# ==== README ====
# input data for this script are modified output files from Tandem Repeat Finder
# the header from original .dat files was manually removed and replaced with header for individual collumns as follows:
# Indices1 Indices2 Period_Size Copy_Number Consensus_Size Percent_Matches Percent_Indels Score A C G T X Consensus_sequence Sequence

# ==== read data and combine them into one table ===

A_indata=read.table("amlnn.fasta.2.7.7.80.10.50.1000.dat", header=T)
Y_indata=read.table("ymlnn.fasta.2.7.7.80.10.50.1000.dat", header=T)
X_indata=read.table("xmlnn.fasta.2.7.7.80.10.50.1000.dat", header=T)

A_indata$chromozom<-factor(rep("A",nrow(A_indata)), levels = c("A","X","Y"))
Y_indata$chromozom<-factor(rep("Y",nrow(Y_indata)), levels = c("A","X","Y"))
X_indata$chromozom<-factor(rep("X",nrow(X_indata)), levels = c("A","X","Y"))

all<-rbind(A_indata, Y_indata, X_indata)
rm(A_indata, Y_indata, X_indata)

# ==== Assign consistent tandem repeat identifiers ====

# Tandem repeat finder outputs repeat consensus for each identified array somewat
# inconsistently. For example satelite "CAA" is equivalent to "ACA", "AAC", "TTG",
# "TGT" and "TTG". All of thease variants can be found in the data. To recognize
# all permutations as equivalent, we create new collumn with aphabeticly first
# permutation which will serve to identify ocurences of the same satelite. So for
# example "CAA" and all its permutations repeat wil be labeled as "AAC".


library(stringi)
all$revcomp <- stri_reverse(chartr("ATCG","TAGC",all$Consensus_sequence))

library(stringr)
first_repeat<-function(x,xrev)
{
  con<-paste(x,x,sep="")
  conrev<-paste(xrev,xrev,sep="")
  l1<-rep(NA, 2*str_length(x))
  for(i in 1:str_length(x))
  {
    l1[i]<-substring(con,i,i+str_length(x)-1)
  }
  for(i in 1:str_length(x))
  {
    l1[i+str_length(x)]<-substring(conrev,i,i+str_length(x)-1)
  }
  return(sort(l1)[1])
}

all$unique<-mapply( first_repeat, all$Consensus_sequence, all$revcomp)
all$funique<-as.factor(all$unique)


# ==== Create graphs of array length periodicity for selected statelites ====

# calculate array lengths
all$bp<-nchar(x=as.character(all$Sequence))

dir.create("work")
#for(i in levels(all$funique)) # produce graphs for all satelites OR only for selected few:
for(i in c("AAC","AACACACCC","AAG","AACCCT","AACAACAAG","AG",
           "AAAAACGAGCG",
           "AAAAAATCGTCATCGAGCTCAAAAACGTGTTTGATGACATTATTTCGAGCTTGATGACGTT",
           "AAACACACCC","ACACTGTGATG","AAAACTCTACACG","AAAAACGAGC","AAACACACC",
           "AAACACACCCAACACAACC","AAACACACCCAACACACC"))
{
  if(sum(all$funique==i)>100)
  {
    data=all[all$funique==i,c("bp","chromozom")]
    data$bp=factor(data$bp, levels=0:151)
    
    tab=as.matrix(table(data))
    pdf(paste("work/",i, ".pdf", sep = ""))
    plot(y=c(tab[,1]),
         x=row.names(tab),
         col="green4",
         type="l",
         xlab="Base pairs",
         ylab="Abundance per milion reads",
         main=i,
         ylim=c(0, max(tab))
    )
    lines(y=c(tab[,2]),
          x=row.names(tab), col="red")
    lines(y=c(tab[,3]),
          x=row.names(tab), col="blue")
    dev.off()
  }
}


# ==== Calculate abundances for each chromosome library ====

# Add number of bp (and number of ocurences = reads) for each satelite separately for each chromosome library
bp_sums = aggregate(list(bp = all$bp), by = list(funique = all$funique, chromozom = all$chromozom), FUN = sum)
read_sums = aggregate(list(reads = rep(1, nrow(all))), by = list(funique = all$funique, chromozom = all$chromozom), FUN = sum)
sums = merge(bp_sums, read_sums)

# Table manipulatios to get one line per one satelite and abudances neatly next to each other...
# Split to three tables for each chromosome and skip chromosome collumn
A_sums = sums[sums$chromozom == "A", colnames(sums) != "chromozom"]
X_sums = sums[sums$chromozom == "X", colnames(sums) != "chromozom"]
Y_sums = sums[sums$chromozom == "Y", colnames(sums) != "chromozom"]
# Name the count collumns acording to chromosome
colnames(A_sums)[2] = "bp_A"
colnames(X_sums)[2] = "bp_X"
colnames(Y_sums)[2] = "bp_Y"
colnames(A_sums)[3] = "reads_A"
colnames(X_sums)[3] = "reads_X"
colnames(Y_sums)[3] = "reads_Y"

# Merge into one overview table
overview = merge(merge(A_sums, X_sums, all = T), Y_sums, all = T)
rm(A_sums, X_sums, Y_sums)
# Replace NAs with zeroes
overview[is.na(overview)] = 0

# Convert counts to percentages
overview$percent_A = overview$bp_A*100/(2000000*151)
overview$percent_X = overview$bp_X*100/(2000000*151)
overview$percent_Y = overview$bp_Y*100/(2000000*151)
overview$percent_reads_A = overview$reads_A*100/2000000 
overview$percent_reads_X = overview$reads_X*100/2000000 
overview$percent_reads_Y = overview$reads_Y*100/2000000 


# Reorder overview table to have the most interesting (most abundant) at the top
overview_abundance = overview[order(-(10.8*overview$percent_A + 1.8*overview$percent_X + 2.35*overview$percent_Y)), ]
write.csv(overview_abundance, file = "satelites_procentages.csv")


# ==== Plot Rank abundace curve ====

# For further analysis use only satelites with perios size larger then 1
AAA = overview$percent_A[overview$Period_Size>1]
XXX = overview$percent_X[overview$Period_Size>1]
YYY = overview$percent_Y[overview$Period_Size>1]


# Plot
svg("C_Rank_abundance_curves.svg", height = 6, width = 6)
par(mar=c(4.1,4.1,1.1,1.1))
plot(0, type = "n",
     xlim = c(0,log10(Nbiggest)),
     ylim = sort(c(log10(min(c(AAA[AAA != 0], XXX[XXX != 0], YYY[YYY != 0]))),
                   log10(max(c(AAA, XXX, YYY))))),
     xlab = "Rank",
     ylab = "Abundance [% of genome]",
     yaxt = "n",
     xaxt = "n")
yznacky = c(1,0.1,0.01,0.001,0.0001,0.00001)
axis(side = 2,
     labels = yznacky,
     at = log10(yznacky),
     las = 1)
xznacky = c(1,10,100,1000,10000,100000)
axis(side = 1,
     labels = as.character(xznacky),
     at = log10(xznacky),
     las = 1)
lines(y = log10(sort(AAA, decreasing = T)),
      x = log10(1:length(AAA)),
      col = "green4")
lines(y = log10(sort(XXX, decreasing = T)),
      x = log10(1:length(XXX)),
      col = "red")
lines(y = log10(sort(YYY, decreasing = T)),
      x = log10(1:length(YYY)),
      col = "blue")
legend("topright",
       legend = c("X chromosome", "Y chroosome", "Autosomes"),
       col = c("red", "blue", "green4"),
       lty = 1,
       bty = "n")
dev.off()


# ==== Few general statisticks ====

# total procentages
sum(AAA)
sum(XXX)
sum(YYY)
# mismatch rate
selected = !grepl("NNN", all$Sequence) & all$Period_Size > 1
100 - weighted.mean(all$Percent_Matches[all$chromozom == "A" & selected], all$bp[all$chromozom == "A" & selected])
100 - weighted.mean(all$Percent_Matches[all$chromozom == "X" & selected], all$bp[all$chromozom == "X" & selected])
100 - weighted.mean(all$Percent_Matches[all$chromozom == "Y" & selected], all$bp[all$chromozom == "Y" & selected])
# indel rate
weighted.mean(all$Percent_Indels[all$chromozom == "A" & selected], all$bp[all$chromozom == "A" & selected])
weighted.mean(all$Percent_Indels[all$chromozom == "X" & selected], all$bp[all$chromozom == "X" & selected])
weighted.mean(all$Percent_Indels[all$chromozom == "Y" & selected], all$bp[all$chromozom == "Y" & selected])
# mean array lengths
mean(all$bp[all$chromozom == "A" & selected])
mean(all$bp[all$chromozom == "X" & selected])
mean(all$bp[all$chromozom == "Y" & selected])


# ==== Histogram AC content ====

library(stringr)
overview$A = str_count(as.character(overview$funique), "A")
overview$C = str_count(as.character(overview$funique), "C")
overview$G = str_count(as.character(overview$funique), "G")
overview$T = str_count(as.character(overview$funique), "T")
bining = 0.05
overview$AC_bins = cut((overview$A+overview$C)/overview$Period_Size, seq(0, 1, bining))

AC_data = aggregate(overview[,c("percent_A","percent_X","percent_Y")],
                    by = list(AC_bins = overview$AC_bins),
                    sum,
                    drop = F)
AC_data[is.na(AC_data)] = 0
AC_data$Mbp_A = AC_data$percent_A*(10.8/14.95*7335)
AC_data$Mbp_X = AC_data$percent_X*(1.8/14.95*7335)
AC_data$Mbp_Y = AC_data$percent_Y*(2.35/14.95*7335)
AC_matrix = as.matrix(AC_data[,c("Mbp_A","Mbp_X","Mbp_Y")])
AC_matrix = t(AC_matrix)

# Specifically AAC
AAC_Mbp = colSums(overview[overview$funique %in% c("AAC","AACAAC","AACAACAAC","AACAACAACAAC", "AACAACAACAACAAC","AACAACAACAACAACAAC","AACAACAACAACAACAACAAC"),
                           c("percent_A","percent_X","percent_Y")]) * c((10.8/14.95*7335),(1.8/14.95*7335),(2.35/14.95*7335))

# Plot
svg(filename = "CAA_histogram.svg")
barplot(AC_matrix, 
        #col = c("green4", "red", "blue"),
        col = c("#7fc47f", "#ff7f7f", "#7f7fff"),
        border = "black", 
        space = 0, 
        xlab = "Portion of A + C in monomer sequence",
        ylab = "Tandem repeat abundance in male genome [Mbp]",
        ylim = c(0,2000))
axis(side = 1,
     labels = seq(0, 1, 0.1),
     at = seq(0,1/bining,0.1/bining))
# AAC emphasis
ytops = c(sum(AC_matrix[1,20]), sum(AC_matrix[c(1,2),20]),sum(AC_matrix[,20]))
ybottoms = ytops - AAC_Mbp
rect(xleft = 1/bining-1,
     xright = 1/bining,
     ytop = ytops,
     ybottom = ybottoms,
     angle = 45,
     density = 25,
     border = NA)
legend("topleft",
       legend = c("Autosome", "X chromosome", "Y chromosome", "CAA repeat"),
       fill = c("#7fc47f", "#ff7f7f", "#7f7fff", NA),
       border = c("black","black","black","gray"),
       angle = c(NA,NA,NA,45),
       density = c(NA,NA,NA,25),
       bty = "n"
)
dev.off()