# ==== README ==== # input data for this script are modified output files from Tandem Repeat Finder # the header from original .dat files was manually removed and replaced with header for individual collumns as follows: # Indices1 Indices2 Period_Size Copy_Number Consensus_Size Percent_Matches Percent_Indels Score A C G T X Consensus_sequence Sequence # ==== read data and combine them into one table === A_indata=read.table("amlnn.fasta.2.7.7.80.10.50.1000.dat", header=T) Y_indata=read.table("ymlnn.fasta.2.7.7.80.10.50.1000.dat", header=T) X_indata=read.table("xmlnn.fasta.2.7.7.80.10.50.1000.dat", header=T) A_indata$chromozom<-factor(rep("A",nrow(A_indata)), levels = c("A","X","Y")) Y_indata$chromozom<-factor(rep("Y",nrow(Y_indata)), levels = c("A","X","Y")) X_indata$chromozom<-factor(rep("X",nrow(X_indata)), levels = c("A","X","Y")) all<-rbind(A_indata, Y_indata, X_indata) rm(A_indata, Y_indata, X_indata) # ==== Assign consistent tandem repeat identifiers ==== # Tandem repeat finder outputs repeat consensus for each identified array somewat # inconsistently. For example satelite "CAA" is equivalent to "ACA", "AAC", "TTG", # "TGT" and "TTG". All of thease variants can be found in the data. To recognize # all permutations as equivalent, we create new collumn with aphabeticly first # permutation which will serve to identify ocurences of the same satelite. So for # example "CAA" and all its permutations repeat wil be labeled as "AAC". library(stringi) all$revcomp <- stri_reverse(chartr("ATCG","TAGC",all$Consensus_sequence)) library(stringr) first_repeat<-function(x,xrev) { con<-paste(x,x,sep="") conrev<-paste(xrev,xrev,sep="") l1<-rep(NA, 2*str_length(x)) for(i in 1:str_length(x)) { l1[i]<-substring(con,i,i+str_length(x)-1) } for(i in 1:str_length(x)) { l1[i+str_length(x)]<-substring(conrev,i,i+str_length(x)-1) } return(sort(l1)[1]) } all$unique<-mapply( first_repeat, all$Consensus_sequence, all$revcomp) all$funique<-as.factor(all$unique) # ==== Create graphs of array length periodicity for selected statelites ==== # calculate array lengths all$bp<-nchar(x=as.character(all$Sequence)) dir.create("work") #for(i in levels(all$funique)) # produce graphs for all satelites OR only for selected few: for(i in c("AAC","AACACACCC","AAG","AACCCT","AACAACAAG","AG", "AAAAACGAGCG", "AAAAAATCGTCATCGAGCTCAAAAACGTGTTTGATGACATTATTTCGAGCTTGATGACGTT", "AAACACACCC","ACACTGTGATG","AAAACTCTACACG","AAAAACGAGC","AAACACACC", "AAACACACCCAACACAACC","AAACACACCCAACACACC")) { if(sum(all$funique==i)>100) { data=all[all$funique==i,c("bp","chromozom")] data$bp=factor(data$bp, levels=0:151) tab=as.matrix(table(data)) pdf(paste("work/",i, ".pdf", sep = "")) plot(y=c(tab[,1]), x=row.names(tab), col="green4", type="l", xlab="Base pairs", ylab="Abundance per milion reads", main=i, ylim=c(0, max(tab)) ) lines(y=c(tab[,2]), x=row.names(tab), col="red") lines(y=c(tab[,3]), x=row.names(tab), col="blue") dev.off() } } # ==== Calculate abundances for each chromosome library ==== # Add number of bp (and number of ocurences = reads) for each satelite separately for each chromosome library bp_sums = aggregate(list(bp = all$bp), by = list(funique = all$funique, chromozom = all$chromozom), FUN = sum) read_sums = aggregate(list(reads = rep(1, nrow(all))), by = list(funique = all$funique, chromozom = all$chromozom), FUN = sum) sums = merge(bp_sums, read_sums) # Table manipulatios to get one line per one satelite and abudances neatly next to each other... # Split to three tables for each chromosome and skip chromosome collumn A_sums = sums[sums$chromozom == "A", colnames(sums) != "chromozom"] X_sums = sums[sums$chromozom == "X", colnames(sums) != "chromozom"] Y_sums = sums[sums$chromozom == "Y", colnames(sums) != "chromozom"] # Name the count collumns acording to chromosome colnames(A_sums)[2] = "bp_A" colnames(X_sums)[2] = "bp_X" colnames(Y_sums)[2] = "bp_Y" colnames(A_sums)[3] = "reads_A" colnames(X_sums)[3] = "reads_X" colnames(Y_sums)[3] = "reads_Y" # Merge into one overview table overview = merge(merge(A_sums, X_sums, all = T), Y_sums, all = T) rm(A_sums, X_sums, Y_sums) # Replace NAs with zeroes overview[is.na(overview)] = 0 # Convert counts to percentages overview$percent_A = overview$bp_A*100/(2000000*151) overview$percent_X = overview$bp_X*100/(2000000*151) overview$percent_Y = overview$bp_Y*100/(2000000*151) overview$percent_reads_A = overview$reads_A*100/2000000 overview$percent_reads_X = overview$reads_X*100/2000000 overview$percent_reads_Y = overview$reads_Y*100/2000000 # Reorder overview table to have the most interesting (most abundant) at the top overview_abundance = overview[order(-(10.8*overview$percent_A + 1.8*overview$percent_X + 2.35*overview$percent_Y)), ] write.csv(overview_abundance, file = "satelites_procentages.csv") # ==== Plot Rank abundace curve ==== # For further analysis use only satelites with perios size larger then 1 AAA = overview$percent_A[overview$Period_Size>1] XXX = overview$percent_X[overview$Period_Size>1] YYY = overview$percent_Y[overview$Period_Size>1] # Plot svg("C_Rank_abundance_curves.svg", height = 6, width = 6) par(mar=c(4.1,4.1,1.1,1.1)) plot(0, type = "n", xlim = c(0,log10(Nbiggest)), ylim = sort(c(log10(min(c(AAA[AAA != 0], XXX[XXX != 0], YYY[YYY != 0]))), log10(max(c(AAA, XXX, YYY))))), xlab = "Rank", ylab = "Abundance [% of genome]", yaxt = "n", xaxt = "n") yznacky = c(1,0.1,0.01,0.001,0.0001,0.00001) axis(side = 2, labels = yznacky, at = log10(yznacky), las = 1) xznacky = c(1,10,100,1000,10000,100000) axis(side = 1, labels = as.character(xznacky), at = log10(xznacky), las = 1) lines(y = log10(sort(AAA, decreasing = T)), x = log10(1:length(AAA)), col = "green4") lines(y = log10(sort(XXX, decreasing = T)), x = log10(1:length(XXX)), col = "red") lines(y = log10(sort(YYY, decreasing = T)), x = log10(1:length(YYY)), col = "blue") legend("topright", legend = c("X chromosome", "Y chroosome", "Autosomes"), col = c("red", "blue", "green4"), lty = 1, bty = "n") dev.off() # ==== Few general statisticks ==== # total procentages sum(AAA) sum(XXX) sum(YYY) # mismatch rate selected = !grepl("NNN", all$Sequence) & all$Period_Size > 1 100 - weighted.mean(all$Percent_Matches[all$chromozom == "A" & selected], all$bp[all$chromozom == "A" & selected]) 100 - weighted.mean(all$Percent_Matches[all$chromozom == "X" & selected], all$bp[all$chromozom == "X" & selected]) 100 - weighted.mean(all$Percent_Matches[all$chromozom == "Y" & selected], all$bp[all$chromozom == "Y" & selected]) # indel rate weighted.mean(all$Percent_Indels[all$chromozom == "A" & selected], all$bp[all$chromozom == "A" & selected]) weighted.mean(all$Percent_Indels[all$chromozom == "X" & selected], all$bp[all$chromozom == "X" & selected]) weighted.mean(all$Percent_Indels[all$chromozom == "Y" & selected], all$bp[all$chromozom == "Y" & selected]) # mean array lengths mean(all$bp[all$chromozom == "A" & selected]) mean(all$bp[all$chromozom == "X" & selected]) mean(all$bp[all$chromozom == "Y" & selected]) # ==== Histogram AC content ==== library(stringr) overview$A = str_count(as.character(overview$funique), "A") overview$C = str_count(as.character(overview$funique), "C") overview$G = str_count(as.character(overview$funique), "G") overview$T = str_count(as.character(overview$funique), "T") bining = 0.05 overview$AC_bins = cut((overview$A+overview$C)/overview$Period_Size, seq(0, 1, bining)) AC_data = aggregate(overview[,c("percent_A","percent_X","percent_Y")], by = list(AC_bins = overview$AC_bins), sum, drop = F) AC_data[is.na(AC_data)] = 0 AC_data$Mbp_A = AC_data$percent_A*(10.8/14.95*7335) AC_data$Mbp_X = AC_data$percent_X*(1.8/14.95*7335) AC_data$Mbp_Y = AC_data$percent_Y*(2.35/14.95*7335) AC_matrix = as.matrix(AC_data[,c("Mbp_A","Mbp_X","Mbp_Y")]) AC_matrix = t(AC_matrix) # Specifically AAC AAC_Mbp = colSums(overview[overview$funique %in% c("AAC","AACAAC","AACAACAAC","AACAACAACAAC", "AACAACAACAACAAC","AACAACAACAACAACAAC","AACAACAACAACAACAACAAC"), c("percent_A","percent_X","percent_Y")]) * c((10.8/14.95*7335),(1.8/14.95*7335),(2.35/14.95*7335)) # Plot svg(filename = "CAA_histogram.svg") barplot(AC_matrix, #col = c("green4", "red", "blue"), col = c("#7fc47f", "#ff7f7f", "#7f7fff"), border = "black", space = 0, xlab = "Portion of A + C in monomer sequence", ylab = "Tandem repeat abundance in male genome [Mbp]", ylim = c(0,2000)) axis(side = 1, labels = seq(0, 1, 0.1), at = seq(0,1/bining,0.1/bining)) # AAC emphasis ytops = c(sum(AC_matrix[1,20]), sum(AC_matrix[c(1,2),20]),sum(AC_matrix[,20])) ybottoms = ytops - AAC_Mbp rect(xleft = 1/bining-1, xright = 1/bining, ytop = ytops, ybottom = ybottoms, angle = 45, density = 25, border = NA) legend("topleft", legend = c("Autosome", "X chromosome", "Y chromosome", "CAA repeat"), fill = c("#7fc47f", "#ff7f7f", "#7f7fff", NA), border = c("black","black","black","gray"), angle = c(NA,NA,NA,45), density = c(NA,NA,NA,25), bty = "n" ) dev.off()