options(warn =-1) # R_script_for_haplotype_calling # this script is used by S3_file.sh # it also subset the read sequence according to bed file # Makeing a cluster for the most populus reads with similarity according to Levanshtein distance # allow input as illumina 150 se or pe # annotation for output: missing as *, deletion as _, padding as N, low score base as U rm(list=ls()) args <- commandArgs(trailingOnly = TRUE) print(args) address <- args[1] SNPsite <- args[2] read_num <- args[3] read_len <- args[4] read_num = as.numeric(as.character(read_num)) read_len = as.numeric(as.character(read_len)) #setting of read length of illumina print(paste(read_num, "*",read_len,"bp", sep="")) padding_len=10 # how many N allow to pad into sequence if pair end reads do not overlap, default 10 min_length_m=read_len+padding_len #min illumina read + padding N sample_size=5000 #cap the total reads number, default 5000 primter_length=25 #adjust display number of SNP in the output min_read=10 #thredshold for each leaf min_type=10 #threshold for each node MAPQ_min=30 #requirement for the read mapping quality, filter out reads with poor mapping BaseQ_min=13 #base quality cutoff for each reads # load SNP sites SNPSiteAll=read.table(SNPsite,skip=1,sep="") #count how many amplicons in SNP file SNP_chr=table(SNPSiteAll[,1]) #read sam files t=read.table(address,skip = nrow(SNP_chr),sep="",fill=T,row.names=NULL) #remove empty rowst t1=t[complete.cases(t),] #remove unmapped reads t1=t1[t1[,3]!="*",] t1[,14:16]="" #calculate read length t1[,17]=nchar(as.character(t1[,10])) #filter read length t1=t1[t1[,17]>=min_length_m,] #filter low MAPQ t1=t1[t1[,5]>=MAPQ_min,] t_all=t1 rm(t) for (chr_num in 1:nrow(SNP_chr)){ print(c(chr_num, SNP_chr[chr_num])) #one amplicon at a time, extract reads and SNPsites t1=t_all[t_all[,3]==rownames(SNP_chr)[chr_num],] SNPSite=SNPSiteAll[SNPSiteAll[,1]==rownames(SNP_chr)[chr_num],] SNPSite_start=SNPSite[,2] SNPSite_stop=SNPSite[,3] #get the min length according to each amplicon min_length=min(SNPSite_stop[length(SNPSite_stop)]-SNPSite_start[1]) min_length_SNP=min_length #set min_length to display SNP #set the min_length_SNP at least 150-25bp if (min_length_SNP<(read_num*read_len)-primter_length){ min_length_SNP=(read_num*read_len)-primter_length } row_num_t1=nrow(t1) print(row_num_t1) #adjusting the min_length with median length if (row_num_t1>0){ if (median(t1[,17],na.rm=TRUE)<135) { min_length=min_length_m #min_length_SNP=mean(c(min_length_SNP,min_length)) min_length_SNP=sqrt(min_length_SNP * min_length) } } #filter read length according to each chromosome row_num_t1=nrow(t1) print(row_num_t1) # exclude the case of HID HID-Y and Disease if (!(rownames(SNP_chr)[chr_num] %in% c("HID","HID-Y","Disease"))){ #downsize sample size if (row_num_t1>sample_size) t1=t1[sample(row_num_t1,size=sample_size,replace=FALSE),] row_num_t1d=nrow(t1) print(row_num_t1d) #vectorized the dataframe t1_chr=t1[,3] t1_pos=t1[,4] t1_cig=t1[,6] t1_seq=t1[,10] t1_seq_col=t1[,14] t1_seq_ex=t1[,15] t1_seq_exnum=t1[,16] t1_seq_score=t1[,11] #minimu 10 reads to wrok with if (row_num_t1d>min_read) { #start extracting sequence according to current amplicon for (i in 1:row_num_t1d){ #get all the numbers and operations cigar_num=gsub("([aA-zZ]+)", ",", as.character(t1_cig[i])) cigar_chr=gsub("([0-9]+)", "", as.character(t1_cig[i])) cigar_num1=strsplit(cigar_num,",") cigar_chr1=strsplit(cigar_chr,"") cigar_num1=unlist(cigar_num1) cigar_chr1=unlist(cigar_chr1) #cigar indext start and stop cigar_num2=cigar_num1 cigar_num2[cigar_chr1=="D"]=0 ci_stop=cumsum(cigar_num2) ci_start=ci_stop-as.numeric(cigar_num2)+1 start_pos=ci_start[cigar_chr1=="M"] stop_pos=ci_stop[cigar_chr1=="M"] #identify the padding SNP position in ref, include 5bp of buffer zone of each end amp_match_len=sum(as.numeric(cigar_num1[cigar_chr1 %in% c("M","D")])) Padding_start=t1_pos[i]+read_len-5 Padding_stop=t1_pos[i]+amp_match_len-1-read_len+5 #identify the padding SNP in SNP list PadSNP_start=which(SNPSite_start >=Padding_start & SNPSite_start <=Padding_stop) PadSNP_stop=which(SNPSite_stop >=Padding_start & SNPSite_stop <=Padding_stop) #get base quality score temp_seq_score=as.numeric(charToRaw(as.character(t1_seq_score[i])))-33 temp_seq=unlist(strsplit(as.character(t1_seq[i]),"")) temp_seq[temp_seq_score2) { #Levenshtein Distance top_num=min(nrow(a),min_type) d<-adist(rownames(a[1:top_num])) rownames(d)=rownames(a[1:top_num]) hc<-hclust(as.dist(d)) #dataframe of top 20 reads and grouping df<-data.frame(a[1:top_num],cutree(hc,k=max(2,ceiling(top_num/3)))) #percentage of each group df[,3]=df[,1]/sum(df[,1]) #save file into new folder address_folder=paste(dirname(address),"/",rownames(SNP_chr)[chr_num],sep="") dir.create(address_folder,showWarnings =F) write.csv(df,file=paste(address_folder,"/",basename(address),"_",rownames(SNP_chr)[chr_num],"_cluster.csv",sep=""),row.names=TRUE) #making cluster plot with bar plot png(paste(address_folder,"/",basename(address),"_",rownames(SNP_chr)[chr_num],".png",sep=""), width = 6, height = 6, units = "in", res = 300) par(oma = c(1, 1, 1, 1)) par(mfrow=c(2,1)) par(mar = c(0, 1.5, 0.5, 1.5)) plot(hc,cex=0.6,main=paste(basename(address),"_",rownames(SNP_chr)[chr_num],sep=""), cex.main=0.75, axes=T, hang=-1,family = "mono", font = 2) rect.hclust(hc,k=max(2,ceiling(top_num/3))) #dev.new(width=5, height=4) #making bar plot according to cluster par(mar = c(0, 1, 0.2, 1)) barplot(a[hc$order],las=3,cex.names=1) par(new = TRUE) #add second graph of percentage overlay barplot(df[hc$order,3],axes=F,xlab=NA, ylab=NA,las=3,cex.names=1) axis(4, las=3) dev.off() } } } } }