###First run utility functions below. #################################### #Reads .fastq files. Returns a list of vectorized sequences as in fasta import. read.fastq<-function(file){ infile<-readLines(con=file) z<-length(infile) if(z%%4!=0)stop('Line nr. is not a multiple of 4') fqnames<-infile[seq(1,length(infile),by=4)] if(length(fqnames)!=(length(infile)/4)){ stop('Nr. sequence names does not match nr. lines in file') } fqseqs<-infile[seq(2,length(infile),by=4)] if(length(fqseqs)!=(length(infile)/4)){ stop('Nr. sequences does not match nr. lines in file') } fqnames<-substring(fqnames,2) out<-lapply(fqseqs,function(x)paste(x)) out<-sapply(out,function(x)strsplit(x,split='')) names(out)<-fqnames return(out) } ######################################## #The following function finds oligos in a sequence. The query should be of #the form 'gaagg', i.e. a continuous string. The sequence to be searched #should have the form of a vector. The routine is not case sensitive. #It returns the positions of the of the sites matching the query sequence. #The output is in matrix form. If the query matches more than one segment #positions of all matches are returned. find.seq<-function(query,seq){ query<-toupper(query);seq<-toupper(seq) seq<-c(seq,'-') x<-unlist(strsplit(query,split='')) sites<-as.matrix(grep(x[1],seq)) for(i in 2:length(x)){ nsit<-(seq[sites[,(i-1)]+1]==x[i])*sites[,(i-1)] #if(sum(nsit)==0)stop('No match for query') if(sum(nsit)==0)print('No match for query') if(sum(nsit)==0)return(0) sites<-matrix(sites[nsit!=0,],ncol=i-1) nsit<-nsit[nsit!=0] sites<-cbind(sites,nsit+1) } #if(nrow(sites)>1)stop('Multiple matches to query') if(nrow(sites)>1)print('Multiple matches to query') if(nrow(sites)>1)return(0) return(sites) } ##################################################### #Function for looping through sequences in a list (e.g. imported fasta file). #The arguments are the sequences flanking the microsats (flank1 and flank2), #and the name of the list containing the sequence data. #Returns a new list containing the microsat sequences. extr.mi.sat<-function(flank1,flank2,algn){ out<-list() for(i in 1:length(algn)){ sequence<-algn[[i]] start<-max(find.seq(flank1,sequence)) if(start==0)next stop<-min(find.seq(flank2,sequence)) if(stop==0)next mi.sat<-sequence[(start+1):(stop-1)] out<-c(out,list(mi.sat)) } return(out) } ############################################## #Function for reverse complementing a sequence. The string option lets you #decide if you want the function to return a text string rather than a character #vector (default). rev.comp<-function(seq,string=F){ seq<-toupper(seq) seq<-unlist(strsplit(seq,split='')) comp<-rep('N',length(seq)) comp[seq=='A']<-'T' comp[seq=='C']<-'G' comp[seq=='G']<-'C' comp[seq=='T']<-'A' rc<-rev(comp) if(string==T)rc<-paste(rc,collapse='') return(rc) } ####################################### #Function for demultiplexing sequence reads. 'barcodes' should be a two #column table with the first column containing the sample name and the #second columng the barcode sequences. 'seqs' is the list containing #the sequences to be multiplexed. 'mm' is the maximum number of mismatches #allowed. #Returns a list of length nr. barcodes with each element containing #another list with the demultiplexed sequences. demultiplex<-function(barcodes,seqs,mm=2){ bc<-as.character(barcodes[,2]) names_bc<-as.character(barcodes[,1]) bc_size<-nchar(bc) start_f<-lapply(seqs,function(x)x[1:bc_size[1]]) start_f<-lapply(start_f,toupper) start_rc<-lapply(seqs,function(x)x[(length(x)-(bc_size[1]-1)):length(x)]) start_rc<-lapply(start_rc,toupper) out<-list() for(i in 1:nrow(barcodes)){ bc_seq<-unlist(strsplit(bc[i],split='')) bc_match_f<-unlist(lapply(start_f,function(x)sum(x==bc_seq))) bc_match_rc<-unlist(lapply(start_rc,function(x)sum(x==rev.comp(bc_seq)))) fwd<-all_seqs[which(bc_match_f>=bc_size[1]-mm)] revc<-all_seqs[which(bc_match_rc>=bc_size[1]-mm)] rev_f<-lapply(revc,rev.comp) merged<-c(fwd,rev_f) out[[i]]<-merged } names(out)<-names_bc return(out) } ############################################ fix.var<-function(x,fix)abs(fix-x) ########################################## shannon<-function(x){ pos<-x[x>0] pos<-table(pos)/length(pos) temp<-sum(pos*log(pos)) si<--temp return(si) } ############################################## ###one nucleotide slippage, fwd vs. bwd fwd.slip<-function(x,n,prop=T){ tot<-length(x) fwd<-sum(x==(n+1)) bwd<-sum(x==(n-1)) if(prop==T) fb<-c((fwd/tot)*100,(bwd/tot)*100) else fb<-c(fwd,bwd) return(fb) } ############################################### ###percent correct allele size pct.corr<-function(x,n){ tot<-length(x) target<-sum(x==n) prc<-(target/tot)*100 return(prc) } ############################################# ###compare backward vs. forward slippage fbc<-function(x,target,lms){ fwd<-x[x>target] bwd<-x[x1) x1<-x1[index2];x2<-x2[index2] x2<-x2/sum(x2) out<-rbind(x1,x2) return(out) } ############################################# ###Function is same as extr.mi.sat except it includes the flanking search ###sequence in return sequence extr.spacer<-function(flank1,flank2,algn){ out<-list() for(i in 1:length(algn)){ sequence<-algn[[i]] start<-min(find.seq(flank1,sequence)) if(start==0)next stop<-max(find.seq(flank2,sequence)) if(stop==0)next spacer<-sequence[(start):(stop)] out<-c(out,list(spacer)) } return(out) } ############################################ ###compare backward vs. forward slippage indel.rate<-function(x,target){ n<-length(x) single_ins<-sum(x==(target+1)) single_ins<-(single_ins/n)*100 single_del<-sum(x==(target-1)) single_del<-(single_del/n)*100 fwd<-sum(x>target) bwd<-sum(x