###First run utility functions below.
####################################
#Reads .fastq files. Returns a list of vectorized sequences as in fasta import.
read.fastq<-function(file){
infile<-readLines(con=file)
z<-length(infile)
if(z%%4!=0)stop('Line nr. is not a multiple of 4')
fqnames<-infile[seq(1,length(infile),by=4)]
if(length(fqnames)!=(length(infile)/4)){
stop('Nr. sequence names does not match nr. lines in file')
}
fqseqs<-infile[seq(2,length(infile),by=4)]
if(length(fqseqs)!=(length(infile)/4)){
stop('Nr. sequences does not match nr. lines in file')
}
fqnames<-substring(fqnames,2)
out<-lapply(fqseqs,function(x)paste(x))
out<-sapply(out,function(x)strsplit(x,split=''))
names(out)<-fqnames
return(out)
}
########################################
#The following function finds oligos in a sequence. The query should be of
#the form 'gaagg', i.e. a continuous string. The sequence to be searched 
#should have the form of a vector. The routine is not case sensitive.
#It returns the positions of the of the sites matching the query sequence.
#The output is in matrix form. If the query matches more than one segment
#positions of all matches are returned. 
find.seq<-function(query,seq){
query<-toupper(query);seq<-toupper(seq)
seq<-c(seq,'-')
x<-unlist(strsplit(query,split=''))
sites<-as.matrix(grep(x[1],seq))
for(i in 2:length(x)){
nsit<-(seq[sites[,(i-1)]+1]==x[i])*sites[,(i-1)]
#if(sum(nsit)==0)stop('No match for query')
if(sum(nsit)==0)print('No match for query')
if(sum(nsit)==0)return(0)
sites<-matrix(sites[nsit!=0,],ncol=i-1)
nsit<-nsit[nsit!=0]
sites<-cbind(sites,nsit+1)
}
#if(nrow(sites)>1)stop('Multiple matches to query')
if(nrow(sites)>1)print('Multiple matches to query')
if(nrow(sites)>1)return(0)
return(sites)
}
#####################################################
#Function for looping through sequences in a list (e.g. imported fasta file).
#The arguments are the sequences flanking the microsats (flank1 and flank2),
#and the name of the list containing the sequence data.
#Returns a new list containing the microsat sequences.
extr.mi.sat<-function(flank1,flank2,algn){
out<-list()
for(i in 1:length(algn)){
sequence<-algn[[i]]
start<-max(find.seq(flank1,sequence))
if(start==0)next
stop<-min(find.seq(flank2,sequence))
if(stop==0)next
mi.sat<-sequence[(start+1):(stop-1)]
out<-c(out,list(mi.sat))
}
return(out)
}
##############################################
#Function for reverse complementing a sequence. The string option lets you
#decide if you want the function to return a text string rather than a character
#vector (default).
rev.comp<-function(seq,string=F){
seq<-toupper(seq)
seq<-unlist(strsplit(seq,split=''))
comp<-rep('N',length(seq))
comp[seq=='A']<-'T'
comp[seq=='C']<-'G'
comp[seq=='G']<-'C'
comp[seq=='T']<-'A'
rc<-rev(comp)
if(string==T)rc<-paste(rc,collapse='')
return(rc)
}
#######################################
#Function for demultiplexing sequence reads. 'barcodes' should be a two 
#column table with the first column containing the sample name and the 
#second columng the barcode sequences. 'seqs' is the list containing
#the sequences to be multiplexed. 'mm' is the maximum number of mismatches
#allowed.
#Returns a list of length nr. barcodes with each element containing 
#another list with the demultiplexed sequences.
demultiplex<-function(barcodes,seqs,mm=2){
bc<-as.character(barcodes[,2])
names_bc<-as.character(barcodes[,1])
bc_size<-nchar(bc)
start_f<-lapply(seqs,function(x)x[1:bc_size[1]])
start_f<-lapply(start_f,toupper)
start_rc<-lapply(seqs,function(x)x[(length(x)-(bc_size[1]-1)):length(x)])
start_rc<-lapply(start_rc,toupper)
out<-list()
for(i in 1:nrow(barcodes)){
bc_seq<-unlist(strsplit(bc[i],split=''))
bc_match_f<-unlist(lapply(start_f,function(x)sum(x==bc_seq)))
bc_match_rc<-unlist(lapply(start_rc,function(x)sum(x==rev.comp(bc_seq))))
fwd<-all_seqs[which(bc_match_f>=bc_size[1]-mm)]
revc<-all_seqs[which(bc_match_rc>=bc_size[1]-mm)]
rev_f<-lapply(revc,rev.comp)
merged<-c(fwd,rev_f)
out[[i]]<-merged
}
names(out)<-names_bc
return(out)
}
############################################
fix.var<-function(x,fix)abs(fix-x)
##########################################
shannon<-function(x){
pos<-x[x>0]
pos<-table(pos)/length(pos)
temp<-sum(pos*log(pos))
si<--temp
return(si)
}
##############################################
###one nucleotide slippage, fwd vs. bwd
fwd.slip<-function(x,n,prop=T){
tot<-length(x)
fwd<-sum(x==(n+1))
bwd<-sum(x==(n-1))
if(prop==T) fb<-c((fwd/tot)*100,(bwd/tot)*100) else fb<-c(fwd,bwd)
return(fb)
}
###############################################
###percent correct allele size
pct.corr<-function(x,n){
tot<-length(x)
target<-sum(x==n)
prc<-(target/tot)*100
return(prc)
}
#############################################
###compare backward vs. forward slippage
fbc<-function(x,target,lms){
fwd<-x[x>target]
bwd<-x[x<target]
x1<-fwd[which(fwd%%lms==0)]
x2<-fwd[which(fwd%%lms!=0)]
y1<-bwd[which(bwd%%lms==0)]
y2<-bwd[which(bwd%%lms!=0)]
o1<-c(length(fwd),length(bwd))
o2<-c(length(x1),length(y1))
o3<-c(length(x2),length(y2))
out<-rbind(o1,o2,o3)
out<-out/length(x)
out<-out*100
return(out)
}
###########################################
###look at densities
ret.perc<-function(x1,x2=NULL,x3=NULL,x4=NULL,lms){
x<-c(x1,x2,x3,x4)
temp<-table(x)
x1<-as.numeric(names(temp))
x2<-as.vector(temp)
index<-which(x1%%lms==0)
x1<-x1[index];x2<-x2[index]
#x1<-round(x1,0)
index2<-which(x2>1)
x1<-x1[index2];x2<-x2[index2]
x2<-x2/sum(x2)
out<-rbind(x1,x2)
return(out)
}
#############################################
###Function is same as extr.mi.sat except it includes the flanking search
###sequence in return sequence
extr.spacer<-function(flank1,flank2,algn){
out<-list()
for(i in 1:length(algn)){
sequence<-algn[[i]]
start<-min(find.seq(flank1,sequence))
if(start==0)next
stop<-max(find.seq(flank2,sequence))
if(stop==0)next
spacer<-sequence[(start):(stop)]
out<-c(out,list(spacer))
}
return(out)
}
############################################
###compare backward vs. forward slippage
indel.rate<-function(x,target){
n<-length(x)
single_ins<-sum(x==(target+1))
single_ins<-(single_ins/n)*100
single_del<-sum(x==(target-1))
single_del<-(single_del/n)*100
fwd<-sum(x>target)
bwd<-sum(x<target)
offtarget<-fwd+bwd
f_prop<-(fwd/n)*100
b_prop<-(bwd/n)*100
tot_prop<-(offtarget/n)*100
f_pb<-(fwd/n)/target
b_pb<-(bwd/n)/target
tot_pb<-(offtarget/n)/target
out<-c(f_prop,b_prop,tot_prop,f_pb,b_pb,tot_pb,single_ins,single_del)
return(out)
}
#############################################
###find good tick marks for figures 2 and 3
find.tix<-function(x,int=4,target){
v1<-as.numeric(names(x))
index<-range(v1[(v1%%2)==0])
if(((target-index[1])%%int)==0) start<-index[1] else start<-index[1]+2
if(((target-index[2])%%int)==0) stop<-index[2] else stop<-index[2]-2
tix<-seq(start,stop,by=int)
return(tix)
}