#### Author - Vikas Bansal
#### Email - vikas.bansal@charite.de
#### Created - October 2013
#### R 2.15.1

library("outliers")
library("HMM")

##---------------------------------------------------------------------------------------------------------------------------------
## modified code for Dixon's Q test from "outliers" package, which returns sample names, p-values and outlier type (gain, loss or normal) 
##


my.dixon.test <-  function (x, type = 0, opposite = FALSE, two.sided = TRUE) {
	DNAME <- deparse(substitute(x))
	x <- sort(x[complete.cases(x)])
	n <- length(x)
	if ((type == 10 || type == 0) & (n < 3 || n > 30))
		stop("Sample size must be in range 3-30 for type10")
	if (type == 20 & (n < 4 || n > 30))
		stop("Sample size must be in range 4-30 for type20")
	if (xor(((x[n] - mean(x)) < (mean(x) - x[1])), opposite)) {
		alt = paste("lowest value", x[1], "is an outlier")
		number="Loss"
		if (type == 10) {
		Q = (x[2] - x[1])/(x[n] - x[1])
		out.patient=names(x[1])
		}
		else {
		Q = (x[3] - x[1])/(x[n] - x[1])
		out.patient=paste(names(x[1]),names(x[2]),sep=";")
		}
	}
	else {
		alt = paste("highest value", x[n], "is an outlier")
		number="Gain"
		if (type == 10) {
		Q = (x[n] - x[n - 1])/(x[n] - x[1])
		out.patient=names(x[n])
		}
		else {
		Q = (x[n] - x[n - 2])/(x[n] - x[1])
		out.patient=paste(names(x[n]),names(x[n-1]),sep=";")
		}
	}
	pval <- pdixon(Q, n, type)
	if (two.sided) {
		pval <- 2 * pval
		if (pval > 1)
		pval <- 2 - pval
	}
	RVAL <- list(statistic = c(Q = Q), alternative = alt, p.value = pval,
		method = "Dixon test for outliers", data.name = DNAME , x=out.patient, num=number)
	class(RVAL) <- "htest"
	return(RVAL)
}

##--------------------------------------------------------------------------------------------------------------------------------------------------
## main function - calling CNVs
## input data frame contains first 4 columns - CHROM, START, END, GC% and 5th, 6th, 7th, ... 34th column contains copy number value for each sample 
## above input data frame can be created from the output of mrCaNaVar "out_prefix.copynumber.bed" output file (first step of the method)
##

exomeCNA <- function(df.var, type = 0, w.size = 100, p.cutoff = 0.01, two.sided = FALSE, conti.win = 5 ) {
	col <- ncol(df.var)
	if (type == 0) {
		if (col < 12 & col >6) {
			type <- 10
		}
		else if (col < 35 & col >11){
			type <- 20
		}
		else {
			stop("Sample size must be in range 3-30")
		}
	}
	else if (type != 10 && type != 20) {
		stop("Type should be 10 or 20")
	}	
	
	## read in the data frame
	all <- df.var
	colnames(all)[1:3] <- c("CHROM","START","END")
	end.all <- df.var[apply(df.var[,5:col],1,function(v)sum(v!=0,na.rm=TRUE)>=((col-4)/2)),]
	colnames(end.all)[1:3] <- c("CHROM","START","END")
	not.same <- apply(end.all[,5:col],1,function(i) length (unique(i)) > 1 )
	end.all <- end.all[not.same,]
	one <- col+1   
	two <- col+2
	three <- col+3
	
	## apply type20 Dixon test if type is equal to 20 (second step of the method)
	if (type == 20) {
		for  (chak in c(10,20)) {  
			ko <- apply(end.all[,5:col],1, function(test){
				to <- my.dixon.test(test, type=chak ,two.sided= two.sided) 
			})
			end.all[,one] <- sapply(ko,function(la){la$p.value})
			end.all[,two] <- sapply(ko,function(la){la$x})
			end.all[,three] <- sapply(ko,function(la){la$num})
			colnames(end.all)[one:three] <- c(paste("p.value,type",chak,sep=""), paste("patients.type",chak,sep=""), paste("copynum.type",chak,sep=""))
			one <- one+3  
			two <- two+3
			three <- three+3
		}
		
		## return the outlying windows which has p-value less than p.cutoff 
		filtered <- (end.all[which(end.all[,col+1] <= p.cutoff | end.all[,col+4] <= p.cutoff ),])		
		if(length(filtered)==0 || nrow(filtered) == 0 ){
			stop("No significant regions found")
		}
		else{
			filtered[which(filtered[,col+1] <= p.cutoff),ncol(filtered)+1] <- "type10" 
			filtered[is.na(filtered)]<- "type20"
			colnames(filtered)[ncol(filtered)] <- "No. of patients"
			filtered <- (filtered[which(filtered[,3]-filtered[,2] == w.size),])
			if (length(unique(filtered[,col+3]) ) > 1){
				filtergain <- (filtered[which(filtered[,col+3]== "Gain"),])  
				filterloss <- (filtered[which(filtered[,col+3]!= "Gain"),])
				Patient1 <- vector()
				Patient2 <- vector()
				for (chak in 1:nrow(filtergain)){
					if(filtergain[chak,ncol(filtergain)] == "type10"){  
						Patient1[chak] <- filtergain[chak,col+2]
						Patient2[chak] <- "NA"
					} 
					else if (filtergain[chak,ncol(filtergain)] == "type20"){
						test <- unlist(strsplit(filtergain[chak,col+5],";"))  
						Patient1[chak] <- test[1]
						Patient2[chak] <- test[2]
					}
				} 
				gain <- filtergain[,c(1,2,3)]   
				gain[,4:5] <- c(Patient1,Patient2)
				colnames(gain)[4:5] <- c("Patient1","Patient2")
				Patient1 <- vector()
				Patient2 <- vector()
				for (chak in 1:nrow(filterloss)){
					if(filterloss[chak,ncol(filterloss)] == "type10"){  
						Patient1[chak] <- filterloss[chak,col+2]
						Patient2[chak] <- "NA"
					} 
					else if (filterloss[chak,ncol(filterloss)] == "type20"){
						test <- unlist(strsplit(filterloss[chak,col+5],";"))  
						Patient1[chak] <- test[1]
						Patient2[chak] <- test[2]
					}
				}  
				loss <- filterloss[,c(1,2,3)]
				loss[,4:5] <- c(Patient1,Patient2)
				colnames(loss)[4:5] <- c("Patient1","Patient2")
			}
			else if(unique(filtered[,col+3])[1] == "Gain") {
				filtergain <- (filtered[which(filtered[,col+3]== "Gain"),])
				Patient1 <- vector()
				Patient2 <- vector()
				for (chak in 1:nrow(filtergain)){
					if(filtergain[chak,ncol(filtergain)] == "type10"){  
						Patient1[chak] <- filtergain[chak,col+2]
						Patient2[chak] <- "NA"
					} 
					else if (filtergain[chak,ncol(filtergain)] == "type20"){
						test <- unlist(strsplit(filtergain[chak,col+5],";"))  
						Patient1[chak] <- test[1]
						Patient2[chak] <- test[2]
					}
				} 
				gain <- filtergain[,c(1,2,3)]   
				gain[,4:5] <- c(Patient1,Patient2)
				colnames(gain)[4:5] <- c("Patient1","Patient2")
			}
			else {
				filterloss <- (filtered[which(filtered[,col+3]!= "Gain"),])
				Patient1 <- vector()
				Patient2 <- vector()
				for (chak in 1:nrow(filterloss)){
					if(filterloss[chak,ncol(filterloss)] == "type10"){  
						Patient1[chak] <- filterloss[chak,col+2]
						Patient2[chak] <- "NA"
					} 
					else if (filterloss[chak,ncol(filterloss)] == "type20"){
						test <- unlist(strsplit(filterloss[chak,col+5],";"))  
						Patient1[chak] <- test[1]
						Patient2[chak] <- test[2]
					}
				}  
				loss <- filterloss[,c(1,2,3)]
				loss[,4:5] <- c(Patient1,Patient2)
				colnames(loss)[4:5] <- c("Patient1","Patient2")
			}
		}
	}
	
	## apply type10 Dixon test if type is equal to 10 (second step of the method)
	else{  
		chak=10
		ko <- apply(end.all[,5:col],1, function(test){
			to <- my.dixon.test(test, type=chak ,two.sided= two.sided) 
		})
		end.all[,one] <- sapply(ko,function(la){la$p.value})
		end.all[,two] <- sapply(ko,function(la){la$x})
		end.all[,three] <- sapply(ko,function(la){la$num})
		colnames(end.all)[one:three] <- c(paste("p.value,type",chak,sep=""), paste("patients.type",chak,sep=""), paste("copynum.type",chak,sep=""))
		filtered <- (end.all[which(end.all[,col+1] <= p.cutoff  ),])		
		if(length(filtered)==0 || nrow(filtered) == 0 ){
			stop("No significant regions found")
		}
		else{
			filtered[,ncol(filtered)+1] <- "type10" 
			colnames(filtered)[ncol(filtered)] <- "No. of patients"
			filtered <- (filtered[which(filtered[,3]-filtered[,2] == w.size),])
			if (length(unique(filtered[,col+3]) ) > 1){
				filtergain <- (filtered[which(filtered[,col+3]== "Gain"),])  
				filterloss <- (filtered[which(filtered[,col+3]!= "Gain"),])
				gain <- filtergain[,c(1,2,3, col+2, 4)]
				loss <- filterloss[,c(1,2,3, col+2, 4)]
				colnames(gain)[4:5] <- c("Patient1","Patient2")
				colnames(loss)[4:5] <- c("Patient1","Patient2")
			}
			else if(unique(filtered[,col+3])[1] == "Gain") {
				filtergain <- (filtered[which(filtered[,col+3]== "Gain"),])
				gain <- filtergain[,c(1,2,3, col+2, 4)]
				colnames(gain)[4:5] <- c("Patient1","Patient2")
			}
			else {
				filterloss <- (filtered[which(filtered[,col+3]!= "Gain"),])
				loss <- filterloss[,c(1,2,3, col+2, 4)]
				colnames(loss)[4:5] <- c("Patient1","Patient2")
			}
		}
	}
	
	## apply HMM for each sample separately (third step of the method)
	pat.id <- colnames(end.all)[5:col]
	for(file in pat.id){
		if(exists("gain")){
			gain.sff <- gain[which(gain[,4] == file | gain[,5] == file ),1:5]
		}
		else {
			gain.sff <- data.frame(a=character(0))
		}
		if(exists("loss")){
			loss.sff <- loss[which(loss[,4] == file | loss[,5] == file  ),1:5]
		}
		else {
			loss.sff <- data.frame(a=character(0))
		}
		all.win <- all[,1:4] 
		if(length(gain.sff)==0 || nrow(gain.sff) == 0 ){ 
			 if(length(loss.sff)==0 || nrow(loss.sff) == 0 ){
			 	next
			 }
			 else{
				loss.sff[,6] <- "loss"
				lossgain78 <- loss.sff
			 }
		}
		else if (length(loss.sff)==0 || nrow(loss.sff) == 0 ) {
			gain.sff[,6] <- "gain"
			lossgain78 <- gain.sff
		}
		else {
			loss.sff[,6] <- "loss"
			gain.sff[,6] <- "gain"
			lossgain78 <- (rbind(gain.sff,loss.sff))  
		}
		merge78 <- (merge(all.win,lossgain78,by = c("CHROM","START","END"),all.x=TRUE)) 
		merge78[is.na(merge78)] <- "normal"  
		forhmm78 <- merge78[,c(1:3,7)]
		forhmm78[,5] <- "wait"
		colnames(forhmm78)[5] <- "After.HMM"
		
		## initial transition and emission probabilities
		hmm  <- initHMM(c("gain","loss","normal"), c("gain","loss","normal"), transProbs=matrix(c(.6,.2,.2,.2,.6,.2,.2,.2,.6),3),emissionProbs=matrix(c(.6,.2,.2,.2,.6,.2,.2,.2,.6),3))
		
		## recomputing transition and emission probabilities using the Baum-Welch algorithm
		## finding most likely sequence of the hidden states by the Viterbi algorithm
		for(jo in unique(forhmm78[,1])){
			cat("\r", paste(jo,"-",file) , "\n")
			observations <- forhmm78[forhmm78[,1] == jo ,4]
			bw <- baumWelch(hmm,observations,10)
			viterbi <- viterbi(bw$hmm,observations)
			forhmm78[forhmm78[,1]==jo,5] <- viterbi
			colnames(forhmm78)[4] <- "Before.HMM"
		}
		
		## calling CNV if 5 continuous windows (default conti.win = 5) are present with same copy number type
		forhmm78=forhmm78[,-4]
		forhmm78$conseq <-cumsum(c(1, forhmm78$After.HMM[-1] != forhmm78$After.HMM[-length(forhmm78$After.HMM)] ) )
		final <- do.call( rbind, 
		by(forhmm78, list(forhmm78$CHROM, forhmm78$conseq), 
			function(df)
			if( NROW(df) >= conti.win & df$After.HMM[1] %in% c("gain", "loss") ) {
				cbind(df[1, c("CHROM", "START")] , df[NROW(df), c("END", "After.HMM")] ) 
				} else{NULL} ) )

		## output CNVs for each sample if present
		if (length(final)==0 || nrow(final) == 0) {
			next
		}
		else {
			colnames(final)[4] <- "TYPE"
			write.table(final, file=paste("hmm.",file,sep=""), sep="\t", quote=FALSE, row.names=FALSE)
		}
	}
}