#### Author - Vikas Bansal #### Email - vikas.bansal@charite.de #### Created - October 2013 #### R 2.15.1 library("outliers") library("HMM") ##--------------------------------------------------------------------------------------------------------------------------------- ## modified code for Dixon's Q test from "outliers" package, which returns sample names, p-values and outlier type (gain, loss or normal) ## my.dixon.test <- function (x, type = 0, opposite = FALSE, two.sided = TRUE) { DNAME <- deparse(substitute(x)) x <- sort(x[complete.cases(x)]) n <- length(x) if ((type == 10 || type == 0) & (n < 3 || n > 30)) stop("Sample size must be in range 3-30 for type10") if (type == 20 & (n < 4 || n > 30)) stop("Sample size must be in range 4-30 for type20") if (xor(((x[n] - mean(x)) < (mean(x) - x[1])), opposite)) { alt = paste("lowest value", x[1], "is an outlier") number="Loss" if (type == 10) { Q = (x[2] - x[1])/(x[n] - x[1]) out.patient=names(x[1]) } else { Q = (x[3] - x[1])/(x[n] - x[1]) out.patient=paste(names(x[1]),names(x[2]),sep=";") } } else { alt = paste("highest value", x[n], "is an outlier") number="Gain" if (type == 10) { Q = (x[n] - x[n - 1])/(x[n] - x[1]) out.patient=names(x[n]) } else { Q = (x[n] - x[n - 2])/(x[n] - x[1]) out.patient=paste(names(x[n]),names(x[n-1]),sep=";") } } pval <- pdixon(Q, n, type) if (two.sided) { pval <- 2 * pval if (pval > 1) pval <- 2 - pval } RVAL <- list(statistic = c(Q = Q), alternative = alt, p.value = pval, method = "Dixon test for outliers", data.name = DNAME , x=out.patient, num=number) class(RVAL) <- "htest" return(RVAL) } ##-------------------------------------------------------------------------------------------------------------------------------------------------- ## main function - calling CNVs ## input data frame contains first 4 columns - CHROM, START, END, GC% and 5th, 6th, 7th, ... 34th column contains copy number value for each sample ## above input data frame can be created from the output of mrCaNaVar "out_prefix.copynumber.bed" output file (first step of the method) ## exomeCNA <- function(df.var, type = 0, w.size = 100, p.cutoff = 0.01, two.sided = FALSE, conti.win = 5 ) { col <- ncol(df.var) if (type == 0) { if (col < 12 & col >6) { type <- 10 } else if (col < 35 & col >11){ type <- 20 } else { stop("Sample size must be in range 3-30") } } else if (type != 10 && type != 20) { stop("Type should be 10 or 20") } ## read in the data frame all <- df.var colnames(all)[1:3] <- c("CHROM","START","END") end.all <- df.var[apply(df.var[,5:col],1,function(v)sum(v!=0,na.rm=TRUE)>=((col-4)/2)),] colnames(end.all)[1:3] <- c("CHROM","START","END") not.same <- apply(end.all[,5:col],1,function(i) length (unique(i)) > 1 ) end.all <- end.all[not.same,] one <- col+1 two <- col+2 three <- col+3 ## apply type20 Dixon test if type is equal to 20 (second step of the method) if (type == 20) { for (chak in c(10,20)) { ko <- apply(end.all[,5:col],1, function(test){ to <- my.dixon.test(test, type=chak ,two.sided= two.sided) }) end.all[,one] <- sapply(ko,function(la){la$p.value}) end.all[,two] <- sapply(ko,function(la){la$x}) end.all[,three] <- sapply(ko,function(la){la$num}) colnames(end.all)[one:three] <- c(paste("p.value,type",chak,sep=""), paste("patients.type",chak,sep=""), paste("copynum.type",chak,sep="")) one <- one+3 two <- two+3 three <- three+3 } ## return the outlying windows which has p-value less than p.cutoff filtered <- (end.all[which(end.all[,col+1] <= p.cutoff | end.all[,col+4] <= p.cutoff ),]) if(length(filtered)==0 || nrow(filtered) == 0 ){ stop("No significant regions found") } else{ filtered[which(filtered[,col+1] <= p.cutoff),ncol(filtered)+1] <- "type10" filtered[is.na(filtered)]<- "type20" colnames(filtered)[ncol(filtered)] <- "No. of patients" filtered <- (filtered[which(filtered[,3]-filtered[,2] == w.size),]) if (length(unique(filtered[,col+3]) ) > 1){ filtergain <- (filtered[which(filtered[,col+3]== "Gain"),]) filterloss <- (filtered[which(filtered[,col+3]!= "Gain"),]) Patient1 <- vector() Patient2 <- vector() for (chak in 1:nrow(filtergain)){ if(filtergain[chak,ncol(filtergain)] == "type10"){ Patient1[chak] <- filtergain[chak,col+2] Patient2[chak] <- "NA" } else if (filtergain[chak,ncol(filtergain)] == "type20"){ test <- unlist(strsplit(filtergain[chak,col+5],";")) Patient1[chak] <- test[1] Patient2[chak] <- test[2] } } gain <- filtergain[,c(1,2,3)] gain[,4:5] <- c(Patient1,Patient2) colnames(gain)[4:5] <- c("Patient1","Patient2") Patient1 <- vector() Patient2 <- vector() for (chak in 1:nrow(filterloss)){ if(filterloss[chak,ncol(filterloss)] == "type10"){ Patient1[chak] <- filterloss[chak,col+2] Patient2[chak] <- "NA" } else if (filterloss[chak,ncol(filterloss)] == "type20"){ test <- unlist(strsplit(filterloss[chak,col+5],";")) Patient1[chak] <- test[1] Patient2[chak] <- test[2] } } loss <- filterloss[,c(1,2,3)] loss[,4:5] <- c(Patient1,Patient2) colnames(loss)[4:5] <- c("Patient1","Patient2") } else if(unique(filtered[,col+3])[1] == "Gain") { filtergain <- (filtered[which(filtered[,col+3]== "Gain"),]) Patient1 <- vector() Patient2 <- vector() for (chak in 1:nrow(filtergain)){ if(filtergain[chak,ncol(filtergain)] == "type10"){ Patient1[chak] <- filtergain[chak,col+2] Patient2[chak] <- "NA" } else if (filtergain[chak,ncol(filtergain)] == "type20"){ test <- unlist(strsplit(filtergain[chak,col+5],";")) Patient1[chak] <- test[1] Patient2[chak] <- test[2] } } gain <- filtergain[,c(1,2,3)] gain[,4:5] <- c(Patient1,Patient2) colnames(gain)[4:5] <- c("Patient1","Patient2") } else { filterloss <- (filtered[which(filtered[,col+3]!= "Gain"),]) Patient1 <- vector() Patient2 <- vector() for (chak in 1:nrow(filterloss)){ if(filterloss[chak,ncol(filterloss)] == "type10"){ Patient1[chak] <- filterloss[chak,col+2] Patient2[chak] <- "NA" } else if (filterloss[chak,ncol(filterloss)] == "type20"){ test <- unlist(strsplit(filterloss[chak,col+5],";")) Patient1[chak] <- test[1] Patient2[chak] <- test[2] } } loss <- filterloss[,c(1,2,3)] loss[,4:5] <- c(Patient1,Patient2) colnames(loss)[4:5] <- c("Patient1","Patient2") } } } ## apply type10 Dixon test if type is equal to 10 (second step of the method) else{ chak=10 ko <- apply(end.all[,5:col],1, function(test){ to <- my.dixon.test(test, type=chak ,two.sided= two.sided) }) end.all[,one] <- sapply(ko,function(la){la$p.value}) end.all[,two] <- sapply(ko,function(la){la$x}) end.all[,three] <- sapply(ko,function(la){la$num}) colnames(end.all)[one:three] <- c(paste("p.value,type",chak,sep=""), paste("patients.type",chak,sep=""), paste("copynum.type",chak,sep="")) filtered <- (end.all[which(end.all[,col+1] <= p.cutoff ),]) if(length(filtered)==0 || nrow(filtered) == 0 ){ stop("No significant regions found") } else{ filtered[,ncol(filtered)+1] <- "type10" colnames(filtered)[ncol(filtered)] <- "No. of patients" filtered <- (filtered[which(filtered[,3]-filtered[,2] == w.size),]) if (length(unique(filtered[,col+3]) ) > 1){ filtergain <- (filtered[which(filtered[,col+3]== "Gain"),]) filterloss <- (filtered[which(filtered[,col+3]!= "Gain"),]) gain <- filtergain[,c(1,2,3, col+2, 4)] loss <- filterloss[,c(1,2,3, col+2, 4)] colnames(gain)[4:5] <- c("Patient1","Patient2") colnames(loss)[4:5] <- c("Patient1","Patient2") } else if(unique(filtered[,col+3])[1] == "Gain") { filtergain <- (filtered[which(filtered[,col+3]== "Gain"),]) gain <- filtergain[,c(1,2,3, col+2, 4)] colnames(gain)[4:5] <- c("Patient1","Patient2") } else { filterloss <- (filtered[which(filtered[,col+3]!= "Gain"),]) loss <- filterloss[,c(1,2,3, col+2, 4)] colnames(loss)[4:5] <- c("Patient1","Patient2") } } } ## apply HMM for each sample separately (third step of the method) pat.id <- colnames(end.all)[5:col] for(file in pat.id){ if(exists("gain")){ gain.sff <- gain[which(gain[,4] == file | gain[,5] == file ),1:5] } else { gain.sff <- data.frame(a=character(0)) } if(exists("loss")){ loss.sff <- loss[which(loss[,4] == file | loss[,5] == file ),1:5] } else { loss.sff <- data.frame(a=character(0)) } all.win <- all[,1:4] if(length(gain.sff)==0 || nrow(gain.sff) == 0 ){ if(length(loss.sff)==0 || nrow(loss.sff) == 0 ){ next } else{ loss.sff[,6] <- "loss" lossgain78 <- loss.sff } } else if (length(loss.sff)==0 || nrow(loss.sff) == 0 ) { gain.sff[,6] <- "gain" lossgain78 <- gain.sff } else { loss.sff[,6] <- "loss" gain.sff[,6] <- "gain" lossgain78 <- (rbind(gain.sff,loss.sff)) } merge78 <- (merge(all.win,lossgain78,by = c("CHROM","START","END"),all.x=TRUE)) merge78[is.na(merge78)] <- "normal" forhmm78 <- merge78[,c(1:3,7)] forhmm78[,5] <- "wait" colnames(forhmm78)[5] <- "After.HMM" ## initial transition and emission probabilities hmm <- initHMM(c("gain","loss","normal"), c("gain","loss","normal"), transProbs=matrix(c(.6,.2,.2,.2,.6,.2,.2,.2,.6),3),emissionProbs=matrix(c(.6,.2,.2,.2,.6,.2,.2,.2,.6),3)) ## recomputing transition and emission probabilities using the Baum-Welch algorithm ## finding most likely sequence of the hidden states by the Viterbi algorithm for(jo in unique(forhmm78[,1])){ cat("\r", paste(jo,"-",file) , "\n") observations <- forhmm78[forhmm78[,1] == jo ,4] bw <- baumWelch(hmm,observations,10) viterbi <- viterbi(bw$hmm,observations) forhmm78[forhmm78[,1]==jo,5] <- viterbi colnames(forhmm78)[4] <- "Before.HMM" } ## calling CNV if 5 continuous windows (default conti.win = 5) are present with same copy number type forhmm78=forhmm78[,-4] forhmm78$conseq <-cumsum(c(1, forhmm78$After.HMM[-1] != forhmm78$After.HMM[-length(forhmm78$After.HMM)] ) ) final <- do.call( rbind, by(forhmm78, list(forhmm78$CHROM, forhmm78$conseq), function(df) if( NROW(df) >= conti.win & df$After.HMM[1] %in% c("gain", "loss") ) { cbind(df[1, c("CHROM", "START")] , df[NROW(df), c("END", "After.HMM")] ) } else{NULL} ) ) ## output CNVs for each sample if present if (length(final)==0 || nrow(final) == 0) { next } else { colnames(final)[4] <- "TYPE" write.table(final, file=paste("hmm.",file,sep=""), sep="\t", quote=FALSE, row.names=FALSE) } } }