##########LICENCE########## # Copyright (c) 2015 Genome Research Ltd. # # Author: David Wedge dw9@sanger.ac.uk # # # This is free software: you can redistribute it and/or modify it under # the terms of the GNU Affero General Public License as published by the Free # Software Foundation; either version 3 of the License, or (at your option) any # later version. # # This program is distributed in the hope that it will be useful, but WITHOUT # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more # details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . ##########LICENCE########## # To sample clustering of mutations from a Dirichlet process: # GS.data<-subclone.dirichlet.gibbs(mutCount=mutCount,WTCount=WTCount,...) # # To make density plots of the clustering for each pair of samples i & j: # Gibbs.subclone.density.est(plot.data[,c(i,j)],GS.data,imageFile, ..., indices=c(i,j)) # where plot.data may be either (1) variant allele frequency (2) mutation copynumber (multiplicity) or (3) cancer cell fraction (all unrounded doubles) # depending on whether Gibbs.subclone.density.est was run with just (1) raw counts (2) raw counts + total and normal copy numbers or # (3) raw counts + total and normal copy numbers + copyNumberAdjustment (number of chromosomes bearing mutations) mutationBurdenToMutationCopyNumber<-function(burden,totalCopyNumber,cellularity,normalCopyNumber = rep(2,length(burden))){ mutCopyNumber = burden/cellularity*(cellularity*totalCopyNumber+normalCopyNumber*(1-cellularity)) mutCopyNumber[is.nan(mutCopyNumber)]=0 return(mutCopyNumber) } mutationCopyNumberToMutationBurden<-function(copyNumber,totalCopyNumber,cellularity,normalCopyNumber = rep(2,length(copyNumber))){ burden = copyNumber*cellularity/(cellularity*totalCopyNumber+normalCopyNumber*(1-cellularity)) burden[is.nan(burden)|(burden<0.000001)]=0.000001 burden[burden>0.999999]=0.999999 return(burden) } subclone.dirichlet.gibbs <- function(mutCount, WTCount, totalCopyNumber=array(2,dim(mutCount)), normalCopyNumber=array(2,dim(mutCount)), copyNumberAdjustment = array(1,dim(mutCount)),C=30, cellularity=rep(1,ncol(mutCount)),iter=1000,conc_param=1,cluster_conc=10) { # mutCount is a p-by-q matrix of the number of reads reporting each variant (p=number of mutations,q=number of timepoints / related samples) # WTCount is a p-by-q matrix of the number of reads in total across the base in question (in the same order as mutCount obviously!, p=number of mutations,q=number of timepoints / related samples) # C is the maximum number of clusters in the Dirichlet process # iter is the number of iterations of the Gibbs sampler num.muts <- NROW(mutCount) num.timepoints = NCOL(mutCount) # Hyperparameters for alpha A=1 B=conc_param # Set up data formats for recording iterations pi.h <- array(NA, c(iter, C,num.timepoints)) mutBurdens <- array(NA, c(C,num.timepoints,num.muts)) V.h <- matrix(1, nrow=iter, ncol=C) S.i <- matrix(NA, nrow=iter, ncol=num.muts) Pr.S <- matrix(NA, nrow=num.muts, ncol=C) alpha <- rep(NA, iter) lower = array(NA,num.timepoints) upper = array(NA,num.timepoints) mutCopyNum = array(NA,c(num.muts,num.timepoints)) for(t in 1:num.timepoints){ mutCopyNum[,t] = mutationBurdenToMutationCopyNumber(mutCount[,t]/(mutCount[,t]+WTCount[,t]),totalCopyNumber[,t] ,cellularity[t],normalCopyNumber[,t]) / copyNumberAdjustment[,t] lower[t]=min(mutCopyNum[,t]) upper[t]=max(mutCopyNum[,t]) difference = upper[t]-lower[t] lower[t]=lower[t]-difference/10 upper[t]=upper[t]+difference/10 # randomise starting positions of clusters pi.h[1,,t]=runif(C,lower[t],upper[t]) for(c in 1:C){ mutBurdens[c,t,]=mutationCopyNumberToMutationBurden(pi.h[1,c,t] * copyNumberAdjustment[,t], totalCopyNumber[,t], cellularity[t],normalCopyNumber[,t]) } } V.h[1,] <- c(rep(0.5,C-1), 1) S.i[1,] <- c(1, rep(0,num.muts-1)) alpha[1] <- 1 V.h[1:iter, C] <- rep(1, iter) for (m in 2:iter) { print.freq = ceiling(iter/100) if(m %% print.freq ==0){print(m)} # Update cluster allocation for each individual mutation for (k in 1:num.muts) { #use log-space to avoid problems with very high counts Pr.S[k,1] <- log(V.h[m-1,1]) for(j in 2:C){ Pr.S[k,j] <- log(V.h[m-1,j]) + sum(log(1-V.h[m-1,1:(j-1)])) } for(t in 1:num.timepoints){ for(c in 1:C){ Pr.S[k,c] <- Pr.S[k,c] + mutCount[k,t]*log(mutBurdens[c,t,k]) + WTCount[k,t]*log(1-mutBurdens[c,t,k]) } } Pr.S[k,is.na(Pr.S[k,])] = 0 Pr.S[k,] = Pr.S[k,] - max(Pr.S[k,]) Pr.S[k,]=exp(Pr.S[k,]) Pr.S[k,] <- Pr.S[k,] / sum(Pr.S[k,]) } if(sum(is.na(Pr.S))>0){ print(paste("Pr.S=",Pr.S)) } S.i[m,] <- sapply(1:num.muts, function(Pr, k) {sum(rmultinom(1,1,Pr[k,]) * (1:length(Pr[k,])))}, Pr=Pr.S) # Update stick-breaking weights V.h[m,1:(C-1)] <- sapply(1:(C-1), function(S, curr.m, curr.alpha, h) {rbeta(1, 1+sum(S[curr.m,] == h), curr.alpha+sum(S[curr.m,] > h))}, S=S.i, curr.m=m, curr.alpha=alpha[m-1]) V.h[m,c(V.h[m,1:(C-1)] == 1,FALSE)] <- 0.999 # Need to prevent one stick from taking all the remaining weight #Get expected number of mutant reads per mutation copy number countsPerCopyNum=array(NA,c(num.timepoints,num.muts)) for(t in 1:num.timepoints){ countsPerCopyNum[t,]=(mutCount[,t]+WTCount[,t])*mutationCopyNumberToMutationBurden(copyNumberAdjustment[,t],totalCopyNumber[,t],cellularity[t],normalCopyNumber[,t]) } #190512 randomise unused pi.h for(t in 1:num.timepoints){ pi.h[m,,t]=runif(C,lower[t],upper[t]) } for(c in unique(S.i[m,])){ for(t in 1:num.timepoints){ if(sum(countsPerCopyNum[t,S.i[m,]==c])==0){ pi.h[m,c,t] = 0 }else{ pi.h[m,c,t] = rgamma(1,shape=sum(mutCount[S.i[m,]==c,t]),rate=sum(countsPerCopyNum[t,S.i[m,]==c])) } } } for(t in 1:num.timepoints){ for(c in 1:C){ mutBurdens[c,t,]=mutationCopyNumberToMutationBurden(pi.h[m,c,t] * copyNumberAdjustment[,t],totalCopyNumber[,t],cellularity[t],normalCopyNumber[,t]) } } if(sum(is.na(pi.h[m,,]))){ print(paste("pi.h=",pi.h[m,,])) print(paste("m=",m,sep="")) } # Update alpha alpha[m] <- rgamma(1, shape=C+A-1, rate=B-sum(log(1-V.h[m,1:(C-1)]))) } return(list(S.i=S.i, V.h=V.h, pi.h=pi.h, mutBurdens=mutBurdens, alpha=alpha)) } Gibbs.subclone.density.est <- function(burden, GS.data, pngFile, density.smooth = 0.1, post.burn.in.start = 3000, post.burn.in.stop = 10000, samplenames = c("sample 1","sample 2"), indices = NA) { V.h.cols <- GS.data$V.h if("pi.h" %in% names(GS.data)){ if(is.na(indices)){ pi.h.cols <- GS.data$pi.h }else{ pi.h.cols <- GS.data$pi.h[,,indices] } }else{ pi.h.cols <- GS.data$theta$mu } wts <- matrix(NA, nrow=dim(V.h.cols)[1], ncol=dim(V.h.cols)[2]) wts[,1] <- V.h.cols[,1] wts[,2] <- V.h.cols[,2] * (1-V.h.cols[,1]) for (i in 3:dim(wts)[2]) {wts[,i] <- apply(1-V.h.cols[,1:(i-1)], MARGIN=1, FUN=prod) * V.h.cols[,i]} library(KernSmooth) gridsize=c(64L,64L) #exclude extreme values, because they distort the plot max.plot.value = 100 burden = burden[burden[,1]<=max.plot.value & burden[,2]<=max.plot.value,] num.timepoints = NCOL(burden) if(num.timepoints==2){ range=list(c(floor(min(burden[,1])*10)-1,ceiling(max(burden[,1])*10)+1)/10,c(floor(min(burden[,2])*10)-1,ceiling(max(burden[,2])*10)+1)/10) if(range[[1]][2]-range[[1]][1]>100){ gridsize[1]=round(4*(range[[1]][2]-range[[1]][1]))+1 }else if(range[[1]][2]-range[[1]][1]>50){ gridsize[1]=round(10*(range[[1]][2]-range[[1]][1]))+1 }else if(range[[1]][2]-range[[1]][1]>20){ gridsize[1]=round(20*(range[[1]][2]-range[[1]][1]))+1 }else if(range[[1]][2]-range[[1]][1]>8){ gridsize[1]=round(50*(range[[1]][2]-range[[1]][1]))+1 }else if(range[[1]][2]-range[[1]][1]>3){ gridsize[1]=round(100*(range[[1]][2]-range[[1]][1]))+1 }else{ gridsize[1]=round(200*(range[[1]][2]-range[[1]][1]))+1 } if(range[[2]][2]-range[[2]][1]>100){ gridsize[2]=round(4*(range[[2]][2]-range[[2]][1]))+1 }else if(range[[2]][2]-range[[2]][1]>50){ gridsize[2]=round(10*(range[[2]][2]-range[[2]][1]))+1 }else if(range[[2]][2]-range[[2]][1]>20){ gridsize[2]=round(20*(range[[2]][2]-range[[2]][1]))+1 }else if(range[[2]][2]-range[[2]][1]>8){ gridsize[2]=round(50*(range[[2]][2]-range[[2]][1]))+1 }else if(range[[2]][2]-range[[2]][1]>3){ gridsize[2]=round(100*(range[[2]][2]-range[[2]][1]))+1 }else{ gridsize[2]=round(200*(range[[2]][2]-range[[2]][1]))+1 } print(paste("gridsize=",gridsize,sep="")) sampledIters = post.burn.in.start : post.burn.in.stop #don't use the intitial (random) state sampledIters = sampledIters[sampledIters!=1] if(length(sampledIters) > 1000){ post.ints <- array(NA, c(gridsize[1], gridsize[2], 1000)) sampledIters=floor(post.burn.in.start + (1:1000) * (post.burn.in.stop - post.burn.in.start)/1000) }else{ post.ints <- array(NA, c(gridsize[1], gridsize[2], length(sampledIters))) } }else{ post.ints <- array(NA, c(512, post.burn.in.stop - post.burn.in.start + 1)) xx=array(NA,c(1,512)) range=c(floor(min(burden)*10)-1,ceiling(max(burden[,1])*10)+1)/10 } no.density.points=10000 no.clusters=ncol(V.h.cols) for(i in 1:length(sampledIters)){ density.data=array(NA,c(0,num.timepoints)) for(j in 1:no.clusters){ #use pi.h from previous generation density.data=rbind(density.data,t(array(pi.h.cols[sampledIters[i]-1,j,],c(num.timepoints,round(no.density.points*wts[sampledIters[i],j]))))) } if(i==1){ write.csv(density.data,gsub(".png",paste("_densityData",i,".csv",sep=""),pngFile)) } if(num.timepoints==2){ d=bkde2D(density.data,bandwidth=density.smooth,gridsize=gridsize,range.x=range) if(i==1){ xvals=d$x1 yvals=d$x2 } post.ints[,,i]=d$fhat }else{ d=bkde(density.data,bandwidth=density.smooth,gridsize=512L,range.x=range) post.ints[,i]=d$y } } if(num.timepoints==2){ median.density=apply(post.ints, MARGIN=c(1,2), FUN=median) }else{ median.density=apply(post.ints, MARGIN=1, FUN=median) } colours=colorRampPalette(c("white","red")) #use lattice for plotting with axes scaled equally library(lattice) png(filename=gsub(".png","_withoutMutations.png",pngFile),width=1500,height=1000) range=list(c(floor(min(burden[,1])*10)-1,ceiling(max(burden[,1])*10)+1)/10, c(floor(min(burden[,2])*10)-1,ceiling(max(burden[,2])*10)+1)/10) image.wid = 500 * (range[[1]][2] - range[[1]][1]) image.ht = 500 * (range[[2]][2] - range[[2]][1]) fig=levelplot(median.density,row.values=xvals,column.values=yvals,xlim=range[[1]],ylim=range[[2]],xlab=list(label=samplenames[1],cex=2),ylab=list(label=samplenames[2],cex=2),scales=list(x=list(cex=1.5),y=list(cex=1.5)),col.regions=colours,colorkey=F, panel = function(...) { panel.levelplot(...) panel.abline(h = 0:floor(max(burden[,2]))) panel.abline(v = 0:floor(max(burden[,1]))) } ) print(fig) dev.off() png(filename=gsub(".png","_withMutations.png",pngFile),width=1500,height=1000) range=list(c(floor(min(burden[,1])*10)-1,ceiling(max(burden[,1])*10)+1)/10, c(floor(min(burden[,2])*10)-1,ceiling(max(burden[,2])*10)+1)/10) image.wid = 500 * (range[[1]][2] - range[[1]][1]) image.ht = 500 * (range[[2]][2] - range[[2]][1]) fig=levelplot(median.density,row.values=xvals,column.values=yvals,xlim=range[[1]],ylim=range[[2]],xlab=list(label=samplenames[1],cex=2),ylab=list(label=samplenames[2],cex=2),scales=list(x=list(cex=1.5),y=list(cex=1.5)),col.regions=colours,colorkey=F, panel = function(...) { panel.levelplot(...) panel.abline(h = 0:floor(max(burden[,2]))) panel.abline(v = 0:floor(max(burden[,1]))) #070913 if(nrow(burden>=500)){ lpoints(burden,pch=".",cex=1,col="black") }else if(nrow(burden>=100)){ lpoints(burden,pch=".",cex=2,col="black") }else{ lpoints(burden,pch=".",cex=4,col="black") } } ) print(fig) dev.off() write.csv(xvals,gsub(".png","_xvals.csv",pngFile)) write.csv(yvals,gsub(".png","_yvals.csv",pngFile)) write.csv(median.density,gsub(".png","_zvals.csv",pngFile)) }