##########LICENCE##########
# Copyright (c) 2015 Genome Research Ltd.
# 
# Author: David Wedge dw9@sanger.ac.uk
# 
# 
# This is free software: you can redistribute it and/or modify it under
# the terms of the GNU Affero General Public License as published by the Free
# Software Foundation; either version 3 of the License, or (at your option) any
# later version.
# 
# This program is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
# details.
# 
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
##########LICENCE##########

# To sample clustering of mutations from a Dirichlet process:
# GS.data<-subclone.dirichlet.gibbs(mutCount=mutCount,WTCount=WTCount,...)
#
# To make density plots of the clustering for each pair of samples i & j:
# Gibbs.subclone.density.est(plot.data[,c(i,j)],GS.data,imageFile, ..., indices=c(i,j))
# where plot.data may be either (1) variant allele frequency (2) mutation copynumber (multiplicity) or (3) cancer cell fraction (all unrounded doubles)
# depending on whether Gibbs.subclone.density.est was run with just (1) raw counts (2) raw counts + total and normal copy numbers or
# (3) raw counts + total and normal copy numbers + copyNumberAdjustment (number of chromosomes bearing mutations)


mutationBurdenToMutationCopyNumber<-function(burden,totalCopyNumber,cellularity,normalCopyNumber = rep(2,length(burden))){
	mutCopyNumber = burden/cellularity*(cellularity*totalCopyNumber+normalCopyNumber*(1-cellularity))
	mutCopyNumber[is.nan(mutCopyNumber)]=0
	return(mutCopyNumber)
}

mutationCopyNumberToMutationBurden<-function(copyNumber,totalCopyNumber,cellularity,normalCopyNumber = rep(2,length(copyNumber))){
	burden = copyNumber*cellularity/(cellularity*totalCopyNumber+normalCopyNumber*(1-cellularity))
	burden[is.nan(burden)|(burden<0.000001)]=0.000001
	burden[burden>0.999999]=0.999999
	return(burden)	
}

subclone.dirichlet.gibbs <- function(mutCount, WTCount, totalCopyNumber=array(2,dim(mutCount)), normalCopyNumber=array(2,dim(mutCount)), copyNumberAdjustment = array(1,dim(mutCount)),C=30, cellularity=rep(1,ncol(mutCount)),iter=1000,conc_param=1,cluster_conc=10) {
	# mutCount is a p-by-q matrix of the number of reads reporting each variant (p=number of mutations,q=number of timepoints / related samples)
	# WTCount is a p-by-q matrix of the number of reads in total across the base in question (in the same order as mutCount obviously!, p=number of mutations,q=number of timepoints / related samples)
	# C is the maximum number of clusters in the Dirichlet process
	# iter is the number of iterations of the Gibbs sampler
	
	num.muts <- NROW(mutCount)
	num.timepoints  = NCOL(mutCount)
	
	# Hyperparameters for alpha
	A=1
	B=conc_param
	
	# Set up data formats for recording iterations
	pi.h <- array(NA, c(iter, C,num.timepoints))
	mutBurdens <- array(NA, c(C,num.timepoints,num.muts))
	
	V.h <- matrix(1, nrow=iter, ncol=C)
	S.i <- matrix(NA, nrow=iter, ncol=num.muts)
	Pr.S <- matrix(NA, nrow=num.muts, ncol=C)
	alpha <- rep(NA, iter)
			
	lower = array(NA,num.timepoints)
	upper = array(NA,num.timepoints)
	mutCopyNum = array(NA,c(num.muts,num.timepoints))
	for(t in 1:num.timepoints){
		mutCopyNum[,t] = mutationBurdenToMutationCopyNumber(mutCount[,t]/(mutCount[,t]+WTCount[,t]),totalCopyNumber[,t] ,cellularity[t],normalCopyNumber[,t]) / copyNumberAdjustment[,t]
		lower[t]=min(mutCopyNum[,t])
		upper[t]=max(mutCopyNum[,t])
		difference = upper[t]-lower[t]
		lower[t]=lower[t]-difference/10
		upper[t]=upper[t]+difference/10
		# randomise starting positions of clusters
		pi.h[1,,t]=runif(C,lower[t],upper[t])
		for(c in 1:C){
			mutBurdens[c,t,]=mutationCopyNumberToMutationBurden(pi.h[1,c,t] * copyNumberAdjustment[,t], totalCopyNumber[,t], cellularity[t],normalCopyNumber[,t]) 
		}
	}	
	V.h[1,] <- c(rep(0.5,C-1), 1)
	S.i[1,] <- c(1, rep(0,num.muts-1))
	alpha[1] <- 1
	V.h[1:iter, C] <- rep(1, iter)

	for (m in 2:iter) {
		print.freq = ceiling(iter/100)
		if(m %% print.freq ==0){print(m)}

		# Update cluster allocation for each individual mutation
		for (k in 1:num.muts) {			
			#use log-space to avoid problems with very high counts
			Pr.S[k,1] <- log(V.h[m-1,1])
			for(j in 2:C){
				Pr.S[k,j] <- log(V.h[m-1,j]) + sum(log(1-V.h[m-1,1:(j-1)]))
			}
			for(t in 1:num.timepoints){
				for(c in 1:C){
					Pr.S[k,c] <- Pr.S[k,c] + mutCount[k,t]*log(mutBurdens[c,t,k]) + WTCount[k,t]*log(1-mutBurdens[c,t,k])
				}
			}			
			Pr.S[k,is.na(Pr.S[k,])] = 0
			Pr.S[k,] = Pr.S[k,] - max(Pr.S[k,])
			Pr.S[k,]=exp(Pr.S[k,])
			Pr.S[k,] <- Pr.S[k,] / sum(Pr.S[k,])				
		}
		
		if(sum(is.na(Pr.S))>0){
			print(paste("Pr.S=",Pr.S))
		}
		
		S.i[m,] <- sapply(1:num.muts, function(Pr, k) {sum(rmultinom(1,1,Pr[k,]) * (1:length(Pr[k,])))}, Pr=Pr.S)

		# Update stick-breaking weights
		V.h[m,1:(C-1)]  <- sapply(1:(C-1), function(S, curr.m, curr.alpha, h) {rbeta(1, 1+sum(S[curr.m,] == h), curr.alpha+sum(S[curr.m,] > h))}, S=S.i, curr.m=m, curr.alpha=alpha[m-1])
		V.h[m,c(V.h[m,1:(C-1)] == 1,FALSE)] <- 0.999 # Need to prevent one stick from taking all the remaining weight
				
		#Get expected number of mutant reads per mutation copy number
		countsPerCopyNum=array(NA,c(num.timepoints,num.muts))
		for(t in 1:num.timepoints){
			countsPerCopyNum[t,]=(mutCount[,t]+WTCount[,t])*mutationCopyNumberToMutationBurden(copyNumberAdjustment[,t],totalCopyNumber[,t],cellularity[t],normalCopyNumber[,t])
		}
		#190512 randomise unused pi.h
		for(t in 1:num.timepoints){
			pi.h[m,,t]=runif(C,lower[t],upper[t])
		}
		for(c in unique(S.i[m,])){
			for(t in 1:num.timepoints){
				if(sum(countsPerCopyNum[t,S.i[m,]==c])==0){
					pi.h[m,c,t] = 0
				}else{
					pi.h[m,c,t] = rgamma(1,shape=sum(mutCount[S.i[m,]==c,t]),rate=sum(countsPerCopyNum[t,S.i[m,]==c]))
				}
			}
		}		
		for(t in 1:num.timepoints){
			for(c in 1:C){
				mutBurdens[c,t,]=mutationCopyNumberToMutationBurden(pi.h[m,c,t] * copyNumberAdjustment[,t],totalCopyNumber[,t],cellularity[t],normalCopyNumber[,t])
			}
		}

		if(sum(is.na(pi.h[m,,]))){
			print(paste("pi.h=",pi.h[m,,]))
			print(paste("m=",m,sep=""))
		}
		
		# Update alpha
		alpha[m] <- rgamma(1, shape=C+A-1, rate=B-sum(log(1-V.h[m,1:(C-1)]))) 
	}
	return(list(S.i=S.i, V.h=V.h, pi.h=pi.h, mutBurdens=mutBurdens, alpha=alpha))
}

Gibbs.subclone.density.est <- function(burden, GS.data, pngFile, density.smooth = 0.1, post.burn.in.start = 3000, post.burn.in.stop = 10000, samplenames = c("sample 1","sample 2"), indices = NA) {
	V.h.cols <- GS.data$V.h
	if("pi.h" %in% names(GS.data)){
		if(is.na(indices)){
			pi.h.cols <- GS.data$pi.h
		}else{
			pi.h.cols <- GS.data$pi.h[,,indices]
		}
	}else{
		pi.h.cols <- GS.data$theta$mu
	}
	
	wts <- matrix(NA, nrow=dim(V.h.cols)[1], ncol=dim(V.h.cols)[2])
	wts[,1] <- V.h.cols[,1]
	wts[,2] <- V.h.cols[,2] * (1-V.h.cols[,1])
	for (i in 3:dim(wts)[2]) {wts[,i] <- apply(1-V.h.cols[,1:(i-1)], MARGIN=1, FUN=prod) * V.h.cols[,i]}

	library(KernSmooth)
	
	gridsize=c(64L,64L)

	#exclude extreme values, because they distort the plot
	max.plot.value = 100
	burden = burden[burden[,1]<=max.plot.value & burden[,2]<=max.plot.value,]

	num.timepoints = NCOL(burden)
	if(num.timepoints==2){
		range=list(c(floor(min(burden[,1])*10)-1,ceiling(max(burden[,1])*10)+1)/10,c(floor(min(burden[,2])*10)-1,ceiling(max(burden[,2])*10)+1)/10)
		if(range[[1]][2]-range[[1]][1]>100){
			gridsize[1]=round(4*(range[[1]][2]-range[[1]][1]))+1
		}else if(range[[1]][2]-range[[1]][1]>50){
			gridsize[1]=round(10*(range[[1]][2]-range[[1]][1]))+1			
		}else if(range[[1]][2]-range[[1]][1]>20){
			gridsize[1]=round(20*(range[[1]][2]-range[[1]][1]))+1			
		}else if(range[[1]][2]-range[[1]][1]>8){
			gridsize[1]=round(50*(range[[1]][2]-range[[1]][1]))+1
		}else if(range[[1]][2]-range[[1]][1]>3){
			gridsize[1]=round(100*(range[[1]][2]-range[[1]][1]))+1
		}else{
			gridsize[1]=round(200*(range[[1]][2]-range[[1]][1]))+1
		}
		if(range[[2]][2]-range[[2]][1]>100){
			gridsize[2]=round(4*(range[[2]][2]-range[[2]][1]))+1
		}else if(range[[2]][2]-range[[2]][1]>50){
			gridsize[2]=round(10*(range[[2]][2]-range[[2]][1]))+1
		}else if(range[[2]][2]-range[[2]][1]>20){
			gridsize[2]=round(20*(range[[2]][2]-range[[2]][1]))+1
		}else if(range[[2]][2]-range[[2]][1]>8){
			gridsize[2]=round(50*(range[[2]][2]-range[[2]][1]))+1
		}else if(range[[2]][2]-range[[2]][1]>3){
			gridsize[2]=round(100*(range[[2]][2]-range[[2]][1]))+1
		}else{
			gridsize[2]=round(200*(range[[2]][2]-range[[2]][1]))+1
		}
		print(paste("gridsize=",gridsize,sep=""))
		
		sampledIters = post.burn.in.start : post.burn.in.stop
		#don't use the intitial (random) state
		sampledIters = sampledIters[sampledIters!=1]		
		if(length(sampledIters) > 1000){
			post.ints <- array(NA, c(gridsize[1], gridsize[2], 1000))
			sampledIters=floor(post.burn.in.start + (1:1000) * (post.burn.in.stop - post.burn.in.start)/1000)			
		}else{
			post.ints <- array(NA, c(gridsize[1], gridsize[2], length(sampledIters)))
		}
		
		
	}else{
		post.ints <- array(NA, c(512, post.burn.in.stop - post.burn.in.start + 1))
		xx=array(NA,c(1,512))
		range=c(floor(min(burden)*10)-1,ceiling(max(burden[,1])*10)+1)/10
	}

	no.density.points=10000
	no.clusters=ncol(V.h.cols)
	for(i in 1:length(sampledIters)){
		density.data=array(NA,c(0,num.timepoints))
		for(j in 1:no.clusters){
			#use pi.h from previous generation
			density.data=rbind(density.data,t(array(pi.h.cols[sampledIters[i]-1,j,],c(num.timepoints,round(no.density.points*wts[sampledIters[i],j])))))
		}
		if(i==1){
			write.csv(density.data,gsub(".png",paste("_densityData",i,".csv",sep=""),pngFile))
		}
		if(num.timepoints==2){
			d=bkde2D(density.data,bandwidth=density.smooth,gridsize=gridsize,range.x=range)
			if(i==1){	
				xvals=d$x1
				yvals=d$x2
			}
			post.ints[,,i]=d$fhat	
			
		}else{
			d=bkde(density.data,bandwidth=density.smooth,gridsize=512L,range.x=range)
			post.ints[,i]=d$y
		}
	}
	if(num.timepoints==2){
		median.density=apply(post.ints, MARGIN=c(1,2), FUN=median)
	}else{
		median.density=apply(post.ints, MARGIN=1, FUN=median)
	}
		
	colours=colorRampPalette(c("white","red"))
	#use lattice for plotting with axes scaled equally
	library(lattice)
    png(filename=gsub(".png","_withoutMutations.png",pngFile),width=1500,height=1000)
	range=list(c(floor(min(burden[,1])*10)-1,ceiling(max(burden[,1])*10)+1)/10, c(floor(min(burden[,2])*10)-1,ceiling(max(burden[,2])*10)+1)/10)
	image.wid = 500 * (range[[1]][2] - range[[1]][1])
	image.ht = 500 * (range[[2]][2] - range[[2]][1])
	fig=levelplot(median.density,row.values=xvals,column.values=yvals,xlim=range[[1]],ylim=range[[2]],xlab=list(label=samplenames[1],cex=2),ylab=list(label=samplenames[2],cex=2),scales=list(x=list(cex=1.5),y=list(cex=1.5)),col.regions=colours,colorkey=F,
		panel = function(...) { 
			panel.levelplot(...)
			panel.abline(h = 0:floor(max(burden[,2])))
			panel.abline(v = 0:floor(max(burden[,1])))
		}
	)    
    print(fig)
	dev.off()
	
    png(filename=gsub(".png","_withMutations.png",pngFile),width=1500,height=1000)
	range=list(c(floor(min(burden[,1])*10)-1,ceiling(max(burden[,1])*10)+1)/10, c(floor(min(burden[,2])*10)-1,ceiling(max(burden[,2])*10)+1)/10)
	image.wid = 500 * (range[[1]][2] - range[[1]][1])
	image.ht = 500 * (range[[2]][2] - range[[2]][1])
	fig=levelplot(median.density,row.values=xvals,column.values=yvals,xlim=range[[1]],ylim=range[[2]],xlab=list(label=samplenames[1],cex=2),ylab=list(label=samplenames[2],cex=2),scales=list(x=list(cex=1.5),y=list(cex=1.5)),col.regions=colours,colorkey=F,
		panel = function(...) { 
			panel.levelplot(...)
			panel.abline(h = 0:floor(max(burden[,2])))
			panel.abline(v = 0:floor(max(burden[,1])))
			#070913
			if(nrow(burden>=500)){
				lpoints(burden,pch=".",cex=1,col="black")
			}else if(nrow(burden>=100)){
				lpoints(burden,pch=".",cex=2,col="black")
			}else{
				lpoints(burden,pch=".",cex=4,col="black")
			}
		}
	)

    print(fig)
	dev.off()
	
	write.csv(xvals,gsub(".png","_xvals.csv",pngFile))
	write.csv(yvals,gsub(".png","_yvals.csv",pngFile))	
	write.csv(median.density,gsub(".png","_zvals.csv",pngFile))	
}