# R code demonstrating comparative clustering analysis 
# 
# Author: robert lehmann <r.lehmann@biologie.hu-berlin.de>
###############################################################################

################################
# required packages
################################
require(limma)
require(ggplot2)
library(gplots)
require(cluster)
require(mclust)
require(stats)
require(kohonen)
require(amap)
require(multicore)
require(clValid)
require(entropy)
require(fpc)

input.dir <- './data/'
output.dir <- './out/'	
exp.file <- 'raw_expression_coding_genes.txt'

################################
# load data
################################
raw.data <- read.table(file.path(input.dir,exp.file))
ts.raw <- as.matrix(raw.data,nrow=length(raw.data[[1]]))

# apply median polishing
ts.MP.norm = medpolish(log2(raw.data))$residuals
ts.MP.norm = 2^(ts.MP.norm - min(ts.MP.norm))

# apply quantile normalization
ts.raw.norm <- normalizeBetweenArrays(ts.raw, method='quantile')

################################
# apply different data transformations
################################

apply.tfs <- function(tss,tfs) {
	res <-list()
	for (i in 1:length(tfs)) {
		res.int <- lapply(tss,tfs[[i]])
		names(res.int) <- lapply(names(tss),paste,names(tfs[i]),sep='.')
		res <- append(res, res.int)
	}
	return(res)
}

# does not change provided data matrix
orig <- function(ts) {
	return(ts)
}

# performs log2 transformation of provided data matrix
log2mr <- function(ts) {
	res <- log2(ts/rowMeans(ts))
	return(res)
}

# performs z-score transformation of provided data matrix
std <- function(ts) {
	res <- (ts-rowMeans(ts)) / apply(ts,1,sd)
	return(res)
}

# performs discrete fourier transformation of provided data matrix
dft <- function(ts) {
	#apply fourier
	n <- ncol(ts)
	f.ts <- t(apply(ts,1,fft))
	f.ts <- f.ts[,1:(n/2)]
	#remove trend (dc) part
	f.ts <- f.ts[,-1]
	f.ts.re <- matrix(NA,ncol=ncol(f.ts),nrow=nrow(f.ts)) 
	f.ts.im <- f.ts.re
	#scale amplitudes
	for ( i in 1:ncol(f.ts) )  f.ts.re[,i] <- Re(f.ts)[,i] / rowMeans(Mod(f.ts)[,-i])
	for ( i in 1:ncol(f.ts) )  f.ts.im[,i] <- Im(f.ts)[,i] / rowMeans(Mod(f.ts)[,-i])
	
	#combine and return
	f.ts <- cbind(f.ts.re,f.ts.im)
	return(f.ts)
}

#two data sets
data.sets <- list(quant.norm=ts.raw.norm,mp=ts.MP.norm,raw=ts.raw)
#and list of transformations
tfs <- list(log2mr=log2mr, std=std, dft=dft, orig=orig)
#apply to each other
data.chr.genes <- apply.tfs(data.sets, tfs)

################################
# diagnostic plots for provided data 
################################

###################
# creates plot of the provided expression time series, with 
# a boxplot per sample (time step) 
plot.hist.samplewise <- function(dat, out.file, width=par("din")[1],height=par("din")[2]) {
	n.samp <- ncol(dat)
	t <- data.frame()
	for(i in 1:n.samp) {
		tcur <- as.data.frame(list(dat=dat[,i]))
		tcur$sample<-colnames(dat)[i]
		t <- rbind(t,tcur)
	}
	#determine order of samples 
	xlims <- colnames(dat)[1:length(unique(colnames(dat)))]
	#plot
	p1 <- ggplot(t, aes(sample,dat, fill = sample)) + geom_boxplot(pos="dodge") +
			xlab('Sample (CT)') + ylab('Expression [a.u.]') + 
			opts(legend.position = "none") + xlim(xlims)
	ggsave(out.file, plot = p1, width=width, height=height)
}

#plot samplewise (chipwise) distributions
plot.hist.samplewise(dat=data.chr.genes$raw.std[,1:6], out.file=file.path(output.dir,'chip_hist_raw.pdf'),width=3.5, height=3.5)
plot.hist.samplewise(dat=data.chr.genes$quant.norm.std[,1:6], out.file=file.path(output.dir,'chip_hist_qn.pdf'),width=3.5, height=3.5)

##plot correlation structure
#cor.raw<- cor(t(data.chr.genes$raw.std),method='spearman')
#cor.raw.hist <- hist(cor.raw,breaks=50)
#cor.raw.hist.df <- data.frame(mids=cor.raw.hist$mids,density=cor.raw.hist$density,
#		counts=cor.raw.hist$counts / sum(cor.raw.hist$counts),type='raw')
#
#cor.qn<- cor(t(data.chr.genes$quant.norm.std),method='spearman')
#cor.qn.hist <- hist(cor.qn,breaks=50)
#cor.qn.hist.df <- data.frame(mids=cor.qn.hist$mids,density=cor.qn.hist$density,
#		counts=cor.qn.hist$counts / sum(cor.qn.hist$counts),type='qn')
#
#p1 <- ggplot(cor.raw.hist.df, aes(mids, counts)) + geom_bar(fill="darkgray",stat="identity")+xlab(expression(paste('Spearman ',gamma))) + ylab(expression(paste(x[1],' < ',P(gamma),' < ',x[2] )))
#ggsave(file.path(output.dir,'raw_corr_hist.pdf'), plot = p1, width=5, height=4)
#p1 <- ggplot(cor.qn.hist.df, aes(mids, counts)) + geom_bar(fill="darkgray",stat="identity")+xlab(expression(paste('Spearman ',gamma))) + ylab(expression(paste(x[1],' < ',P(gamma),' < ',x[2] )))
#ggsave(file.path(output.dir,'clustering/qn_corr_hist.pdf'), plot = p1, width=5, height=4)
#rm(cor.raw,cor.qn,p1)

################################
# obtain clustering results
################################

###
# call kmeans in a loop for all provided cluster numbers n
# avail distance measures "euclidean", "maximum", "manhattan", 
# "canberra", "binary", "pearson", "correlation", "spearman" or "kendall"
kmeans_loop <- function(data.list,dataset.idx,num.clust, dist.method='euclidean', 
		iter.max = 1000, nstart=100) {
	
	data <- data.list[[dataset.idx]]
	data.name <- names(data.list[dataset.idx])
	method.name <- paste('kmeans','.',dist.method,sep='')
	
	cat('clustering dataset',data.name,'with',method.name,'\n')
	
	#make matrix for clustering results
	cm <- matrix(0,nrow=dim(data)[1], ncol=length(num.clust))
	colnames(cm) <- as.character(num.clust)
	rownames(cm) <- rownames(data)
	#mclust results
	res <- vector(mode="list")
	for(idx in c(1:length(num.clust))){
		i <- num.clust[idx]
		cat('kmeans clustering with G=',i,'\n')
		
		km_cl <- Kmeans(data, centers=i, iter.max = iter.max, nstart=nstart, method=dist.method)
		cm[,idx] <- km_cl$cluster
		res <- append(res,list(kmeans=km_cl))
	}
	return(list(cm=cm,resList=res,data.name=data.name,method.name=method.name))
}

###
# avail clustering methods ward, single, complete, average, mcquitty, median, centroid
# avail distance measures euclidean, maximum, manhattan, canberra, binary, pearson, correlation, spearman, kendall
hclust_loop <- function(data.list,dataset.idx,num.clust, method.cl='complete',dist.method='euclidean') {
	
	require(stats)
	data <- data.list[[dataset.idx]]
	data.name <- names(data.list[dataset.idx])
	method.name=paste('hclust',dist.method,method.cl,sep='.')
	
	cat('clustering dataset',data.name,'with',method.name,dim(data),'\n')
	
	#make matrix for clustering results
	cm <- matrix(0,nrow=dim(data)[1], ncol=length(num.clust))
	colnames(cm) <- as.character(num.clust)
	rownames(cm) <- rownames(data)
	
	res <- hclust(amap::Dist(data,method=dist.method),method=method.cl)
	
	for(idx in c(1:length(num.clust))){
		i <- num.clust[idx]
		cat('hclust cutting tree with G=',i,'\n')
		cm[,idx] <- cutree(res,k=i)
	}
	return(list(cm=cm,resList=res,data.name=data.name,method.name=method.name))
}


###
# SOM
som_loop <- function(data.list,dataset.idx,num.clust) {
	
	require(kohonen)
	data <- data.list[[dataset.idx]]
	data.name <- names(data.list[dataset.idx])
	method.name='som'
	
	cat('SOM clustering dataset',data.name,'with',method.name,dim(data),'\n')
	
	#make matrix for clustering results
	cm <- matrix(0,nrow=dim(data)[1], ncol=length(num.clust))
	colnames(cm) <- as.character(num.clust)
	rownames(cm) <- rownames(data)
	
	res <- vector(mode='list')
	for(idx in c(1:length(num.clust))){
		G <- num.clust[idx]
		kmap <- kohonen::som(data, grid = somgrid(x=1, y=G, topo="rectangular"), rlen = 3000, alpha = c(0.5, 0.05))
		cm[,idx] <- kmap$unit.classif
		res <- append(res, kmap)
	}
	return(list(cm=cm,resList=res,data.name=data.name,method.name=method.name))
}

# SOTA
# avail distance measures euclidean, correlation
sota_loop <- function(data.list,dataset.idx,num.clust, dist.method='euclidean') {
	
	require(clValid)
	source('sota.R')
	
	data <- data.list[[dataset.idx]]
	data.name <- names(data.list[dataset.idx])
	method.name=paste('sota',dist.method,sep='.')
	
	cat('clustering dataset',data.name,'with sota \n')
	
	#make matrix for clustering results
	cm <- matrix(NA,nrow=dim(data)[1], ncol=length(num.clust))
	colnames(cm) <- as.character(num.clust)
	rownames(cm) <- rownames(data)
	
	s.res <- clValid(data,num.clust,clMethods=c("sota"),validation="stability", 
			metric=dist.method,maxitems=nrow(data))
	res <- vector(mode='list')
	for(idx in c(1:length(num.clust))){
		G <- num.clust[idx]
		cm[,idx] <- s.res@clusterObjs$sota[[idx]]$clust
		res <- append(res, s.res@clusterObjs$sota[[idx]])
	}
	remove(s.res)
	return(list(cm=cm,resList=res,data.name=data.name,method.name=method.name))
}

###
# apply pam on provided dataset or distance matrix 
pam_loop <- function(data.list,dataset.idx, method, dist.mat, num.clust) {
	require(cluster)
	if(!missing(dist.mat)) {
		cat('using provided dist matrix \n')
		method.name <- paste('pam',method,sep='_')
		data.name <- 'NA'
	} else {
		if(missing(data.list) | missing(dataset.idx) | missing(method)) {
			cat('you have to provide data and distance method')
			return
		}
		dist.mat <- amap::Dist(data.list[[dataset.idx]],method)
		data.name <- names(data.list[dataset.idx])
		method.name <- paste('pam',method,sep='')
	}
	cl.pcls <- vector(mode='list')
	cm <- matrix(0,nrow=dim(data.list[[1]])[1], ncol=length(num.clust))
	colnames(cm) <- num.clust
	rownames(cm) <- rownames(data.list[[1]])
	for(i in 1:length(num.clust)) {
		pam.res <- pam(dist.mat,num.clust[i])
		cl.pcls <- append(cl.pcls,list(pam.res))
		cm[,i] <- pam.res$clustering
	}
	names(cl.pcls) <- num.clust
	return(list(cm=cm,resList=cl.pcls,data.name=data.name,method.name=method.name))
}

#######
#cluster numbers k to evaluate
num.clust <- c(5,11)

#######
#compute kmeans clusterings

#different similarity measures
method.list <- list('spearman','euclidean')
kmeans.res <- list()
for(method in method.list) {
	kmr <- mclapply(1:length(data.chr.genes), function(x) kmeans_loop(data.chr.genes,x,num.clust, dist.method=method))
	names(kmr) <- names(data.chr.genes)
	kmeans.res <- append(kmeans.res, kmr)
}

#######
#compute hclust clusterings

method.cl <- list('ward')
method.list <- list('spearman','euclidean')
for(mcl in method.cl)
	for(method in method.list) {
		hcr <- mclapply(1:length(data.chr.genes), function(x) hclust_loop(data.list=data.chr.genes,dataset.idx=x,num.clust=num.clust, 
							dist.method=method, method.cl=mcl) )
		names(hcr) <- names(data.chr.genes)
		hclust.res <- append(hclust.res, hcr)
	}


#######
#compute som clusterings
som.res <- mclapply(1:length(data.chr.genes), function(x) som_loop(data.list=data.chr.genes,dataset.idx=x,num.clust=num.clust))
names(som.res) <- names(data.chr.genes)

#######
#compute sota clusterings

dist.method = list('euclidean','correlation')
sota.res <- vector(mode='list')
for(dm in dist.method) {
	sr <- mclapply(1:length(data.chr.genes), function(x) sota_loop(data.list=data.chr.genes,dataset.idx=x,num.clust=num.clust, dist.method=dm) )
	names(sr) <- names(data.chr.genes)
	sota.res <- append(sota.res, sr)
}

#######
#compute pam clusterings

dist.method = list('euclidean','spearman')
pam.res <- vector(mode='list')
for(dm in dist.method) {
	pr <- mclapply(1:length(data.chr.genes), function(x) pam_loop(data.list=data.chr.genes,dataset.idx=x,num.clust=num.clust, method=dm) )
	names(pr) <- names(data.chr.genes)
	pam.res <- append(pam.res, pr)
}


################################
# compare clusterings
################################


########################################################
# compute similarity measures between clustering results 
# results - list of clustering result objects
# dist.mat - distance matrix for data
# k - consider only clustering with this k
# discard.degen - dont include clusterings where clusters consist only of 1 gene 
# 	(if clustering algorithm failed to achive the requested k)
comp.clustering.similarity <- function(results, dist.mat, k, discard.degen=F) {
	
	cm.supermat <- results[[1]]$cm
	nms <- list()
	for(ks in colnames(results[[1]]$cm))
		nms <- append(nms,paste(results[[1]]$method.name,results[[1]]$data.name,'k',ks,sep='.'))
	
	for(i in 2:length(results)) {
		res <- results[[i]]
		if (any(is.na(res$cm)))
			cat('warning: clustering matrix of result',res$method.name,res$data.name,'contains NAs\n')
		#unique names for matrix rows,cols giving all clustering parameters
		for(ks in colnames(res$cm))
			nms <- append(nms,paste(res$method.name,res$data.name,'k',ks,sep='.'))
		cm.supermat <- cbind(cm.supermat,res$cm)
	}
	colnames(cm.supermat) <- nms
	
	#if k is specified, use only clusterings that posses k number of clusters
	if(!missing(k)) {
		k.counts <- apply(cm.supermat,2,function(x) length(unique(x)))
		col.idx <- which(k.counts == k)
		cm.supermat <- cm.supermat[,col.idx]
		cat('k=',k,'specified, choosing only clusterings with this k...\n')
	}
	# remove degenerated clusterings
	if (discard.degen) {
		to.keep <- c()
		for(i in 1:ncol(cm.supermat)) { #clusterings
			keep <- T
			for(j in unique(cm.supermat[,i])) {#clusters
				if(length(which(cm.supermat[,i]==j))<2)
					keep <- F
			}
			if (keep) to.keep <- append(to.keep, i)
		}
		cat('discarding',(ncol(cm.supermat)-length(to.keep)),'degenerated clusterings\n')
		cm.supermat <- cm.supermat[,to.keep]
	}
	
	n <- dim(cm.supermat)[2]
	#prepare similarity matrix for clustering results
	vi.mat <- matrix(NA,ncol=n,nrow=n)
	
	nm <- colnames(cm.supermat)
	colnames(vi.mat) <- nm
	rownames(vi.mat) <- nm
#copy to other similarity matrices
	mi.mat <- vi.mat
	arand.mat <- vi.mat
	
#make list of all index combinations to calculate
	pairs <- combn(n,2,simplify=F)
	#combn doesnt include pairs of identical indices -> add diag
	for(i in 1:n)
		pairs <- append(pairs,list(c(i,i)))
#calculate meilas vi, mutual information and arand for clustering results
	n.cores <- 1
	sim.meas <- mclapply(pairs,function(x) clustering.similarity(x,cm.supermat,dist.mat))#, mc.cores=n.cores)
	#put results into matrix
	for(item in sim.meas){
		if(typeof(item)=='character') {
			cat(item,'\n')
			next
		}
		vi.mat[item$idx.pair[1],item$idx.pair[2]] <- item$a.vi
		vi.mat[item$idx.pair[2],item$idx.pair[1]] <- item$a.vi
		arand.mat[item$idx.pair[1],item$idx.pair[2]] <- item$arand
		arand.mat[item$idx.pair[2],item$idx.pair[1]] <- item$arand
		#mutual information
		mi.mat[item$idx.pair[1],item$idx.pair[2]] <- item$mi
		mi.mat[item$idx.pair[2],item$idx.pair[1]] <- item$mi
	}
	return(list(a.vi.mat=vi.mat,mi.mat=mi.mat,arand.mat=arand.mat))
}
########################################################
#computes similarity between two clusterings from the provided matrix of clusterings
#returns list wit mutual information, meilas VI and corrected rand index
#TODO: use normalized mutual information, since MI is dependent on cluster number
clustering.similarity <- function(idx.pair, cl.mat, dist){
	#for security reasons
	if(any(is.na(cl.mat[,idx.pair[1]])) 
			| any(is.na(cl.mat[,idx.pair[2]])))
		cat('one of the clusterings',paste(idx.pair),'contains NA','\n')
	comp <- cluster.stats(d=dist, clustering=cl.mat[,idx.pair[1]], 
			alt.clustering=cl.mat[,idx.pair[2]], compareonly=T, silhouette=F)
	confusion <- table(cl.mat[,idx.pair[1]], cl.mat[,idx.pair[2]])
	mi <- mi.empirical(confusion)
	#normalize vi with joint entropy
	je <- entropy(confusion)
	a.vi <- comp$vi / je
	return(list(idx.pair=idx.pair,mi=mi,a.vi=a.vi,arand=comp$corrected.rand))
}

# collect all clustering results in a list
results <- unlist(list(mclust.res, kmeans.res, pam.res, hclust.res, som.res, 
				sota.res, flowclust.res, apclust.res, consensus.res), recursive=F)

cl.res.similarities <- vector(mode='list')
# graphical parameters
marg <- c(35,35)
dims <- c(25,25)
cols <- colorpanel(256,low='white',high='black')
vi.cols <- colorpanel(256,low='black',high='white')
cex <- 2

# consider only clusterings with 5 clusters
k <- 5
# cluster.stats wants distance matrix, even though only the external 
# (data independent) clustering comparisons are used
dist.mat <- dist(data.chr.genes$raw.log2mr, method="euclidean")
#compute distance matrix
res <- comp.clustering.similarity(results, dist.mat, k=k, discard.degen=T)
#make index list for which clusterings are raw / quantile normalized
norm.idx = c()
for (i in 1:ncol(res$a.vi.mat)) {
	dat <- unlist(strsplit(colnames(res$a.vi.mat)[i],'\\.'))
	norm.idx <- append(norm.idx, length(which(dat=='quant')))
}
norm.idx[which(norm.idx==0)] <- 'blue'
norm.idx[which(norm.idx==1)] <- 'red'
#shorten names for matrix - remove information which is redundant
nm <- gsub('.k.\\d$','',colnames(res$a.vi.mat)) # cluster number
nm <- gsub('.quant.norm','',nm) # normalization technique
nm <- gsub('.raw','',nm) # normalization technique
nm <- gsub('.ward','',nm) # hclust method
nm <- gsub('\\.','  ',nm) # hclust method
colnames(res$a.vi.mat) <- nm
rownames(res$a.vi.mat) <- nm
colnames(res$arand.mat) <- nm
rownames(res$arand.mat) <- nm
colnames(res$mi.mat) <- nm
rownames(res$mi.mat) <- nm
# plot heatmap
pdf(file.path(output.dir,paste('clust_clust_MIheatmap_K',k,'.pdf',sep='')),width=dims[1],height=dims[2])
hcl.res <- try(as.dendrogram(hclust(dist(res$mi.mat),method='ward')))
#	pam.res <- pam(dist(res$mi.mat),15)$clustering
mi.res  <- try(heatmap.2(res$mi.mat,margins=marg,key=F,trace='none',dendrogram='row',
				col=cols, cexRow=cex, cexCol=cex,  
				ColSideColors=norm.idx, RowSideColors=norm.idx,
				Rowv=hcl.res, Colv=hcl.res, labCol=NA) )
dev.off()