# R code demonstrating comparative clustering analysis # # Author: robert lehmann ############################################################################### ################################ # required packages ################################ require(limma) require(ggplot2) library(gplots) require(cluster) require(mclust) require(stats) require(kohonen) require(amap) require(multicore) require(clValid) require(entropy) require(fpc) input.dir <- './data/' output.dir <- './out/' exp.file <- 'raw_expression_coding_genes.txt' ################################ # load data ################################ raw.data <- read.table(file.path(input.dir,exp.file)) ts.raw <- as.matrix(raw.data,nrow=length(raw.data[[1]])) # apply median polishing ts.MP.norm = medpolish(log2(raw.data))$residuals ts.MP.norm = 2^(ts.MP.norm - min(ts.MP.norm)) # apply quantile normalization ts.raw.norm <- normalizeBetweenArrays(ts.raw, method='quantile') ################################ # apply different data transformations ################################ apply.tfs <- function(tss,tfs) { res <-list() for (i in 1:length(tfs)) { res.int <- lapply(tss,tfs[[i]]) names(res.int) <- lapply(names(tss),paste,names(tfs[i]),sep='.') res <- append(res, res.int) } return(res) } # does not change provided data matrix orig <- function(ts) { return(ts) } # performs log2 transformation of provided data matrix log2mr <- function(ts) { res <- log2(ts/rowMeans(ts)) return(res) } # performs z-score transformation of provided data matrix std <- function(ts) { res <- (ts-rowMeans(ts)) / apply(ts,1,sd) return(res) } # performs discrete fourier transformation of provided data matrix dft <- function(ts) { #apply fourier n <- ncol(ts) f.ts <- t(apply(ts,1,fft)) f.ts <- f.ts[,1:(n/2)] #remove trend (dc) part f.ts <- f.ts[,-1] f.ts.re <- matrix(NA,ncol=ncol(f.ts),nrow=nrow(f.ts)) f.ts.im <- f.ts.re #scale amplitudes for ( i in 1:ncol(f.ts) ) f.ts.re[,i] <- Re(f.ts)[,i] / rowMeans(Mod(f.ts)[,-i]) for ( i in 1:ncol(f.ts) ) f.ts.im[,i] <- Im(f.ts)[,i] / rowMeans(Mod(f.ts)[,-i]) #combine and return f.ts <- cbind(f.ts.re,f.ts.im) return(f.ts) } #two data sets data.sets <- list(quant.norm=ts.raw.norm,mp=ts.MP.norm,raw=ts.raw) #and list of transformations tfs <- list(log2mr=log2mr, std=std, dft=dft, orig=orig) #apply to each other data.chr.genes <- apply.tfs(data.sets, tfs) ################################ # diagnostic plots for provided data ################################ ################### # creates plot of the provided expression time series, with # a boxplot per sample (time step) plot.hist.samplewise <- function(dat, out.file, width=par("din")[1],height=par("din")[2]) { n.samp <- ncol(dat) t <- data.frame() for(i in 1:n.samp) { tcur <- as.data.frame(list(dat=dat[,i])) tcur$sample<-colnames(dat)[i] t <- rbind(t,tcur) } #determine order of samples xlims <- colnames(dat)[1:length(unique(colnames(dat)))] #plot p1 <- ggplot(t, aes(sample,dat, fill = sample)) + geom_boxplot(pos="dodge") + xlab('Sample (CT)') + ylab('Expression [a.u.]') + opts(legend.position = "none") + xlim(xlims) ggsave(out.file, plot = p1, width=width, height=height) } #plot samplewise (chipwise) distributions plot.hist.samplewise(dat=data.chr.genes$raw.std[,1:6], out.file=file.path(output.dir,'chip_hist_raw.pdf'),width=3.5, height=3.5) plot.hist.samplewise(dat=data.chr.genes$quant.norm.std[,1:6], out.file=file.path(output.dir,'chip_hist_qn.pdf'),width=3.5, height=3.5) ##plot correlation structure #cor.raw<- cor(t(data.chr.genes$raw.std),method='spearman') #cor.raw.hist <- hist(cor.raw,breaks=50) #cor.raw.hist.df <- data.frame(mids=cor.raw.hist$mids,density=cor.raw.hist$density, # counts=cor.raw.hist$counts / sum(cor.raw.hist$counts),type='raw') # #cor.qn<- cor(t(data.chr.genes$quant.norm.std),method='spearman') #cor.qn.hist <- hist(cor.qn,breaks=50) #cor.qn.hist.df <- data.frame(mids=cor.qn.hist$mids,density=cor.qn.hist$density, # counts=cor.qn.hist$counts / sum(cor.qn.hist$counts),type='qn') # #p1 <- ggplot(cor.raw.hist.df, aes(mids, counts)) + geom_bar(fill="darkgray",stat="identity")+xlab(expression(paste('Spearman ',gamma))) + ylab(expression(paste(x[1],' < ',P(gamma),' < ',x[2] ))) #ggsave(file.path(output.dir,'raw_corr_hist.pdf'), plot = p1, width=5, height=4) #p1 <- ggplot(cor.qn.hist.df, aes(mids, counts)) + geom_bar(fill="darkgray",stat="identity")+xlab(expression(paste('Spearman ',gamma))) + ylab(expression(paste(x[1],' < ',P(gamma),' < ',x[2] ))) #ggsave(file.path(output.dir,'clustering/qn_corr_hist.pdf'), plot = p1, width=5, height=4) #rm(cor.raw,cor.qn,p1) ################################ # obtain clustering results ################################ ### # call kmeans in a loop for all provided cluster numbers n # avail distance measures "euclidean", "maximum", "manhattan", # "canberra", "binary", "pearson", "correlation", "spearman" or "kendall" kmeans_loop <- function(data.list,dataset.idx,num.clust, dist.method='euclidean', iter.max = 1000, nstart=100) { data <- data.list[[dataset.idx]] data.name <- names(data.list[dataset.idx]) method.name <- paste('kmeans','.',dist.method,sep='') cat('clustering dataset',data.name,'with',method.name,'\n') #make matrix for clustering results cm <- matrix(0,nrow=dim(data)[1], ncol=length(num.clust)) colnames(cm) <- as.character(num.clust) rownames(cm) <- rownames(data) #mclust results res <- vector(mode="list") for(idx in c(1:length(num.clust))){ i <- num.clust[idx] cat('kmeans clustering with G=',i,'\n') km_cl <- Kmeans(data, centers=i, iter.max = iter.max, nstart=nstart, method=dist.method) cm[,idx] <- km_cl$cluster res <- append(res,list(kmeans=km_cl)) } return(list(cm=cm,resList=res,data.name=data.name,method.name=method.name)) } ### # avail clustering methods ward, single, complete, average, mcquitty, median, centroid # avail distance measures euclidean, maximum, manhattan, canberra, binary, pearson, correlation, spearman, kendall hclust_loop <- function(data.list,dataset.idx,num.clust, method.cl='complete',dist.method='euclidean') { require(stats) data <- data.list[[dataset.idx]] data.name <- names(data.list[dataset.idx]) method.name=paste('hclust',dist.method,method.cl,sep='.') cat('clustering dataset',data.name,'with',method.name,dim(data),'\n') #make matrix for clustering results cm <- matrix(0,nrow=dim(data)[1], ncol=length(num.clust)) colnames(cm) <- as.character(num.clust) rownames(cm) <- rownames(data) res <- hclust(amap::Dist(data,method=dist.method),method=method.cl) for(idx in c(1:length(num.clust))){ i <- num.clust[idx] cat('hclust cutting tree with G=',i,'\n') cm[,idx] <- cutree(res,k=i) } return(list(cm=cm,resList=res,data.name=data.name,method.name=method.name)) } ### # SOM som_loop <- function(data.list,dataset.idx,num.clust) { require(kohonen) data <- data.list[[dataset.idx]] data.name <- names(data.list[dataset.idx]) method.name='som' cat('SOM clustering dataset',data.name,'with',method.name,dim(data),'\n') #make matrix for clustering results cm <- matrix(0,nrow=dim(data)[1], ncol=length(num.clust)) colnames(cm) <- as.character(num.clust) rownames(cm) <- rownames(data) res <- vector(mode='list') for(idx in c(1:length(num.clust))){ G <- num.clust[idx] kmap <- kohonen::som(data, grid = somgrid(x=1, y=G, topo="rectangular"), rlen = 3000, alpha = c(0.5, 0.05)) cm[,idx] <- kmap$unit.classif res <- append(res, kmap) } return(list(cm=cm,resList=res,data.name=data.name,method.name=method.name)) } # SOTA # avail distance measures euclidean, correlation sota_loop <- function(data.list,dataset.idx,num.clust, dist.method='euclidean') { require(clValid) source('sota.R') data <- data.list[[dataset.idx]] data.name <- names(data.list[dataset.idx]) method.name=paste('sota',dist.method,sep='.') cat('clustering dataset',data.name,'with sota \n') #make matrix for clustering results cm <- matrix(NA,nrow=dim(data)[1], ncol=length(num.clust)) colnames(cm) <- as.character(num.clust) rownames(cm) <- rownames(data) s.res <- clValid(data,num.clust,clMethods=c("sota"),validation="stability", metric=dist.method,maxitems=nrow(data)) res <- vector(mode='list') for(idx in c(1:length(num.clust))){ G <- num.clust[idx] cm[,idx] <- s.res@clusterObjs$sota[[idx]]$clust res <- append(res, s.res@clusterObjs$sota[[idx]]) } remove(s.res) return(list(cm=cm,resList=res,data.name=data.name,method.name=method.name)) } ### # apply pam on provided dataset or distance matrix pam_loop <- function(data.list,dataset.idx, method, dist.mat, num.clust) { require(cluster) if(!missing(dist.mat)) { cat('using provided dist matrix \n') method.name <- paste('pam',method,sep='_') data.name <- 'NA' } else { if(missing(data.list) | missing(dataset.idx) | missing(method)) { cat('you have to provide data and distance method') return } dist.mat <- amap::Dist(data.list[[dataset.idx]],method) data.name <- names(data.list[dataset.idx]) method.name <- paste('pam',method,sep='') } cl.pcls <- vector(mode='list') cm <- matrix(0,nrow=dim(data.list[[1]])[1], ncol=length(num.clust)) colnames(cm) <- num.clust rownames(cm) <- rownames(data.list[[1]]) for(i in 1:length(num.clust)) { pam.res <- pam(dist.mat,num.clust[i]) cl.pcls <- append(cl.pcls,list(pam.res)) cm[,i] <- pam.res$clustering } names(cl.pcls) <- num.clust return(list(cm=cm,resList=cl.pcls,data.name=data.name,method.name=method.name)) } ####### #cluster numbers k to evaluate num.clust <- c(5,11) ####### #compute kmeans clusterings #different similarity measures method.list <- list('spearman','euclidean') kmeans.res <- list() for(method in method.list) { kmr <- mclapply(1:length(data.chr.genes), function(x) kmeans_loop(data.chr.genes,x,num.clust, dist.method=method)) names(kmr) <- names(data.chr.genes) kmeans.res <- append(kmeans.res, kmr) } ####### #compute hclust clusterings method.cl <- list('ward') method.list <- list('spearman','euclidean') for(mcl in method.cl) for(method in method.list) { hcr <- mclapply(1:length(data.chr.genes), function(x) hclust_loop(data.list=data.chr.genes,dataset.idx=x,num.clust=num.clust, dist.method=method, method.cl=mcl) ) names(hcr) <- names(data.chr.genes) hclust.res <- append(hclust.res, hcr) } ####### #compute som clusterings som.res <- mclapply(1:length(data.chr.genes), function(x) som_loop(data.list=data.chr.genes,dataset.idx=x,num.clust=num.clust)) names(som.res) <- names(data.chr.genes) ####### #compute sota clusterings dist.method = list('euclidean','correlation') sota.res <- vector(mode='list') for(dm in dist.method) { sr <- mclapply(1:length(data.chr.genes), function(x) sota_loop(data.list=data.chr.genes,dataset.idx=x,num.clust=num.clust, dist.method=dm) ) names(sr) <- names(data.chr.genes) sota.res <- append(sota.res, sr) } ####### #compute pam clusterings dist.method = list('euclidean','spearman') pam.res <- vector(mode='list') for(dm in dist.method) { pr <- mclapply(1:length(data.chr.genes), function(x) pam_loop(data.list=data.chr.genes,dataset.idx=x,num.clust=num.clust, method=dm) ) names(pr) <- names(data.chr.genes) pam.res <- append(pam.res, pr) } ################################ # compare clusterings ################################ ######################################################## # compute similarity measures between clustering results # results - list of clustering result objects # dist.mat - distance matrix for data # k - consider only clustering with this k # discard.degen - dont include clusterings where clusters consist only of 1 gene # (if clustering algorithm failed to achive the requested k) comp.clustering.similarity <- function(results, dist.mat, k, discard.degen=F) { cm.supermat <- results[[1]]$cm nms <- list() for(ks in colnames(results[[1]]$cm)) nms <- append(nms,paste(results[[1]]$method.name,results[[1]]$data.name,'k',ks,sep='.')) for(i in 2:length(results)) { res <- results[[i]] if (any(is.na(res$cm))) cat('warning: clustering matrix of result',res$method.name,res$data.name,'contains NAs\n') #unique names for matrix rows,cols giving all clustering parameters for(ks in colnames(res$cm)) nms <- append(nms,paste(res$method.name,res$data.name,'k',ks,sep='.')) cm.supermat <- cbind(cm.supermat,res$cm) } colnames(cm.supermat) <- nms #if k is specified, use only clusterings that posses k number of clusters if(!missing(k)) { k.counts <- apply(cm.supermat,2,function(x) length(unique(x))) col.idx <- which(k.counts == k) cm.supermat <- cm.supermat[,col.idx] cat('k=',k,'specified, choosing only clusterings with this k...\n') } # remove degenerated clusterings if (discard.degen) { to.keep <- c() for(i in 1:ncol(cm.supermat)) { #clusterings keep <- T for(j in unique(cm.supermat[,i])) {#clusters if(length(which(cm.supermat[,i]==j))<2) keep <- F } if (keep) to.keep <- append(to.keep, i) } cat('discarding',(ncol(cm.supermat)-length(to.keep)),'degenerated clusterings\n') cm.supermat <- cm.supermat[,to.keep] } n <- dim(cm.supermat)[2] #prepare similarity matrix for clustering results vi.mat <- matrix(NA,ncol=n,nrow=n) nm <- colnames(cm.supermat) colnames(vi.mat) <- nm rownames(vi.mat) <- nm #copy to other similarity matrices mi.mat <- vi.mat arand.mat <- vi.mat #make list of all index combinations to calculate pairs <- combn(n,2,simplify=F) #combn doesnt include pairs of identical indices -> add diag for(i in 1:n) pairs <- append(pairs,list(c(i,i))) #calculate meilas vi, mutual information and arand for clustering results n.cores <- 1 sim.meas <- mclapply(pairs,function(x) clustering.similarity(x,cm.supermat,dist.mat))#, mc.cores=n.cores) #put results into matrix for(item in sim.meas){ if(typeof(item)=='character') { cat(item,'\n') next } vi.mat[item$idx.pair[1],item$idx.pair[2]] <- item$a.vi vi.mat[item$idx.pair[2],item$idx.pair[1]] <- item$a.vi arand.mat[item$idx.pair[1],item$idx.pair[2]] <- item$arand arand.mat[item$idx.pair[2],item$idx.pair[1]] <- item$arand #mutual information mi.mat[item$idx.pair[1],item$idx.pair[2]] <- item$mi mi.mat[item$idx.pair[2],item$idx.pair[1]] <- item$mi } return(list(a.vi.mat=vi.mat,mi.mat=mi.mat,arand.mat=arand.mat)) } ######################################################## #computes similarity between two clusterings from the provided matrix of clusterings #returns list wit mutual information, meilas VI and corrected rand index #TODO: use normalized mutual information, since MI is dependent on cluster number clustering.similarity <- function(idx.pair, cl.mat, dist){ #for security reasons if(any(is.na(cl.mat[,idx.pair[1]])) | any(is.na(cl.mat[,idx.pair[2]]))) cat('one of the clusterings',paste(idx.pair),'contains NA','\n') comp <- cluster.stats(d=dist, clustering=cl.mat[,idx.pair[1]], alt.clustering=cl.mat[,idx.pair[2]], compareonly=T, silhouette=F) confusion <- table(cl.mat[,idx.pair[1]], cl.mat[,idx.pair[2]]) mi <- mi.empirical(confusion) #normalize vi with joint entropy je <- entropy(confusion) a.vi <- comp$vi / je return(list(idx.pair=idx.pair,mi=mi,a.vi=a.vi,arand=comp$corrected.rand)) } # collect all clustering results in a list results <- unlist(list(mclust.res, kmeans.res, pam.res, hclust.res, som.res, sota.res, flowclust.res, apclust.res, consensus.res), recursive=F) cl.res.similarities <- vector(mode='list') # graphical parameters marg <- c(35,35) dims <- c(25,25) cols <- colorpanel(256,low='white',high='black') vi.cols <- colorpanel(256,low='black',high='white') cex <- 2 # consider only clusterings with 5 clusters k <- 5 # cluster.stats wants distance matrix, even though only the external # (data independent) clustering comparisons are used dist.mat <- dist(data.chr.genes$raw.log2mr, method="euclidean") #compute distance matrix res <- comp.clustering.similarity(results, dist.mat, k=k, discard.degen=T) #make index list for which clusterings are raw / quantile normalized norm.idx = c() for (i in 1:ncol(res$a.vi.mat)) { dat <- unlist(strsplit(colnames(res$a.vi.mat)[i],'\\.')) norm.idx <- append(norm.idx, length(which(dat=='quant'))) } norm.idx[which(norm.idx==0)] <- 'blue' norm.idx[which(norm.idx==1)] <- 'red' #shorten names for matrix - remove information which is redundant nm <- gsub('.k.\\d$','',colnames(res$a.vi.mat)) # cluster number nm <- gsub('.quant.norm','',nm) # normalization technique nm <- gsub('.raw','',nm) # normalization technique nm <- gsub('.ward','',nm) # hclust method nm <- gsub('\\.',' ',nm) # hclust method colnames(res$a.vi.mat) <- nm rownames(res$a.vi.mat) <- nm colnames(res$arand.mat) <- nm rownames(res$arand.mat) <- nm colnames(res$mi.mat) <- nm rownames(res$mi.mat) <- nm # plot heatmap pdf(file.path(output.dir,paste('clust_clust_MIheatmap_K',k,'.pdf',sep='')),width=dims[1],height=dims[2]) hcl.res <- try(as.dendrogram(hclust(dist(res$mi.mat),method='ward'))) # pam.res <- pam(dist(res$mi.mat),15)$clustering mi.res <- try(heatmap.2(res$mi.mat,margins=marg,key=F,trace='none',dendrogram='row', col=cols, cexRow=cex, cexCol=cex, ColSideColors=norm.idx, RowSideColors=norm.idx, Rowv=hcl.res, Colv=hcl.res, labCol=NA) ) dev.off()