rm(list=ls()) quad <- read.table("all_PQS_and_TSS_mappings.csv", sep=",",header=TRUE,stringsAsFactors=FALSE) colnames(quad)[11:19] <- c("stacks","fa","fc","ft","ng","length2","l1","l2","l3") dim <- 5 # split into plus and minus strands, keep only row entries with PQS present # keep only one copy of each unique PQS all <- quad[with(quad, is.na(delta) == FALSE & !duplicated(row_number)), ] # analysis of all PQS with 4 stacks and 3 loops in which the proximal and terminal loop base for each loop != G all1a <- all[with(all, is.na(l1) == FALSE),] all2 <- all1a[,c(5,12,13,14,17,18,19)] rownames(all2) <- all2$row_number # Load the kohonen package require(kohonen) # Create a training data set (rows are samples, columns are variables # Here I am selecting a subset of my variables available in "data" data_train <- all2 # Change the data frame with training data to a matrix # Also center and scale all variables to give them equal importance during # the SOM training process. data_train_matrix <- as.matrix(scale(data_train)) colnames(data_train_matrix) <- colnames(data_train) # Create the SOM Grid - you generally have to specify the size of the # training grid prior to training the SOM. Hexagonal and Circular # topologies are possible x <- dim y <- dim som_grid <- somgrid(xdim = x, ydim=y, topo="hexagonal") # Finally, train the SOM, options for the number of iterations, # the learning rates, and the neighbourhood are available som_model <- som(data_train_matrix, grid=som_grid, rlen=100, alpha=c(0.05,0.01), keep.data = TRUE, n.hood="circular") pdf("kohonen.pdf") # bottom left top right library("gplots") library("gtools") library("colorRamps") #plot(som_model, type="changes",main="Changes", font=2, font.lab=2, font.axis=2) par(mar=c(0,0,0,0)) plot(som_model, type="count", main="Count", font.main=2) plot(som_model, type="dist.neighbours", main="Distance to Neighbors", font.main=2) # unscaled property plots for(i in 1:7){ var <- i #define the variable to plot var_unscaled <- aggregate(as.numeric(data_train[,var]), by=list(som_model$unit.classif), FUN=mean, simplify=TRUE)[,2] plot(som_model, type = "property", property=var_unscaled, main=names(data_train)[var], font.main=2) } plot(som_model, type="codes", main="Codes") mydata <- som_model$codes wss <- (nrow(mydata)-1)*sum(apply(mydata,2,var)) for (i in 2:15) { wss[i] <- sum(kmeans(mydata, centers=i)$withinss) } #plot(wss) ## use hierarchical clustering to cluster the codebook vectors k=6 som_cluster <- cutree(hclust(dist(som_model$codes)), k=k, h= NULL) # plot these results: col3 <- colorRampPalette(c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7"))(k) plot(som_model, type="mapping", bgcol = col3[som_cluster], main = "Clusters") add.cluster.boundaries(som_model, som_cluster) all3 <- cbind(all1a, as.data.frame(som_model$unit.classif)) colnames(all3)[51] <- "class" write.table(all3, "classification.csv",sep=",",row.names=FALSE, col.names=colnames(all3)) to <- x*y+0.5 bins <- seq(from=0.5, to=to, by=1) h_all <- hist(all3$class, breaks=bins, plot=FALSE) # import dataset for Bloom Syndrome Patients bdat <- read.table("differentially-expressed_genes_BS.csv", sep=",", header=TRUE) # split gene sets into up- and down-regulated blm.up1 <- as.data.frame(bdat[with(bdat, logFC > 0), ]$gene) blm.down1 <- as.data.frame(bdat[with(bdat, logFC < 0), ]$gene) colnames(blm.up1) <- colnames(blm.down1) <- "gene_name" # merge blm.up1 and blm.down1 datasets with quad --> full TSS and PQS lists blm.up <- merge(all3, blm.up1, by="gene_name") # isolate entries of all3 from differentially-expressed genes h_blm.up <- hist(blm.up$class, breaks=bins, plot=FALSE) blm.down <- merge(all3, blm.down1, by="gene_name") h_blm.down <- hist(blm.down$class, breaks=bins, plot=FALSE) # calculate numbers of genes in respective datasets genomic.tss <- length(unique(quad$unique.gene.number)) # calculate number of genes in blm up and down datasets (not all genes from blm.up1 and blm.down1 lerge with quad dataset). # Calcualte the number of genes that actually merge into dataset blm.up.tss1 <- merge(quad, blm.up1, by="gene_name") blm.up.tss <- length(unique(blm.up.tss1$gene_name)) blm.down.tss1 <- merge(quad, blm.down1, by="gene_name") blm.down.tss <- length(unique(blm.down.tss1$gene_name)) ## botstrap simulation to identify threshold for enrichment/depletion significance for each cluster # make random gene sets gene.symbols <- unique(quad$gene_name) out <- list() for(i in 1:100){ rand.up1 <- data.frame(sample(gene.symbols, blm.up.tss)) rand.down1 <- data.frame(sample(gene.symbols, blm.down.tss)) colnames(rand.up1) <- colnames(rand.down1) <- "gene_name" rand.up <- merge(all3, rand.up1, by="gene_name") h_rand.up <- hist(rand.up$class, breaks=bins, plot=FALSE) rand.down <- merge(all3, rand.down1, by="gene_name") h_rand.down <- hist(rand.down$class, breaks=bins, plot=FALSE) rand.up.tss <- nrow(rand.up1) rand.down.tss <- nrow(rand.down1) en_rand.up <- log2((h_rand.up$counts/rand.up.tss)/(h_all$counts/genomic.tss)) en_rand.down <- log2((h_rand.down$counts/rand.down.tss)/(h_all$counts/genomic.tss)) sav <- list() sav[[1]] <- en_rand.up sav[[2]] <- en_rand.down out[[i]] <- sav print(i) } # calculate per node distributions to determing threshold levels up.mean <- NULL up.sd <- NULL down.mean <- NULL down.sd <- NULL for(i in 1:length(out[[1]][1][[1]])){ up <- NULL down <- NULL for(j in 1:100){ up <- c(up, out[[j]][1][[1]][i]) up <- up[up != "-Inf"] down <- c(down, out[[j]][2][[1]][i]) down <- down[down != "-Inf"] print(j) } up.mean <- c(up.mean, mean(up)) up.sd <- c(up.sd,sd(up)) down.mean <- c(down.mean, mean(down)) down.sd <- c(down.sd, sd(down)) print("i==.....................") print(i) print("......................") } up.u.limit <- up.mean+2*up.sd up.l.limit <- up.mean-2*up.sd down.u.limit <- down.mean+2*down.sd down.l.limit <- down.mean-2*down.sd ############################ en_blm.up <- log2((h_blm.up$counts/blm.up.tss)/(h_all$counts/genomic.tss)) # raplace values with 0 if the probability of the observation is not outside of the 95% CI en_blm.up2 <- NULL for(i in 1:length(en_blm.up)){ v <- en_blm.up[i] if(v > up.l.limit[i] && v < up.u.limit[i]){ v <- 0 } en_blm.up2 <- c(en_blm.up2,v) print(i) } # en_blm.up <- replace(en_blm.up, en_blm.up == "-Inf",-3) # en_blm.up <- replace(en_blm.up, en_blm.up < -3,-3) en_blm.down <- log2((h_blm.down$counts/blm.down.tss)/(h_all$counts/genomic.tss)) en_blm.down2 <- NULL for(i in 1:length(en_blm.down)){ v <- en_blm.down[i] if(v > down.l.limit[i] && v < down.u.limit[i]){ v <- 0 } en_blm.down2 <- c(en_blm.down2,v) print(i) } # en_blm.down <- replace(en_blm.down, en_blm.down == "-Inf",-3) # en_blm.down <- replace(en_blm.down, en_blm.down < -3,-3) # set scale col3 <- colorRampPalette(c("red","gray","green"))(50) plot(som_model, type="mapping", bgcol = col3[round((round(en_blm.up2,2)+2.5)*10)], main = "BS: Up-Regulated") plot(som_model, type="mapping", bgcol = col3[round((round(en_blm.down2,2)+2.5)*10)], main = "BS: Down-Regulated") ################ ## Werner Syndrome # import dataset for Werner Syndrome Patients bdat <- read.table("differentially-expressed_genes_WS.csv", sep=",", header=TRUE) # split gene sets into up- and down-regulated blm.up1 <- as.data.frame(bdat[with(bdat, logFC > 0), ]$gene) blm.down1 <- as.data.frame(bdat[with(bdat, logFC < 0), ]$gene) colnames(blm.up1) <- colnames(blm.down1) <- "gene_name" # merge blm.up1 and blm.down1 datasets with quad --> full TSS and PQS lists blm.up <- merge(all3, blm.up1, by="gene_name") # isolate entries of all3 from differentially-expressed genes h_blm.up <- hist(blm.up$class, breaks=bins, plot=FALSE) blm.down <- merge(all3, blm.down1, by="gene_name") h_blm.down <- hist(blm.down$class, breaks=bins, plot=FALSE) # calculate numbers of genes in respective datasets genomic.tss <- length(unique(quad$unique.gene.number)) # calculate number of genes in blm up and down datasets (not all genes from blm.up1 and blm.down1 lerge with quad dataset). # Calcualte the number of genes that actually merge into dataset blm.up.tss1 <- merge(quad, blm.up1, by="gene_name") blm.up.tss <- length(unique(blm.up.tss1$gene_name)) blm.down.tss1 <- merge(quad, blm.down1, by="gene_name") blm.down.tss <- length(unique(blm.down.tss1$gene_name)) ## botstrap simulation to identify threshold for enrichment/depletion significance for each cluster # make random gene sets gene.symbols <- unique(quad$gene_name) out <- list() for(i in 1:100){ rand.up1 <- data.frame(sample(gene.symbols, blm.up.tss)) rand.down1 <- data.frame(sample(gene.symbols, blm.down.tss)) colnames(rand.up1) <- colnames(rand.down1) <- "gene_name" rand.up <- merge(all3, rand.up1, by="gene_name") h_rand.up <- hist(rand.up$class, breaks=bins, plot=FALSE) rand.down <- merge(all3, rand.down1, by="gene_name") h_rand.down <- hist(rand.down$class, breaks=bins, plot=FALSE) rand.up.tss <- nrow(rand.up1) rand.down.tss <- nrow(rand.down1) en_rand.up <- log2((h_rand.up$counts/rand.up.tss)/(h_all$counts/genomic.tss)) en_rand.down <- log2((h_rand.down$counts/rand.down.tss)/(h_all$counts/genomic.tss)) sav <- list() sav[[1]] <- en_rand.up sav[[2]] <- en_rand.down out[[i]] <- sav print(i) } # calculate per node distributions to determing threshold levels up.mean <- NULL up.sd <- NULL down.mean <- NULL down.sd <- NULL for(i in 1:length(out[[1]][1][[1]])){ up <- NULL down <- NULL for(j in 1:100){ up <- c(up, out[[j]][1][[1]][i]) up <- up[up != "-Inf"] down <- c(down, out[[j]][2][[1]][i]) down <- down[down != "-Inf"] print(j) } up.mean <- c(up.mean, mean(up)) up.sd <- c(up.sd,sd(up)) down.mean <- c(down.mean, mean(down)) down.sd <- c(down.sd, sd(down)) print("i==.....................") print(i) print("......................") } up.u.limit <- up.mean+2*up.sd up.l.limit <- up.mean-2*up.sd down.u.limit <- down.mean+2*down.sd down.l.limit <- down.mean-2*down.sd ############################ en_blm.up <- log2((h_blm.up$counts/blm.up.tss)/(h_all$counts/genomic.tss)) # raplace values with 0 if the probability of the observation is not outside of the 95% CI en_blm.up2 <- NULL for(i in 1:length(en_blm.up)){ v <- en_blm.up[i] if(v > up.l.limit[i] && v < up.u.limit[i]){ v <- 0 } en_blm.up2 <- c(en_blm.up2,v) print(i) } # en_blm.up <- replace(en_blm.up, en_blm.up == "-Inf",-3) # en_blm.up <- replace(en_blm.up, en_blm.up < -3,-3) en_blm.down <- log2((h_blm.down$counts/blm.down.tss)/(h_all$counts/genomic.tss)) en_blm.down2 <- NULL for(i in 1:length(en_blm.down)){ v <- en_blm.down[i] if(v > down.l.limit[i] && v < down.u.limit[i]){ v <- 0 } en_blm.down2 <- c(en_blm.down2,v) print(i) } # en_blm.down <- replace(en_blm.down, en_blm.down == "-Inf",-3) # en_blm.down <- replace(en_blm.down, en_blm.down < -3,-3) # set scale col3 <- colorRampPalette(c("red","gray","green"))(50) plot(som_model, type="mapping", bgcol = col3[round((round(en_blm.up2,2)+2.5)*10)], main = "WS: Up-Regulated") plot(som_model, type="mapping", bgcol = col3[round((round(en_blm.down2,2)+2.5)*10)], main = "WS: Down-Regulated") # plot color scale col3 <- colorRampPalette(c("red","gray","green"))(300) x <- 1:300 y <- rep(1,300) plot(x,y,col=col3[x],pch=16) dev.off()