# Anne de Jong, May 2015, T-REx pipeline # # # ----------------------------------- load the library-------------------------------------------------- library(statmod) library(ggplot2) library(reshape) library(cluster) library(plyr) library(grid) library(gplots) library(igraph) library(rjson) library(limma) library(edgeR) lib_RNAseq <- "T-REx_functions.R" # --------------------------------------- If you do not have the packages -------------------------------------------------------- # install.packages("ggplot2") # install.packages("statmod") # install.packages("reshape") # install.packages("cluster") # install.packages("plyr") # install.packages("grid") # install.packages("rjson") # install.packages("igraph") # install.packages("gplots") # source("http://bioconductor.org/biocLite.R") # biocLite() # biocLite("limma") # biocLite("edgeR") # -------------------------------------------------------------------------------- for local use------------------------------------------------------ # sessiondir="E:\\Google Drive\\WERK\\Article - RNAseq\\Example_PNAS_CodY" # sessiondir="E:\\tmp\\mirjam" # sessiondir="E:\\tmp\\Example_PNAS_CodY" # experiment="my_experiment" # rpkm_file="RPKM.txt" # factors_file="Factors.txt" # class_file="Class.txt" # contrast_file="Contrasts.txt" # lib_RNAseq <- "E:\\OwnCloud\\molgentools\\rnaseq\\g2d_RNAseq_multifactor_functions.R" # -------------------------------------------------------------------------------- for webserver use ------------------------------------------------- # load parameters from command line or call from the webserver # strange 'known!' thing with R is that is counts all the arguments, so inclusing the options :( # R command line: R --vanilla --slave --args . robyn -r bacillus_cereus_merged_normalized.rpkm -f Factors.txt -c Contrasts.txt < /usr/molgentools/rnaseq/RNAseq_multifactor.R sessiondir <- commandArgs()[5] experiment <- commandArgs()[6] rpkm_file <- commandArgs()[7] factors_file <- commandArgs()[8] contrast_file <- commandArgs()[9] class_file=commandArgs()[10] # ----------------------------------- custom settings / or parameters from webserver-------------------------------------------------- # make web folder from the local folder websessiondir <- gsub("/tmp/genome2d", "genome2d_results", sessiondir) source(lib_RNAseq) setwd(sessiondir) # comment SINK if you run this script locally sink("00.R.remarks.log", append=TRUE, split=FALSE) # container for arguments and data tables in jSON format jSON <- NULL jSON$arguments['sessiondir'] <- sessiondir jSON$arguments['experiment'] <- experiment jSON$arguments['rpkm_file'] <- rpkm_file jSON$arguments['factors_file'] <- factors_file jSON$arguments['class_file'] <- class_file jSON$arguments['contrast_file'] <- contrast_file # graphics settings pointsizes <- 48 # Image resolution # 720p x_720p <- 1280 y_720p <- 720 # FullHD x_1k <- 1920 y_1k <- 1080 # QHD x_2k <- 2560 y_2k <- 1440 # UHD 4k x_4k <- 3840 y_4k <- 2160 # FUHD x_8k <- 7680 y_8k <- 4320 logbook <- c() logbook <- write_log(logbook, "Pipeline started") #------------------------------------ Functions -------------------------------------------------- logbook <- write_log(logbook, "Functions loaded") #------------------------------------ EdgeR -------------------------------------------------- ## Multi Factorial design # 1. read the factorial design logbook <- write_log(logbook, "# 1. Read the factorial design") MF <- read.delim(factors_file,sep="\t") MF # generic import of undefined number of factors MF.dim <- dim(MF)[2] MF.factors <- MF[,2] # first factor # add more factors if there are any (MF.dim >2) if (MF.dim >2) { for (i in 3:MF.dim) { MF.factors <- paste(MF.factors,MF[,i],sep=".") } } MF.group <- factor(MF.factors) MF.group.all <- cbind(MF,Group=MF.group) MF.group.levels <- as.list(levels(MF.group.all$Group)) MF.group.all # 2. Read the RNAseq data logbook <- write_log(logbook, "# 2. Read the RNAseq data") x <- read.delim(rpkm_file,sep="\t", row.names="key") y <- DGEList(counts=x,group=MF.group) head(x) x.melt <- melt(x) colnames(x.melt) =c("Experiment","Value") x.melt$log2Value <- log(x.melt$Value,2) my_barplot_palette <- colorRampPalette(c("#B4AF91", "#C03000"))(n = length(x)) filename <- paste(experiment, "colored_boxplot","png", sep=".") png(file = filename, width = x_1k, height = y_1k, units = "px", pointsize = 12, bg = "white") p <- ggplot(x.melt, aes(Experiment,log2Value) ) + geom_boxplot( aes(fill=Experiment)) + ggtitle("Boxplot of Expression Values of Experiments") + theme_g2d_barplot + theme(legend.position="none") + scale_fill_manual(values = my_barplot_palette) print(p) dev.off() # 3. Read the class file logbook <- write_log(logbook, "# 3. Read the class file") my_class_genes <- read.delim(class_file,sep="\t") colnames(my_class_genes) =c("GeneID","color","Group") # 4. Create the design matrix logbook <- write_log(logbook, "# 4. Create the design matrix") designMF <- model.matrix(~0+MF.group, data=y$samples) designMF colnames(designMF) <- levels(MF.group) designMF # 5. Normalization logbook <- write_log(logbook, "# 5. Normalization ") # raw data filename <- paste(experiment, "Box_plot_raw","png", sep=".") png(file = filename, width = x_4k, height = y_4k, units = "px", pointsize = 12, bg = "white") boxplot(as.data.frame(log(y$counts,2)), main="RPKM raw data (log2)", ylab="log2(counts)", notch=FALSE) dev.off() y<-calcNormFactors(y) # 6. Calculate mean values of the replicates logbook <- write_log(logbook, "# 6. Calculate mean values of the replicates") # Take mean of replicates if are are replicates y_groupheader <- y$counts colnames(y_groupheader)<-MF.group mean_table <- c() for (i in MF.group.levels) { print(i) my_cols<-which(colnames(y_groupheader) %in% i) print(my_cols) if (length(my_cols)>1) { my_mean<-rowMeans(y_groupheader[,my_cols]) } else { my_mean<-(y_groupheader[,my_cols]) } mean_table<-cbind(mean_table,my_mean) } colnames(mean_table)<-MF.group.levels head(mean_table) filename <- paste(experiment, "Mean_signals_of_replicates.txt", sep=".") write.table(mean_table, file=filename , quote=F, sep="\t", col.names=NA) jSON$Mean_signals_of_replicates <- filename # Mean Signals Normalized per gene for heatmaps (x-min)/max mean_table_log <- log(mean_table,2) head(mean_table_log) filename <- paste(experiment, "Mean_signals_log2.txt", sep=".") write.table(mean_table_log, file=filename , quote=F, sep="\t", col.names=NA) jSON$Mean_signals_log2 <- filename mean_table_log_scaled <- round((100 * mean_table_log / max(mean_table_log)), digits=3) head(mean_table_log_scaled) filename <- paste(experiment, "Mean_signals_log2_scaled.txt", sep=".") write.table(mean_table_log_scaled, file=filename , quote=F, sep="\t", col.names=NA) jSON$Mean_signals_log2_scaled <- filename mean_table_log_scaled_per_gene <- round(t(apply(mean_table_log, 1, function(x) (x-min(x))/(max(x)-min(x)))), digits=4) filename <- paste(experiment, "Mean_signals_log2_scaled_per_gene.txt", sep=".") write.table(mean_table_log_scaled_per_gene, file=filename , quote=F, sep="\t", col.names=NA) jSON$Mean_signals_log2_scaled_per_gene <- filename # add class (color and groups) to the mean table mean_table_log_scaled_class <- mean_table_log_scaled mean_table_log_scaled_class <- cbind(mean_table_log_scaled_class, row.names(mean_table_log_scaled_class)) colnames(mean_table_log_scaled_class) <- c(colnames(mean_table_log_scaled), "GeneID") mean_table_log_scaled_class <- merge(mean_table_log_scaled_class, my_class_genes, by="GeneID",all.x=T) mean_table_log_scaled_class$color <- ifelse(is.na(mean_table_log_scaled_class$color), "grey", as.character(mean_table_log_scaled_class$color)) # add a default color if the color is NA mean_table_log_scaled_class$Group <- ifelse(is.na(mean_table_log_scaled_class$Group), "NA", as.character(mean_table_log_scaled_class$Group)) # add a default Group if the Group is NA head(mean_table_log_scaled_class) filename <- paste(experiment, "Mean_signals_log2_scaled_CLASSES.txt", sep=".") write.table(mean_table_log_scaled_class, file=filename , quote=F, sep="\t", col.names=NA) jSON$Mean_signals_log2_scaled_CLASSES <- filename # 7. Graphics for each subClass on the basis of the group column logbook <- write_log(logbook, "# 7. Signal plots of Class genes") mean_table_class <- subset(mean_table, row.names(mean_table) %in% my_class_genes$GeneID) mean_table_class_log <- subset(mean_table_log, row.names(mean_table_log) %in% my_class_genes$GeneID) head(mean_table_class) if (nrow(mean_table_class)<5) { logbook <- write_log(logbook, "# ERROR - 7. Skipping Class signal plots due to low numbers classes found") } else { #draw a graph for each class color logbook <- write_log(logbook, "- a) Create Graphics for mean signals of each subClass") Class_signal_plots(mean_table_class, "Mean_Signals_subClass") logbook <- write_log(logbook, "- b) Create Graphics for mean log2(signal) of each subClass") Class_signal_plots(mean_table_class_log, "Mean_Signals_subClass_log2") } # 8. Correlation matrix of All Class genes, using mean signal of replicates logbook <- write_log(logbook, "# 8. Correlation matrix of All Class genes, using mean signal of replicates") cor_mean_table_class <- cor(t(mean_table_class)) filename <- paste(experiment, "Correlation_matrix_Class_genes.txt", sep=".") write.table(cor_mean_table_class, file=filename , quote=F, sep="\t", row.names=F) jSON$Correlation_matrix_Class_genes <- filename filename <- paste(experiment, "Correlation_matrix_Class_genes.png", sep=".") png(file = filename, width = x_2k, height = y_2k, units = "px", pointsize = 12, bg = "white") m1 <- melt(cor_mean_table_class, id = row.names) qplot(x=X1, y=X2, data=as.data.frame(m1),main = "Correlation matrix of Class genes", xlab="Class genes", ylab="Class genes", fill=value, geom="tile", zlim= c(0,1)) + theme_g2d_plot_cor + scale_fill_gradient2(limits=c(-1, 1)) dev.off() # 9. Correlation matrix of subClass genes, using mean signal of replicates logbook <- write_log(logbook, "# 9. Correlation matrix of subClass genes, using mean signal of replicates") SubClass_correlation_plots(cor_mean_table_class, "SubClass_correlation_matrix" ) # 10. MDS plot logbook <- write_log(logbook, "# 10. MDS plot") filename <- paste(experiment, "MDS.plot","png", sep=".") png(file = filename, width = x_1k, height = y_1k, units = "px", pointsize = 12, bg = "white") par(mar=c(6,8,3,6)) # bottom, left, top, right plotMDS(y, col="blue", top=500, main="MDS Plot of Experiments", cex.main=2,cex.axis=1.8, cex.lab=2.5, cex=3 ) dev.off() # 11. Library size logbook <- write_log(logbook, "# 11. Library size") # old version par(mar=c(6,6,3,16)) # bottom, left, top, right # old version my_barplot_palette <- colorRampPalette(c("#B4AF91", "#787746", "#40411E", "#32331D", "#C03000"))(n = 10) # old version barplot(y$samples$lib.size*1e-6, legend =y$samples$group, names=row.names(y$samples), xlab="Samples", ylab="Depth (millions)", col=my_barplot_palette, cex.main=2,cex.axis=1.8,cex.lab=2.5, main="Library Size", beside = TRUE, ) # prepare data for barplot my_samples <- y$samples my_samples$experiment <- row.names(my_samples) my_samples$lib.size <- as.numeric(my_samples$lib.size) my_samples$experiment <-factor(my_samples$experiment, levels=my_samples[order(my_samples$experiment), "experiment"]) #my_barplot_palette <- colorRampPalette(c("#0057FF", "#001640"))(n = dim(my_samples)[1]) my_barplot_palette <- colorRampPalette(c("#B4AF91", "#787746", "#40411E", "#32331D", "#C03000"))(n = dim(my_samples)[1]) bar_chart <- ggplot(my_samples, aes(experiment, lib.size, fill=factor(lib.size))) + ggtitle("Library Sizes") + theme_g2d_barplot + geom_bar(stat="identity") + xlab("Experiments") + ylab("Number of reads") + scale_fill_manual(values = my_barplot_palette) + theme_g2d_barplot + theme(legend.position="none") + scale_y_continuous(expand = c(0,0)) filename <- paste(experiment, "Library_size","png", sep=".") png(file = filename, width = x_720p, height = y_720p, units = "px", pointsize = 12, bg = "white") print(bar_chart) dev.off() # 12. Calculation of Dispersion logbook <- write_log(logbook, "# 12. Calculation of Dispersion") y <- estimateGLMCommonDisp(y,designMF) y <- estimateGLMTrendedDisp(y,designMF, method="bin.loess") y <- estimateGLMTagwiseDisp(y,designMF) # 13. Load the Contrasts logbook <- write_log(logbook, "# 13. Load the Contrasts") # contrast from file my_contrasts <- readLines(contrast_file) my_contrasts contrast.matrix <- makeContrasts(contrasts=my_contrasts, levels=designMF) contrast.matrix contrast.matrix.dim <- dim(contrast.matrix)[2] # 14. Fit the data logbook <- write_log(logbook, "# 14. Fit the data") fit.design <- glmFit(y, designMF) lrt.design <- glmLRT(fit.design, designMF) filename <- paste(experiment, "Normalised_data_table.txt", sep=".") write.table(lrt.design$fitted.values, file=filename , quote=F, sep="\t", col.names=NA) jSON$Normalised_data_table <- filename # 15. Make correlation Matrix of experiments logbook <- write_log(logbook, "# 15. Make correlation Matrix of experiment") logFC <- predFC(y,designMF,prior.count=1,dispersion=0.05) corLogFC <- cor(logFC) filename <- paste(experiment, "Correlation_matrix_experiments.txt", sep=".") write.table(corLogFC, file=filename , quote=F, sep="\t", row.names=F) jSON$Correlation_matrix_experiments <- filename filename <- paste(experiment, "Correlation_matrix_experiments.png", sep=".") png(file = filename, width = x_1k, height = y_1k, units = "px", pointsize = 12, bg = "white") m1 <- melt(corLogFC, id = row.names) qplot(x=X1, y=X2, data=as.data.frame(m1),main = "Correlation matrix experiments", xlab="Experiments", ylab="Experiments", fill=value, geom="tile") + theme_g2d_plot_cor dev.off() # 16. Loop through the Contrasts and draw for each Contrast a: ## - DE (Differential Expression ) table ## - Significant Fold Change Plots ## - MA plot ## - Volcano plot ## - Fold change distribution plot ## - Count distribution plot logbook <- write_log(logbook, "# 16. Loop through the Contrasts") tophits <- c() tophits_highfold <-c() tophits_no_background <-c() filenames.MAplots <- list() filenames.VolcanoPlots <- list() filenames.SigChanged_plots <- list() filenames.Significant_Changed_Genes_tables <- list() my_overview <- c() my_GeneNetwork <- c() volcano_tables <- NULL for (i in 1:contrast.matrix.dim) { logbook <- write_log(logbook, my_contrasts[i] ) lrt <- glmLRT(fit.design , contrast=contrast.matrix[,i]) my.DGEList.glmFit = topTags(lrt, n=100000, adjust.method="BH") # A. DE (Differential Expression ) table output = cbind(rownames(my.DGEList.glmFit$table), my.DGEList.glmFit$table) # add fold change output <- cbind(output, sign(output$logFC)*(2^abs(output$logFC))) colnames(output) =c("GeneID","logFC", "logCPM", "LR", "pvalue", "adj_pvalue", "Fold") output$minFDR <- -log(output$adj_pvalue,2) filename <- paste(experiment, "Differential_Expression", i, my_contrasts[i] ,"txt", sep=".") write.table(output, file=filename , quote=F, sep="\t", row.names=F) jSON$Differential_Expression[my_contrasts[i]] <- filename # B. For the html table only export the up and down regulated genes tophits_ONLY <- output[( abs(output$Fold)>=2 & output$adj_pvalue<0.05),] tophits_ONLY <- tophits_ONLY[with(tophits_ONLY, order(-Fold)), ] tophits_ONLY$logFC <- sprintf("%.2f", tophits_ONLY$logFC) tophits_ONLY$logCPM <- sprintf("%.2f", tophits_ONLY$logCPM) tophits_ONLY$LR <- sprintf("%.1f", tophits_ONLY$LR) #tophits_ONLY$minFDR <- -log(tophits_ONLY$adj_pvalue,2) tophits_ONLY$minFDR <- sprintf("%.2f", tophits_ONLY$minFDR) tophits_ONLY$pvalue <- sprintf("%.1e", tophits_ONLY$pvalue) tophits_ONLY$adj_pvalue <- sprintf("%.1e", tophits_ONLY$adj_pvalue) tophits_ONLY$Fold <- sprintf("%.1f", tophits_ONLY$Fold) filename <- paste(experiment, "Significant_Changed_Genes", i, my_contrasts[i] ,"txt", sep=".") filenames.Significant_Changed_Genes_tables[i] <- filename write.table(tophits_ONLY, file=filename , quote=F, sep="\t", row.names=F) # C. GeneNetwork; Add tophits for each contrast to network compatible table my_GeneNetwork <- rbind(my_GeneNetwork, data.frame(Contrasts=my_contrasts[i], GeneID=tophits_ONLY$GeneID, logFC=tophits_ONLY$logFC, adj_pvalue=tophits_ONLY$adj_pvalue)) # E. Add data to overview table total<- length(output$GeneID) up <- length(output[( output$Fold >= 2 & output$adj_pvalue <= 0.05),]$GeneID) down <- length(output[( output$Fold <= -2 & output$adj_pvalue <= 0.05),]$GeneID) my_overview <- rbind(my_overview, data.frame(Contrasts=my_contrasts[i], Total=total, Up=up, Down=down, TableFilename=filename)) # D. Plot tophits_ONLY for all genes filename <- paste(experiment, "Significant_Changed_Genes", i, my_contrasts[i] ,"png", sep=".") filenames.SigChanged_plots[i] <- filename png(file = filename, width = x_2k, height = y_2k, units = "px", pointsize = 12, bg = "white") my_title=paste(experiment, "Significant changed genes", my_contrasts[i], sep=" ") tophits_ONLY$logFC <- as.numeric(tophits_ONLY$logFC) tophits_ONLY$GeneID <-factor(tophits_ONLY$GeneID, levels=tophits_ONLY[order(-tophits_ONLY$logFC), "GeneID"]) tophits_ONLY$Up_Down <- ifelse(tophits_ONLY$logFC>0, "UP", "DOWN") bar_chart <- ggplot(tophits_ONLY, aes(GeneID, logFC, fill=factor(Up_Down), colour = factor(Up_Down)), main = my_title ) + ggtitle(my_title) + theme_g2d_barplot + geom_bar(stat="identity") + xlab("Genes") + ylab("Fold Change (Log2FC)") + scale_colour_manual(breaks = tophits_ONLY$updown, values = c("#FFFAF0","#5494FF")) + scale_fill_manual(breaks = tophits_ONLY$updown, values = c("#FFD312","#0608B2")) print(bar_chart) dev.off() # E. Plot All genes of each class # to do # F. add the GeneIDs to the Tophits list for all experiments tophits <- c(tophits, as.character(output[( abs(output$Fold)>2 & output$adj_pvalue<0.05),]$GeneID)) tophits_highfold <- c(tophits_highfold, as.character(output[( abs(output$Fold)>5 & output$adj_pvalue<0.01),]$GeneID)) tophits_no_background <- c(tophits_no_background, as.character(output[( abs(output$Fold)>1.4 & output$adj_pvalue<0.20),]$GeneID)) # G. Add all individual files to one file for TMEV or other downstream programs my_columns <- cbind(rownames(output), output$logFC) colnames(my_columns) = c("GeneID","logFC") if (i == 1 ) { TMEV <- my_columns TMEV.colnames <- c("GeneID",my_contrasts[i]) ; } if (i > 1 ) { TMEV<-merge(TMEV, my_columns, by="GeneID",all.x=T) TMEV.colnames <- c(TMEV.colnames,my_contrasts[i]) ; } colnames(TMEV) = TMEV.colnames # H. Select data for colouring classes my_class <- output[which(row.names(output) %in% my_class_genes$GeneID),] my_class <- merge(my_class, my_class_genes, by="GeneID",all.x=T) colnames(my_class) =c("GeneID","logFC", "logCPM", "LR", "pvalue", "adj_pvalue", "Fold", "minFDR", "color") # Select data with 0 expression and mark with X it in the plots my_null_class <- output[which(row.names(output) %in% row.names(lrt$fitted.values)[lrt$fitted.values==0]),] my_null_class # I. - MA plots filename <- paste(experiment, "MAplot", i, my_contrasts[i] ,"png", sep=".") filenames.MAplots[i] <- filename plottitle <- paste("MAplot", experiment, "contrast=", my_contrasts[i] , sep=" ") png(file = filename, width = x_1k, height = y_1k, units = "px", pointsize = 12, bg = "white") par(mar=c(6,6,3,2)) # bottom, left, top, right plot(my.DGEList.glmFit$table$logCPM, my.DGEList.glmFit$table$logFC, col="grey39", cex.main=2,cex.axis=1.8,cex.lab=2.5, main=plottitle, pch=20, ylab="M - log2 ratio", xlab="A - mean expression level") points(my_class$logCPM, my_class$logFC, pch=19,lwd=1.2, col=as.character(my_class$color)) points(my_null_class$logCPM, my_null_class$logFC, pch=4,lwd=1, col="blue") rect(-100,-1, 100,1, col = "blue", density = 10, border = "transparent") text(output$logCPM[1:20], output$logFC[1:20], labels = row.names(output[1:20,]), pos = 4, cex=0.8, col="blue") abline(h=0) dev.off() # J - Volcano Data # Export Volcano data including class data and mean signal volcano_table <- cbind(row.names(output), output$logFC, output$minFDR, output$logCPM) colnames(volcano_table) <- c("GeneID", "logFC", "minFDR", "logCPM") volcano_table <- merge(volcano_table, my_class_genes, by="GeneID",all.x=T) volcano_table$color <- ifelse(is.na(volcano_table$color), "grey", as.character(volcano_table$color)) # add a default color if the color is NA volcano_table$Group <- ifelse(is.na(volcano_table$Group), "NA", as.character(volcano_table$Group)) # add a default Group if the Group is NA volcano_table$minFDR <- as.numeric(as.character(volcano_table$minFDR)) volcano_table <- volcano_table[ order(volcano_table$minFDR,decreasing = TRUE), ] # add signal data and scale it to color all above the median volcano_table$logCPM <- as.numeric(as.character(volcano_table$logCPM)) volcano_table$sfere <- volcano_table$logCPM - median(volcano_table$logCPM) volcano_table$sfere <- ifelse(volcano_table$sfere<0, 0, volcano_table$sfere) head(volcano_table) filename <- paste(experiment, "Volcano_plot", i, my_contrasts[i] ,"txt", sep=".") write.table(volcano_table, file=filename , quote=F, sep="\t", col.names=NA) jSON$VolcanoPlots[my_contrasts[i]] <- filename # outside the loop we want to filter on tophits_no_background, so we store this table volcano_tables[i] <- list(volcano_table) # Volcano Plot filename <- paste(experiment, "Volcano_plot", i, my_contrasts[i] ,"png", sep=".") filenames.VolcanoPlots[i] <- filename plottitle <- paste("Volcano plot of", experiment, "contrast=", my_contrasts[i] , sep=" ") png(file = filename, width = x_1k, height = y_1k, units = "px", pointsize = 12, bg = "white") par(mar=c(6,6,3,2)) # bottom, left, top, right plot(my.DGEList.glmFit$table$logFC, -log(my.DGEList.glmFit$table$FDR,2), col="grey39", cex.main=2,cex.axis=1.8,cex.lab=2.5, main=plottitle, pch=20, xlab="log2 ratio", ylab="-log2(FDR p-value)") points(my_class$logFC, -log(my_class$adj_pvalue,2), pch=19,lwd=1.2, col=as.character(my_class$color)) points(my_null_class$logFC, -log(my_null_class$adj_pvalue,2), pch=4,lwd=1, col="blue") text(output$logFC[1:10], -log(output$adj_pvalue[1:10],2), labels = row.names(output[1:10,]), pos = 3, cex=0.9, col="blue") lines(c(-1,-1),c(-100,1000)) lines(c(1,1),c(-100,1000)) lines(c(-3.3,-3.3),c(-100,1000), lty=2) lines(c(3.3,3.3),c(-100,1000), lty=2) lines(c(-100,100),c(4.4,4.4)) lines(c(-100,100),c(6.6,6.6), lty=2) rect(-100,-5,100,4.1, col = "red", density = 10, border = "transparent") rect(-0.95,-1,0.95,1000, col = "red", density = 10, border = "transparent") dev.off() # K. - Fold change distribution plots filename <- paste(experiment, "Fold_distribution", i, my_contrasts[i] ,"png", sep=".") plottitle <- paste("Fold change distribution", experiment, "contrast=", my_contrasts[i] , sep=" ") png(file = filename, width = 1280, height = 1024, units = "px", pointsize = 12, bg = "white") par(mar=c(6,6,3,2)) # bottom, left, top, right plot(rev(output$logFC), cex.main=2,cex.axis=1.8,cex.lab=2.5, main=plottitle, pch=20, xlab="genes (sorted on p-value)", ylab="log2(Fold Change)") rect(0,-1, nrow(output),1, col = "red", density = 30, border = "transparent") dev.off() # L - Count distribution plots filename <- paste(experiment, "Count_distribution", i, my_contrasts[i] ,"png", sep=".") plottitle <- paste("Count distribution", experiment, "contrast=", my_contrasts[i] , sep=" ") png(file = filename, width = 1280, height = 1024, units = "px", pointsize = 12, bg = "white") par(mar=c(6,6,3,2)) # bottom, left, top, right plot(sort(output$logCPM), cex.main=2,cex.axis=1.8,cex.lab=2.5, main=plottitle, pch=20, xlab="genes used for normalization", ylab="log2(Counts)") rect(0,-100, 0.2*nrow(output),100, col = "red", density = 30, border = "transparent") rect(0.8*nrow(output),-100, nrow(output),100, col = "red", density = 30, border = "transparent") dev.off() } # save filenames filename <- paste(experiment, "MAplots.filenames.txt", sep=".") write.csv(filenames.MAplots, file=filename) filename <- paste(experiment, "VolcanoPlots.filenames.txt", sep=".") write.csv(filenames.VolcanoPlots, file=filename) filename <- paste(experiment, "SigChanged_plots.filenames.txt", sep=".") write.csv(filenames.SigChanged_plots, file=filename) filename <- paste(experiment, "Significant_Changed_Genes_tables.filenames.txt", sep=".") write.csv(filenames.Significant_Changed_Genes_tables, file=filename) # save my_overview table filename <- paste(experiment, "Contrasts_overview.txt", sep=".") write.table(my_overview, file=filename , quote=F, sep="\t", row.names=FALSE) jSON$Contrasts_overview <- filename # Store data for volcano plots where at least one gene is above the background logbook <- write_log(logbook, "# 16.5 Write filtered Volcano plots data" ) # Create a items list containing the plots items <- list() head(as.data.frame(volcano_tables)) for (i in 1:contrast.matrix.dim) { volcano_filtered <- as.data.frame(volcano_tables[i]) volcano_filtered <- volcano_filtered[which(volcano_filtered$GeneID %in% tophits_no_background),] filename <- paste(experiment, "Volcano_plot_filtered", i, my_contrasts[i] ,"txt", sep=".") write.table(volcano_filtered, file=filename , quote=F, sep="\t", col.names=NA) #jSON$VolcanoPlotsFiltered[my_contrasts[i]] <- filename # JSON: Description of each plot item <- NULL item$name <- my_contrasts[i] item$type <- "scatterplot" item$settingsFile <- 'item-settings/molgen-volcanoplot.json' settings <- NULL settings$filename <- paste(websessiondir, filename, sep="/") item$settings <- toJSON(settings) # add the description to the items list items <- c(items, toJSON(item)) } # 17. Draw the network using iGraph logbook <- write_log(logbook, "# 17. Draw Networks" ) # The complete network filename <- paste(experiment, "GeneNetwork_EdgeList.txt", sep=".") write.table(my_GeneNetwork, file=filename , quote=F, sep="\t", row.names=FALSE) jSON$GeneNetwork_EdgeList <- filename my_Error <- tryCatch( Draw_GeneNetwork(my_GeneNetwork, "GeneNetwork_of_Contrasts", layout.reingold.tilford ), error=function(e) e ) # The high fold change network my_Error <- tryCatch( { my_GeneNetwork_high <- my_GeneNetwork[( abs(as.numeric(as.character(my_GeneNetwork$logFC)))>=4 & as.numeric(as.character(my_GeneNetwork$adj_pvalue))<0.01 ),] # frequency freq_table <- table(as.character(my_GeneNetwork_high$GeneID)) filename <- paste(experiment, "GeneNetwork_EdgeList_high.txt", sep=".") write.table(my_GeneNetwork_high, file=filename , quote=F, sep="\t", row.names=FALSE) jSON$GeneNetwork_EdgeList_high <- filename Draw_GeneNetwork(my_GeneNetwork_high, "GeneNetwork_of_Contrasts_high", layout.reingold.tilford ) }, error=function(e) e ) # 18. TMEV: Combine the output of all the Contrasts to one file that is compatible with TMEV (TIGR Multi Experiment Viewer) logbook <- write_log(logbook, "# 18. Combine the output of all the Contrasts to one file that is compatible with TMEV" ) colnames(TMEV) = TMEV.colnames filename <- paste(experiment, "TMEV", "txt", sep=".") write.table(TMEV, file=filename , quote=F, sep="\t", row.names=F) row.names(TMEV) <- TMEV$GeneID TMEV[1] <- NULL TMEV_TOPHITS <- TMEV[which(row.names(TMEV) %in% tophits),] TMEV_TOPHITS <- TMEV_TOPHITS[ order(TMEV_TOPHITS[,1],decreasing = TRUE), ] TMEV_TOPHITS_highfold <- TMEV[which(row.names(TMEV) %in% tophits_highfold),] TMEV_TOPHITS_highfold <- TMEV_TOPHITS_highfold[ order(TMEV_TOPHITS_highfold[,1],decreasing = TRUE), ] # To do: Expression profile plots of class genes and tophit genes #CLASS_expression <- y[which(row.names(y) %in% tophits),] #CLASS_expression <- y[ order(TOPHITS_expression[,1],decreasing = TRUE), ] #TOPHITS_expression <- y[which(row.names(y) %in% tophits),] #TOPHITS_expression <- y[ order(TOPHITS_expression[,1],decreasing = TRUE), ] #TOPHITS_expression_highfold <- y[which(row.names(y) %in% tophits_highfold),] #TOPHITS_expression_highfold <- y[ order(TOPHITS_expression_highfold[,1],decreasing = TRUE), ] filename <- paste(experiment, "TMEV_TOPHITS_logFC", "txt", sep=".") write.table(TMEV_TOPHITS, file=filename , quote=F, sep="\t", row.names=T, col.names=NA) filename <- paste(experiment, "TMEV_TOPHITS_logFC_highfold", "txt", sep=".") write.table(TMEV_TOPHITS_highfold, file=filename , quote=F, sep="\t", row.names=T, col.names=NA) # ------------------------------------------------------------------------------------------------------------ heatmaps ------------------------------------------------------------------------------------------- # Heatmap settings my_heatmap_palette_signal <- colorRampPalette(c("#FCFFF5", "#D1DBBD", "#91AA9D", "#3E606F", "#193441"))(n = 1000) my_heatmap_palette_ratio <- colorRampPalette(c("#4C3200", "#CC8400", "white","#00006B", "#00004C"))(n = 1000) # orange - blue my_heatmap_margins <- c(14,10) my_heatmap_pointsize <- 24 # 19. Heatmap of experiments to see how replicates perform logbook <- write_log(logbook, "# 19. Heatmaps" ) n <- length(row.names(lrt.design$fitted.values)) normalized_signals <- as.data.frame((lrt.design$fitted.values)) normalized_signals <- normalized_signals[order(normalized_signals[,1]),] normalized_signals <- normalized_signals[round(n*.1):round(n-n*.3),] # remove high and low 20% counts # save the heatmap data to file filename <- paste(experiment, "Heatmap_EXPERIMENTS", "txt", sep=".") write.table(normalized_signals, file=filename , quote=F, sep="\t", row.names=T, col.names=NA) filename <- paste(experiment, "Heatmap_EXPERIMENTS", "png", sep=".") png(file = filename, width = x_2k, height = y_2k, units = "px", pointsize = my_heatmap_pointsize, bg = "white") par(mar=c(15,6,13,2)) # bottom, left, top, right heatmap.2(as.matrix(log2(normalized_signals+0.5)),dendrogram="both",Rowv=TRUE, col=my_heatmap_palette_signal, scale="none", key=T, keysize=1, main="Normalized log2(Signals), Experiments vs Genes",margins=my_heatmap_margins, density.info="none", trace="none",cexCol=0.9, labRow=NA) dev.off() # 20. Heatmaps of TopHits logbook <- write_log(logbook, "# 20. Draw heatmaps of TopHits" ) heatmap_temp <-(t(apply(TMEV_TOPHITS,1,as.numeric))) colnames(heatmap_temp) <- colnames(TMEV_TOPHITS) Anne_heatmap(heatmap_temp, experiment, "Heatmap of TopHits Ratio (logFC)", "Heatmap_TopHits", "ratio") # Heatmap High fold change heatmap_temp <-(t(apply(TMEV_TOPHITS_highfold,1,as.numeric))) colnames(heatmap_temp) <- colnames(TMEV_TOPHITS_highfold) Anne_heatmap(heatmap_temp, experiment, "Heatmap of High Fold Changed TopHits", "Heatmap_TopHits_HighFold", "ratio") # 21. Heatmap: Combine Class data with TopHits data and draw plots of sub classes logbook <- write_log(logbook, "# 21. Combine Class data with TopHits data and draw plots and heatmaps of sub classes" ) CLASS_TOPHITS <- subset(TMEV_TOPHITS, row.names(TMEV_TOPHITS) %in% my_class_genes$GeneID) heatmap_temp <-(t(apply(CLASS_TOPHITS,1,as.numeric))) colnames(heatmap_temp) <- colnames(CLASS_TOPHITS) Anne_heatmap(heatmap_temp, experiment, "Heatmap of TopHits of all Class genes (logFC)", "Heatmap_Class_TopHits", "ratio") # 22. Heatmap of each Class Ratio logbook <- write_log(logbook, "# 22. Make Ratio Heatmap of each Class" ) Class_GroupNames <- levels(my_class_genes$Group) filenames.ClassHeatmaps <- list() filenames.Clusters <- list() for (i in 1:length(Class_GroupNames)) { Class_Genes <- subset(my_class_genes$GeneID, my_class_genes$Group %in% Class_GroupNames[i]) Class_Group <- subset(TMEV, row.names(TMEV) %in% Class_Genes) if (dim(Class_Group)[1] > 1) { filename_prefix <- paste("Heatmap_Class", Class_GroupNames[i], sep=".") filenames.ClassHeatmaps <- c(filenames.ClassHeatmaps, paste(experiment, filename_prefix ,"png", sep=".")) filenames.Clusters <- c(filenames.Clusters, paste(experiment, filename_prefix ,"clusters", "txt", sep=".")) main_title <- paste(experiment, "Heatmap of Contrasts of Class", Class_GroupNames[i] , sep=" ") heatmap_temp <-(t(apply(Class_Group,1,as.numeric))) colnames(heatmap_temp) <- colnames(Class_Group) Anne_heatmap(heatmap_temp, experiment, main_title, filename_prefix, "ratio") } } # save filenames filename <- paste(experiment, "ClassHeatmaps.filenames.txt", sep=".") write.csv(filenames.ClassHeatmaps, file=filename) filename <- paste(experiment, "ClassHeatmaps.clusters.filenames.txt", sep=".") #write.table(unlist(filenames.ClassHeatmaps), file=filename, row.names = FALSE, col.names = FALSE, quote=FALSE) temp <- NULL temp$files <- sapply(filenames.Clusters, paste0, collapse=",") temp$names <- sapply(Class_GroupNames, paste0, collapse=",") write.table(as.data.frame(temp), file=filename, row.names = FALSE, col.names = FALSE, quote=FALSE) # 23. High Fold change logbook <- write_log(logbook, "# 23. High Fold changed genes" ) CLASS_TOPHITS_highfold <- subset(TMEV_TOPHITS_highfold, row.names(TMEV_TOPHITS) %in% my_class_genes$GeneID) filename <- paste(experiment, "CLASS_TOPHITS_highfold_logFC", "txt", sep=".") write.table(CLASS_TOPHITS_highfold, file=filename , quote=F, sep="\t", row.names=T, col.names=NA) # 24. Heatmaps of Class Signal data logbook <- write_log(logbook, "# 24. Signal data Heat Maps" ) CLASS_TOPHITS_signal <- subset(mean_table_class, row.names(mean_table_class) %in% tophits) heatmap_temp <-(t(apply(CLASS_TOPHITS_signal,1,as.numeric))) colnames(heatmap_temp) <- colnames(CLASS_TOPHITS_signal) Anne_heatmap(log(heatmap_temp+1,2), experiment, "Heatmap of all Class genes log2(Signal)", "Heatmap_Class_Signals", "signal") # 24b. Heatmap of Each Class Signal data logbook <- write_log(logbook, "# 24b. Heatmap of Each Class Signal data" ) Class_GroupNames <- levels(my_class_genes$Group) filenames.ClassHeatmapsSignal <- list() filenames.Clusters <- list() for (i in 1:length(Class_GroupNames)) { Class_Genes <- subset(my_class_genes$GeneID, my_class_genes$Group %in% Class_GroupNames[i]) Class_Group <- subset(mean_table_class, row.names(mean_table_class) %in% Class_Genes) if (dim(Class_Group)[1] > 1) { filename_prefix <- paste("Heatmap_Class_Signal", Class_GroupNames[i], sep=".") filenames.ClassHeatmapsSignal <- c(filenames.ClassHeatmapsSignal, paste(experiment, "Heatmap_Class_Signal", Class_GroupNames[i] ,"png", sep=".")) filenames.Clusters <- c(filenames.Clusters, paste(experiment, filename_prefix ,"clusters", "txt", sep=".")) main_title <- paste(experiment, "Heatmap of Class", Class_GroupNames[i], "log2(Signal)" , sep=" ") heatmap_temp <-(t(apply(Class_Group,1,as.numeric))) colnames(heatmap_temp) <- colnames(Class_Group) Anne_heatmap(log(heatmap_temp+1,2), experiment, main_title, filename_prefix, "signal") } } # save filenames filenames.ClassHeatmapsSignal filename <- paste(experiment, "ClassHeatmapsSignal.filenames.txt", sep=".") write.csv(filenames.ClassHeatmapsSignal, file=filename) #write.table(unlist(filenames.ClassHeatmapsSignal), file=filename, row.names = FALSE, col.names = FALSE, quote=FALSE) filename <- paste(experiment, "ClassHeatmapsSignal.clusters.filenames.txt", sep=".") temp <- NULL temp$files <- sapply(filenames.Clusters, paste0, collapse=",") temp$names <- sapply(Class_GroupNames, paste0, collapse=",") write.table(as.data.frame(temp), file=filename, row.names = FALSE, col.names = FALSE, quote=FALSE) #--------------------------------------------------------------------- end heatmaps ------------------------------------------------------------------------- # 25. Signal data Expression plots logbook <- write_log(logbook, "# 25. Signal data expression plots" ) Class_signal_plots(log(CLASS_TOPHITS_signal,2),"Mean_signals_subClass_TOPHITS") # 26-28. K-Means Clustering logbook <- write_log(logbook, "# 26. K-Means Clustering MedianFold" ) if (length(row.names(TMEV_TOPHITS)) > 10) { KMeans_Clustering(TMEV_TOPHITS, "MedianFold", "log2Ratio") } logbook <- write_log(logbook, "# 27. K-Means Clustering Class Tophits" ) if (length(row.names(CLASS_TOPHITS)) > 10) { KMeans_Clustering(CLASS_TOPHITS, "ClassTophits", "log2Ratio") } logbook <- write_log(logbook, "# 28. K-Means Clustering High Fold" ) if (length(row.names(TMEV_TOPHITS_highfold)) > 10) { KMeans_Clustering(TMEV_TOPHITS_highfold, "HighFold", "log2Ratio") } if (length(row.names(CLASS_TOPHITS_signal)) > 10) { KMeans_Clustering(log(CLASS_TOPHITS_signal+0.1,2), "ClassTophitsSignal", "log2Signal") } # 29 Signal K-means CLustering logbook <- write_log(logbook, "# 29. K-Means Clustering of Mean Signals" ) # Clustering all genes is overkill, we will use only genes that changed at least weakly in one of the contrasts my_trimmed_mean_table <- subset(mean_table, row.names(mean_table) %in% tophits_no_background) # take the mean table and remove rows with 1 zero or more my_trimmed_mean_table <- my_trimmed_mean_table[apply(my_trimmed_mean_table, 1, function(x) !any(x==0)),] head(my_trimmed_mean_table) if (length(row.names(mean_table)) > 10) { KMeans_Clustering(log(my_trimmed_mean_table,2), "MeanSignal", "log2Signal") } # 30. Finally save the file container for downstream processing # Create menu menu1 <- NULL menu1$name <- 'Volcanoplots' menu1$items <- toJSON(items) # Combine the lists in one object jSON$menus <- list() jSON$menus <- c(jSON$menus, toJSON(menu1)) # Finally convert the object to JSON and remove unwanted chars result <- toJSON(jSON) gsub("\\\\", "", result,fixed=TRUE) filename <- paste(experiment, "Container.json", sep=".") write.table(result, file=filename, quote = FALSE, col.names = FALSE, , row.names = FALSE)