# R code modules to be used with code generated by iDEP for stand-alone analysis # Steven Xijin Ge South Dakota State University # gexijin@gmail.com # 3/12/2018 # latest version on github: # https://raw.githubusercontent.com/iDEP-SDSU/idep/master/shinyapps/idep/iDEP_core_functions.R ################################################################ # Install packages ################################################################ # dplyr complains this required libraries: libudunits2-dev, libmariadb-client-lgpl-dev # install.packages("plotly", repos="http://cran.rstudio.com/", dependencies=TRUE) # sometimes need to remove all installed packages: https://www.r-bloggers.com/how-to-remove-all-user-installed-packages-in-r/ #cat("\n Installing lots of R packages, this may take several hours ... \n # Each of these packages took years to develop.\n So be a patient thief.\n # \n Note 1: Sometimes dependencies needs to be installed manually. # \n Note 2: If you are using an older version of R, and having trouble with package installation, # \n sometimes, it is easier to install the new version of R and delete all old packages, and start fresh. # ") list.of.packages <- c("dendextend", "htmlwidgets","RSQLite", "gplots", "ggplot2", "dplyr", "tidyverse","plotly","e1071", "reshape2", "Rcpp","flashClust","statmod", "biclust","igraph","Rtsne") list.of.bio.packages <- c("STRINGdb", "limma", "DESeq2", "edgeR", "gage", "PGSEA", "fgsea", "ReactomePA", "pathview", "PREDA", "impute", "runibic", "QUBIC","rhdf5", "PREDAsampledata", "sfsmisc", "lokern", "multtest", "GSEABase","hgu133plus2.db") #Install Require packages new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])] if(length(new.packages)) install.packages(new.packages, repos="http://cran.rstudio.com/", dependencies=TRUE) new.bio.packages <- list.of.bio.packages[!(list.of.bio.packages %in% installed.packages()[,"Package"])] if(length(new.bio.packages)){ source("https://bioconductor.org/biocLite.R") biocLite(new.bio.packages, suppressUpdates = T) } # WGCNA must be installed after AnnotationDbi, impute, GO.db, preprocessCore # install.packages(c("WGCNA")) #Test Load Packages This cause an error when too many packages are loaded. # suc = unlist ( lapply(list.of.packages, require, character.only = TRUE) ) # if(sum(suc) < length(list.of.packages) ) # cat ("\n\nWarnning!!!!!! These R packages cannot be loaded:", list.of.packages[!suc] ) # suc = unlist ( lapply(list.of.bio.packages, require, character.only = TRUE) ) # if(sum(suc) < length(list.of.bio.packages) ) # cat ("\n\nWarnning!!!!!! These Bioconductor packages cannot be loaded:", list.of.bio.packages[!suc] ) ################################################################ # Global variables ################################################################ library(RSQLite,verbose=FALSE) # for database connection library(gplots,verbose=FALSE) # for hierarchical clustering library(ggplot2,verbose=FALSE) # graphics library(e1071,verbose=FALSE) # computing kurtosis library(DT,verbose=FALSE) # for renderDataTable library(plotly,verbose=FALSE) # for interactive heatmap library(reshape2,verbose=FALSE) # for melt correlation matrix in heatmap input_goButton = 0 # if 1, use demo data file Min_overlap <- 2 minSetSize = 3; mappingCoverage = 0.60 # 60% percent genes has to be mapped for confident mapping mappingEdge = 0.5 # Top species has 50% more genes mapped PvalGeneInfo = 0.05; minGenes = 10 # min number of genes for ploting kurtosis.log = 50 # log transform is enforced when kurtosis is big kurtosis.warning = 10 # log transformation recommnded minGenesEnrichment = 2 # perform GO or promoter analysis only if more than this many genes PREDA_Permutations =1000 maxGeneClustering = 12000 # max genes for hierarchical clustering and k-Means clustering. Slow if larger maxGeneWGCNA = 2000 # max genes for co-expression network maxFactors =6 # max number of factors in DESeq2 models redudantGeneSetsRatio = 0.9 # remove redundant genesets in enrichment analysis set.seed(2) # seed for random number generator mycolors = sort(rainbow(20))[c(1,20,10,11,2,19,3,12,4,13,5,14,6,15,7,16,8,17,9,18)] # 20 colors for kNN clusters #Each row of this matrix represents a color scheme; hmcols <- colorRampPalette(rev(c("#D73027", "#FC8D59", "#FEE090", "#FFFFBF", "#E0F3F8", "#91BFDB", "#4575B4")))(75) heatColors = rbind( greenred(75), bluered(75), colorpanel(75,"green","black","magenta"),colorpanel(75,"blue","yellow","red"),hmcols ) rownames(heatColors) = c("Green-Black-Red","Blue-White-Red","Green-Black-Magenta","Blue-Yellow-Red","Blue-white-brown") colorChoices = setNames(1:dim(heatColors)[1],rownames(heatColors)) # for pull down menu # Create a list for Select Input options speciesChoice = list() speciesChoice <- append( setNames( "NEW","**NEW SPECIES**"), speciesChoice ) speciesChoice <- append( setNames( "BestMatch","Best matching species"), speciesChoice ) ################################################################ # Read data and pre-process ################################################################ readData <- function(inFile ) { # these packages moved here to reduce loading time library(edgeR,verbose=FALSE) # count data D.E. library(DESeq2,verbose=FALSE) # count data analysis dataTypeWarning =0 dataType =c(TRUE) #---------------Read file x <- read.csv(inFile) # try CSV if(dim(x)[2] <= 2 ) # if less than 3 columns, try tab-deliminated x <- read.table(inFile, sep="\t",header=TRUE) #-------Remove non-numeric columns, except the first column for(i in 2:dim(x)[2]) dataType = c( dataType, is.numeric(x[,i]) ) if(sum(dataType) <=2) return (NULL) # only less than 2 columns are numbers x <- x[,dataType] # only keep numeric columns # rows with all missing values ix = which( apply(x[,-1],1, function(y) sum( is.na(y) ) ) != dim(x)[2]-1 ) x <- x[ix,] dataSizeOriginal = dim(x); dataSizeOriginal[2] = dataSizeOriginal[2] -1 x[,1] <- toupper(x[,1]) x[,1] <- gsub(" ","",x[,1]) # remove spaces in gene ids x = x[order(- apply(x[,2:dim(x)[2]],1,sd) ),] # sort by SD x <- x[!duplicated(x[,1]) ,] # remove duplicated genes rownames(x) <- x[,1] x <- as.matrix(x[,c(-1)]) # remove "-" or "." from sample names colnames(x) = gsub("-","",colnames(x)) colnames(x) = gsub("\\.","",colnames(x)) #cat("\nhere",dim(x)) # missng value for median value if(sum(is.na(x))>0) {# if there is missing values if(input_missingValue =="geneMedian") { rowMedians <- apply(x,1, function (y) median(y,na.rm=T)) for( i in 1:dim(x)[2] ) { ix = which(is.na(x[,i]) ) x[ix,i] <- rowMedians[ix] } } else if(input_missingValue =="treatAsZero") { x[is.na(x) ] <- 0 } else if (input_missingValue =="geneMedianInGroup") { sampleGroups = detectGroups( colnames(x)) for (group in unique( sampleGroups) ){ samples = which( sampleGroups == group ) rowMedians <- apply(x[,samples, drop=F],1, function (y) median(y,na.rm=T)) for( i in samples ) { ix = which(is.na(x[ ,i] ) ) if(length(ix) >0 ) x[ix, i ] <- rowMedians[ix] } } # missing for entire sample group, use median for all samples if(sum(is.na(x) )>0 ) { rowMedians <- apply(x,1, function (y) median(y,na.rm=T)) for( i in 1:dim(x)[2] ) { ix = which(is.na(x[,i]) ) x[ix,i] <- rowMedians[ix] } } } } # Compute kurtosis mean.kurtosis = mean(apply(x,2, kurtosis),na.rm=T) rawCounts = NULL pvals= NULL if (input_dataFileFormat == 2 ) { # if FPKM, microarray if ( is.integer(x) ) dataTypeWarning = 1; # Data appears to be read counts #-------------filtering #tem <- apply(x,1,max) #x <- x[which(tem > input_lowFilter),] # max by row is at least x <- x[ which( apply( x, 1, function(y) sum(y >= input_lowFilter)) >= input_NminSamples2 ) , ] x <- x[which(apply(x,1, function(y) max(y)- min(y) ) > 0 ),] # remove rows with all the same levels #--------------Log transform # Takes log if log is selected OR kurtosis is big than 100 if ( (input_transform == TRUE) | (mean.kurtosis > kurtosis.log ) ) x = log(x+abs( input_logStart),2) tem <- apply(x,1,sd) x <- x[order(-tem),] # sort by SD } else if( input_dataFileFormat == 1) { # counts data # data not seems to be read counts if(!is.integer(x) & mean.kurtosis < kurtosis.log ) { dataTypeWarning = -1 } # not used as some counts data like those from CRISPR screen #validate( # if Kurtosis is less than a threshold, it is not read-count # need(mean.kurtosis > kurtosis.log, "Data does not seem to be read count based on distribution. Please double check.") # ) x <- round(x,0) # enforce the read counts to be integers. Sailfish outputs has decimal points. #x <- x[ which( apply(x,1,max) >= input_minCounts ) , ] # remove all zero counts # remove genes if it does not at least have minCounts in at least NminSamples #x <- x[ which( apply(x,1,function(y) sum(y>=input_minCounts)) >= input_NminSamples ) , ] # filtering on raw counts # using counts per million (CPM) for filtering out genes. # CPM matrix #N samples > minCounts x <- x[ which( apply( cpm(DGEList(counts = x)), 1, function(y) sum(y>=input_minCounts)) >= input_NminSamples ) , ] rawCounts = x; # ??? # construct DESeqExpression Object # colData = cbind(colnames(x), as.character(detectGroups( colnames(x) )) ) tem = rep("A",dim(x)[2]); tem[1] <- "B" # making a fake design matrix to allow process, even when there is no replicates colData = cbind(colnames(x), tem ) colnames(colData) = c("sample", "groups") dds <- DESeqDataSetFromMatrix(countData = x, colData = colData, design = ~ groups) dds <- estimateSizeFactors(dds) # estimate size factor for use in normalization later for started log method # regularized log or VST transformation if( input_CountsTransform == 3 ) { # rlog is slow, only do it with 10 samples if(dim(x)[2]<=20 ) { x <- rlog(dds, blind=TRUE); x <- assay(x) } else x <- log2( counts(dds, normalized=TRUE) + input_countsLogStart ) } else { if ( input_CountsTransform == 2 ) { # vst is fast but aggressive x <- vst(dds, blind=TRUE) x <- assay(x) } else{ # normalized by library sizes and add a constant. x <- log2( counts(dds, normalized=TRUE) + input_countsLogStart ) # log(x+c) # This is equivalent to below. But the prior counts is more important #x <- cpm(DGEList(counts = x),log=TRUE, prior.count=input_countsLogStart ) #log CPM from edgeR #x <- x-min(x) # shift values to avoid negative numbers } } } else if( input_dataFileFormat == 3) { # other data type #neg_lfc neg_fdr pos_lfc pos_fdr #11 1 11 1 n2 = ( dim(x)[2] %/% 2) # 5 --> 2 # It looks like it contains P values # ranges of columns add 0.2 and round to whole. For P value columns this should be 1 tem = round( apply(x, 2, function( y) max(y)- min(y)) + .2) if( sum(tem[(1:n2)*2 ] == 1 ) == n2 | sum(tem[(1:n2)*2-1 ] == 1 ) == n2 ) { x = x[,1:(2*n2) ,drop=FALSE ] # if 5, change it to 4 if(tem[2] == 1) { # FDR follows Fold-change pvals = x [,2*(1:n2 ),drop=FALSE ] # 2, 4, 6 x = x[, 2*(1:n2 )-1,drop=FALSE] # 1, 3, 5 } else { # FDR follows Fold-change pvals = x [,2*(1:n2 )-1,drop=FALSE ] # 2, 4, 6 x = x[, 2*(1: n2 ),drop=FALSE] # 1, 3, 5 } } ix = which(apply(x,1, function(y) max(y)- min(y) ) > 0 ) x <- x[ix,] # remove rows with all the same levels if(!is.null(pvals) ) pvals = pvals[ix,] } dataSize = dim(x); if(!(dim(x)[1]>5 & dim(x)[2]>1)) stop ( "Data file not recognized. Please double check.") sampleInfoDemo=NULL if( input_goButton >0) sampleInfoDemo <- t( read.csv(demoDataFile2,row.names=1,header=T,colClasses="character") ) finalResult <- list(data = as.matrix(x), mean.kurtosis = mean.kurtosis, rawCounts = rawCounts, dataTypeWarning=dataTypeWarning, dataSize=c(dataSizeOriginal,dataSize),sampleInfoDemo=sampleInfoDemo, pvals =pvals ) return(finalResult) } readSampleInfo <- function(inFile){ dataTypeWarning =0 dataType =c(TRUE) #---------------Read file x <- read.csv(inFile,row.names=1,header=T,colClasses="character") # try CSV if(dim(x)[2] <= 2 ) # if less than 3 columns, try tab-deliminated x <- read.table(inFile, row.names=1,sep="\t",header=TRUE,colClasses="character") # remove "-" or "." from sample names colnames(x) = gsub("-","",colnames(x)) colnames(x) = gsub("\\.","",colnames(x)) #----------------Matching with column names of expression file ix = match(toupper(colnames(readData.out$data)), toupper(colnames(x)) ) ix = ix[which(!is.na(ix))] # remove NA if(! ( length(unique(ix) ) == dim(readData.out$data)[2] & dim(x)[1]>=1 & dim(x)[1] <500 ) ) # at least one row, it can not be more than 500 rows stop( "Error!!! Sample information file not recognized. Sample names must be exactly the same. Each row is a factor. Each column represent a sample. Please see documentation on format.") #-----------Double check factor levels, change if needed # remove "-" or "." from factor levels for( i in 1:dim(x)[1]) { x[i,] = gsub("-","",x[i,]) x[i,] = gsub("\\.","",x[i,]) } # if levels from different factors match if( length(unique(ix) ) == dim(readData.out$data)[2]) { # matches exactly x = x[,ix] # if the levels of different factors are the same, it may cause problems if( sum( apply(x, 1, function(y) length(unique(y)))) > length(unique(unlist(x) ) ) ) { tem2 =apply(x,2, function(y) paste0( names(y),y)) # factor names are added to levels rownames(tem2) = rownames(x) x <- tem2 } return(t( x ) ) } else retrun(NULL) } textTransform <- function () { k.value = readData.out$mean.kurtosis tem = paste( "Mean Kurtosis = ", round(k.value,2), ".\n",sep = "") if( k.value > kurtosis.log) tem = paste(tem, " Detected extremely large values. \n When kurtosis >", kurtosis.log, ", log transformation is automatically applied.\n", sep="") else if (k.value>kurtosis.warning) {tem = paste(tem, " Detected large numbers with kurtosis >", kurtosis.warning,". \n Log transformation is recommended.", sep="") } if(readData.out$dataTypeWarning == 1 ) { tem = paste (tem, " ------Warning!!! \nData matrix contains all integers. \n It seems to be read counts!!! \n Please select appropriate data type in the previous page and reload file.")} if(readData.out$dataTypeWarning == -1 ) { tem = paste (tem, " ------Warning!!! \nData does not look like read counts data. \n Please select appropriate data type in the previous page and reload file.")} tem } # Clean up gene sets. Remove spaces and other control characters from gene names cleanGeneSet <- function (x){ # remove duplicate; upper case; remove special characters x <- unique( toupper( gsub("\n| ","",x) ) ) x <- x[which( nchar(x)>1) ] # genes should have at least two characters return(x) } findSpeciesById <- function (speciesID){ # find species name use id return( orgInfo[which(orgInfo$id == speciesID),] ) } # just return name findSpeciesByIdName <- function (speciesID){ # find species name use id return( orgInfo[which(orgInfo$id == speciesID),3] ) } # convert gene IDs to ensembl gene ids and find species convertID <- function (query,selectOrg, selectGO) { querySet <- cleanGeneSet( unlist( strsplit( toupper(query),'\t| |\n|\\,'))) # querySet is ensgene data for example, ENSG00000198888, ENSG00000198763, ENSG00000198804 querySTMT <- paste( "select distinct id,ens,species from mapping where id IN ('", paste(querySet,collapse="', '"),"')",sep="") result <- dbGetQuery(convert, querySTMT) if( dim(result)[1] == 0 ) return(NULL) if(selectOrg == speciesChoice[[1]]) { comb = paste( result$species,result$idType) sortedCounts = sort(table(comb),decreasing=T) recognized =names(sortedCounts[1]) result <- result[which(comb == recognized),] speciesMatched=sortedCounts names(speciesMatched )= sapply(as.numeric(gsub(" .*","",names(sortedCounts) ) ), findSpeciesByIdName ) speciesMatched <- as.data.frame( speciesMatched ) if(length(sortedCounts) == 1) { # if only one species matched speciesMatched[1,1] <-paste( rownames(speciesMatched), "(",speciesMatched[1,1],")",sep="") } else {# if more than one species matched speciesMatched[,1] <- as.character(speciesMatched[,1]) speciesMatched[,1] <- paste( speciesMatched[,1]," (",speciesMatched[,2], ")", sep="") speciesMatched[1,1] <- paste( speciesMatched[1,1]," ***Used in mapping*** To change, select from above and resubmit query.") speciesMatched <- as.data.frame(speciesMatched[,1]) } } else { # if species is selected result <- result[which(result$species == selectOrg ) ,] if( dim(result)[1] == 0 ) return(NULL) #stop("ID not recognized!") speciesMatched <- as.data.frame(paste("Using selected species ", findSpeciesByIdName(selectOrg) ) ) } result <- result[which(!duplicated(result[,2]) ),] # remove duplicates in ensembl_gene_id result <- result[which(!duplicated(result[,1]) ),] # remove duplicates in user ID colnames(speciesMatched) = c("Matched Species (genes)" ) conversionTable <- result[,1:2]; colnames(conversionTable) = c("User_input","ensembl_gene_id") conversionTable$Species = sapply(result[,3], findSpeciesByIdName ) return(list(originalIDs = querySet,IDs=unique( result[,2]), species = findSpeciesById(result$species[1]), #idType = findIDtypeById(result$idType[1] ), speciesMatched = speciesMatched, conversionTable = conversionTable ) ) } # retrieve detailed info on genes geneInfo <- function (fileName = NULL){ if(!is.null(fileName)) # if only one file #WBGene0000001 some ensembl gene ids in lower case { x = read.csv(as.character(fileName) ); x[,1]= toupper(x[,1]) } else # read in the chosen file { return(as.data.frame("No file.") ) } Set=rep("List",nrow(x)) return( cbind(x,Set) ) } convertedData <- function() { if( is.null(converted.out ) ) return( readData.out$data) # if id or species is not recognized use original data. if(input_noIDConversion) return( readData.out$data ) mapping <- converted.out$conversionTable # cat (paste( "\nData:",input_selectOrg) ) x =readData.out$data rownames(x) = toupper(rownames(x)) # any gene not recognized by the database is disregarded # x1 = merge(mapping[,1:2],x, by.y = 'row.names', by.x = 'User_input') # the 3 lines keeps the unrecogized genes using original IDs x1 = merge(mapping[,1:2],x, by.y = 'row.names', by.x = 'User_input', all.y=TRUE) # original IDs used if ID is not matched in database ix = which(is.na(x1[,2]) ) x1[ix,2] = x1[ix,1] #multiple matched IDs, use the one with highest SD tem = apply(x1[,3:(dim(x1)[2])],1,sd) x1 = x1[order(x1[,2],-tem),] x1 = x1[!duplicated(x1[,2]) ,] rownames(x1) = x1[,2] x1 = as.matrix(x1[,c(-1,-2)]) tem = apply(x1,1,sd) x1 = x1[order(-tem),] # sort again by SD return(x1) } convertedCounts <- function() { if( is.null(converted.out ) ) return( readData.out$rawCounts) mapping <- converted.out$conversionTable x =readData.out$rawCounts rownames(x) = toupper(rownames(x)) # any gene not recognized by the database is disregarded # x1 = merge(mapping[,1:2],x, by.y = 'row.names', by.x = 'User_input') # the 3 lines keeps the unrecogized genes using original IDs x1 = merge(mapping[,1:2],x, by.y = 'row.names', by.x = 'User_input', all.y=TRUE) ix = which(is.na(x1[,2]) ) x1[ix,2] = x1[ix,1] # original IDs used #multiple matched IDs, use the one with highest SD tem = apply(x1[,3:(dim(x1)[2])],1,sd) x1 = x1[order(x1[,2],-tem),] x1 = x1[!duplicated(x1[,2]) ,] rownames(x1) = x1[,2] x1 = as.matrix(x1[,c(-1,-2)]) tem = apply(x1,1,sd) x1 = x1[order(-tem),] # sort again by SD return(x1) } # here nGenesFilter <- function() { tem = readData.out$dataSize ix = match( toupper( rownames(convertedData.out)), toupper(converted.out$conversionTable$ensembl_gene_id ) ) nMatched = sum( !is.na(ix) ) if( input_noIDConversion) return( paste(tem[1], "genes in", tem[4], "samples.", tem[3], " genes passed filter.\n Original gene IDs used." ) ) else return( paste(tem[1], "genes in", tem[4], "samples. Of the", tem[3], " genes passed filter, \n", nMatched, " were converted to Ensembl gene IDs in our database. \n The remaining ", tem[3]-nMatched, "genes were kept in the data using original IDs." ) ) } # Define sample groups based on column names detectGroups <- function (x){ # x are col names tem <- gsub("[0-9]*$","",x) # Remove all numbers from end #tem = gsub("_Rep|_rep|_REP","",tem) tem <- gsub("_$","",tem); # remove "_" from end tem <- gsub("_Rep$","",tem); # remove "_Rep" from end tem <- gsub("_rep$","",tem); # remove "_rep" from end tem <- gsub("_REP$","",tem) # remove "_REP" from end return( tem ) } # EDA plots ------------------------------------- densityPlot <- function() { set.seed(2) # seed for random number generator mycolors = sort(rainbow(20))[c(1,20,10,11,2,19,3,12,4,13,5,14,6,15,7,16,8,17,9,18)] # 20 colors for kNN clusters #Each row of this matrix represents a color scheme; x <- readData.out$data myColors = mycolors plot(density(x[,1]),col = myColors[1], lwd=2, xlab="Expresson values", ylab="Density", main= "Distribution of transformed data", ylim=c(0, max(density(x[,1])$y)+.02 ) ) for( i in 2:dim(x)[2] ) lines(density(x[,i]),col=myColors[i], lwd=2 ) if(dim(x)[2]< 31 ) # if too many samples do not show legends legend("topright", cex=1,colnames(x), lty=rep(1,dim(x)[2]), col=myColors ) } genePlot <- function() { x <- convertedData.out Symbols <- rownames(x) if( input_selectOrg != "NEW") { ix = match( rownames(x), allGeneInfo.out[,1]) if( sum( is.na(allGeneInfo.out$symbol )) != dim(allGeneInfo.out )[1] ) { # symbol really exists? Symbols = as.character( allGeneInfo.out$symbol[ix] ) Symbols[which( nchar(Symbols) <= 2 ) ] <- rownames(x) [which( nchar(Symbols) <= 2 ) ] } } x = as.data.frame(x) x$Genes = Symbols # matching from the beginning of symbol ix = which(regexpr( paste("^" , toupper(input_geneSearch),sep="") ,toupper(x$Genes)) > 0) if(grepl(" ", input_geneSearch) ) # if there is space character, do exact match ix = match(gsub(" ","", toupper(input_geneSearch)),x$Genes) # too few or too many genes found if(length(ix) == 0 | length(ix) > 50 ) return(NULL) # no genes found mdf = melt(x[ix,],id.vars="Genes", value.name="value", variable.name="samples") p1 <- ggplot(data=mdf, aes(x=samples, y=value, group = Genes, shape=Genes, colour = Genes)) + geom_line() + geom_point( size=5, fill="white")+ #shape=21 circle #theme(axis.text.x = element_text(size=16,angle = 45, hjust = 1)) + labs(y="Transformed expression level") + coord_cartesian(ylim = c(0, max(mdf$value))) p1 <- p1 + theme(plot.title = element_text(size = 16,hjust = 0.5)) + # theme(aspect.ratio=1) + theme(axis.text.x = element_text(angle=45, size = 16, hjust=1), axis.text.y = element_text( size = 16), axis.title.x = element_blank(), axis.title.y = element_text( size = 16) ) + theme(legend.text=element_text(size=12)) p1 } geneBarPlotError <- function() { x <- convertedData.out Symbols <- rownames(x) if( input_selectOrg != "NEW") { ix = match( rownames(x), allGeneInfo.out[,1]) if( sum( is.na(allGeneInfo.out$symbol )) != dim(allGeneInfo.out )[1] ) { # symbol really exists? Symbols = as.character( allGeneInfo.out$symbol[ix] ) Symbols[which( nchar(Symbols) <= 2 ) ] <- rownames(x) [which( nchar(Symbols) <= 2 ) ] } } x = as.data.frame(x) x$Genes = Symbols #write.csv(x,"tem.csv") # Search for genes #ix = grep("HOXA",toupper(x$Genes) ) # ix = grep(toupper(input_geneSearch),toupper(x$Genes)) # sox --> Tsox # matching from the beginning of symbol ix = which(regexpr( paste("^" , toupper(input_geneSearch),sep="") ,toupper(x$Genes)) > 0) if(grepl(" ", input_geneSearch) ) # if there is space character, do exact match ix = match(gsub(" ","", toupper(input_geneSearch)),x$Genes) # too few or too many genes found if(length(ix) == 0 | length(ix) > 50 ) return(NULL) # no genes found mdf = melt(x[ix,],id.vars="Genes", value.name="value", variable.name="samples") mdf$count = 1 g = detectGroups(mdf$samples) Means = aggregate(mdf$value,by=list( g, mdf$Genes ), FUN = mean, na.rm=TRUE ) SDs = aggregate(mdf$value,by=list( g, mdf$Genes ), FUN = sd, na.rm=TRUE ) Ns = aggregate(mdf$count, by= list(g, mdf$Genes) , FUN = sum ) summarized = cbind(Means,SDs[,3],Ns[,3]) colnames(summarized)= c("Samples","Genes","Mean","SD","N") summarized$SE = summarized$SD / sqrt(summarized$N) #http://www.sthda.com/english/wiki/ggplot2-barplots-quick-start-guide-r-software-and-data-visualization p2 <- ggplot(summarized, aes(x=Genes, y=Mean,fill=Samples) ) + # data & aesthetic mapping geom_bar(stat="identity", position=position_dodge()) + # bars represent average geom_errorbar(aes(ymin=Mean-SE, ymax=Mean+SE), width=0.2,position=position_dodge(.9)) + labs(y="Expression Level") if(input_useSD == 1) { p2 <- ggplot(summarized, aes(x=Genes, y=Mean,fill=Samples) ) + # data & aesthetic mapping geom_bar(stat="identity", position=position_dodge()) + # bars represent average geom_errorbar(aes(ymin=Mean-SD, ymax=Mean+SD), width=0.2,position=position_dodge(.9)) + labs(y="Expression Level") } p2 <- p2 + theme(plot.title = element_text(size = 16,hjust = 0.5)) + # theme(aspect.ratio=1) + theme(axis.text.x = element_text(angle=45, size = 16, hjust=1), axis.text.y = element_text( size = 16), axis.title.x = element_blank(), axis.title.y = element_text( size = 16) ) + theme(legend.text=element_text(size=16)) p2 } ################################################################ # Hierarchical clustering ################################################################ # Functions for hierarchical clustering hclust2 <- function(x, method="average", ...) # average linkage hclust(x, method=method, ...) hclust.ward.D <- function(x, method="ward.D", ...) # average linkage hclust(x, method=method, ...) hclust.ward.D2 <- function(x, method="ward.D2", ...) # average linkage hclust(x, method=method, ...) hclust.single <- function(x, method="single", ...) # average linkage hclust(x, method=method, ...) hclust.mcquitty <- function(x, method="mcquitty", ...) # average linkage hclust(x, method=method, ...) hclust.median <- function(x, method="median", ...) # average linkage hclust(x, method=method, ...) hclust.centroid <- function(x, method="centroid", ...) # average linkage hclust(x, method=method, ...) hclustFuns <- list( averge = hclust2, complete=hclust, single=hclust.single, median=hclust.median, centroid=hclust.centroid, mcquitty=hclust.mcquitty) hclustChoices = setNames(1:length(hclustFuns),names(hclustFuns)) # for pull down menu dist2 <- function(x, ...) # distance function = 1-PCC (Pearson's correlation coefficient) as.dist(1-cor(t(x), method="pearson")) dist3 <- function(x, ...) # distance function = 1-abs(PCC) (Pearson's correlation coefficient) as.dist(1-abs(cor(t(x), method="pearson"))) distFuns <- list(Correlation=dist2, Euclidean=dist,AbsolutePCC=dist3) # hierarchical clustering tree ---------------------------------------- hmcols <- colorRampPalette(rev(c("#D73027", "#FC8D59", "#FEE090", "#FFFFBF", "#E0F3F8", "#91BFDB", "#4575B4")))(75) heatColors = rbind( greenred(75), bluered(75), colorpanel(75,"green","black","magenta"),colorpanel(75,"blue","yellow","red"),hmcols ) rownames(heatColors) = c("Green-Black-Red","Blue-White-Red","Green-Black-Magenta","Blue-Yellow-Red","Blue-white-brown") colorChoices = setNames(1:dim(heatColors)[1],rownames(heatColors)) # for pull down menu # adding sample legends to heatmap; this is for the main heatmap # https://stackoverflow.com/questions/3932038/plot-a-legend-outside-of-the-plotting-area-in-base-graphics add_legend <- function(...) { opar <- par(fig=c(0, 1, 0, 1), oma=c(0, 0, 0, 0), mar=c(0, 0, 0, 6), new=TRUE) on.exit(par(opar)) plot(0, 0, type='n', bty='n', xaxt='n', yaxt='n') legend(...) } correlationMatrix <- function( ) { # heatmap of correlation matrix x <- readData.out$data maxGene <- apply(x,1,max) x <- x[which(maxGene > quantile(maxGene)[1] ) ,] # remove bottom 25% lowly expressed genes, which inflate the PPC melted_cormat <- melt(round(cor(x),2), na.rm = TRUE) # melted_cormat <- melted_cormat[which(melted_cormat[,1] != melted_cormat[,2] ) , ] # Create a ggheatmap ggheatmap <- ggplot(melted_cormat, aes(Var2, Var1, fill = value))+ geom_tile(color = "white")+ scale_fill_gradient2(low = "green", high = "red", mid = "white", space = "Lab", limit = c(min(melted_cormat[,3]) ,max(melted_cormat[,3])), midpoint = median(melted_cormat[,3]), name="Pearson\nCorrelation" ) + theme_minimal()+ # minimal theme theme(axis.text.x = element_text(angle = 45, vjust = 1, size = 15, hjust = 1))+ theme(axis.text.y = element_text( size = 15))+ coord_fixed()+ theme( axis.title.x = element_blank(), axis.title.y = element_blank(), panel.grid.major = element_blank(), panel.border = element_blank(), panel.background = element_blank(), axis.ticks = element_blank(), legend.justification = c(1, 0), legend.position = c(0.6, 0.7), legend.direction = "horizontal")+ guides(fill = FALSE) # + ggtitle("Pearson's Correlation Coefficient (all genes)") if(input_labelPCC && ncol(x)<30) ggheatmap <- ggheatmap + geom_text(aes(Var2, Var1, label = value), color = "black", size = 4) ggheatmap } staticHeatmap <- function () { x <- readData.out$data # x = read.csv("expression1.csv") n=input_nGenes #if(n>6000) n = 6000 # max if(n>dim(x)[1]) n = dim(x)[1] # max as data # this will cutoff very large values, which could skew the color if(input_geneCentering) x=as.matrix(x[1:n,])-apply(x[1:n,],1,mean) # standardize by gene if(input_geneNormalize) x <- x / apply(x,1,sd) # row centering and normalize x <- scale(x, center = input_sampleCentering, scale = input_sampleNormalize) cutoff = median(unlist(x)) + input_heatmapCutoff * sd (unlist(x)) x[x>cutoff] <- cutoff cutoff = median(unlist(x)) - input_heatmapCutoff *sd (unlist(x)) x[x< cutoff] <- cutoff groups = detectGroups(colnames(x) ) # if sample info file is uploaded us that info: if(!is.null(readSampleInfo.out) && !is.null(input_selectFactorsHeatmap) ) { if(input_selectFactorsHeatmap == "Sample_Name" ) groups = detectGroups(colnames(x) ) else { ix = match(input_selectFactorsHeatmap, colnames(readSampleInfo.out ) ) groups = readSampleInfo.out[,ix] } } groups.colors = rainbow(length(unique(groups) ) ) #http://stackoverflow.com/questions/15351575/moving-color-key-in-r-heatmap-2-function-of-gplots-package lmat = rbind(c(0,4),c(0,1),c(3,2),c(5,0)) lwid = c(2,6) # width of gene tree; width of heatmap lhei = c(1.5,.2,8,1.1) #layout matrix # [,1] [,2] # [1,] 0 4 # [2,] 0 1 # [3,] 3 2 # [4,] 5 0 # 4--> column tree; 1--> column color bar; 2--> heatmap; 3-> row tree; 5--> color key. # height of 4 rows is specified by lhei; width of columns is given by lwid par(mar = c(12, 4, 1.4, 0.2)) if( n>110) heatmap.2(x, distfun = distFuns[[as.integer(input_distFunctions)]] ,hclustfun=hclustFuns[[as.integer(input_hclustFunctions)]] ,Colv=!input_noSampleClustering #col=colorpanel(75,"green","black","magenta") , #col=bluered(75), #col=greenred(75), ,col= heatColors[as.integer(input_heatColors1),] ,density.info="none", trace="none", scale="none", keysize=.5 ,key=T, symkey=F ,ColSideColors=groups.colors[ as.factor(groups)] ,labRow="" ,margins=c(10,2) ,srtCol=45 ,cexCol=2 # size of font for sample names ,lmat = lmat, lwid = lwid, lhei = lhei ) if( n<=110) heatmap.2(x, distfun = distFuns[[as.integer(input_distFunctions)]] ,hclustfun=hclustFuns[[as.integer(input_hclustFunctions)]] ,Colv=!input_noSampleClustering ,col= heatColors[as.integer(input_heatColors1),], density.info="none", trace="none", scale="none", keysize=.5 ,key=T, symkey=F, #,labRow=labRow ,ColSideColors=groups.colors[ as.factor(groups)] ,margins=c(18,12) ,cexRow=1 ,srtCol=45 ,cexCol=1.8 # size of font for sample names ,lmat = lmat, lwid = lwid, lhei = lhei ) par(lend = 1) # square line ends for the color legend add_legend("topleft", legend = unique(groups), # category labels col = groups.colors[ unique(as.factor(groups))], # color key lty= 1, # line style lwd = 10 # line width ) } #Interactive heatmap via Plotly plot ---------------------------------------------- # interactive heatmap with plotly heatmapPlotly <- function () { input_nGenesPlotly= 50 x <- convertedData.out n=input_nGenesPlotly #if(n>6000) n = 6000 # max if(n>dim(x)[1]) n = dim(x)[1] # max as data if(input_geneCentering) x=as.matrix(x[1:n,])-apply(x[1:n,],1,mean) # standardize by gene if(input_geneNormalize) x <- x / apply(x,1,sd) # row centering and normalize x <- scale(x, center = input_sampleCentering, scale = input_sampleNormalize) cutoff = median(unlist(x)) + 3*sd (unlist(x)) x[x>cutoff] <- cutoff cutoff = median(unlist(x)) - 3*sd (unlist(x)) x[x< cutoff] <- cutoff # adding gene symbol ix <- match( rownames(x), allGeneInfo.out[,1]) geneSymbols <- as.character( allGeneInfo.out$symbol)[ix] # if missing or duplicated, use Ensembl ID ix <- which(nchar( geneSymbols) <=1 | duplicated(geneSymbols ) | is.null(geneSymbols) | is.na(geneSymbols) ); geneSymbols[ ix ] <- rownames(x)[ix] rownames( x) = geneSymbols; clust <- x %>% dist2() %>% hclust2() # Get order ord <- clust$order # Re-arrange based on order df <- t( x[ord,] )%>% melt() colnames(df)[1:2] <- c("X","Y") colorNames = unlist(strsplit(tolower(rownames(heatColors)[ as.integer(input_heatColors1) ]),"-" ) ) p <- df %>% ggplot(aes(X, Y, fill = value)) + geom_tile()+ scale_fill_gradient2(low = colorNames[1], mid = colorNames[2],high = colorNames[3]) + theme(axis.title.y=element_blank(), # remove y labels # axis.text.y=element_blank.out, # keep gene names for zooming axis.ticks.y=element_blank(), axis.title.x=element_blank()) + theme(axis.text.x = element_text(size=10,angle = 45, hjust = 1)) ggplotly(p) %>% layout(margin = list(b = 150,l=200)) # prevent cutoff of sample names } ################################################################ # PCA ################################################################ PCAplot <- function() { #PCA x <- convertedData.out; pca.object <- prcomp(t(x)) pcaData = as.data.frame(pca.object$x[,1:2]); pcaData = cbind(pcaData,detectGroups(colnames(x)) ) colnames(pcaData) = c("PC1", "PC2", "Sample_Name") percentVar=round(100*summary(pca.object)$importance[2,1:2],0) if(is.null(readSampleInfo.out)) { p=ggplot(pcaData, aes(PC1, PC2, color=Sample_Name, shape = Sample_Name)) + geom_point(size=5) } else { pcaData = cbind(pcaData,readSampleInfo.out ) p=ggplot(pcaData, aes_string("PC1", "PC2", color=input_selectFactors,shape=input_selectFactors2)) + geom_point(size=5) } p=p+xlab(paste0("PC1: ",percentVar[1],"% variance")) p=p+ylab(paste0("PC2: ",percentVar[2],"% variance")) p=p+ggtitle("Principal component analysis (PCA)")+coord_fixed(ratio=1.0)+ theme(plot.title = element_text(size = 16,hjust = 0.5)) + theme(aspect.ratio=1) + theme(axis.text.x = element_text( size = 16), axis.text.y = element_text( size = 16), axis.title.x = element_text( size = 16), axis.title.y = element_text( size = 16) ) + theme(legend.text=element_text(size=16)) print(p) } # variance chart # plot(pca.object,type="bar", xlab="Principal Components", main ="Variances explained") MDSplot <- function() { # MDS x <- convertedData.out; fit = cmdscale( dist2(t(x) ), eig=T, k=2) # par(pin=c(5,5)) pcaData = as.data.frame(fit$points[,1:2]); pcaData = cbind(pcaData,detectGroups(colnames(x)) ) colnames(pcaData) = c("x1", "x2", "Sample_Name") if(is.null(readSampleInfo.out)) { p=ggplot(pcaData, aes(x1, x2, color=Sample_Name, shape = Sample_Name)) + geom_point(size=5) } else { pcaData = cbind(pcaData,readSampleInfo.out ) p=ggplot(pcaData, aes_string("x1", "x2", color=input_selectFactors,shape=input_selectFactors2)) + geom_point(size=5) } p=p+xlab("Dimension 1") p=p+ylab("Dimension 2") p=p+ggtitle("Multidimensional scaling (MDS)")+ coord_fixed(ratio=1.)+ theme(plot.title = element_text(hjust = 0.5)) + theme(aspect.ratio=1) + theme(axis.text.x = element_text( size = 16), axis.text.y = element_text( size = 16), axis.title.x = element_text( size = 16), axis.title.y = element_text( size = 16) ) + theme(legend.text=element_text(size=16)) print(p) } tSNEplot <- function() { # t-SNE x <- convertedData.out; library(Rtsne,verbose=FALSE) set.seed(input_tsneSeed2) tsne <- Rtsne(t(x), dims = 2, perplexity=1, verbose=FALSE, max_iter = 400) pcaData = as.data.frame(tsne$Y); pcaData = cbind(pcaData,detectGroups(colnames(x)) ) colnames(pcaData) = c("x1", "x2", "Sample_Name") if(is.null(readSampleInfo.out)) { p=ggplot(pcaData, aes(x1, x2, color=Sample_Name, shape = Sample_Name)) + geom_point(size=5) } else { pcaData = cbind(pcaData,readSampleInfo.out ) p=ggplot(pcaData, aes_string("x1", "x2", color=input_selectFactors,shape=input_selectFactors2)) + geom_point(size=5) } p=p+xlab("Dimension 1") p=p+ylab("Dimension 2") p=p+ggtitle("t-SNE plot")+ coord_fixed(ratio=1.)+ theme(plot.title = element_text(hjust = 0.5)) + theme(aspect.ratio=1) + theme(axis.text.x = element_text( size = 16), axis.text.y = element_text( size = 16), axis.title.x = element_text( size = 16), axis.title.y = element_text( size = 16) ) + theme(legend.text=element_text(size=16)) print(p) } # read pathway data from local file, not used if using database # Read gene sets GMT file # This functions cleans and converts to upper case readGMTRobust <- function (file1) { # size restriction # Read in the first file x <- scan(file1, what="", sep="\n") # x <- gsub("\t\t.","",x) # GMT files saved by Excel has a lot of empty cells "\t\t\t\t" "\t." means one or more tab x <- gsub(" ","",x) # remove white space x <- toupper(x) # convert to upper case #----Process the first file # Separate elements by one or more whitespace y <- strsplit(x, "\t") # Extract the first vector element and set it as the list element name names(y) <- sapply(y, `[[`, 1) #names(y) <- sapply(y, function(x) x[[1]]) # same as above # Remove the first vector element from each list element y <- lapply(y, `[`, -c(1,2)) #y <- lapply(y, function(x) x[-1]) # same as above # remove duplicated elements for ( i in 1:length(y) ) y[[i]] <- cleanGeneSet(y[[i]]) # check the distribution of the size of gene lists sapply(y, length) hold a vector of sizes if( max( sapply(y,length) ) <5) cat("Warning! Gene sets have very small number of genes!\n Please double check format.") y <- y[which(sapply(y,length) > 1)] # gene sets smaller than 1 is ignored!!! return(y) } readGeneSets <- function (fileName, convertedData, GO,selectOrg, myrange) { pathway <- dbConnect(sqlite,fileName) if(is.null(GO) ) GO <- "GOBP" # initial value not properly set; enforcing # get Gene sets querySet = rownames(convertedData) sqlQuery = paste( " select distinct gene,pathwayID from pathway where gene IN ('", paste(querySet,collapse="', '"),"')" ,sep="") # cat(paste0("\n\nhere:",GO,"There")) if( GO != "All") sqlQuery = paste0(sqlQuery, " AND category ='",GO,"'") result <- dbGetQuery( pathway, sqlQuery ) if( dim(result)[1] ==0) {return(list( x=as.data.frame("No matching species or gene ID file!" )) )} # list pathways and frequency of genes pathwayIDs = aggregate( result$pathwayID, by = list(unique.values = result$pathwayID), FUN = length) pathwayIDs = pathwayIDs[which(pathwayIDs[,2]>= myrange[1] ),] pathwayIDs = pathwayIDs[which( pathwayIDs[,2] <= myrange[2] ),] if(dim(pathwayIDs)[1] ==0 ) geneSets = NULL; # convert pathways into lists like those generated by readGMT geneSets = lapply(pathwayIDs[,1], function(x) result[which(result$pathwayID == x ),1] ) pathwayInfo <- dbGetQuery( pathway, paste( " select distinct id,Description from pathwayInfo where id IN ('", paste(pathwayIDs[,1],collapse="', '"), "') ",sep="") ) ix = match( pathwayIDs[,1], pathwayInfo[,1]) names(geneSets) <- pathwayInfo[ix,2] #geneSets <- geneSets[ -which(duplicated(names(geneSets) ))] # remove geneSets with the same name dbDisconnect(pathway) return( geneSets ) } # Runs pathway analysis using PGSEA; this is copied and revised from PGSEA package myPGSEA <- function (exprs, cl, range = c(25, 500), ref = NULL, center = TRUE, p.value = 0.005, weighted = TRUE, nPermutation=100, enforceRange = TRUE, ...) { if (is(exprs, "ExpressionSet")) exprs <- exprs(exprs) if (!is.list(cl)) stop("cl need to be a list") if (!is.null(ref)) { if (!is.numeric(ref)) stop("column index's required") } if (!is.null(ref)) { if (options.out$verbose) cat("Creating ratios...", "\n") ref_mean <- apply(exprs[, ref], 1, mean, na.rm = TRUE) exprs <- sweep(exprs, 1, ref_mean, "-") } if (center) exprs <- scale(exprs, scale = FALSE) # column centering is done results <- matrix(NA, length(cl), ncol(exprs)) rownames(results) <- names(cl) colnames(results) <- colnames(exprs) mode(results) <- "numeric" Setsize = c(rep(0,length(cl))) # gene set size vector mean2 = c(rep(0,length(cl))) # mean of the range of means meanSD = c(rep(0,length(cl))) # SD of the range of means if (is.logical(p.value)) { p.results <- results; mean.results <- results;} for (i in 1:length(cl)) { # for each gene list #cat("\nProcessing gene set",i); if (class(cl[[i]]) == "smc") { clids <- cl[[i]]@ids } else if (class(cl[[i]]) %in% c("GeneColorSet", "GeneSet")) { clids <- cl[[i]]@geneIds } else { clids <- cl[[i]] } if (options()$verbose) cat("Testing region ", i, "\n") ix <- match(clids, rownames(exprs)) ix <- unique(ix[!is.na(ix)]) present <- sum(!is.na(ix)) Setsize[i] <- present if (present < range[1]) { if (options()$verbose) cat("Skipping region ", i, " because too small-", present, ",\n") next } if (present > range[2]) { if (options()$verbose) cat("Skipping region ", i, " because too large-", present, "\n") next } texprs <- exprs[ix, ] # expression matrix for genes in gene set if (any(is.na(texprs))) cat("Warning - 'NA' values within expression data, enrichment scores are estimates only.\n") if (!is.matrix(texprs)) texprs <- as.matrix(texprs) stat <- try(apply(texprs, 2, t.test, ...)) means <- try(apply(texprs, 2, mean,trim=0.1)) # trim mean ps <- unlist(lapply(stat, function(x) x$p.value)) stat <- unlist(lapply(stat, function(x) x$statistic)) p.results[i, ] <- ps mean.results[i,] <- means results[i, ] <- as.numeric(stat) # permutation of gene sets of the same size if(nPermutation > 2 ) { # no permutation if <=2 meansRanges = c(0, rep(nPermutation)) for( k in 1:nPermutation ) { ix <- sample.int( dim(exprs)[1], length(ix) ) texprs <- exprs[ix, ] means <- try(apply(texprs, 2, mean,trim=0.1)) # trim mean meansRanges[k] = dynamicRange(means) } mean2[i] = mean(meansRanges) meanSD[i]= sd(meansRanges,na.rm=TRUE) # NA are removed before calculating standard deviation } } return(list(results = results, p.results = p.results, means = mean.results, size=Setsize, mean2=mean2, meanSD=meanSD)) } PCApathway<- function () { # pathway x <- convertedData.out; library(PGSEA,verbose=FALSE) pca.object <- prcomp(t(x)) pca = 100*pca.object$rotation Npca = 5 if (Npca > dim(pca)[2]) { Npca = dim(pca)[2] } else pca <- pca[,1:Npca] #pca = pca[,1:5] if(is.null(GeneSets.out ) ) return(NULL) # no species recognized if(length(GeneSets.out ) <= 1 ) return(NULL) #cat("\n\nGene Sets:",length( GeneSets.out)) pg = myPGSEA (pca,cl=GeneSets.out,range=c(15,2000),p.value=TRUE, weighted=FALSE,nPermutation=1) # correcting for multiple testing p.matrix = pg$p.result tem = p.adjust(as.numeric(p.matrix),"fdr") p.matrix = matrix(tem, nrow=dim(p.matrix)[1], ncol = dim(p.matrix)[2] ) rownames(p.matrix) = rownames(pg$p.result); colnames(p.matrix) = colnames(pg$p.result) selected =c() for( i in 1:dim(p.matrix)[2]) { tem = which( rank(p.matrix[,i],ties.method='first') <= 5) # rank by P value #tem = which( rank(pg$result[,i],ties.method='first') >= dim(p.matrix)[1]-3.1) # rank by mean names(tem) = paste("PC",i," ", rownames(p.matrix)[tem], sep="" ) selected = c(selected, tem) } rowids = gsub(" .*","",names(selected)) rowids = as.numeric( gsub("PC","",rowids) ) pvals = p.matrix[ cbind(selected,rowids) ] a=sprintf("%-1.0e",pvals) tem = pg$result[selected,] rownames(tem) = paste(a,names(selected)); #colnames(tem)= paste("PC",colnames(tem),sep="") tem = tem[!duplicated(selected),] #tem = t(tem); tem = t( (tem - apply(tem,1,mean)) ) #/apply(tem,1,sd) ) smcPlot(tem,scale = c(-max(tem), max(tem)), show.grid = T, margins = c(3,1, 6, 23), col = .rwb,cex.lab=0.5, main="Pathways analysis on PCA") } # correlation PCA with factors PCA2factor <- function( ){ if(is.null(readSampleInfo.out)) return(NULL) npc = 5 # number of Principal components x <- readData.out$data y <- readSampleInfo.out pca.object <- prcomp(t(x)) pcaData = as.data.frame(pca.object$x[,1:npc]); pvals = matrix(1,nrow=npc,ncol=ncol(y)) for (i in 1:npc ){ for (j in 1:ncol(y) ) pvals[i,j] =summary( aov(pcaData[,i] ~ as.factor(y[,j])))[[1]][["Pr(>F)"]][1] } pvals = pvals * npc* ncol(y) # correcting for multiple testing pvals[pvals>1] = 1 colnames(pvals) = colnames(y) rownames(pvals) = paste0("PC",1:npc) a="\n Correlation between Principal Components (PCs) with factors\n" nchar0 = nchar(a) for ( i in 1:npc){ j = which.min(pvals[i,]) if(pvals[i,j]< 0.05) a=paste0(a,rownames(pvals)[i], " is correlated with ", colnames(pvals)[j], " (p=", sprintf("%-3.2e",pvals[i,j]),").\n") } if(nchar(a) == nchar0 ) return(NULL) else return( a ) } # detecting sequencing depth bias readCountsBias <- function( ){ totalCounts = colSums(readData.out$rawCounts) groups = as.factor( detectGroups(colnames(readData.out$rawCounts ) ) ) tem = NULL # ANOVA of total read counts vs sample groups parsed by sample name pval = summary( aov(totalCounts ~ groups ))[[1]][["Pr(>F)"]][1] if(pval <0.05) tem = paste("Warning! Sequencing depth bias detected. Total read counts are significantly different among sample groups (p=", sprintf("%-3.2e",pval),") based on ANOVA.") # ANOVA of total read counts vs factors in experiment design if(!is.null(readSampleInfo.out ) ) { y <- readSampleInfo.out for (j in 1:ncol(y) ) { pval = summary( aov(totalCounts ~ as.factor(y[,j])))[[1]][["Pr(>F)"]][1] if(pval <0.05) tem = paste(tem, " Total read counts seem to be correlated with factor",colnames(y)[j], "(p=", sprintf("%-3.2e",pval),"). ") } } if(is.null(tem)) return( "No bias detected") else return( tem ) } ################################################################ # K-means clustering ################################################################ #Distribution of SDs distributionSD <- function() { SDs=apply(convertedData.out,1,sd) maxSD = mean(SDs)+ 4*sd(SDs) SDs[ SDs > maxSD] = maxSD top = input_nGenesKNN if(top > length(SDs)) top = length(SDs) Cutoff=sort(SDs,decreasing=TRUE)[top] SDs = as.data.frame(SDs) p <- ggplot(SDs, aes(x=SDs)) + geom_density(color="darkblue", fill="lightblue") + labs(x = "Standard deviations of all genes", y="Density")+ geom_vline(aes(xintercept=Cutoff), color="red", linetype="dashed", size=1) + annotate("text", x = Cutoff + 0.4*sd(SDs[,1]), y = 1,colour = "red", label = paste0("Top ", top)) p } # Decide number of clusters KmeansNclusters <- function() { # Kmeans clustering x <- convertedData.out #x <- readData.out #par(mfrow=c(1,2)) n=input_nGenesKNN #if(n>6000) n = 6000 # max if(n>dim(x)[1]) n = dim(x)[1] # max as data #x1 <- x; #x=as.matrix(x[1:n,])-apply(x[1:n,],1,mean) x = 100* x[1:n,] / apply(x[1:n,],1,sum) # this is causing problem?????? #x = x - apply(x,1,mean) # this is causing problem?????? #colnames(x) = gsub("_.*","",colnames(x)) set.seed(2) k = 30 wss <- (nrow(x)-1)*sum(apply(x,2,var)) for (i in 2:k) wss[i] <- sum(kmeans(x,centers=i,iter.max = 30)$withinss) par(mar=c(4,5,4,4)) plot(1:k, wss, type="b", xlab="Number of Clusters (k)", ylab="Within groups sum of squares", cex=2,cex.lab=2, cex.axis=2, cex.main=2, cex.sub=2 ,xaxt="n" ) axis(1, at = seq(1, 30, by = 2),cex.axis=1.5,cex=1.5) } Kmeans <- function() { # Kmeans clustering x <- convertedData.out #x <- readData.out #par(mfrow=c(1,2)) n=input_nGenesKNN if(n>maxGeneClustering) n = maxGeneClustering # max if(n>dim(x)[1]) n = dim(x)[1] # max as data #x1 <- x; #x=as.matrix(x[1:n,])-apply(x[1:n,],1,mean) #x = 100* x[1:n,] / apply(x[1:n,],1,sum) x = x[1:n,] if( input_kmeansNormalization == 'L1Norm') x = 100* x / apply(x,1,function(y) sum(abs(y))) else # L1 norm if( input_kmeansNormalization == 'geneMean') x = x - apply(x,1,mean) else # this is causing problem?????? if( input_kmeansNormalization == 'geneStandardization') x = (x - apply(x,1,mean) ) / apply(x,1,sd) #colnames(x) = gsub("_.*","",colnames(x)) set.seed(input_KmeansReRun) k=input_nClusters cl = kmeans(x,k,iter.max = 50) #myheatmap(cl$centers) hc <- hclust2(dist2(cl$centers-apply(cl$centers,1,mean) ) )# perform cluster for the reordering of samples tem = match(cl$cluster,hc$order) # new order x = x[order(tem),] ; bar = sort(tem) #myheatmap2(x-apply(x,1,mean), bar,1000) return(list( x = x, bar = bar)) } myheatmap2 <- function (x,bar=NULL,n=-1,mycolor=1,clusterNames=NULL,sideColors=NULL ) { # number of genes to show ngenes = as.character( table(bar)) if(length(bar) >n && n != -1) {ix = sort( sample(1:length(bar),n) ); bar = bar[ix]; x = x[ix,] } if(! is.null(bar) ) if(is.null(sideColors) ) sideColors = mycolors # this will cutoff very large values, which could skew the color x=as.matrix(x)-apply(x,1,mean) cutoff = median(unlist(x)) + 3*sd (unlist(x)) x[x>cutoff] <- cutoff cutoff = median(unlist(x)) - 3*sd (unlist(x)) x[x< cutoff] <- cutoff #colnames(x)= detectGroups(colnames(x)) if(is.null(bar)) # no side colors heatmap.2(x, Rowv =F,Colv=F, dendrogram ="none", col=heatColors[as.integer(mycolor),], density.info="none", trace="none", scale="none", keysize=.3 ,key=F, labRow = F, #,RowSideColors = mycolors[bar] ,margins = c(8, 24) ,srtCol=45 ) else heatmap.2(x, Rowv =F,Colv=F, dendrogram ="none", col=heatColors[as.integer(mycolor),], density.info="none", trace="none", scale="none", keysize=.3 ,key=F, labRow = F, ,RowSideColors = sideColors[bar] ,margins = c(8, 24) ,srtCol=45 ) if(!is.null(bar)) { legend.text = paste("Cluster ", toupper(letters)[unique(bar)], " (N=", ngenes,")", sep="") if( !is.null( clusterNames ) && length(clusterNames)>= length( unique(bar) ) ) legend.text = paste(clusterNames[ 1:length( unique(bar) ) ], " (N=", ngenes,")", sep="") par(lend = 1) # square line ends for the color legend legend("topright", # location of the legend on the heatmap plot legend = legend.text, # category labels col = sideColors, # color key lty= 1, # line style lwd = 10 ) # line width } } KmeansHeatmap <- function() { # Kmeans clustering myheatmap2(Kmeans.out$x-apply(Kmeans.out$x,1,mean), Kmeans.out$bar,1000,mycolor=input_heatColors1) } tSNEgenePlot <- function() { Cluster <- Kmeans.out$bar train <- as.data.frame( cbind(Cluster,Kmeans.out$x) ) library(Rtsne,verbose=FALSE) train = unique(train) Cluster = train$Cluster set.seed(input_seedTSNE) ## Executing the algorithm on curated data tsne <- Rtsne(train[,-1], dims = 2, perplexity=30, verbose=FALSE, max_iter = 400) nClusters = length(unique(Cluster) ) if(input_colorGenes) { plot(tsne$Y[,1], tsne$Y[,2], pch = (0:(nClusters-1))[Cluster], cex = 1.,col = mycolors[Cluster], xlab="X",ylab="Y") legend("topright",toupper(letters)[1:nClusters], pch = 0:(nClusters-1), col=mycolors, title="Cluster" ) } else plot(tsne$Y[,1], tsne$Y[,2], cex = 1., xlab="X",ylab="Y") } # Main function. Find a query set of genes enriched with functional category FindOverlap <- function (converted,gInfo, GO,selectOrg,minFDR, reduced = FALSE) { maxTerms =15 # max number of enriched terms idNotRecognized = as.data.frame("ID not recognized!") if(is.null(converted) ) return(idNotRecognized) # no ID # only coding gInfo <- gInfo[which( gInfo$gene_biotype == "protein_coding"),] querySet <- intersect( converted$IDs, gInfo[,1]); if(length(querySet) == 0) return(idNotRecognized ) ix = grep(converted$species[1,1],gmtFiles) totalGenes <- converted$species[1,7] if (length(ix) == 0 ) {return(idNotRecognized )} # If selected species is not the default "bestMatch", use that species directly if(selectOrg != speciesChoice[[1]]) { ix = grep(findSpeciesById(selectOrg)[1,1], gmtFiles ) if (length(ix) == 0 ) {return(idNotRecognized )} totalGenes <- orgInfo[which(orgInfo$id == as.numeric(selectOrg)),7] } pathway <- dbConnect(sqlite,gmtFiles[ix],flags=SQLITE_RO) sqlQuery = paste( " select distinct gene,pathwayID from pathway where gene IN ('", paste(querySet,collapse="', '"),"')" ,sep="") #cat(paste0("HH",GO,"HH") ) if( GO != "All") sqlQuery = paste0(sqlQuery, " AND category ='",GO,"'") result <- dbGetQuery( pathway, sqlQuery ) if( dim(result)[1] ==0) {return(as.data.frame("No matching species or gene ID file!" )) } # given a pathway id, it finds the overlapped genes, symbol preferred sharedGenesPrefered <- function(pathwayID) { tem <- result[which(result[,2]== pathwayID ),1] ix = match(tem, converted$conversionTable$ensembl_gene_id) # convert back to original tem2 <- unique( converted$conversionTable$User_input[ix] ) if(length(unique(gInfo$symbol) )/dim(gInfo)[1] >.7 ) # if 70% genes has symbol in geneInfo { ix = match(tem, gInfo$ensembl_gene_id); tem2 <- unique( gInfo$symbol[ix] ) } return( paste( tem2 ,collapse=" ",sep="") )} x0 = table(result$pathwayID) x0 = as.data.frame( x0[which(x0>=Min_overlap)] )# remove low overlaps if(dim(x0)[1] <= 5 ) return(idNotRecognized) # no data colnames(x0)=c("pathwayID","overlap") pathwayInfo <- dbGetQuery( pathway, paste( " select distinct id,n,Description from pathwayInfo where id IN ('", paste(x0$pathwayID,collapse="', '"), "') ",sep="") ) x = merge(x0,pathwayInfo, by.x='pathwayID', by.y='id') x$Pval=phyper(x$overlap-1,length(querySet),totalGenes - length(querySet),as.numeric(x$n), lower.tail=FALSE ); x$FDR = p.adjust(x$Pval,method="fdr") x <- x[ order( x$FDR) ,] # sort according to FDR if(dim(x)[1] > maxTerms ) x = x[1:maxTerms,] if(min(x$FDR) > minFDR) x=as.data.frame("No significant enrichment found!") else { x <- x[which(x$FDR < minFDR),] x= cbind(x,sapply( x$pathwayID, sharedGenesPrefered ) ) colnames(x)[7]= "Genes" x <- subset(x,select = c(FDR,overlap,n,description,Genes) ) colnames(x) = c("Corrected P value (FDR)", "Genes in list", "Total genes in category","Functional Category","Genes" ) # remove redudant gene sets if(reduced != FALSE ){ # reduced=FALSE no filtering, reduced = 0.9 filter sets overlap with 90% n= nrow(x) tem=rep(TRUE,n ) geneLists = lapply(x$Genes, function(y) unlist( strsplit(as.character(y)," " ) ) ) for( i in 2:n) for( j in 1:(i-1) ) { if(tem[j]) { # skip if this one is already removed commonGenes = length(intersect(geneLists[i] ,geneLists[j] ) ) if( commonGenes/ length(geneLists[j] ) > reduced ) tem[i] = FALSE } } x <- x[which(tem),] } } dbDisconnect(pathway) return(x ) } # Given a gene set, finds significant overlaps with a gene set database object findOverlapGMT <- function ( query, geneSet, minFDR=.2 ,minSize=2,maxSize=10000 ){ #geneSets <- readGMT("exampleData/MousePath_TF_gmt.gmt") #query <- geneSets[['TF_MM_FRIARD_C-REL']] #query <- query[1:60] total_elements = 30000 Min_overlap <- 1 maxTerms =10 # max number of enriched terms noSig <- as.data.frame("No significant enrichment found!") query <- cleanGeneSet(query) # convert to upper case, unique() if(length(query) <=2) return(noSig) if(length(geneSet) <1) return(noSig) geneSet <- geneSet[which(sapply(geneSet,length) > minSize)] # gene sets smaller than 1 is ignored!!! geneSet <- geneSet[which(sapply(geneSet,length) < maxSize)] # gene sets smaller than 1 is ignored!!! result <- unlist( lapply(geneSet, function(x) length( intersect (query, x) ) ) ) result <- cbind(unlist( lapply(geneSet, length) ), result ) result <- result[ which(result[,2]>Min_overlap), ,drop=F] if(dim(result)[1] == 0) return( noSig) xx <- result[,2] mm <- length(query) nn <- total_elements - mm kk <- result[,1] Pval_enrich=phyper(xx-1,mm,nn,kk, lower.tail=FALSE ); FDR <- p.adjust(Pval_enrich,method="fdr",n=length(geneSet) ) result <- as.data.frame(cbind(FDR,result)) result <- result[,c(1,3,2)] result$pathway = rownames(result) result$Genes = "" # place holder just colnames(result)= c("Corrected P value (FDR)", "Genes in list", "Total genes in category","Functional Category","Genes" ) result <- result[ which( result[,1] < minFDR),,drop=F] if( dim( result)[1] == 0) return(noSig) if(min(FDR) > minFDR) return(noSig) result <- result[order(result[,1] ),] if(dim(result)[1] > maxTerms ) result <- result[1:maxTerms,] return( result) } KmeansGO <- function() { pp=0 minFDR = 0.01 for( i in 1:input_nClusters) { query = rownames(Kmeans.out$x)[which(Kmeans.out$bar == i)] result <- findOverlapGMT( query, GeneSets.out,1) if( dim(result)[2] ==1) next; # result could be NULL result$Genes = toupper(letters)[i] if (pp==0 ) { results <- result; pp <- 1; } else { results <- rbind(results,result) } } if(pp == 0) return( as.data.frame("No enrichment found.")) results= results[,c(5,1,2,4)] colnames(results)= c("Cluster","FDR","Genes","Pathways") if(min(results$FDR) > minFDR ) results = as.data.frame("No signficant enrichment found.") else results = results[which(results$FDR < minFDR),] if( is.null(results) ) return ( as.matrix("No significant enrichment.") ) if( class(results) != "data.frame") return ( as.matrix("No significant enrichment.") ) if( dim(results)[2] ==1) return ( as.matrix("No significant enrichment.") ) colnames(results)[2] = "adj.Pval" results$Genes <- as.character(results$Genes) results$Cluster[which( duplicated(results$Cluster) ) ] <- "" results } ################################################################ # Differential expression ################################################################ # Differential expression using LIMMA DEG.limma <- function (x, maxP_limma=.1, minFC_limma=2, rawCounts,countsDEGMethods,priorCounts, dataFormat, selectedComparisons=NULL, sampleInfo = NULL,modelFactors=NULL, blockFactor = NULL){ library(limma,verbose=FALSE) # Differential expression library(statmod,verbose=FALSE) # many different situations: 1. just use sample names 2. just one factor 3. two factors no interaction # 4. two factors with interaction 5. block factor topGenes = list(); limmaTrend = FALSE if( dataFormat == 2) { # if normalized data eset = new("ExpressionSet", exprs=as.matrix(x)) } else { # counts data if (countsDEGMethods == 1 ) { # limma-trend method selected for counts data #dge <- DGEList(counts=rawCounts); #dge <- calcNormFactors(dge, method = "TMM") #eset <- cpm(dge, log=TRUE, prior.count=priorCounts) eset = new("ExpressionSet", exprs=as.matrix(x)) # use transformed data for limma limmaTrend = TRUE } } groups = colnames(x) groups = detectGroups( groups) g = unique(groups) # check for replicates, removes samples without replicates reps = as.matrix(table(groups)) # number of replicates per biological sample if ( sum( reps[,1] >= 2) <2 ) # if less than 2 samples with replicates return( list(results= NULL, comparisons = NULL, Exp.type="Failed to parse sample names to define groups. Cannot perform DEGs and pathway analysis. Please double check column names! Use WT_Rep1, WT_Rep2 etc. ", topGenes=NULL)) # remove samples without replicates g <- rownames(reps)[which(reps[,1] >1)] ix <- which( groups %in% g) groups <- groups[ix] x<- x[,ix]; rawCounts <- rawCounts[,ix] if(length(g) ==2 ) { g= unique(groups) comparisons <- paste(g[2],"-",g[1],sep="") # "Mutant-WT" # no sample file, but user selected comparisons using column names if( is.null(modelFactors) & length( selectedComparisons) >0 ) comparisons = selectedComparisons design <- model.matrix(~0+groups) colnames(design) <- g if( !is.null(rawCounts) && countsDEGMethods == 2) { # voom dge <- DGEList(counts=rawCounts); dge <- calcNormFactors(dge, method = "TMM") # normalization v <- voom(dge, design); fit <- lmFit(v, design) } else fit <- lmFit(eset, design) # regular limma cont.matrix <- makeContrasts(contrasts=comparisons, levels=design) fit2 <- contrasts.fit(fit, cont.matrix) fit2 <- eBayes(fit2, trend=limmaTrend) # calls differential gene expression 1 for up, -1 for down results <- decideTests(fit2, p.value=maxP_limma, lfc=log2(minFC_limma) ) #vennDiagram(results,circle.col=rainbow(5)) topGenes1 =topTable(fit2, number = 1e12,sort.by="M" ) if (dim(topGenes1)[1] != 0) { topGenes1 = topGenes1[,c('logFC','adj.P.Val')] # topGenes1[,1] <- -1* topGenes1[,1] # reverse direction topGenes[[1]] <- topGenes1 } # log fold change is actually substract of means. So if the data is natral log transformed, it shoudl be natral log. Exp.type = "2 sample groups." } if(length(g) > 2 ) { # more than two sample groups design <- model.matrix(~ 0+factor(groups)) colnames(design) <- gsub(".*)","",colnames(design)) if( !is.null(rawCounts) && countsDEGMethods == 2) { # voom v <- voom(rawCounts, design); fit <- lmFit(v, design) } else fit <- lmFit(eset, design) fit <- eBayes(fit, trend=limmaTrend) comparisons = "" for( i in 1:(length(g)-1) ) for (j in (i+1):length(g)) comparisons = c(comparisons,paste(g[j],"-",g[i],sep="" ) ) comparisons <- comparisons[-1] # no sample file, but user selected comparisons using column names if( is.null(modelFactors) & length( selectedComparisons) >0 ) comparisons = selectedComparisons contrast1 <- makeContrasts(contrasts=comparisons[1], levels=design) if(length(comparisons)>1 ) for( kk in 2:length(comparisons) ) contrast1<- cbind(contrast1,makeContrasts(contrasts=comparisons[kk], levels=design) ) Exp.type = paste(length(g)," sample groups detected.") # if factorial design 2x2, 2x3, 3x5 etc. # all samples must be something like WT_control_rep1 if ( sum(sapply(strsplit(g,"_"),length) == 2 ) == length(g) ) { #comparisons comparisons = "" for( i in 1:(length(g)-1) ) for (j in (i+1):length(g)) if( strsplit(g[i],"_")[[1]][1] == strsplit(g[j],"_")[[1]][1]| strsplit(g[i],"_")[[1]][2] == strsplit(g[j],"_")[[1]][2]) # only compare WT_control vs. WT_treatment comparisons = c(comparisons,paste(g[j],"-",g[i],sep="" ) ) comparisons <- comparisons[-1] #factors genotype treatment levels extract_treatment <- function (x) paste( gsub( ".*_","",unlist( strsplit(x,"-")) ), collapse="-") extract_genotype <- function (x) gsub( "_.*","",unlist( strsplit(x,"-")) )[1] extract_treatment_counting <- unique( gsub( ".*_","",unlist( strsplit(g,"-")) )) treatments = sapply(comparisons, extract_treatment) genotypes = sapply(comparisons, extract_genotype) Exp.type = paste( Exp.type, "\nFactorial design:",length(unique(genotypes)),"X", length( extract_treatment_counting ), sep="" ) # pairwise contrasts contrast1 <- makeContrasts(contrasts=comparisons[1], levels=design) for( kk in 2:length(comparisons) ) contrast1<- cbind(contrast1,makeContrasts(contrasts=comparisons[kk], levels=design) ) contrast.names = colnames(contrast1) # interaction contrasts for ( kk in 1:(length(comparisons)-1) ) { for( kp in (kk+1):length(comparisons)) if( treatments[kp]== treatments[kk] ) { contrast1 = cbind(contrast1, contrast1[,kp]- contrast1[,kk] ) contrast.names = c(contrast.names, paste("I:", genotypes[kp], "-", genotypes[kk],"(",gsub("-",".vs.",treatments[kp]),")",sep="" ) ) } } colnames(contrast1)=contrast.names comparisons = contrast.names } # if interaction terms # if sample information is uploaded and user selected factors and comparisons if( !is.null(modelFactors) & length( selectedComparisons) >0 ) { Exp.type = paste("Model: ~", paste(modelFactors,collapse=" + ") ) interactionTerm = FALSE # default value to be re-write if needed # model factors that does not contain interaction terms # modelFactors "genotype", "condition", "genotype:condition" keyModelFactors = modelFactors[ !grepl(":",modelFactors) ] # "genotype: control vs. mutant" factorsVector= gsub(":.*","",selectedComparisons) # corresponding factors for each comparison # remove factors not used in comparison, these are batch effects/pairs/blocks, # keyModelFactors = keyModelFactors[ !is.na(match(keyModelFactors, factorsVector)) ] # if a factor is selected both in block and main factors, then use it as block factor keyModelFactors = keyModelFactors[ is.na(match(keyModelFactors, blockFactor)) ] #------Design matrix sampleInfo2 = sampleInfo[,keyModelFactors,drop=F] # remove factors not used. groups = apply(sampleInfo2,1, function(x) paste(x,collapse="_") ) g = unique(groups) design <- model.matrix(~ 0+factor(groups)) colnames(design) <- gsub(".*)","",colnames(design)) if( !is.null(rawCounts) && countsDEGMethods == 2) { # voom v <- voom(rawCounts, design); fit <- lmFit(v, design) } else fit <- lmFit(eset, design) fit <- eBayes(fit, trend=limmaTrend) #-----------Making comaprisons if( length(keyModelFactors) != 2 | length(blockFactor) >1 ) { # if only one factor, or more than two then use all pairwise comparisons comparisons = gsub(".*: ","",selectedComparisons) comparisons = gsub(" vs\\. ","-",comparisons) } else if( length(keyModelFactors) == 2 ){ # two key factors if( sum(grepl(":",modelFactors) >0) ) { # interaction? interactionTerm =TRUE # all pairwise comparisons comparisons = "" for( i in 1:(length(g)-1) ) for (j in (i+1):length(g)) if( strsplit(g[i],"_")[[1]][1] == strsplit(g[j],"_")[[1]][1]| strsplit(g[i],"_")[[1]][2] == strsplit(g[j],"_")[[1]][2]) # only compare WT_control vs. WT_treatment comparisons = c(comparisons,paste(g[j],"-",g[i],sep="" ) ) comparisons <- comparisons[-1] # pairwise contrasts contrast1 <- makeContrasts(contrasts=comparisons[1], levels=design) if(length(comparisons)>1 ) for( kk in 2:length(comparisons) ) contrast1<- cbind(contrast1,makeContrasts(contrasts=comparisons[kk], levels=design) ) contrast.names = colnames(contrast1) # all possible interactions # interaction contrasts contrast2 = NULL contrast.names ="" for ( kk in 1:(dim(contrast1)[2]-1) ) { for( kp in (kk+1):dim(contrast1)[2]) #if( treatments[kp]== treatments[kk] ) { if(is.null(contrast2)) contrast2 = contrast1[,kp]- contrast1[,kk] else contrast2 = cbind(contrast2, contrast1[,kp]- contrast1[,kk] ) contrast.names = c(contrast.names, paste0("I:", colnames(contrast1)[kp], ".vs.", colnames(contrast1)[kk] ) ) } } colnames(contrast2)=contrast.names[-1] # remove nonsense contrasts from interactions contrast2 = contrast2[,which(apply(abs(contrast2),2,max)==1),drop=F] contrast2 = contrast2[,which(apply(abs(contrast2),2,sum)==4),drop=F] contrast2 = t( unique(t(contrast2)) ) # remove duplicate columns # remove unwanted contrasts involving more than three levels in either factor keep= c() for( i in 1:dim(contrast2)[2]) { tem = rownames(contrast2)[ contrast2[ ,i ] != 0 ] tem1 = unique ( unlist(gsub("_.*","", tem) ) ) tem2 = unique ( unlist(gsub(".*_","", tem) ) ) if( length(tem1) == 2 & length(tem2) ==2 ) keep = c(keep, colnames(contrast2) [i] ) } contrast2 = contrast2[,keep,drop=F] comparisons2 = colnames(contrast2) } # "stage: MN vs. EN" --> c("MN_AB-EN_AB", "EN_Nodule-EN_AB") # comparisons in all levels of the other factor transformComparisons <- function (comparison1){ tem = gsub(".*: ","",comparison1) tem = unlist(strsplit(tem, " vs\\. ") ) # control mutant factor1= gsub(":.*","",comparison1) ix = match(factor1, keyModelFactors) # 1: first factor, 2: 2nd factor otherFactor = keyModelFactors[3-ix] # 3-1 = 2; 3-1=1 otherFactorLevels = unique( sampleInfo2[,otherFactor] ) comparisons = c( ) for (factorLevels in otherFactorLevels) { if( ix == 1){ comparisons = c( comparisons, paste(paste0( tem, "_",factorLevels),collapse="-") ) } else { comparisons = c( comparisons, paste(paste0(factorLevels, "_", tem),collapse="-") ) } } return(comparisons) } comparisons = unlist( sapply(selectedComparisons, transformComparisons )) comparisons = as.vector(comparisons) } # two factors # make contrasts contrast1 <- makeContrasts(contrasts=comparisons[1], levels=design) if(length(comparisons) >1 ) for( kk in 2:length(comparisons) ) contrast1<- cbind(contrast1,makeContrasts(contrasts=comparisons[kk], levels=design) ) if( interactionTerm ) { # if interaction terms contrast1 = cbind(contrast1,contrast2) contrast.names = c(colnames(contrast1), colnames(contrast2) ) comparisons = c(comparisons,comparisons2) } # block design to remove batch effect or paired samples # corfit <- duplicateCorrelation(eset,design,block=targets$Subject) # corfit$consensus #Then this inter-subject correlation is input into the linear model fit: # fit <- lmFit(eset,design,block=targets$Subject,correlation=corfit$consensus) if(length(blockFactor) >= 1 ) { # if a factor is selected as block if(length(blockFactor) >= 1 ) blockFactor = blockFactor[1] # if multiple use the first one block = sampleInfo[, blockFactor] # the column not used if( !is.null(rawCounts) && countsDEGMethods == 2) { # voom v <- voom(rawCounts, design); corfit <- duplicateCorrelation(v,design,block= block) fit <- lmFit(v, design,block=block,correlation=corfit$consensus) } else { corfit <- duplicateCorrelation(eset,design,block= block) fit <- lmFit(eset, design,block=block,correlation=corfit$consensus) } fit <- eBayes(fit, trend=limmaTrend) } # block factors } # use selected factors fit2 <- contrasts.fit(fit, contrast1) fit2 <- eBayes(fit2, trend=limmaTrend) #topTable(fit2, coef=1, adjust="BH") results <- decideTests(fit2, p.value=maxP_limma, lfc= log2(minFC_limma )) #vennDiagram(results[,1:5],circle.col=rainbow(5)) #colnames(results) = unlist(sapply( colnames(results), changeNames ) ) #comparisons3 <- unlist(sapply( comparisons, changeNames ) ) # extract fold change for each comparison # there is issues with direction of foldchange. Sometimes opposite top <- function (comp) { tem <- topTable(fit2, number = 1e12,coef=comp,sort.by="M" ) if(dim(tem)[1] == 0) { return (1) } else { # compute fold change for the first gene (ranked by absolute value) tem2 = as.numeric( x[ which(rownames(x)== rownames(tem)[1]) , ] ) names(tem2) = colnames(x) return( tem[,c(1,5)]) } } # no significant gene returns 1, otherwise a data frame topGenes <- lapply(comparisons, top) topGenes <- setNames(topGenes, comparisons ) ix <- which( unlist( lapply(topGenes, class) ) == "numeric") if( length(ix)>0) topGenes <- topGenes[ - ix ] # if (length(topGenes) == 0) topGenes = NULL; } #cat("\n", names(topGenes) ) #cat("\n", colnames(results)) #cat("\n", comparisons3) #comparisons <- comparisons3 # it does not make any sense! comparisons can not be changed! #comparisons =comparisons3 #cat("\n", comparisons) return( list(results= results, comparisons=comparisons, Exp.type=Exp.type, topGenes=topGenes)) } # Differential expression using DESeq2 DEG.DESeq2 <- function ( rawCounts,maxP_limma=.05, minFC_limma=2, selectedComparisons=NULL, sampleInfo = NULL,modelFactors=NULL, blockFactor = NULL, referenceLevels=NULL){ library(DESeq2,verbose=FALSE) # count data analysis groups = as.character ( detectGroups( colnames( rawCounts ) ) ) g = unique(groups)# order is reversed # check for replicates, removes samples without replicates reps = as.matrix(table(groups)) # number of replicates per biological sample if ( sum( reps[,1] >= 2) <2 ) # if less than 2 samples with replicates return( list(results= NULL, comparisons = NULL, Exp.type="Failed to parse sample names to define groups. Cannot perform DEGs and pathway analysis. Please double check column names! Use WT_Rep1, WT_Rep2 etc. ", topGenes=NULL)) # remove samples without replicates g <- rownames(reps)[which(reps[,1] >1)] ix <- which( groups %in% g) groups <- groups[ix] rawCounts <- rawCounts[,ix] Exp.type = paste(length(g)," sample groups detected.") comparisons = "" for( i in 1:(length(g)-1) ) for (j in (i+1):length(g)) comparisons = c(comparisons,paste(g[j],"-",g[i],sep="" ) ) comparisons <- comparisons[-1] colData = cbind(colnames(rawCounts), groups ) # no sample file, but user selected comparisons using column names if( is.null(modelFactors) & length( selectedComparisons) >0 ) comparisons = selectedComparisons comparisons2 = comparisons # this is for showing comparison names, which might be different from internally # Set up the DESeqDataSet Object and run the DESeq pipeline dds = DESeqDataSetFromMatrix(countData=rawCounts, colData=colData, design=~groups) if( is.null(modelFactors) ) dds = DESeq(dds) else { # using selected factors and comparisons # build model modelFactors = c(modelFactors,blockFactor) # block factor is just added in. factors = modelFactors # selected factors and interactions: c( "strain", "treatment", "strain:treatment") factors = factors[ !grepl(":",factors )] # non-interaction terms # interaction terms like strain:treatment Interactions = modelFactors[ grepl(":",modelFactors )] colData = sampleInfo factorsCoded = toupper(letters )[1: dim(colData)[2] ] # Factors are encoded as "A", "B", "C"; this avoid illigal letters names(factorsCoded) = colnames(colData) # this is for look up; each column of sampleInfo colnames(colData) = factorsCoded # all columns named A B C D colData = as.data.frame(colData) # set reference levels for factors if(! is.null( referenceLevels) ) { # c("genotype:wt", "treatment:control" ) # first factor for ( refs in referenceLevels) if(! is.null( refs) ) { ix = match(gsub(":.*","",refs), colnames(sampleInfo) ) # corresponding column id for factor colData[,ix] = as.factor( colData[,ix] ) colData[,ix] = relevel(colData[,ix],gsub(".*:","",refs) ) } } # base model DESeq2.Object= paste("dds = DESeqDataSetFromMatrix(countData=rawCounts, colData=colData, design=~ ", paste( factorsCoded[factors],collapse="+")) # only use selected factors Exp.type = paste("Model: ~", paste(modelFactors,collapse=" + ") ) # create model if( length(Interactions)>0 ) { # if there is interaction for( interactionTerms in Interactions) { interactingFactors = unlist(strsplit(interactionTerms,":" ) ) # split strain:treatment as "strain" and "mutant" tem = paste(factorsCoded [ interactingFactors ],collapse=":") # convert "strain:mutant" to "A:B" DESeq2.Object = paste(DESeq2.Object, " + ",tem) } } DESeq2.Object= paste( DESeq2.Object, ")") # ends the model eval(parse(text = DESeq2.Object) ) dds = DESeq(dds) # main function # comparisons # "group: control vs. mutant" comparisons = gsub(".*: ","",selectedComparisons) comparisons = gsub(" vs\\. ","-",comparisons) factorsVector= gsub(":.*","",selectedComparisons) # corresponding factors for each comparison # comparison2 holds names for display with real factor names # comparison is used in calculation it is A, B, C for factors comparisons2 = comparisons #comparisons2 = gsub(" vs\\. ","-",selectedComparisons) #comparisons2 = gsub(":","_",comparisons2) # Note that with interaction terms, not all meaningful comparisons is listed for selection. # this is complex. Only under reference level. # comparisons due to interaction terms if( length(Interactions)>0 ) { # if there is interaction interactionComparisons = resultsNames(dds) interactionComparisons = interactionComparisons[ grepl("\\.",interactionComparisons ) ] comparisons = c(comparisons,interactionComparisons ) # translate comparisons generated in interaction terms back to real factor names interactionComparisons2 = interactionComparisons for ( i in 1:length(interactionComparisons2 ) ) { tem = unlist(strsplit(interactionComparisons2[i],"\\." ) ) tem_factors = substr(tem,1,1) tem_factors[1] = names(factorsCoded)[factorsCoded == tem_factors[1]] # get the first letter and translate into real factor names tem_factors[2] = names(factorsCoded)[factorsCoded == tem_factors[2]] # get the 2nd letters and translate into real factor names interactionComparisons2[i] <- paste0( "I:",tem_factors[1], "_",substr(tem[1],2,nchar(tem[1]) ),".", tem_factors[2], "_",substr(tem[2],2,nchar(tem[2]) ) ) } comparisons2 = c(comparisons2,interactionComparisons2 ) } } # if selected factors # extract contrasts according to comprisons defined above result1 = NULL; allCalls = NULL; topGenes = list(); pk = 1 # counter pp=0 # first results? for( kk in 1:length(comparisons) ) { tem = unlist( strsplit(comparisons[kk],"-") ) if(is.null(modelFactors)) # if just group comparison using sample names selected = results(dds, contrast=c("groups", tem[1], tem[2]) ) else { if(!grepl("\\.", comparisons[kk] ) ) # if not interaction term: they contain . interaction term selected = results(dds, contrast=c( factorsCoded[ factorsVector[kk] ],tem[1], tem[2]) ) else # either A, B, C ... selected = results(dds, name=comparisons[kk] ) # interaction term } selected$calls =0 selected$calls [which( selected$log2FoldChange > log2(minFC_limma) & selected$padj < maxP_limma ) ] <- 1 selected$calls [ which( selected$log2FoldChange < -log2(minFC_limma) & selected$padj < maxP_limma ) ] <- -1 colnames(selected)= paste( as.character(comparisons2[kk]), "___",colnames(selected),sep="" ) selected = as.data.frame(selected) if (pp==0){ # if first one with significant genes, collect gene list and Pval+ fold result1 = selected; pp = 1; # selected[,2] <- -1 * selected[,2] # reverse fold change direction topGenes[[1]] = selected[,c(2,6)]; names(topGenes)[1] = comparisons2[kk]; } else { result1 = merge(result1,selected,by="row.names"); rownames(result1) = result1[,1]; result1 <- result1[,-1] pk= pk+1; # selected[,2] <- -1 * selected[,2] # reverse fold change direction topGenes[[pk]] = selected[,c(2,6)]; names(topGenes)[pk] = comparisons2[kk]; # assign name to comprison } } Interactions = c() if( !is.null(modelFactors) ) Interactions = modelFactors[ grepl(":",modelFactors )] #--- add comprisons for non-reference levels. It adds to the results1 object. if( length(Interactions)>0 ) { # if there is interaction factorLookup=c() # a factor whose values are factors and names are factor and level combination conditionTreated, genotypeWT levelLookup = c() for( i in 1:dim(sampleInfo)[2]) { sampleInfo2 = unique(sampleInfo) tem = rep(toupper(letters)[i],dim(sampleInfo2)[1] ) names(tem) = paste0(toupper(letters)[i],sampleInfo2[,i]) factorLookup = c(factorLookup,tem) tem = as.character( sampleInfo2[,i] ) names(tem) = paste0(toupper(letters)[i],sampleInfo2[,i]) levelLookup = c(levelLookup, tem) } # split genotypeI.conditionTrt --> c("genotype","I","conditoin","Trt") splitInteractionTerms <- function (term) { if(!grepl("\\.",term) ) return(NULL) terms2 = unlist(strsplit(term,"\\.") ) # factor1, level1, factor2, level2 return(c(factorLookup[terms2[1]], levelLookup[terms2[1]],factorLookup[terms2[2]], levelLookup[terms2[2]] ) ) } # none interaction terms NoneInterTerms = resultsNames(dds)[ !grepl( "\\.", resultsNames(dds)) ] NoneInterTerms=NoneInterTerms[-1] allInteractionTerms = resultsNames(dds)[ grepl( "\\.", resultsNames(dds)) ] for( kk in 1:length(NoneInterTerms) ) { # for each none interaction term if(!is.null(modelFactors) ) {# if not just group comparison using sample names #current factor cFactor = gsub("_.*","",NoneInterTerms[kk] ) for(interactionTerm in allInteractionTerms ) { splited = splitInteractionTerms (interactionTerm) # 4 components if (cFactor != splited[1] & cFactor != splited[3] ) next; selected = results(dds, list(c( NoneInterTerms[kk],interactionTerm ) ) ) comparisonName = paste0( NoneInterTerms[kk],"__", gsub("\\.","",interactionTerm) ) if( cFactor == splited[1] ) otherLevel = splited[4] else otherLevel = splited[2] comparisonName = paste0(#names(factorsCoded)[which(factorsCoded==cFactor)], # real factor name gsub("_vs_","-", substr(NoneInterTerms[kk], 3, nchar(NoneInterTerms[kk] ) )), # the comparison "_for_",otherLevel) comparisons2 = c(comparisons2, comparisonName) selected$calls =0 selected$calls [which( selected$log2FoldChange > log2(minFC_limma) & selected$padj < maxP_limma ) ] <- 1 selected$calls [ which( selected$log2FoldChange < -log2(minFC_limma) & selected$padj < maxP_limma ) ] <- -1 colnames(selected)= paste( comparisonName, "___",colnames(selected),sep="" ) selected = as.data.frame(selected) if (pp==0){ # if first one with significant genes, collect gene list and Pval+ fold result1 = selected; pp = 1; # selected[,2] <- -1 * selected[,2] # reverse fold change direction topGenes[[1]] = selected[,c(2,6)]; names(topGenes)[1] = comparisonName; } else { result1 = merge(result1,selected,by="row.names"); rownames(result1) = result1[,1]; result1 <- result1[,-1] pk= pk+1; # selected[,2] <- -1 * selected[,2] # reverse fold change direction topGenes[[pk]] = selected[,c(2,6)]; names(topGenes)[pk] = comparisonName; # assign name to comprison } } #for } #if } #for } #if #--- #if( length(comparisons) == 1) topGenes <- topGenes[[1]] # if only one comparison, topGenes is not a list, just a data frame itself. if(! is.null(result1)) { # note that when you only select 1 column from a data frame it automatically converts to a vector. drop =FALSE prevents that. allCalls = as.matrix( result1[,grep("calls",colnames(result1)), drop = FALSE ] ) colnames(allCalls)= gsub("___.*","", colnames(allCalls)) colnames(allCalls)= gsub("\\.","-", colnames(allCalls)) # note that samples names should have no "." colnames(allCalls)= gsub("^I-","I:", colnames(allCalls)) } return( list(results= allCalls, comparisons = comparisons2, Exp.type=Exp.type, topGenes=topGenes)) } # main function limma <- function() { if(input_dataFileFormat == 1 ) { # if count data if(input_CountsDEGMethod == 3 ) { # if DESeq2 method # rawCounts = read.csv("exampleData/airway_GSE52778.csv", row.names=1) # res =DEG.DESeq2(rawCounts, .05, 2) # res1 =DEG.limma(rawCounts, .1, 1.5,rawCounts, 2,3) return( DEG.DESeq2( convertedCounts.out,input_limmaPval, input_limmaFC, input_selectModelComprions, readSampleInfo.out, c(input_selectFactorsModel,input_selectInteractions), input_selectBlockFactorsModel, factorReferenceLevels.out ) ) } if(input_CountsDEGMethod < 3 ) # voom or limma-trend return( DEG.limma(convertedData.out, input_limmaPval, input_limmaFC, convertedCounts.out, input_CountsDEGMethod, priorCounts=input_countsLogStart,input_dataFileFormat, input_selectModelComprions, readSampleInfo.out, c(input_selectFactorsModel,input_selectInteractions), input_selectBlockFactorsModel) ) } else if (input_dataFileFormat == 2 ){ # normalized data return( DEG.limma(convertedData.out, input_limmaPval, input_limmaFC, convertedCounts.out, input_CountsDEGMethod, priorCounts=input_countsLogStart,input_dataFileFormat, input_selectModelComprions, readSampleInfo.out, c(input_selectFactorsModel,input_selectInteractions), input_selectBlockFactorsModel) ) } else { # dataFileFormat == 3 user just uploaded fold change matrix x = convertedData.out pvals = convertedPvals.out if(!is.null(pvals) ) { ix = match(rownames(x), rownames(pvals)) pvals = pvals[ix,] } # looks like ratio data, take log2 if( sum(round(apply(x,2, median) + .2) == 1 ) == dim(x)[2] & min(x) > 0) x = log2(x) Exp.type = "None standard data without replicates." all.Calls = x # fake calls for( i in 1: dim(all.Calls)[2]) { tem <- all.Calls[,i] all.Calls[which( tem <= log2(input_limmaFC) & tem >= -log2(input_limmaFC) ) ,i] = 0 all.Calls[which( tem > log2(input_limmaFC) ) ,i] = 1 all.Calls[which( tem < -log2(input_limmaFC) ) ,i] = -1 if(!is.null(pvals) ) all.Calls[ which( pvals[,i] > input_limmaPval),i] = 0 } comparisons = colnames(all.Calls) extractColumn <- function (i) { topGenes = as.data.frame( convertedData.out[,i,drop=FALSE]) if(is.null(pvals) ) topGenes$FDR = 0 else topGenes$FDR = pvals[,i]# fake fdr colnames(topGenes) = c("Fold","FDR") return(topGenes) } topGenes = lapply( 1:dim( x )[2], extractColumn ) topGenes <- setNames(topGenes, colnames(x ) ) return( list(results= all.Calls, comparisons = colnames(x ), Exp.type=Exp.type, topGenes=topGenes) ) } } DEG.data <- function() { genes = limma.out$results genes = as.data.frame( genes[which( rowSums(genes) != 0 ),] ) colnames(genes) = colnames( limma.out$results ) genes = merge(genes,convertedData.out, by='row.names') colnames(genes)[1] = "1: upregulation, -1: downregulation" # add gene symbol ix = match( genes[,1], allGeneInfo.out[,1]) genes <- cbind(as.character( allGeneInfo.out$symbol)[ix],genes) colnames(genes)[1] = "Symbol" genes <- genes[,c(2,1,3:dim(genes)[2]) ] return(genes) } vennPlot <- function() { results = limma.out$results # split by up or down regulation if(input_UpDownRegulated) { resultUp = results; resultUp[resultUp < 0 ] <- 0; colnames(resultUp) = paste0("Up_", colnames(resultUp)) resultDown = results; resultDown[resultDown > 0] <- 0; colnames(resultDown) = paste0("Down_", colnames(resultDown)) results <- cbind(resultUp, resultDown) } ixa = c() for (comps in input_selectComparisonsVenn) { if(!grepl("^I:|^I-|^Up_I:|^Up_I-|^Down_I:|^Down_I-", comps) ) { # if not interaction term ix = match(comps, colnames(results)) } else { #mismatch in comparison names for interaction terms for DESeq2 #I:water_Wet.genetic_Hy in the selected Contrast #Diff-water_Wet-genetic_Hy in column names tem = gsub("^I-","I:" ,colnames(results)) tem = gsub("-","\\.",tem) ix = match(comps, tem) if(is.na(ix) ) # this is for limma package ix = match(comps, colnames(results)) } ixa = c(ixa,ix) } results = results[,ixa,drop=FALSE] # only use selected comparisons if(dim(results)[2] >5) results <- results[,1:5] colnames(results) = gsub("^I-","I:" ,colnames(results)) vennDiagram(results,circle.col=rainbow(5), cex=c(1.,1, 0.7) ) # part of limma package } sigGeneStats <- function( ){ results = limma.out$results library(reshape2) Up = apply(results, 2, function(x) sum(x == 1) ) Down = apply(results, 2, function(x) sum(x == -1) ) stats = rbind(Up, Down) gg <- melt(stats) colnames(gg) = c("Regulation","Comparisons","Genes") p= ggplot(gg, aes(x=Comparisons, y=Genes, fill= Regulation ) )+ geom_bar(position="dodge", stat="identity") + coord_flip() + theme(legend.position = "top") + scale_fill_manual(values=c("red", "blue")) + ylab("Number of differntially expressed genes") + theme(axis.title.y=element_blank(), axis.text=element_text(size=14)) p } sigGeneStatsTable <- function( ) { results = limma.out$results Up = apply(results, 2, function(x) sum(x == 1) ) Down = apply(results, 2, function(x) sum(x == -1) ) stats = rbind(Up, Down) stats = t(stats) stats=cbind(rownames(stats), stats) colnames(stats)[1]="Comparisons" return(as.data.frame(stats)) } selectedHeatmap.data <- function() { genes <- limma.out$results if( is.null(genes) ) return(NULL) if(!grepl("I:", input_selectContrast) ) { # if not interaction term ix = match(input_selectContrast, colnames(genes)) } else { #mismatch in comparison names for interaction terms for DESeq2 #I:water_Wet.genetic_Hy in the selected Contrast #Diff-water_Wet-genetic_Hy in column names tem = gsub("I-","I:" ,colnames(genes)) tem = gsub("-","\\.",tem) ix = match(input_selectContrast, tem) if(is.na(ix) ) # this is for limma package ix = match(input_selectContrast, colnames(genes)) } if(is.null(ix)) return(NULL) if(is.na(ix)) return(NULL) if( sum(abs(genes[,ix] ) ) <= 1 ) return(NULL) # no significant genes for this comparison if(dim(genes)[2] < ix ) return(NULL) query = rownames(genes)[which(genes[,ix] != 0)] if(length(query) == 0) return(NULL) iy = match(query, rownames(convertedData.out ) ) iz = findContrastSamples( input_selectContrast, colnames(convertedData.out), readSampleInfo.out, input_selectFactorsModel, input_selectModelComprions, factorReferenceLevels.out, input_CountsDEGMethod, input_dataFileFormat ) # color bar bar = genes[,ix] bar = bar[bar!=0] # retreive related data genes = convertedData.out[iy,iz,drop=FALSE] genes = genes[order(bar),,drop=FALSE] # needs to be sorted because myheatmap2 does not reorder genes bar = sort(bar) return(list(genes=genes, bar=bar )) } # find sample index for selected comparisons findContrastSamples <- function(selectContrast, allSampleNames,sampleInfo=NULL, selectFactorsModel=NULL,selectModelComprions =NULL , referenceLevels=NULL, countsDEGMethod=NULL, dataFileFormat=NULL ){ iz= match( detectGroups(allSampleNames), unlist(strsplit( selectContrast, "-")) ) iz = which(!is.na(iz)) if ( !is.null(sampleInfo) & !is.null(selectFactorsModel) & length(selectModelComprions)>0 ) { comparisons = gsub(".*: ","",selectModelComprions) # strings like: "groups: mutant vs. control" comparisons = gsub(" vs\\. ","-",comparisons) factorsVector= gsub(":.*","",selectModelComprions) # corresponding factors # if read counts data and DESeq2 if(dataFileFormat==1 & countsDEGMethod == 3) { # if DESeq2 contrast = gsub("_for_.*","",selectContrast) # could be "wt-mu" or "wt-mu_for_conditionB" ik = match( contrast, comparisons ) # selected contrast lookes like: "mutant-control" otherFactorLevel = gsub(".*_for_","",selectContrast) # find the corresponding factor for the other factor otherFactor=" " if(nchar( otherFactorLevel ) >0){ for( eachFactor in colnames(sampleInfo) ) if ( otherFactorLevel %in% sampleInfo[,eachFactor ] ) otherFactor = eachFactor } if (is.na(ik)) iz=1:(length(allSampleNames)) else { # interaction term, use all samples selectedfactor= factorsVector[ ik ] # corresponding factors iz = which(sampleInfo[,selectedfactor] %in% unlist(strsplit( contrast, "-")) ) #filter by other factors: reference level if(! is.null( referenceLevels) ) { # c("genotype:wt", "treatment:control" ) for ( refs in referenceLevels) if(! is.null( refs) & gsub(":.*","",refs) != selectedfactor ) { currentFactor = gsub(":.*","",refs) if(nchar( otherFactorLevel ) >0 & currentFactor == otherFactor ) { # if not reference level iz = intersect( iz, which(sampleInfo[,currentFactor] == otherFactorLevel ) ) } else iz = intersect( iz, which(sampleInfo[,currentFactor] == gsub(".*:","",refs) ) ) } } iz = iz[which(!is.na(iz))] # switching from limma to DESeq2 causes problem, as reference level is not defined. } } else { # not DESeq2 # given level find corresponding sample ids findIDsFromLevel <- function (aLevel){ # find factor currentFactor="" for( eachFactor in colnames(sampleInfo) ) if ( aLevel %in% sampleInfo[,eachFactor ] ) currentFactor = eachFactor if(nchar(currentFactor) >0 ) return( which(sampleInfo[,currentFactor ] %in% aLevel ) ) else return(NULL) } if( !grepl(".*_.*-.*_.*",selectContrast )) iz = c.out levels4 = unlist( strsplit( unlist( strsplit(selectContrast,"-") ), "_") ) #double split! if(length(levels4)!=4) { iz = c.out } else { iz = intersect( findIDsFromLevel(levels4[1]), findIDsFromLevel(levels4[2]) ) # first sample iz = c(iz, intersect( findIDsFromLevel(levels4[3]), findIDsFromLevel(levels4[4]) ) ) # 2nd sample } } #else } if (grepl("I:",selectContrast)) iz=1:length(allSampleNames) # if it is factor design use all samples if( is.na(iz)[1] | length(iz)<=1 ) iz=1:length(allSampleNames) return(iz) } selectedHeatmap <- function() { bar = selectedHeatmap.data.out$bar +2; bar[bar==3] =2 myheatmap2( selectedHeatmap.data.out$genes,bar,200,mycolor=input_heatColors1,c("Down","Up") ) } selectedHeatmap.data <- function(){ genes <- limma.out$results if( is.null(genes) ) return(NULL) if(!grepl("I:", input_selectContrast) ) { # if not interaction term ix = match(input_selectContrast, colnames(genes)) } else { #mismatch in comparison names for interaction terms for DESeq2 #I:water_Wet.genetic_Hy in the selected Contrast #Diff-water_Wet-genetic_Hy in column names tem = gsub("I-","I:" ,colnames(genes)) tem = gsub("-","\\.",tem) ix = match(input_selectContrast, tem) if(is.na(ix) ) # this is for limma package ix = match(input_selectContrast, colnames(genes)) } if(is.null(ix)) return(NULL) if(is.na(ix)) return(NULL) if( sum(abs(genes[,ix] ) ) <= 1 ) return(NULL) # no significant genes for this comparison if(dim(genes)[2] < ix ) return(NULL) query = rownames(genes)[which(genes[,ix] != 0)] if(length(query) == 0) return(NULL) iy = match(query, rownames(convertedData.out ) ) iz = findContrastSamples( input_selectContrast, colnames(convertedData.out), readSampleInfo.out, input_selectFactorsModel, input_selectModelComprions, factorReferenceLevels.out, input_CountsDEGMethod, input_dataFileFormat ) # color bar bar = genes[,ix] bar = bar[bar!=0] # retreive related data genes = convertedData()[iy,iz,drop=FALSE] genes = genes[order(bar),,drop=FALSE] # needs to be sorted because myheatmap2 does not reorder genes bar = sort(bar) return(list(genes=genes, bar=bar )) } DEG.data <- function(){ genes = limma.out$results genes = as.data.frame( genes[which( rowSums(genes) != 0 ),] ) colnames(genes) = colnames( limma.out$results ) genes = merge(genes,convertedData.out, by='row.names') colnames(genes)[1] = "1: upregulation, -1: downregulation" # add gene symbol ix = match( genes[,1], allGeneInfo.out[,1]) genes <- cbind(as.character( allGeneInfo.out$symbol)[ix],genes) colnames(genes)[1] = "Symbol" genes <- genes[,c(2,1,3:dim(genes)[2]) ] return(genes) } AllGeneListsGMT <- function() { results = limma.out$results results2 = cbind(results, results) colnames(results2)= c( paste0("UP_", colnames(results)),paste0("Down_", colnames(results)) ) for(i in 1:dim(results)[2] ) { results2[,i*2-1] = results[,i] results2[ which(results2[,i*2-1] < 0 ) , i*2-1] = 0 results2[,i*2] = results[,i] results2[ which(results2[,i*2] > 0 ) , i*2] = 0 } geneList1 <- function (i) { ix = which(results2[,i] !=0 ) return(paste0(colnames(results2)[i],"\t", length(ix),"\t", paste(rownames(results2 )[ix], collapse="\t" ) ) ) } tem = sapply(1:dim(results2)[2],geneList1 ) return( paste(tem, collapse="\n") ) } geneListData <- function( ) { noSig = as.data.frame("No significant genes find!") if( is.null(input_selectContrast) ) return(NULL) if( is.null( limma.out$comparisons ) ) return(NULL) # if no significant genes found if( length(limma.out$topGenes) == 0 ) return(noSig) if(length( limma.out$comparisons) ==1 ) { top1=limma.out$topGenes[[1]] } else { top = limma.out$topGenes ix = match(input_selectContrast, names(top)) if( is.na(ix)) return (noSig) top1 <- top[[ix]]; } if(dim(top1)[1] == 0 ) return (noSig) colnames(top1)= c("Fold","FDR") #top1 = merge(top1,convertedData(), by='row.names') #colnames(top1)[1] = "Genes" top1 = top1[order(-abs(top1$Fold)) ,] if ( length( which( top1$FDR <= input_limmaPval & abs(top1$Fold) >= log2(input_limmaFC) ) ) == 0 ) return( noSig) top1 <- top1[which(top1$FDR <= input_limmaPval ) ,] top1 <- top1[which(abs(top1$Fold) >= log2( input_limmaFC)) ,] top1$Top_Genes <- rownames(top1) top1 <- top1[,c(3,1,2)] # if new species if( input_selectGO2 == "ID not recognized!" | input_selectOrg == "NEW") return (top1); #convertedID = convertID(top1[,1],input_selectOrg, "GOBP" );#"gmax_eg_gene" # tem <- geneInfo(convertedID,input_selectOrg) #input_selectOrg ) ; # tem <- geneInfo(converted(),input_selectOrg) top1 <- merge(top1, allGeneInfo.out, by.x ="Top_Genes", by.y="ensembl_gene_id",all.x=T ) if ( sum( is.na(top1$band)) == dim(top1)[1] ) top1$chr = top1$chromosome_name else top1$chr = paste( top1$chromosome_name, top1$band,sep="") top1 <- top1[,c('Top_Genes','Fold','FDR','symbol','chr','gene_biotype')] # ix = match(top1[,1], tem$ensembl_gene_id) # if( sum(is.na( tem$Symbol[ix]) ) != length(ix) ) # { top1 <- cbind(top1, tem$Symbol[ix]); colnames(top1)[4]= "Symbol" } top1 = top1[order(-abs(as.numeric( top1$Fold))) ,] top1$FDR <- sprintf("%-3.2e",top1$FDR ) colnames(top1) <- c("Ensembl ID", "log2 Fold Change", "Adj.Pval", "Symbol","Chr","Type") if ( sum( is.na(top1$Symbol)) == dim(top1)[1] ) top1 <- top1[,-4] return(top1) } volcanoPlot <- function( ) { if(length( limma.out$comparisons) ==1 ) { top1=limma.out$topGenes[[1]] } else { top = limma.out$topGenes ix = match(input_selectContrast, names(top)) if( is.na(ix)) return (NULL) top1 <- top[[ix]]; } if(dim(top1)[1] == 0 ) return (NULL) colnames(top1)= c("Fold","FDR") top1 <- as.data.frame(top1) # convert to data frame top1 <- top1[which(!(is.na(top1$Fold)|is.na(top1$FDR) )),] # remove NA's top1$upOrDown <- 1 #write.csv(top1,"tem.csv") top1$upOrDown[ which(top1$FDR <= input_limmaPval& top1$Fold >= log2( input_limmaFC)) ] <- 2 top1$upOrDown[ which(top1$FDR <= input_limmaPval & top1$Fold <= -log2( input_limmaFC)) ] <- 3 par(mar=c(5,5,1,1)) plot(top1$Fold,-log10(top1$FDR),col = c("grey30", "red","blue")[top1$upOrDown], pch =16, cex = .3, xlab= "log2 fold change", ylab = "- log10 (FDR)", cex.lab=2, cex.axis=2, cex.main=2, cex.sub=2 ) legend("bottomright",c("Upregulated","Downregulated"),fill = c("red","blue"),cex=1.1 ) } scatterPlot <- function( ){ if(length( limma.out$comparisons) ==1 ) { top1=limma.out$topGenes[[1]] } else { top = limma.out$topGenes ix = match(input_selectContrast, names(top)) if( is.na(ix)) return (NULL) top1 <- top[[ix]]; } if(dim(top1)[1] == 0 ) return (NULL) colnames(top1)= c("Fold","FDR") top1 <- as.data.frame(top1) # convert to data frame top1 <- top1[which(!(is.na(top1$Fold)|is.na(top1$FDR) )),] # remove NA's top1$upOrDown <- 1 #write.csv(top1,"tem.csv") top1$upOrDown[ which(top1$FDR <= input_limmaPval& top1$Fold >= log2( input_limmaFC)) ] <- 2 top1$upOrDown[ which(top1$FDR <= input_limmaPval & top1$Fold <= -log2( input_limmaFC)) ] <- 3 iz = findContrastSamples( input_selectContrast, colnames(convertedData.out), readSampleInfo.out, input_selectFactorsModel, input_selectModelComprions, factorReferenceLevels.out, input_CountsDEGMethod, input_dataFileFormat ) genes <- convertedData.out[,iz] g = detectGroups(colnames(genes)) if(length(unique(g)) > 2) { plot.new(); text(0.5,0.5, "Not available.") } else{ average1 <- apply( genes[, which( g == unique(g)[1] ) ],1,mean) average2 <- apply( genes[, which( g == unique(g)[2] ) ],1,mean) genes2 <- cbind(average1,average2) rownames(genes2) = rownames(genes) genes2 <- merge(genes2,top1,by="row.names") par(mar=c(5,5,1,1)) plot(genes2$average2,genes2$average1,col = c("grey45","red","blue")[genes2$upOrDown], pch =16, cex = .3, xlab= paste("Average expression in", unique(g)[2] ), ylab = paste("Average expression in", unique(g)[1] ), cex.lab=2, cex.axis=2, cex.main=2, cex.sub=2 ) legend("bottomright",c("Upregulated","Downregulated"),fill = c("red","blue"),cex=1.3 ) } } MAplot <- function ( ) { if(length( limma.out$comparisons) ==1 ) { top1=limma.out$topGenes[[1]] } else { top = limma.out$topGenes ix = match(input_selectContrast, names(top)) if( is.na(ix)) return (NULL) top1 <- top[[ix]]; } if(dim(top1)[1] == 0 ) return (NULL) colnames(top1)= c("Fold","FDR") top1 <- as.data.frame(top1) # convert to data frame top1 <- top1[which(!(is.na(top1$Fold)|is.na(top1$FDR) )),] # remove NA's top1$upOrDown <- 1 #write.csv(top1,"tem.csv") top1$upOrDown[ which(top1$FDR <= input_limmaPval& top1$Fold >= log2( input_limmaFC)) ] <- 2 top1$upOrDown[ which(top1$FDR <= input_limmaPval & top1$Fold <= -log2( input_limmaFC)) ] <- 3 iz = findContrastSamples( input_selectContrast, colnames(convertedData.out), readSampleInfo.out, input_selectFactorsModel, input_selectModelComprions, factorReferenceLevels.out, input_CountsDEGMethod, input_dataFileFormat ) average1 <- as.data.frame( apply( convertedData()[,iz],1,mean) ) colnames(average1) = "Average" rownames(average1) = rownames(convertedData.out) genes2 <- merge(average1,top1,by="row.names") par(mar=c(5,5,1,1)) plot(genes2$Average,genes2$Fold,col = c("grey45","red","blue")[genes2$upOrDown], pch =16, cex = .3, xlab= "Average expression", ylab = "Log2 fold change", cex.lab=2, cex.axis=2, cex.main=2, cex.sub=2 ) abline(h=0) legend("bottomright",c("Upregulated","Downregulated"),fill = c("red","blue"),cex=1.3 ) } geneListGOTable <- function() { NoSig=NULL # using expression data genes <- selectedHeatmap.data.out$genes if(is.null(genes) ) return(NULL) if(dim(genes)[1] <= minGenesEnrichment ) return(NoSig) # if has only few genes fc = selectedHeatmap.data.out$bar # GO results1 <- NULL; result <- NULL pp <- 0 for( i in c(1,-1) ) { if( length(which(fc*i<0)) <= minGenesEnrichment) next; query = rownames(genes)[which(fc*i<0)] if( length(query) <= minGenesEnrichment) next; result <- findOverlapGMT( query, GeneSets.out,1) if( dim(result)[2] ==1) next; # result could be NULL if(i == -1) result$direction = "Up regulated" else result$direction = "Down regulated" # changed if (pp==0 ) { results1 <- result; pp = 1;} else results1 = rbind(results1,result) } if ( pp == 0 ) return (NoSig) if ( is.null( results1) ) return (NoSig) if( dim(results1)[2] == 1 ) return(NoSig) # Returns a data frame: "No significant results found!" results1= results1[,c(6,1,2,4,5)] # changed colnames(results1)= c("List","FDR","nGenes","GO terms or pathways","Genes") # changed minFDR = 0.01 if(min(results1$FDR) > minFDR ) results1 = as.data.frame("No signficant enrichment found.") else results1 = results1[which(results1$FDR < minFDR),] if(dim(results1)[2] != 5) return(NoSig) # changed colnames(results1)= c("Direction","adj.Pval","nGenes","Pathways", "Genes") # changed results1 } geneListGO <- function() { results1 = geneListGOTable.out tem = input_removeRedudantSets if(dim(results1)[2] ==1) return(results1) else { results1$adj.Pval <- sprintf("%-2.1e",as.numeric(results1$adj.Pval) ) results1[,1] <- as.character(results1[,1]) results1[ duplicated (results1[,1] ),1 ] <- "" return( results1[,-5] ) } } # a program for ploting enrichment results by highlighting the similarities among terms # must have columns: Direction, adj.Pval Pathways Genes # Direction adj.Pval nGenes Pathways Genes #Down regulated 3.58E-59 131 Ribonucleoprotein complex biogenesis 36 Nsun5 Nhp2 Rrp15 #Down regulated 2.55E-57 135 NcRNA metabolic process 23 Nsun5 Nhp2 Rrp15 Emg1 Ddx56 Rsl1d1 enrichmentPlot <- function( enrichedTerms){ # Up or down regulation is color-coded # gene set size if represented by the size of marker enrichmentPlot <- function( enrichedTerms, rightMargin=33) { if(class(enrichedTerms) != "data.frame") return(NULL) library(dendextend) # customizing tree geneLists = lapply(enrichedTerms$Genes, function(x) unlist( strsplit(as.character(x)," " ) ) ) names(geneLists)= enrichedTerms$Pathways # compute overlaps percentage-------------------- n = length(geneLists) w <- matrix(NA, nrow = n, ncol = n) # compute overlaps among all gene lists for (i in 1:n) { for (j in i:n) { u <- unlist(geneLists[i]) v <- unlist(geneLists[j]) w[i, j] = length(intersect(u, v))/length(unique(c(u,v))) } } # the lower half of the matrix filled in based on symmetry for (i in 1:n) for (j in 1:(i-1)) w[i, j] = w[j,i] Terms = paste( sprintf("%-1.0e",as.numeric(enrichedTerms$adj.Pval)), names(geneLists)) rownames(w) = Terms colnames(w) = Terms par(mar=c(0,0,1,rightMargin)) # a large margin for showing dend <- as.dist(1-w) %>% hclust (method="average") ix = dend$order # permutated order of leaves leafType= as.factor( gsub(" .*","", enrichedTerms$Direction[ix] ) ) if(length(unique(enrichedTerms$Direction) ) ==2 ) leafColors = c("green","red") else # mycolors leafColors = mycolors #leafSize = unlist( lapply(geneLists,length) ) # leaf size represent number of genes #leafSize = sqrt( leafSize[ix] ) leafSize = -log10(as.numeric( enrichedTerms$adj.Pval[ix] ) ) # leaf size represent P values leafSize = 1.5*leafSize/max( leafSize ) + .2 dend %>% as.dendrogram(hang=-1) %>% set("leaves_pch", 19) %>% # type of marker set("leaves_cex", leafSize) %>% #Size set("leaves_col", leafColors[leafType]) %>% # up or down genes plot(horiz=TRUE) #legend("top",pch=19, col=leafColors[1:2],legend=levels(leafType),bty = "n",horiz =T ) # add legend using a second layer par(lend = 1) # square line ends for the color legend add_legend("top",pch=19, col=leafColors[1:2],legend=levels(leafType),bty = "n",horiz =T ) } # numChar=100 maximum number of characters # n=200 maximum number of nodes # degree.cutoff = 0 Remove node if less connected #from PPInfer enrich.net2 <- function (x, gene.set, node.id, node.name = node.id, pvalue, n = 50, numChar = NULL, pvalue.cutoff = 0.05, edge.cutoff = 0.05, degree.cutoff = 0, edge.width = function(x) { 5 * x^2 }, node.size = function(x) { 2.5 * log10(x) }, group = FALSE, group.color = c("green","red" ), group.shape = c("circle", "square"), legend.parameter = list("topright"), show.legend = TRUE, plotting=TRUE, ...) { library(igraph) x <- data.frame(x, group) colnames(x)[length(colnames(x))] <- "Group" x <- x[as.numeric( x[, pvalue]) < pvalue.cutoff, ] x <- x[order(x[, pvalue]), ] n <- min(nrow(x), n) if (n == 0) { stop("no enriched term found...") } x <- x[1:n, ] index <- match(x[, node.id], names(gene.set)) geneSets <- list() for (i in 1:n) { geneSets[[i]] <- gene.set[[index[i]]] } names(geneSets) <- x[, node.name] if (is.null(numChar)) { numChar <- max(nchar(as.character(x[, node.name]))) } else { if (length(unique(substr(x[, node.name], 1, numChar))) < nrow(x)) { numChar <- max(nchar(as.character(x[, node.name]))) message("Note : numChar is too small.", "\n") } } x[, node.name] <- paste(substr(x[, node.name], 1, numChar), ifelse(nchar(as.character(x[, node.name])) > numChar, "...", ""), sep = "") w <- matrix(NA, nrow = n, ncol = n) for (i in 1:n) { for (j in i:n) { u <- unlist(geneSets[i]) v <- unlist(geneSets[j]) w[i, j] = length(intersect(u, v))/length(unique(c(u, v))) } } list.edges <- stack(data.frame(w)) list.edges <- cbind(list.edges[, 1], rep(x[, node.name], n), rep(x[, node.name], each = n)) list.edges <- list.edges[list.edges[, 2] != list.edges[,3], ] list.edges <- list.edges[!is.na(list.edges[, 1]), ] g <- graph.data.frame(list.edges[, -1], directed = FALSE) E(g)$width = edge.width(as.numeric(list.edges[, 1])) V(g)$size <- node.size(lengths(geneSets)) g <- delete.edges(g, E(g)[as.numeric(list.edges[, 1]) < edge.cutoff]) index.deg <- igraph::degree(g) >= degree.cutoff g <- delete.vertices(g, V(g)[!index.deg]) x <- x[index.deg, ] index <- index[index.deg] if (length(V(g)) == 0) { stop("no categories greater than degree.cutoff...") } n <- min(nrow(x), n) x <- x[1:n, ] group.level <- sort(unique(group)) pvalues <- x[, pvalue] for (i in 1:length(group.level)) { index <- x[, "Group"] == group.level[i] V(g)$shape[index] <- group.shape[i] group.pvalues <- pvalues[index] if (length(group.pvalues) > 0) { if (max(group.pvalues) == min(group.pvalues)) { V(g)$color[index] <- adjustcolor(group.color[i], alpha.f = 0.5) } else { V(g)$color[index] <- sapply(1 - (group.pvalues - min(group.pvalues))/(max(group.pvalues) - min(group.pvalues)), function(x) { adjustcolor(group.color[i], alpha.f = x) }) } } } if(plotting) { plot(g,, vertex.label.dist=1, ...) if (show.legend) { legend.parameter$legend <- group.level legend.parameter$text.col <- group.color legend.parameter$bty <- "n" do.call(legend, legend.parameter) }} return(g) } enrichmentNetwork <- function(enrichedTerms){ geneLists = lapply(enrichedTerms$Genes, function(x) unlist( strsplit(as.character(x)," " ) ) ) names(geneLists)= enrichedTerms$Pathways enrichedTerms$Direction = gsub(" .*","",enrichedTerms$Direction ) g <- enrich.net2(enrichedTerms, geneLists, node.id = "Pathways", numChar = 100, pvalue = "adj.Pval", edge.cutoff = 0.2, pvalue.cutoff = 1, degree.cutoff = 0, n = 200, group = enrichedTerms$Direction, vertex.label.cex = 1, vertex.label.color = "black") } enrichmentNetworkPlotly <- function(enrichedTerms){ geneLists = lapply(enrichedTerms$Genes, function(x) unlist( strsplit(as.character(x)," " ) ) ) names(geneLists)= enrichedTerms$Pathways g <- enrich.net2(enrichedTerms, geneLists, node.id = "Pathways", numChar = 100, pvalue = "adj.Pval", edge.cutoff = 0.2, pvalue.cutoff = 1, degree.cutoff = 0, n = 200, group = enrichedTerms$Direction, vertex.label.cex = 0.8, vertex.label.color = "black" ,plotting=TRUE) vs <- V(g) es <- as.data.frame(get.edgelist(g)) Nv <- length(vs) Ne <- length(es[1]$V1) # create nodes L <- layout.kamada.kawai(g) Xn <- L[,1] Yn <- L[,2] inc = (max(Yn)-min(Yn))/50 # for shifting #group <- ifelse(V(g)$shape == "circle", "GO", "KEGG") group <- as.character(enrichedTerms$Direction) network <- plot_ly(x = ~Xn, y = ~Yn, type = "scatter", mode = "markers", marker = list(color = V(g)$color, size = V(g)$size*2, symbol= ~V(g)$shape, line = list(color = "gray", width = 2)), hoverinfo = "text", text = ~paste("
", group, "
", names(vs))) %>% add_annotations( x = ~Xn, y = ~Yn+inc, text = names(vs), showarrow = FALSE, font = list(color = "#030303", size = 12)) # create edges edge_shapes <- list() for(i in 1:Ne) { v0 <- es[i,]$V1 v1 <- es[i,]$V2 index0 <- match(v0, names(V(g))) index1 <- match(v1, names(V(g))) edge_shape <- list(type = "line", line = list(color = "gray" , width = E(g)$width[i]/2), x0 = Xn[index0], y0 = Yn[index0], x1 = Xn[index1], y1 = Yn[index1]) edge_shapes[[i]] <- edge_shape } # create network axis <- list(title = "", showgrid = FALSE, showticklabels = FALSE, zeroline = FALSE) h <- layout(network, title = "Enrichment Network", shapes = edge_shapes, xaxis = axis, yaxis = axis) config(h, showLink = TRUE) } #Homo sapies --> hsapiens shortSpeciesNames <- function(tem){ tem2 = strsplit(as.character(tem)," " ) return( tolower( paste0(substr(tem2[[1]][1],1,1), tem2[[1]][2] ) ) ) } findTaxonomyID <- function( speciesName = "mmusculus") { if(!is.null(input_speciesName) ) { # if species name is entered ix = match(input_speciesName, STRING10_species$official_name) } else if( input_selectGO2 != "ID not recognized!" ) { # if no species is entered, try to resolve species using existing info codedNames = sapply(STRING10_species$compact_name,shortSpeciesNames ) ix = match(speciesName , codedNames) if(input_selectOrg != speciesChoice[[1]]) { # if species is entered selectedSpecies = findSpeciesById(input_selectOrg)[1,1] ix = match( gsub("_.*","", selectedSpecies ), codedNames) } } else return(NULL) if(length(ix) == 0 | is.na(ix) ) return(NULL) return(STRING10_species$species_id[ix]) } STRINGdb_geneList <- function() { library(STRINGdb,verbose=FALSE) taxonomyID = findTaxonomyID.out if(is.null( taxonomyID ) ) return(NULL) #Intialization string_db <- STRINGdb$new( version="10", species=taxonomyID, score_threshold=0, input_directory="" ) # using expression data genes <- geneListData.out colnames(genes)[1:2]=c("gene","lfc") mapped <- string_db$map(genes,"gene", removeUnmappedRows = TRUE ) up= subset(mapped, lfc>0, select="STRING_id", drop=TRUE ) down= subset(mapped, lfc<0, select="STRING_id", drop=TRUE ) mappingRatio = nrow(mapped)/ nrow(genes) if(nrow(mapped) == 0) return(NULL) else return( list(up=up, down=down, ratio=mappingRatio, geneTable=mapped ) ) } STRINGDB_mapping_stat <- function( ) { if( is.null(STRINGdb_geneList.out ) ) return("No genes mapped by STRINGdb. Please enter or double-check species name above.") if(! is.null(STRINGdb_geneList.out ) ) { tem=paste0( 100*round(STRINGdb_geneList()$ratio,3), "% genes mapped by STRING web server.") if(STRINGdb_geneList()$ratio <0.3 ) tem = paste(tem, "Warning!!! Very few gene mapped. Double check if the correct species is selected.") return( tem ) } } stringDB_GO_enrichmentData <- function() { library(STRINGdb,verbose=FALSE) tem = input_STRINGdbGO taxonomyID = findTaxonomyID.out if(is.null( taxonomyID ) ) return(NULL) NoSig = as.data.frame("No significant enrichment found.") #################################### #Intialization string_db <- STRINGdb$new( version="10", species=taxonomyID, score_threshold=0, input_directory="" ) # using expression data genes <- selectedHeatmap.data.out$genes if(is.null(genes) ) return(NULL) if(dim(genes)[1] <= minGenesEnrichment ) return(NoSig) # if has only few genes fc = selectedHeatmap.data.out$bar # GO results1 <- NULL; result <- NULL pp <- 0 for( i in c(1:2) ) { if( length(which(fc*i<0)) <= minGenesEnrichment) next; query = rownames(genes)[which(fc*i<0)] if( length(query) <= minGenesEnrichment) next; ids = STRINGdb_geneList.out[[i]] result <- string_db$get_enrichment( ids, category = input_STRINGdbGO, methodMT = "fdr", iea = TRUE ) if(nrow(result) == 0 ) next; if(nrow(result) > 15) result <- result[1:15,] if( dim(result)[2] ==1) next; # result could be NULL if(i == 1) result$direction = "Up regulated" else result$direction = "Down regulated" if (pp==0 ) { results1 <- result; pp = 1;} else results1 = rbind(results1,result) } if ( pp == 0 ) return (NoSig) if ( is.null( results1) ) return (NoSig) if( dim(results1)[2] == 1 ) return(NoSig) # Returns a data frame: "No significant results found!" results1= results1[,c(7,5,3, 6)] colnames(results1)= c("List","FDR","nGenes","GO terms or pathways") minFDR = 0.01 if(min(results1$FDR) > minFDR ) results1 = as.data.frame("No signficant enrichment found.") else results1 = results1[which(results1$FDR < minFDR),] if(dim(results1)[2] != 4) return(NoSig) colnames(results1)= c("Direction","adj.Pval","nGenes","Pathways") results1$adj.Pval <- sprintf("%-2.1e",as.numeric(results1$adj.Pval) ) rownames(results1)=1:nrow(results1) results1[ duplicated (results1[,1] ),1 ] <- "" return( results1 ) } stringDB_network1 <- function( geneLists = 1 ) { # geneLists =2 for both up and down library(STRINGdb) tem = input_nGenesPPI taxonomyID = findTaxonomyID.out if(is.null( taxonomyID ) ) return(NULL) #################################### NoSig = as.data.frame("No significant enrichment found.") if(is.null(STRINGdb_geneList.out ) ) return(NULL) #Intialization string_db <- STRINGdb$new( version="10", species=taxonomyID, score_threshold=0, input_directory="" ) # only up regulated is ploted for( i in c(1:geneLists) ) { ids = STRINGdb_geneList.out[[i]] if(length(ids)> input_nGenesPPI ) # n of genes cannot be more than 400 ids <- ids[1:input_nGenesPPI] string_db$plot_network( ids,add_link=FALSE) } } # generates a html file containing results and also links to STRING-db stringDB_network_link <- function(){ library(STRINGdb,verbose=FALSE) taxonomyID = findTaxonomyID.out if(is.null( taxonomyID ) ) return(NULL) NoSig = as.data.frame("No significant enrichment found.") if(is.null(STRINGdb_geneList.out ) ) return(NULL) #Intialization string_db <- STRINGdb$new( version="10", species=taxonomyID, score_threshold=0, input_directory="" ) # upregulated ids = STRINGdb_geneList.out[[1]] if(length(ids)> input_nGenesPPI ) # n of genes cannot be more than 400 ids <- ids[1:input_nGenesPPI] link1 = string_db$get_link( ids) Pval1 = string_db$get_ppi_enrichment( ids) tem = "
Interactive and annotated PPI networks among DEGs:
" tem = paste(tem, " Up-regulated; " ) # downregulated ids = STRINGdb_geneList()[[2]] if(length(ids)> input_nGenesPPI ) ids <- ids[1:input_nGenesPPI] link2 = string_db$get_link( ids) Pval2 = string_db$get_ppi_enrichment( ids) tem = paste(tem, "   Down-regulated; " ) # both up and down with color code geneTable = STRINGdb_geneList.out$geneTable if(nrow(geneTable)> input_nGenesPPI ) geneTable <- geneTable[1:input_nGenesPPI,] geneTable = string_db$add_diff_exp_color( geneTable, logFcColStr="lfc" ) payload_id <- string_db$post_payload( geneTable$STRING_id,colors=geneTable$color ) link3 = string_db$get_link(geneTable$STRING_id, payload_id = payload_id) tem = paste(tem, "   Both with fold-changes color coded.
" ) tem2 = paste("
PPI enrichment P values: ") tem2 = paste0(tem2,"Up-regulated: ", sprintf("%-3.2e",Pval1[1]), "   Down-regulated: ", sprintf("%-3.2e",Pval2[1]),".") tem2 = paste(tem2, " Small P values indicate more PPIs among DEGs than background.
" ) tem = paste(tem2,tem ) return(tem) } ################################################################ # Pathway analysis ################################################################ gagePathwayData <- function( ){ library(gage,verbose=FALSE) # pathway analysis #################################### myrange = c(input_minSetSize, input_maxSetSize) noSig = as.data.frame("No significant pathway found.") if( length(limma.out$topGenes) == 0 ) return(noSig) if(length( limma.out$comparisons) ==1 ) { top1=limma.out$topGenes[[1]] } else { top = limma.out$topGenes ix = match(input_selectContrast1, names(top)) if( is.na(ix)) return (noSig) top1 <- top[[ix]]; } if(dim(top1)[1] == 0 ) return (noSig) colnames(top1)= c("Fold","FDR") top1 = top1[which(top1$FDR input_nPathwayShow ) top1 <- top1[1:input_nPathwayShow, ,drop=FALSE] top1 <- as.data.frame(top1) top1 <- cbind(rep( input_selectContrast1, dim(top1)[1]),row.names(top1), top1); top1$statistic <- as.character( round(as.numeric(top1$statistic),4)); top1$adj.Pval <- sprintf("%-2.1e",as.numeric(top1$adj.Pval) ) top1[,2] <- as.character(top1[,2]);top1[,1] <- as.character(top1[,1]) colnames(top1)[1] <- "Direction" p.m <- "GAGE" colnames(top1)[2] <- paste(p.m," analysis:", gsub("-"," vs ",input_selectContrast1 ) ) top1[ which( top1[,3] >0),1 ] <- "Up" #gsub("-"," > ",input_selectContrast1 ) top1[ which( top1[,3] <0),1 ] <- "Down" # gsub("-"," < ",input_selectContrast1 ) top1 <- top1[order( top1[,1], -abs(as.numeric( top1[,3]) ) ) ,] top1[ duplicated (top1[,1] ),1 ] <- "" return( top1) } fgseaPathwayData <- function() { library(fgsea,verbose=FALSE) # fast GSEA noSig = as.data.frame("No significant pathway found.") if( length(limma.out$topGenes) == 0 ) return(noSig) if(length( limma.out$comparisons) ==1 ) { top1=limma.out$topGenes[[1]] } else { top = limma.out$topGenes ix = match(input_selectContrast1, names(top)) if( is.na(ix)) return (noSig) top1 <- top[[ix]]; } if(dim(top1)[1] == 0 ) return (noSig) colnames(top1)= c("Fold","FDR") # remove some genes top1 = top1[which(top1$FDR input_nPathwayShow ) top1 <- top1[1:input_nPathwayShow,,drop=FALSE] #top1 <- cbind(row.names(top1), top1); colnames(top1)[1] <-input_selectContrast1 top1 <- as.data.frame(top1) top1 <- cbind(rep( input_selectContrast1, dim(top1)[1]), top1); top1[,4] = as.character( round(as.numeric(top1[,4]),4)); top1$adj.Pval <- sprintf("%-2.1e",as.numeric(top1$adj.Pval) ) top1[,1] <- as.character(top1[,1]) colnames(top1)[1] <- "Direction" p.m <- "GSEA" colnames(top1)[2] <- paste(p.m," analysis:", gsub("-"," vs ",input_selectContrast1 ) ) top1[ which( as.numeric( top1[,3]) >0),1 ] <- "Up" #gsub("-"," > ",input_selectContrast1 ) top1[ which( as.numeric( top1[,3]) <0),1 ] <- "Down" #gsub("-"," < ",input_selectContrast1 ) top1 <- top1[order( top1[,1], -abs(as.numeric( top1[,3]) ) ) ,] top1[ duplicated (top1[,1] ),1 ] <- "" top1[,3] = as.character( round(as.numeric(top1[,3]),4)); return( top1) } extract1 <- function (x) { words <- unlist ( strsplit(x,"_")) if(length( words ) <=4 ) return(gsub("_"," ",x)) else { words <- words[-c(1:4)] return( proper(paste(words,collapse = " ") ) )} } PGSEApathway <- function (converted,convertedData, selectOrg,GO,gmt, myrange,Pval_pathway,top){ subtype = detectGroups(colnames(convertedData)) library(PGSEA,verbose=FALSE) Pvalue = 0.01 # cut off to report in PGSEA. Otherwise NA #Pval_pathway = 0.2 # cut off for P value of ANOVA test to writ to file # top = 30 # number of pathways to show if(length(gmt) ==0 ) return( list(pg3 = NULL, best = 1 ) ) # centering by mean #pg = myPGSEA (convertedData - rowMeans(convertedData), # cl=gmt,range=myrange,p.value=TRUE, weighted=FALSE,nPermutation=100) #if( class(convertedData) != "data.frame" | class(convertedData) != "matarix") return( list(pg3 = NULL, best = 1 ) ) #if( dim(convertedData)[2] <2 ) return( list(pg3 = NULL, best = 1 ) ) pg = PGSEA (convertedData - rowMeans(convertedData),cl=gmt,range=myrange,p.value=TRUE, weighted=FALSE) pg2 = pg$results; pg2 = pg2[rowSums(is.na(pg2))F)"]][1] ) } Pvalues = sapply(1:dim(pg2)[1], pathPvalue) Pvalues = p.adjust(Pvalues, "fdr") #if(min(Pvalues) > Pval_pathway ) return( list(pg3 = NULL, best = best ) ) else { if(sort(Pvalues)[2] > Pval_pathway ) return( list(pg3 = NULL, best = best ) ) else { NsigT = rowSums(pg$p.results2) ,] pg2 = result[,-2] # when there is only 1 left in the matrix pg2 becomes a vector if(sum( Pvalues top ) { pg3 = pg2[1:top,]; } else { pg3 = pg2; } } rownames(pg3) = sapply(rownames(pg3) , extract1) a=sprintf("%-3.2e",pg3[,1]) rownames(pg3) = paste(a,rownames(pg3),sep=" ") pg3 =pg3[,-1] pg3 <- pg3[order( -apply(pg3,1,sd) ),] # sort by SD return( list(pg3 = pg3, best = best ) ) } } PGSEAplot <- function(){ library(PGSEA,verbose=FALSE) if(input_selectGO == "ID not recognized!" ) return( NULL) myrange = c(input_minSetSize, input_maxSetSize) genes = convertedData.out if (is.null(input_selectContrast1 ) ) return(NULL) gmt = GeneSets.out # find related samples iz = findContrastSamples(input_selectContrast1, colnames(convertedData.out),readSampleInfo.out, input_selectFactorsModel,input_selectModelComprions, factorReferenceLevels.out,input_CountsDEGMethod, input_dataFileFormat ) genes = genes[,iz] subtype = detectGroups(colnames(genes )) if(length( GeneSets.out ) == 0) { plot.new(); text(0.5,0.5, "No gene sets!")} else { result = PGSEApathway(converted.out,genes, input_selectOrg,input_selectGO, GeneSets.out, myrange, input_pathwayPvalCutoff, input_nPathwayShow ) if( is.null(result$pg3) ) { plot.new(); text(0.5,1, "No significant pathway found!")} else smcPlot(result$pg3,factor(subtype),scale = c(-max(result$pg3), max(result$pg3)), show.grid = T, margins = c(3,1, 13,28), col = .rwb,cex.lab=0.5, main="Pathway Analysis:PGSEA") } } convertEnsembl2Entrez <- function (query,Species) { querySet <- cleanGeneSet( unlist( strsplit( toupper(names( query)),'\t| |\n|\\,' ) ) ) speciesID <- orgInfo$id[ which(orgInfo$ensembl_dataset == Species)] # note uses species Identifying # idType 6 for entrez gene ID result <- dbGetQuery( convert, paste( " select id,ens,species from mapping where ens IN ('", paste(querySet,collapse="', '"), "') AND idType ='",idType_Entrez,"'",sep="") ) # slow if( dim(result)[1] == 0 ) return(NULL) result <- subset(result, species==speciesID, select = -species) ix = match(result$ens,names(query) ) tem <- query[ix]; names(tem) = result$id return(tem) } ReactomePAPathwayData <- function( ){ library(ReactomePA,verbose=FALSE) # pathway analysis ensemblSpecies <- c("hsapiens_gene_ensembl","rnorvegicus_gene_ensembl", "mmusculus_gene_ensembl", "celegans_gene_ensembl","scerevisiae_gene_ensembl", "drerio_gene_ensembl", "dmelanogaster_gene_ensembl") ReactomePASpecies= c("human", "rat", "mouse", "celegans", "yeast", "zebrafish", "fly" ) # cbind(ensemblSpecies,ReactomePASpecies) # double check mapping myrange = c(input_minSetSize, input_maxSetSize) noSig = as.data.frame("No significant pathway found.") if( length(limma.out$topGenes) == 0 ) return(noSig) if(length( limma.out$comparisons) ==1 ) { top1=limma.out$topGenes[[1]] } else { top = limma.out$topGenes ix = match(input_selectContrast1, names(top)) if( is.na(ix)) return (noSig) top1 <- top[[ix]]; } if(dim(top1)[1] == 0 ) return (noSig) colnames(top1)= c("Fold","FDR") # remove some genes top1 = top1[which(top1$FDR input_nPathwayShow ) top1 <- top1[1:input_nPathwayShow,,drop=FALSE] #top1 <- cbind(row.names(top1), top1); colnames(top1)[1] <-input_selectContrast1 top1 <- as.data.frame(top1) top1 <- cbind(rep( input_selectContrast1, dim(top1)[1]), top1); top1[,4] = as.character( round(as.numeric(top1[,4]),4)); top1$adj.Pval <- sprintf("%-2.1e",as.numeric(top1$adj.Pval) ) top1[,1] <- as.character(top1[,1]) colnames(top1)[1] <- "Direction" if(input_pathwayMethod == 1 ) p.m <- "GAGE" else if(input_pathwayMethod == 2 ) p.m <- "PGSEA" else if(input_pathwayMethod == 3 ) p.m <- "GSEA" else if(input_pathwayMethod == 4 ) p.m <- "PGSEA_All" else if(input_pathwayMethod == 5 ) p.m <- "ReactomePA" colnames(top1)[2] <- paste(p.m," analysis:", gsub("-"," vs ",input_selectContrast1 ) ) top1[ which( as.numeric( top1[,3]) >0),1 ] <- "Up" #gsub("-"," > ",input_selectContrast1 ) top1[ which( as.numeric( top1[,3]) <0),1 ] <- "Down" #gsub("-"," < ",input_selectContrast1 ) top1 <- top1[order( top1[,1], -abs(as.numeric( top1[,3]) ) ) ,] top1[ duplicated (top1[,1] ),1 ] <- "" top1[,3] = as.character( round(as.numeric(top1[,3]),4)); return( top1) } selectedPathwayData <- function( ){ if(input_sigPathways == "All") return (NULL) ix <- which(names(GeneSets.out ) == input_sigPathways ) # find the gene set if(length(ix) == 0 ) return(NULL) genes <- GeneSets.out[[ix]] # retrieve genes # find related samples iz = findContrastSamples(input_selectContrast1, colnames(convertedData.out),readSampleInfo.out, input_selectFactorsModel,input_selectModelComprions, factorReferenceLevels.out,input_CountsDEGMethod , input_dataFileFormat ) x <- convertedData.out[which(rownames(convertedData.out) %in% genes), iz ] if( input_selectOrg != "NEW") { ix = match( rownames(x), allGeneInfo.out[,1]) if( sum( is.na(allGeneInfo.out$symbol )) != dim(allGeneInfo.out )[1] ) # symbol really exists? rownames(x) <- paste(rownames(x),":", as.character( allGeneInfo.out$symbol)[ix]) } return( x ) } selectedPathwayHeatmap <- function() { x = selectedPathwayData.out if(dim(x)[1]<=2 | dim(x)[2]<=2 ) return(NULL) groups = detectGroups(colnames(x)) # this will cutoff very large values, which could skew the color x=as.matrix(x)-apply(as.matrix(x),1,mean) cutoff = median(unlist(x)) + 3*sd (unlist(x)) x[x>cutoff] <- cutoff cutoff = median(unlist(x)) - 3*sd (unlist(x)) x[x< cutoff] <- cutoff # sometimes, a gene can be all zero in selected samples. x <- x[which(apply(x,1,sd)>0) ,] lmat = rbind(c(5,4),c(0,1),c(3,2)) lwid = c(1.5,6) lhei = c(.5,.2,8) if( dim(x)[1]>200) heatmap.2(x, distfun = dist2,hclustfun=hclust2, col=heatColors[as.integer(input_heatColors1),], density.info="none", trace="none", scale="none", keysize=.5 ,key=T, symkey=F ,dendrogram = "row" ,ColSideColors=mycolors[ groups] ,labRow="" ,margins=c(10,15) ,srtCol=45 ,lmat = lmat, lwid = lwid, lhei = lhei #,main ="Title" ) if( dim(x)[1]<=200) heatmap.2(x, distfun = dist2,hclustfun=hclust2, col=heatColors[as.integer(input_heatColors1),], density.info="none", trace="none", scale="none", keysize=.5 ,key=T, symkey=F, dendrogram = "row", ,labRow=gsub(".*:","",rownames(x)) ,ColSideColors=mycolors[ groups] ,margins=c(10,15) ,cexRow=1.2 ,srtCol=45 ,lmat = lmat, lwid = lwid, lhei = lhei #,main ="Title" ) } convertEnsembl2KEGG <- function (query,Species) { # not working querySet <- cleanGeneSet( unlist( strsplit( toupper(names( query)),'\t| |\n|\\,' ) ) ) speciesID <- orgInfo$id[ which(orgInfo$ensembl_dataset == Species)] # note uses species Identifying # idType 6 for entrez gene ID result <- dbGetQuery( convert, paste( " select id,ens,species from mapping where ens IN ('", paste(querySet,collapse="', '"), "') AND idType ='",idType_KEGG,"'",sep="") ) # slow if( dim(result)[1] == 0 ) return(NULL) result <- subset(result, species==speciesID, select = -species) ix = match(result$ens,names(query) ) tem <- query[ix]; names(tem) = result$id return(tem) } keggPathwayID <- function (pathwayDescription, Species, GO,selectOrg) { ix = grep(Species,gmtFiles) if (length(ix) == 0 ) {return(NULL)} # If selected species is not the default "bestMatch", use that species directly if(selectOrg != speciesChoice[[1]]) { ix = grep(findSpeciesById(selectOrg)[1,1], gmtFiles ) if (length(ix) == 0 ) {return(NULL )} totalGenes <- orgInfo[which(orgInfo$id == as.numeric(selectOrg)),7] } pathway <- dbConnect(sqlite,gmtFiles[ix],flags=SQLITE_RO) # change Parkinson's disease to Parkinson\'s disease otherwise SQL pathwayDescription <- gsub("\'","\'\'",pathwayDescription) pathwayInfo <- dbGetQuery( pathway, paste( " select * from pathwayInfo where description = '", pathwayDescription, "' AND name LIKE '",GO,"%'",sep="") ) pathwayInfo <- dbGetQuery( pathway, paste( " select * from pathwayInfo where description = '", pathwayDescription, "'",sep="") ) dbDisconnect(pathway); if(dim(pathwayInfo)[1] != 1 ) {return(NULL) } tem = gsub(".*:","",pathwayInfo[1,2]) return( gsub("_.*","",tem) ) } KeggImage <- function(){ # First generate a blank image. Otherwise return(NULL) gives us errors. outfile <- tempfile(fileext='.png') png(outfile, width=400, height=300) frame() dev.off() blank <- list(src = outfile, contentType = 'image/png', width = 400, height = 300, alt = " ") if(is.null( input_selectGO ) ) return(outfile) if(input_selectGO != "KEGG") return(outfile) if(is.null(gagePathwayData.out ) ) return(outfile) if(is.null( input_sigPathways)) return (outfile) # if( is.null(selectedPathwayData()) ) return(blank) library(pathview) if (is.null(input_selectContrast1 ) ) return(outfile) if(input_sigPathways == "All") return (outfile) if( length(limma.out$topGenes) == 0 ) return(outfile) # get fold change if(length( limma.out$comparisons) ==1 ) { top1=limma.out$topGenes[[1]] } else { top = limma.out$topGenes ix = match(input_selectContrast1, names(top)) if( is.na(ix)) return (outfile) top1 <- top[[ix]]; } if(dim(top1)[1] == 0 ) return (outfile) # cat("here5") colnames(top1)= c("Fold","FDR") Species <- converted.out$species fold = top1[,1]; names(fold) <- rownames(top1) fold <- convertEnsembl2Entrez(fold,Species) keggSpecies <- as.character( keggSpeciesID[which(keggSpeciesID[,1] == Species),3] ) if(nchar( keggSpecies) <=2 ) return(outfile) # not in KEGG # kegg pathway id pathID = keggPathwayID(input_sigPathways, Species, "KEGG",input_selectOrg) #cat("\nhere5 ",keggSpecies, " ",Species," ",input_sigPathways, "pathID:",pathID,"End", fold[1:5],names(fold)[1:5],"\n") #cat("\npathway:",is.na(input_sigPathways)) #cat("\n",fold[1:5],"\n",keggSpecies,"\n",pathID) if(is.null(pathID) ) return(blank) # kegg pathway id not found. if(nchar(pathID)<3 ) return(blank) randomString <- gsub(".*file","",tempfile()) outfile <- paste(pathID,".",randomString,".png",sep="") try( pv.out <- pathview(gene.data = fold, pathway.id = pathID, out.suffix = randomString, species = keggSpecies, kegg.native=TRUE) ) return( outfile ) } # list of pathways with details pathwayListData <- function(){ pathways = NULL if( input_pathwayMethod == 1) if(!is.null(gagePathwayData.out)) if(dim(gagePathwayData.out)[2] >1) { pathways <- gagePathwayData.out colnames(pathways)[2] ="Pathways"; colnames(pathways)[4] ="nGenes"; } if( input_pathwayMethod == 3) if(!is.null(fgseaPathwayData.out)) if(dim(fgseaPathwayData.out)[2] >1) { pathways <- fgseaPathwayData.out colnames(pathways)[2] ="Pathways"; colnames(pathways)[4] ="nGenes"; } if( input_pathwayMethod == 2) if(!is.null(PGSEAplot.data.out)) if(dim(PGSEAplot.data.out)[2] >1) { pathways <- as.data.frame( PGSEAplot.data.out) pathways$Pathways = substr(rownames(pathways),10, nchar( rownames(pathways)) ) pathways$adj.Pval = gsub(" .*","", rownames(pathways)) pathways$Direction ="Diff" } if( input_pathwayMethod == 4) if(!is.null(PGSEAplotAllSamples.data.out)) if(dim(PGSEAplotAllSamples.data.out)[2] >1) { pathways <- as.data.frame( PGSEAplotAllSamples.data.out) pathways$Pathways = substr(rownames(pathways),10, nchar( rownames(pathways)) ) pathways$adj.Pval = gsub(" .*","", rownames(pathways)) pathways$Direction ="Diff" } if( is.null( pathways) ) return(NULL) # if no gene set data, return pathway list if(is.null(GeneSets.out ) ) return(pathways) pathways$adj.Pval = as.numeric(pathways$adj.Pval) if(nrow(pathways)>1) for( i in 2:nrow(pathways) ) if(nchar(pathways$Direction[i]) <=1) pathways$Direction[i] = pathways$Direction[i-1] # gene symbol matching symbols probeToGene = NULL if( input_selectGO != "ID not recognized!" & input_selectOrg != "NEW") if(sum(is.na( allGeneInfo.out$symbol ) )/ dim( allGeneInfo.out )[1] <.5 ) { # if more than 50% genes has symbol probeToGene = allGeneInfo.out[,c("ensembl_gene_id","symbol")] probeToGene$symbol = gsub(" ","",probeToGene$symbol) ix = which( is.na(probeToGene$symbol) | nchar(probeToGene$symbol)<2 | toupper(probeToGene$symbol)=="NA" | toupper(probeToGene$symbol)=="0" ) probeToGene[ix,2] = probeToGene[ix,1] # use gene ID } pathways$Genes ="" # looking up genes for each pathway for(i in 1:nrow(pathways) ){ ix <- which(names(GeneSets.out ) == pathways$Pathways[i] ) # find the gene set if(length(ix) != 0 ) { genes <- GeneSets.out[[ix]] # retrieve genes if(!is.null(probeToGene) ) { iy = match(genes,probeToGene[,1]) genes = probeToGene[iy,2] } pathways$Genes[i] = paste(genes, collapse=" ") } } return(pathways) } ################################################################ # Genome-wide view ################################################################ genomePlotly <- function( ){ # default plot fake = data.frame(a=1:3,b=1:3) p <- ggplot(fake, aes(x = a, y = b)) + geom_blank() + ggtitle("No genes with position info.") + theme(axis.title.x=element_blank(),axis.title.y=element_blank()) if(length( limma.out$comparisons) ==1 ) { top1=limma.out$topGenes[[1]] } else { top = limma.out$topGenes ix = match(input_selectContrast2, names(top)) if( is.na(ix)) return (ggplotly(p)) top1 <- top[[ix]]; } if(dim(top1)[1] == 0 ) return (ggplotly(p)) colnames(top1)= c("Fold","FDR") # write.csv(merge(top1,allGeneInfo(), by.x="row.names",by.y="ensembl_gene_id" ),"tem.csv" ) x <- merge(top1,allGeneInfo.out, by.x="row.names",by.y="ensembl_gene_id" ) # if no chromosomes found. For example if user do not convert gene IDs. if( dim(x)[1] >5 ) { x <- x[order(x$chromosome_name,x$start_position),] tem = sort( table( x$chromosome_name), decreasing=T) chromosomes <- names( tem[tem >= 5 ] ) # chromosomes with less than 100 genes are excluded if(length(chromosomes) > 50) chromosomes <- chromosomes[1:50] # at most 50 chromosomes chromosomes <- chromosomes[ nchar(chromosomes)<=12] # chr. name less than 10 characters chromosomes = chromosomes[order(as.numeric(chromosomes) ) ] # chromosomes = chromosomes[!is.na(as.numeric(chromosomes) ) ] chromosomesNumbers = as.numeric(chromosomes) # convert chr.x to numbers j = max( chromosomesNumbers,na.rm=T) for( i in 1:length( chromosomes)) { if ( is.na(chromosomesNumbers[i]) ) { chromosomesNumbers[i] <- j+1; j <- j+1; } } x <- x[which(x$chromosome_name %in% chromosomes ),] x <- droplevels(x) # find the number coding for chromosome getChrNumber <- function (chrName){ return( chromosomesNumbers[ which( chromosomes == chrName)] ) } x$chrNum = 1 # numeric coding x$chrNum <- unlist( lapply( x$chromosome_name, getChrNumber) ) x$Row.names <- as.character( x$Row.names) # if symbol is missing use Ensembl id x$symbol = as.character(x$symbol) ix = which(is.na(x$symbol)) ix2 = which(nchar(as.character(x$symbol))<= 2 ) ix3 = which( duplicated(x$symbol)) ix = unique( c(ix,ix2,ix3)) x$symbol[ix] <- x$Row.names[ix] ################################## # plotting # x = read.csv("tem_genome.csv") x = x[!is.na(x$chromosome_name),] x = x[!is.na(x$start_position),] # only keep significant genes #x = x[which(x$FDR log2( input_limmaFC)),] ix = which( (x$FDR< as.numeric(input_limmaPvalViz)) & (abs(x$Fold) > as.numeric(input_limmaFCViz) ) ) if (length(ix) > 5) { x = x[ix,] x$start_position = x$start_position/1000000 # Mbp chrD = 20 # distance between chrs. foldCutoff = 3 # max log2 fold x$Fold = x$Fold / sd(x$Fold) # standardize fold change x$Fold[which(x$Fold > foldCutoff )] = foldCutoff # log2fold within -5 to 5 x$Fold[which(x$Fold < -1*foldCutoff )] = -1*foldCutoff x$Fold = 4* x$Fold x$y = x$chrNum*chrD + x$Fold chrLengthTable = aggregate(start_position~chrNum, data=x,max ) chrTotal = dim(chrLengthTable)[1] x$R = as.factor(sign(x$Fold)) colnames(x)[ which(colnames(x) == "start_position")] = "x" p= ggplot(x, aes(x = x, y = y, colour = R, text = symbol ) ) + geom_point(shape = 20, size = .2) #label y with chr names p <- p + scale_y_continuous(labels = paste("chr",chromosomes[chrLengthTable$chrNum],sep=""), breaks = chrD* (1:chrTotal), limits = c(0, chrD*chrTotal + 5) ) # draw horizontal lines for each chr. for( i in 1:dim(chrLengthTable)[1] ) p = p+ annotate( "segment",x = 0, xend = chrLengthTable$start_position[i], y = chrLengthTable$chrNum[i]*chrD, yend = chrLengthTable$chrNum[i]*chrD) # change legend http://ggplot2.tidyverse.org/reference/scale_manual.html p=p+scale_colour_manual(name="", # customize legend text values=c("blue", "red"), breaks=c("1","-1"), labels=c("Up", "Dn")) p = p + xlab("Position on chrs. (Mbp)") + theme(axis.title.y=element_blank()) p= p + theme(legend.position="none") # p <- ggplot(mtcars, aes(x=hp, y=mpg)) + geom_point(shape=20, size=17) # p=p+ geom_smooth(method = "lm",se=FALSE) #p+ geom_line(aes(y=rollmean(y,7, na.pad=TRUE))) # Customize hover text https://cran.r-project.org/web/packages/plotly/plotly.pdf # style(ggplotly(p),hoverinfo="text") # not working } # if no genes else } ggplotly(p) } # results from PREDA genomePlotData <- function( ){ if (is.null(input_selectContrast2 ) ) return(NULL) if( length(limma.out$topGenes) == 0 ) return(NULL) if(length( limma.out$comparisons) ==1 ) { top1=limma.out$topGenes[[1]] } else { top = limma.out$topGenes ix = match(input_selectContrast2, names(top)) if( is.na(ix)) return (NULL) top1 <- top[[ix]]; } if(dim(top1)[1] == 0 ) return (NULL) colnames(top1)= c("Fold","FDR") # write.csv(merge(top1,allGeneInfo(), by.x="row.names",by.y="ensembl_gene_id" ),"tem.csv" ) x <- merge(top1,allGeneInfo.out, by.x="row.names",by.y="ensembl_gene_id" ) x <- x[order(x$chromosome_name,x$start_position),] tem = sort( table( x$chromosome_name), decreasing=T) chromosomes <- names( tem[tem > 100 ] ) # chromosomes with less than 100 genes are excluded if(length(chromosomes) > 50) chromosomes <- chromosomes[1:50] # at most 50 chromosomes chromosomes = chromosomes[order(as.numeric(chromosomes) ) ] # chromosomes = chromosomes[!is.na(as.numeric(chromosomes) ) ] chromosomesNumbers = as.numeric(chromosomes) # convert chr.x to numbers j = max( chromosomesNumbers,na.rm=T) for( i in 1:length( chromosomes)) { if ( is.na(chromosomesNumbers[i]) ) { chromosomesNumbers[i] <- j+1; j <- j+1; } } x <- x[which(x$chromosome_name %in% chromosomes ),] x <- droplevels(x) # find the number coding for chromosome getChrNumber <- function (chrName){ return( chromosomesNumbers[ which( chromosomes == chrName)] ) } x$chrNum = 1 # numeric coding x$chrNum <- unlist( lapply( x$chromosome_name, getChrNumber) ) x$Row.names <- as.character( x$Row.names) fold = x$Fold; names(fold) = x$Row.names # write.csv(x,"tem.csv") x <- x[!duplicated(x$Row.names),] #rownames(x) = x$Row.names GEanalysisResults <- genomePlotDataPre.out; genomic_regions_UP<-PREDAResults2GenomicRegions(GEanalysisResults, qval.threshold=input_RegionsPvalCutoff, smoothStatistic.tail="upper", smoothStatistic.threshold= input_StatisticCutoff) genomic_regions_DOWN<-PREDAResults2GenomicRegions(GEanalysisResults, qval.threshold=input_RegionsPvalCutoff, smoothStatistic.tail="lower", smoothStatistic.threshold= -1 *input_StatisticCutoff) if( is.null(genomic_regions_UP$Test) && is.null(genomic_regions_UP$Test) ) return(-1) # no significant regions regions <- 0 if( !is.null(genomic_regions_UP$Test)) { dataframe_UPregions<-GenomicRegions2dataframe(genomic_regions_UP[[1]]) dataframe_UPregions$Regulation <- "Up" regions = dataframe_UPregions } if( !is.null(genomic_regions_DOWN$Test) ){ dataframe_DOWNregions<-GenomicRegions2dataframe(genomic_regions_DOWN[[1]]) dataframe_DOWNregions$Regulation <- "Down" if ( class(regions) != "data.frame" ) regions <- dataframe_DOWNregions else # if UP regions is NULL regions = rbind(regions,dataframe_DOWNregions ) } if( class(regions) != "data.frame") return(-1) Regions <- regions[,c(4,1:3)] Regions <- Regions[which(Regions$end != Regions$start),] Regions$size = round((Regions$end - Regions$start )/1000000,3) Regions$chr <- chromosomes[ Regions$chr ] # convert from chr. number to names # find the gene indices in the up or down regulated regions regulatedGenes <- function (i) { ix = which( Regions$chr == x$chromosome_name[i] & Regions$start < x$start_position[i] #&(Regions$end > x$start_position[i] + x$genomeSpan[i]) &(Regions$end > x$start_position[i] )) # if the start position is within the region if( length(ix) == 0 | length(ix) >1 ) return(NA) else return( ix ) } regionID = unlist( lapply(1:dim(x)[1], regulatedGenes) ) x1 = x[which(!is.na(regionID)),] regionID = regionID[!is.na(regionID)] x1 = cbind(regionID,Regions[regionID, ], x1[,c('symbol', 'Row.names', 'Fold','FDR', 'band','start_position')]) x1 = x1[order(x1$regionID,x1$start_position ), ] colnames(x1)[8] = "Ensembl"; colnames(x1)[4] = "Region.Start";colnames(x1)[5] = "Region.End"; colnames(x1)[12] = "Gene.Start" # number of genes tem = table(x1$regionID) Regions$Ngenes = 0 Regions$Ngenes[as.integer(names(tem) ) ] <- tem # cytoband per region tem = unique(x1[,c('regionID','band')]) tem$band = gsub("\\..*","",tem$band) tem = unique(tem) Regions$band ="" Regions$ID <- 1:dim(Regions)[1] for ( i in 1:dim(Regions)[1] ) Regions$band[i] <- paste( tem[ which( tem[,1] == Regions$ID[i]),2], collapse=";" ) # Genes Regions$Genes <- "" for ( i in 1:dim(Regions)[1] ) Regions$Genes[i] <- paste( x1$symbol[ which( x1[,1] == Regions$ID[i])], collapse=" " ) return( list(mainResult = GEanalysisResults, Regions = Regions, Genes = x1,legend.x=max(x$start_position)*.6, legend.y=max(chromosomesNumbers)-3 ) ) } genomePlot <- function(){ library(PREDA,verbose=FALSE) # showing expression on genome library(PREDAsampledata,verbose=FALSE) library(hgu133plus2.db,verbose=FALSE) if( class(genomePlotData.out) != "list" ) { plot.new(); text(0.2,1, "No significant regions found!")} else { GEanalysisResults <- genomePlotData.out$mainResult; if( is.null( GEanalysisResults ) ) return(NULL) genomic_regions_UP<-PREDAResults2GenomicRegions(GEanalysisResults, qval.threshold=input_RegionsPvalCutoff, smoothStatistic.tail="upper", smoothStatistic.threshold= -1 *input_StatisticCutoff) genomic_regions_DOWN<-PREDAResults2GenomicRegions(GEanalysisResults, qval.threshold=input_RegionsPvalCutoff, smoothStatistic.tail="lower", smoothStatistic.threshold= -1 *input_StatisticCutoff) checkplot<-genomePlot(GEanalysisResults, genomicRegions=c(genomic_regions_UP, genomic_regions_DOWN), grouping=c(1, 1), scale.positions="Mb", region.colors=c("red","blue")) legend(x=genomePlotData.out$legend.x, y=genomePlotData.out$legend.y , legend=c("UP", "DOWN"), fill=c("red","blue")) } #if else } # pre-calculating PREDA, so that changing FDR cutoffs does not trigger entire calculation genomePlotDataPre <- function( ){ library(PREDA,verbose=FALSE) # showing expression on genome library(PREDAsampledata,verbose=FALSE) library(hgu133plus2.db,verbose=FALSE) if (is.null(input_selectContrast2 ) ) return(NULL) if( length(limma.out$topGenes) == 0 ) return(NULL) if(length( limma.out$comparisons) ==1 ) { top1=limma.out$topGenes[[1]] } else { top = limma.out$topGenes ix = match(input_selectContrast2, names(top)) if( is.na(ix)) return (NULL) top1 <- top[[ix]]; } if(dim(top1)[1] == 0 ) return (NULL) colnames(top1)= c("Fold","FDR") # write.csv(merge(top1,allGeneInfo(), by.x="row.names",by.y="ensembl_gene_id" ),"tem.csv" ) x <- merge(top1,allGeneInfo.out, by.x="row.names",by.y="ensembl_gene_id" ) #hist(top1[,1]) ########## PREDA infofile <- system.file("sampledata", "GeneExpression", "sampleinfoGE_PREDA.txt", package = "PREDAsampledata") sampleinfo<-read.table(infofile, sep="\t", header=TRUE) head(sampleinfo) data(ExpressionSetRCC) GEstatisticsForPREDA<-statisticsForPREDAfromEset(ExpressionSetRCC, statisticType="tstatistic", referenceGroupLabel="normal", classVector=sampleinfo[,"Class"]) analysesNames(GEstatisticsForPREDA) GEGenomicAnnotations<-eset2GenomicAnnotations(ExpressionSetRCC, retain.chrs=1:22) # needs hgu133plus2.db package GEGenomicAnnotationsForPREDA<-GenomicAnnotations2GenomicAnnotationsForPREDA(GEGenomicAnnotations, reference_position_type="median") GEDataForPREDA<-MergeStatisticAnnotations2DataForPREDA(GEstatisticsForPREDA, GEGenomicAnnotationsForPREDA, sortAndCleanNA=TRUE) #x = read.csv("PREDA_test.csv") #x <- x[,-1] # remove the row index, not needed in Shiny x <- x[order(x$chromosome_name,x$start_position),] tem = sort( table( x$chromosome_name), decreasing=T) chromosomes <- names( tem[tem > 100 ] ) # chromosomes with less than 100 genes are excluded if(length(chromosomes) > 50) chromosomes <- chromosomes[1:50] # at most 50 chromosomes chromosomes = chromosomes[order(as.numeric(chromosomes) ) ] # chromosomes = chromosomes[!is.na(as.numeric(chromosomes) ) ] chromosomesNumbers = as.numeric(chromosomes) # convert chr.x to numbers j = max( chromosomesNumbers,na.rm=T) for( i in 1:length( chromosomes)) { if ( is.na(chromosomesNumbers[i]) ) { chromosomesNumbers[i] <- j+1; j <- j+1; } } x <- x[which(x$chromosome_name %in% chromosomes ),] x <- droplevels(x) # find the number coding for chromosome getChrNumber <- function (chrName){ return( chromosomesNumbers[ which( chromosomes == chrName)] ) } x$chrNum = 1 # numeric coding x$chrNum <- unlist( lapply( x$chromosome_name, getChrNumber) ) x$Row.names <- as.character( x$Row.names) fold = x$Fold; names(fold) = x$Row.names # write.csv(x,"tem.csv") x <- x[!duplicated(x$Row.names),] #rownames(x) = x$Row.names myData <- GEDataForPREDA myData@position <- as.integer( x$start_position+ x$genomeSpan/2 ) myData@ids <- as.character( x$Row.names) myData@chr <- as.integer( x$chrNum ) myData@start <- x$start_position myData@end <- x$start_position + x$genomeSpan myData@strand <- rep(1,dim(x)[1]) myData@chromosomesLabels <- chromosomes myData@chromosomesNumbers <- as.integer( chromosomesNumbers ) myData@statistic <- as.matrix( x[,2,drop=FALSE] ) myData@analysesNames <- "Test" myData@testedTail <- "both" myData@optionalAnnotations <- as.matrix( x[,1:2,drop=FALSE] ) myData@optionalAnnotationsHeaders <- c("a","b") set.seed(2) GEanalysisResults<-PREDA_main(myData,nperms = PREDA_Permutations) return( GEanalysisResults ) } ################################################################ # Biclustering ################################################################ biclustering <- function( ){ library(biclust,verbose=FALSE) if(input_biclustMethod == "BCQU()" ) library(QUBIC,verbose=FALSE) # have trouble installing on Linux if(input_biclustMethod == "BCUnibic()" ) library(runibic,verbose=FALSE) # x <- convertedData.out n=input_nGenesBiclust if(n>dim(x)[1]) n = dim(x)[1] # max as data if(n<10) n = 10 if(n> 2000 ) n = 2000 x=as.matrix(x[1:n,])-apply(x[1:n,],1,mean) if( input_biclustMethod == "BCXmotifs()" ) x<-discretize(x) #res <- biclust::biclust(as.matrix( x), method = BCQU()) runR = paste( "res <- biclust::biclust(as.matrix( x), method =", input_biclustMethod ,")" ) eval(parse(text = runR ) ) return(list( x=x, res=res) ) } biclustHeatmap <- function ( ){ res = biclustering()$res if( res@Number == 0 ) { plot.new(); text(0.5,0.5, "No cluster found!")} else { x = biclust::bicluster(biclustering.out$x, res, as.numeric( input_selectBicluster) )[[1]] par(mar = c(5, 4, 1.4, 0.2)) if( dim(x)[1] <=30 ) heatmap.2(x, Rowv =T,Colv=F, dendrogram ="none", col=heatColors[as.integer(input_heatColors1),], density.info="none", trace="none", scale="none", keysize=.2 ,key=F, #labRow = T, ,margins = c(8, 24) ,cexRow=1 #,srtCol=45 ,cexCol=1. # size of font for sample names ) else heatmap.2(x, Rowv =T,Colv=F, dendrogram ="none", col=heatColors[as.integer(input_heatColors1),], density.info="none", trace="none", scale="none", keysize=.2 ,key=F, labRow = F, ,margins = c(8, 4) ,cexRow=1 #,srtCol=45 ,cexCol=1. # size of font for sample names ) } } geneListBclustGO <- function( ){ res = biclustering.out$res if( res@Number == 0 ) return(as.data.frame("No clusters found!") ) x = biclust::bicluster(biclustering.out$x, res, as.numeric( input_selectBicluster) )[[1]] if( dim(x)[1] <= minGenesEnrichment) return(NoSig) query = rownames(x) result <- findOverlapGMT( query, GeneSets.out,1) result$Genes = "Up regulated" results1 = result if ( is.null( results1) ) return (NoSig) if( dim(results1)[2] <5 ) return(NoSig) # Returns a data frame: "No significant results found!" results1= results1[,c(5,1,2,4)] colnames(results1)= c("List","FDR","Genes","GO terms or pathways") minFDR = 0.01 if(min(results1$FDR) > minFDR ) results1 = as.data.frame("No signficant enrichment found.") else results1 = results1[which(results1$FDR < minFDR),] if(dim(results1)[2] != 4) return(NoSig) colnames(results1)= c("Direction","adj.Pval","Genes","Pathways") results1$adj.Pval <- sprintf("%-2.1e",as.numeric(results1$adj.Pval) ) results1[,1] <- as.character(results1[,1]) tem <- results1[,1] results1[ duplicated (results1[,1] ),1 ] <- "" results1[,-1] } ################################################################ # Co-expression network ################################################################ wgcna <- function ( ){ #http://pklab.med.harvard.edu/scw2014/WGCNA.html library(WGCNA) x <- convertedData.out n=input_nGenesNetwork if(n>dim(x)[1]) n = dim(x)[1] # max as data if(n<50) return(NULL) if(dim(x)[2] <4) return(NULL) if(n> maxGeneWGCNA ) n = maxGeneWGCNA #x=as.matrix(x[1:n,])-apply(x[1:n,],1,mean) datExpr=t(x[1:n,]) subGeneNames=colnames(datExpr) #Choosing a soft-threshold to fit a scale-free topology to the network powers = c(c(1:10), seq(from = 12, to=20, by=2)); sft=pickSoftThreshold(datExpr,dataIsExpr = TRUE,powerVector = powers,corFnc = cor,corOptions = list(use = 'p'),networkType = "unsigned") softPower = input_mySoftPower; #calclute the adjacency matrix adj= adjacency(datExpr,type = "unsigned", power = softPower); #turn adjacency matrix into topological overlap to minimize the effects of noise and spurious associations TOM=TOMsimilarityFromExpr(datExpr,networkType = "unsigned", TOMType = "unsigned", power = softPower); colnames(TOM) =subGeneNames rownames(TOM) =subGeneNames ########################################## # module detection library(flashClust,verbose=FALSE) geneTree = flashClust(as.dist(1-TOM),method="average"); # Set the minimum module size minModuleSize = input_minModuleSize; # Module identification using dynamic tree cut dynamicMods = cutreeDynamic(dendro = geneTree, method="tree", minClusterSize = minModuleSize); #dynamicMods = cutreeDynamic(dendro = geneTree, distM = dissTOM, method="hybrid", deepSplit = 2, pamRespectsDendro = FALSE, minClusterSize = minModuleSize); # table(dynamicMods) dynamicColors = labels2colors(dynamicMods) #table(dynamicColors) #discard the unassigned genes, and focus on the rest # this causes problems #restGenes= (dynamicColors != "grey") #diss1=1-TOMsimilarityFromExpr(datExpr[,restGenes], power = softPower) #colnames(diss1) =rownames(diss1) =subGeneNames[restGenes] #colorCode = as.character(dynamicColors[restGenes]) moduleInfo = cbind( subGeneNames, dynamicColors, dynamicMods) moduleInfo = moduleInfo[which(moduleInfo[,2] != "grey") ,] # remove genes not in any modules moduleInfo = moduleInfo[order(moduleInfo[,3]),] # sort n.modules = length(unique(dynamicColors) ) -1 ; nGenes = dim(moduleInfo)[1] return(list(x = t(datExpr),powers=powers,sft=sft, TOM = TOM, dynamicColors = dynamicColors, moduleInfo = moduleInfo,n.modules=n.modules, nGenes =nGenes) ) } softPower <- function(){ #################################### sft = wgcna.out$sft; powers=wgcna.out$powers; # Plot the results #sizeGrWindow(9, 5) par(mfrow = c(1,2)); cex1 = 0.9; # Scale-free topology fit index as a function of the soft-thresholding power plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],xlab="Soft Threshold (power)",ylab="Scale Free Topology Model Fit, signed R^2",type="n", main = paste("Scale independence")); text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],labels=powers,cex=cex1,col="red"); # Red line corresponds to using an R^2 cut-off abline(h=0.80,col="red") # Mean connectivity as a function of the soft-thresholding power plot(sft$fitIndices[,1], sft$fitIndices[,5],xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n",main = paste("Mean connectivity")) text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers, cex=cex1,col="red") } modulePlot <- function(){ diss1 = 1-wgcna.out$TOM; dynamicColors = wgcna.out$dynamicColors hier1=flashClust(as.dist(diss1), method="average" ) #set the diagonal of the dissimilarity to NA diag(diss1) = NA; plotDendroAndColors(hier1, dynamicColors, "Dynamic Tree Cut", dendroLabels = FALSE, hang = 0.03, addGuide = TRUE, guideHang = 0.05, main = "Gene dendrogram and module colors") #Visualize the Tom plot. Raise the dissimilarity matrix to the power of 4 to bring out the module structure #TOMplot(diss1, hier1, colorCode) } listWGCNA.Modules <- function(){ if (is.null(wgcna.out ) ){ # if sample info is uploaded and correctly parsed. return(NULL) } else { if( dim(wgcna.out$moduleInfo)[1] == 0 ) { # if no module return(NULL) } else { modules = unique(wgcna.out$moduleInfo[, c("dynamicMods","dynamicColors")] ) moduleList = apply(modules,1,paste,collapse=". ") moduleList = paste0( moduleList, " (", table(wgcna.out$moduleInfo[,"dynamicMods"] )," genes)" ) moduleList = c(moduleList,"Entire network") return(moduleList) } } } moduleNetwork <- function(){ outfile <- tempfile(fileext='.txt') #module = gsub(".* ","",input_selectWGCNA.Module) module = unlist(strsplit(input_selectWGCNA.Module," " ) )[2] moduleColors = wgcna.out$dynamicColors inModule = (moduleColors==module); if( input_selectWGCNA.Module == "Entire network") inModule = rep(TRUE, length(inModule)) datExpr = t(wgcna.out$x ) probes = colnames(datExpr) modProbes = probes[inModule]; modTOM = wgcna.out$TOM[inModule, inModule]; dimnames(modTOM) = list(modProbes, modProbes) nTop = input_topGenesNetwork; if( nTop > 1000) nTop = 1000; IMConn = softConnectivity(datExpr[, modProbes]); top = (rank(-IMConn) <= nTop) # adding symbols probeToGene = NULL if( input_selectGO5 != "ID not recognized!" & input_selectOrg != "NEW") if(sum(is.na( allGeneInfo.out$symbol ) )/ dim( allGeneInfo.out )[1] <.5 ) { # if more than 50% genes has symbol probeToGene = allGeneInfo.out[,c("ensembl_gene_id","symbol")] probeToGene$symbol = gsub(" ","",probeToGene$symbol) ix = which( is.na(probeToGene$symbol) | nchar(probeToGene$symbol)<2 | toupper(probeToGene$symbol)=="NA" | toupper(probeToGene$symbol)=="0" ) probeToGene[ix,2] = probeToGene[ix,1] # use gene ID } net <- modTOM[top,top] >input_edgeThreshold for( i in 1:dim(net)[1]) # remove self connection net[i,i] = FALSE if(!is.null(probeToGene) & !input_noIDConversion) { # if gene symbol exist ix = match( colnames(net), probeToGene[,1]) colnames(net) = probeToGene[ix,2] ix = match( rownames(net), probeToGene[,1]) rownames(net) = probeToGene[ix,2] } library(igraph,verbose=FALSE) #plot(graph_from_data_frame(d=data.frame(1:10,ncol=2) ,directed=F) ) # http://www.kateto.net/wp-content/uploads/2016/01/NetSciX_2016_Workshop.pdf plot( graph_from_adjacency_matrix( net, mod ="undirected" ), vertex.label.color="black", vertex.label.dist=3,vertex.size=7) } networkModuleGO <- function(){ #module = gsub(".* ","",input_selectWGCNA.Module) module = unlist(strsplit(input_selectWGCNA.Module," " ) )[2] moduleColors = wgcna.out$dynamicColors inModule = (moduleColors==module); if( input_selectWGCNA.Module == "Entire network") inModule = rep(TRUE, length(inModule)) probes = rownames(wgcna.out$x ) query = probes[inModule]; if( length(query) <= minGenesEnrichment) return(NoSig) result <- findOverlapGMT( query, GeneSets.out,1) result$Genes = "Up regulated" results1 = result if ( is.null( results1) ) return (NoSig) if( dim(results1)[2] <5 ) return(NoSig) # Returns a data frame: "No significant results found!" results1= results1[,c(5,1,2,4)] colnames(results1)= c("List","FDR","Genes","GO terms or pathways") minFDR = 0.01 if(min(results1$FDR) > minFDR ) results1 = as.data.frame("No signficant enrichment found.") else results1 = results1[which(results1$FDR < minFDR),] if(dim(results1)[2] != 4) return(NoSig) colnames(results1)= c("Direction","adj.Pval","Genes","Pathways") results1$adj.Pval <- sprintf("%-2.1e",as.numeric(results1$adj.Pval) ) results1[,1] <- as.character(results1[,1]) tem <- results1[,1] results1[ duplicated (results1[,1] ),1 ] <- "" results1[,-1] }