# R code for stand-alone iDEP analysis # by Steven Xijin Ge, South Dakota State University, gexijin@gmail.com # Generated by iDEP 0.71 hosted at http://ge-lab.org/idep/ Sat Mar 31 10:06:10 2018 ########################## # 1. Read data ########################## setwd('C:/Users/Xijin.Ge/Downloads') # Needs to be changed source('iDEP_core_functions.R') # Input files # Expression file has to use Ensembl for gene ID. Otherwise, use custom pathway database with matching IDs. inputFile <- 'Downloaded_Converted_Data.csv' # Expression matrix sampleInfoFile <- NULL #Gene symbols, location etc. geneInfoFile <- 'Human__hsapiens_gene_ensembl_GeneInfo.csv' # pathway database in SQL; can be GMT format geneSetFile <- 'Human__hsapiens_gene_ensembl.db' STRING10_speciesFile <- 'https://raw.githubusercontent.com/iDEP-SDSU/idep/master/shinyapps/idep/STRING10_species.csv' # Parameters input_missingValue <- 'geneMedian' #Missing values imputation method input_dataFileFormat <- 1 #1- read counts, 2 FKPM/RPKM or DNA microarray input_minCounts <- 0.5 #Min counts input_NminSamples <- 1 #Minimum number of samples input_countsLogStart <- 4 #Pseudo count for log CPM input_CountsTransform <- 3 #Methods for data transformation of counts. 1-EdgeR's logCPM; 2-VST; 3-rlog #Read data files readData.out <- readData(inputFile) readSampleInfo.out <- NULL input_selectOrg ="NEW" input_selectGO <- 'GOCC' #Gene set category input_noIDConversion = TRUE allGeneInfo.out <- geneInfo(geneInfoFile) converted.out = NULL convertedData.out <- convertedData() nGenesFilter() convertedCounts.out <- convertedCounts() # converted counts, just for compatibility readCountsBias() # detecting bias in sequencing depth ########################## # 2. Pre-Process ########################## parDefault = par() par(mar=c(12,4,2,2)) # barplot of total read counts x <- readData.out$rawCounts groups = as.factor( detectGroups(colnames(x ) ) ) if(nlevels(groups)<=1 | nlevels(groups) >20 ) col1 = 'green' else col1 = rainbow(nlevels(groups))[ groups ] barplot( colSums(readData.out$rawCounts)/1e6, col=col1,las=3, main="Total read counts (millions)") # Box plot x = readData.out$data boxplot(x, las = 2, col=col1, ylab='Transformed expression levels', main='Distribution of transformed data') # Density plot par(parDefault) densityPlot() # Scatter plot of the first two samples plot(x[,1:2],xlab=colnames(x)[1],ylab=colnames(x)[2], main='Scatter plot of first two samples') #plot gene or gene family input_selectOrg ="BestMatch" input_geneSearch <- 'HOXA' #Gene ID for searching genePlot() input_useSD <- 'FALSE' #Use standard deviation instead of standard error in error bar? geneBarPlotError() ########################## # 3. Heatmap ########################## # hierarchical clustering tree x <- readData.out$data maxGene <- apply(x,1,max) # remove bottom 25% lowly expressed genes, which inflate the PPC x <- x[which(maxGene > quantile(maxGene)[1] ) ,] plot(as.dendrogram(hclust2( dist2(t(x)))), ylab="1 - Pearson C.C.", type = "rectangle") #Correlation matrix input_labelPCC <- TRUE #Show correlation coefficient? correlationMatrix() # Parameters for heatmap input_nGenes <- 1000 #Top genes for heatmap input_geneCentering <- TRUE #centering genes ? input_sampleCentering <- FALSE #Center by sample? input_geneNormalize <- FALSE #Normalize by gene? input_sampleNormalize <- FALSE #Normalize by sample? input_noSampleClustering <- FALSE #Use original sample order input_heatmapCutoff <- 4 #Remove outliers beyond number of SDs input_distFunctions <- 1 #which distant funciton to use input_hclustFunctions <- 1 #Linkage type input_heatColors1 <- 1 #Colors input_selectFactorsHeatmap <- NULL #Sample coloring factors staticHeatmap() #Legends not showing due to margin. #For a better figure, plot to a file dev.off() # close plot tiff('heatmap.tiff', width = 10, height = 15, units = 'in', res = 300, compression = 'lzw') staticHeatmap() dev.off() browseURL('heatmap.tiff') # show heatmap in browser heatmapPlotly() # interactive heatmap using Plotly ########################## # 4. k-Means clustering ########################## input_nGenesKNN <- 2000 #Number of genes fro k-Means input_nClusters <- 2 #Number of clusters maxGeneClustering = 12000 input_kmeansNormalization <- 'geneMean' #Normalization input_KmeansReRun <- 0 #Random seed distributionSD() #Distribution of standard deviations KmeansNclusters() #Number of clusters Kmeans.out = Kmeans() #Running K-means KmeansHeatmap() #Heatmap for k-Means #Read gene sets for enrichment analysis sqlite <- dbDriver('SQLite') input_selectGO3 <- 'GOBP' #Gene set category input_minSetSize <- 15 #Min gene set size input_maxSetSize <- 2000 #Max gene set size GeneSets.out <-readGeneSets( geneSetFile, convertedData.out, input_selectGO3,input_selectOrg, c(input_minSetSize, input_maxSetSize) ) # Alternatively, users can use their own GMT files by #GeneSets.out <- readGMTRobust('somefile.GMT') KmeansGO() #Enrichment analysis for k-Means clusters input_seedTSNE <- 7 #Random seed for t-SNE input_colorGenes <- TRUE #Color genes in t-SNE plot? tSNEgenePlot() #Plot genes using t-SNE ########################## # 5. PCA and beyond ########################## input_selectFactors <- NULL #Factor coded by color input_selectFactors2 <- NULL #Factor coded by shape input_tsneSeed2 <- 0 #Random seed for t-SNE #PCA, MDS and t-SNE plots PCAplot() MDSplot() tSNEplot() #Read gene sets for pathway analysis using PGSEA on principal components input_selectGO6 <- 'GOBP' #Gene set category GeneSets.out <-readGeneSets( geneSetFile, convertedData.out, input_selectGO6,input_selectOrg, c(input_minSetSize, input_maxSetSize) ) PCApathway() # Run PGSEA analysis cat( PCA2factor() ) #The correlation between PCs with factors ########################## # 6. DEG1 ########################## input_CountsDEGMethod <- 3 #DESeq2= 3,limma-voom=2,limma-trend=1 input_limmaPval <- 0.1 #FDR cutoff input_limmaFC <- 2 #Fold-change cutoff input_selectModelComprions <- NULL #Selected comparisons input_selectFactorsModel <- NULL #Selected comparisons input_selectInteractions <- NULL #Selected comparisons input_selectBlockFactorsModel <- NULL #Selected comparisons factorReferenceLevels.out <- NULL limma.out <- limma() limma.out$comparisons DEG.data.out <- DEG.data() input_selectComparisonsVenn = limma.out$comparisons[1:3] # use first three comparisons input_UpDownRegulated <- FALSE #Split up and down regulated genes vennPlot() # Venn diagram sigGeneStats() # number of DEGs as figure sigGeneStatsTable() # number of DEGs as table ########################## # 7. DEG2 ########################## input_selectContrast <- 'Hoxa1KN-control' #Selected comparisons selectedHeatmap.data.out <- selectedHeatmap.data() selectedHeatmap() # heatmap for DEGs in selected comparison # Save gene lists and data into files write.csv( selectedHeatmap.data()$genes, 'heatmap.data.csv') write.csv(DEG.data(),'DEG.data.csv' ) write(AllGeneListsGMT() ,'AllGeneListsGMT.gmt') input_selectGO2 <- 'MSigDB.Curated' #Gene set category geneListData.out <- geneListData() volcanoPlot() scatterPlot() MAplot() geneListGOTable.out <- geneListGOTable() # Read pathway data again GeneSets.out <-readGeneSets( geneSetFile, convertedData.out, input_selectGO2,input_selectOrg, c(input_minSetSize, input_maxSetSize) ) input_removeRedudantSets <- TRUE #Remove highly redundant gene sets? geneListGO() # STRING-db API access STRING10_species = read.csv(STRING10_speciesFile) ix = grep('Mus musculus', STRING10_species$official_name ) findTaxonomyID.out <- STRING10_species[ix,1] # find taxonomyID findTaxonomyID.out # users can also skip the above and assign NCBI taxonomy id directly by # findTaxonomyID.out = 10090 # mouse 10090, human 9606 etc. STRINGdb_geneList.out <- STRINGdb_geneList() #convert gene lists input_STRINGdbGO <- 'Pfam' #'Process', 'Component', 'Function', 'KEGG', 'Pfam', 'InterPro' stringDB_GO_enrichmentData() # PPI network retrieval and analysis input_nGenesPPI <- 100 #Number of top genes for PPI retrieval and analysis stringDB_network1(1) #Show PPI network write(stringDB_network_link(), 'PPI_results.html') # write results to html file browseURL('PPI_results.html') # open in browser ########################## # 8. Pathway analysis ########################## input_selectContrast1 <- 'Hoxa1KN-control' #select Comparison #input_selectContrast1 = limma.out$comparisons[3] # manually set input_selectGO <- 'GOCC' #Gene set category #input_selectGO='custom' # if custom gmt file input_minSetSize <- 15 #Min size for gene set input_maxSetSize <- 2000 #Max size for gene set # Read pathway data again GeneSets.out <-readGeneSets( geneSetFile, convertedData.out, input_selectGO,input_selectOrg, c(input_minSetSize, input_maxSetSize) ) input_pathwayPvalCutoff <- 0.2 #FDR cutoff input_nPathwayShow <- 30 #Top pathways to show input_absoluteFold <- TRUE #Use absolute values of fold-change? input_GenePvalCutoff <- 1 #FDR to remove genes input_pathwayMethod = 1 # 1 GAGE gagePathwayData.out <- gagePathwayData() # pathway analysis using GAGE gagePathwayData.out pathwayListData.out = pathwayListData() enrichmentPlot(pathwayListData.out, 25 ) enrichmentNetwork(pathwayListData.out ) enrichmentNetworkPlotly(pathwayListData.out) input_pathwayMethod = 3 # 1 fgsea fgseaPathwayData.out <- fgseaPathwayData() #Pathway analysis using fgsea fgseaPathwayData.out pathwayListData.out = pathwayListData() enrichmentPlot(pathwayListData.out, 25 ) enrichmentNetwork(pathwayListData.out ) enrichmentNetworkPlotly(pathwayListData.out) PGSEAplot() # pathway analysis using PGSEA ########################## # 9. Chromosome ########################## input_selectContrast2 <- 'Hoxa1KN-control' #select Comparison #input_selectContrast2 = limma.out$comparisons[3] # manually set input_limmaPvalViz <- 0.1 #FDR to filter genes input_limmaFCViz <- 2 #FDR to filter genes genomePlotly() # shows fold-changes on the genome ########################## # 10. Bicluster ########################## input_nGenesBiclust <- 1000 #Top genes for biclustering input_biclustMethod <- 'BCCC()' #Method: 'BCCC', 'QUBIC', 'runibic' ... biclustering.out = biclustering() # run analysis input_selectBicluster <- 1 #select a cluster biclustHeatmap() # heatmap for selected cluster input_selectGO4 <- 'GOBP' #Gene set # Read pathway data again GeneSets.out <-readGeneSets( geneSetFile, convertedData.out, input_selectGO4,input_selectOrg, c(input_minSetSize, input_maxSetSize) ) geneListBclustGO() # enrichment analysis ########################## # 11. Co-expression network ########################## input_mySoftPower <- 5 #SoftPower to cutoff input_nGenesNetwork <- 1000 #Number of top genes input_minModuleSize <- 20 #Module size minimum wgcna.out = wgcna() # run WGCNA softPower() # soft power curve modulePlot() # plot modules listWGCNA.Modules.out = listWGCNA.Modules() #modules input_selectGO5 <- 'GOBP' #Gene set # Read pathway data again GeneSets.out <-readGeneSets( geneSetFile, convertedData.out, input_selectGO5,input_selectOrg, c(input_minSetSize, input_maxSetSize) ) input_selectWGCNA.Module <- '1. turquoise (995 genes)' #Select a module input_topGenesNetwork <- 10 #SoftPower to cutoff input_edgeThreshold <- 0.4 #Number of top genes moduleNetwork() # show network of top genes in selected module input_removeRedudantSets <- TRUE #Remove redundant gene sets networkModuleGO() # Enrichment analysis of selected module