# R code for stand-alone iDEP analysis 
# by Steven Xijin Ge, South Dakota State University,  gexijin@gmail.com  
# Generated by  iDEP 0.71  hosted at http://ge-lab.org/idep/  Sat Mar 31 10:06:10 2018 

##########################
# 1. Read data
########################## 
 setwd('C:/Users/Xijin.Ge/Downloads')   # Needs to be changed  
 source('iDEP_core_functions.R') 

 # Input files 
 # Expression file has to use Ensembl for gene ID. Otherwise, use custom pathway database with matching IDs. 
 inputFile <- 'Downloaded_Converted_Data.csv'  # Expression matrix
 sampleInfoFile <- NULL 
  #Gene symbols, location etc. 
 geneInfoFile <- 'Human__hsapiens_gene_ensembl_GeneInfo.csv' 
 # pathway database in SQL; can be GMT format 
 geneSetFile <- 'Human__hsapiens_gene_ensembl.db'   
 STRING10_speciesFile <- 'https://raw.githubusercontent.com/iDEP-SDSU/idep/master/shinyapps/idep/STRING10_species.csv' 

 # Parameters
 input_missingValue <- 'geneMedian'	#Missing values imputation method
 input_dataFileFormat <- 1	#1- read counts, 2 FKPM/RPKM or DNA microarray
 input_minCounts <- 0.5	#Min counts
 input_NminSamples <- 1	#Minimum number of samples 
 input_countsLogStart <- 4	#Pseudo count for log CPM
 input_CountsTransform <- 3	#Methods for data transformation of counts. 1-EdgeR's logCPM; 2-VST; 3-rlog 

 #Read data files
 readData.out <- readData(inputFile) 
 readSampleInfo.out <- NULL 
 input_selectOrg ="NEW" 
 input_selectGO <- 'GOCC'	#Gene set category 
 input_noIDConversion = TRUE  
 allGeneInfo.out <- geneInfo(geneInfoFile) 
 converted.out = NULL 
 convertedData.out <- convertedData()	 
 nGenesFilter()  
 convertedCounts.out <- convertedCounts()  # converted counts, just for compatibility 
 readCountsBias()  # detecting bias in sequencing depth 

##########################
# 2. Pre-Process 
########################## 
 parDefault = par() 
 par(mar=c(12,4,2,2)) 
 # barplot of total read counts		
 x <- readData.out$rawCounts
 groups = as.factor( detectGroups(colnames(x ) ) )
 if(nlevels(groups)<=1 | nlevels(groups) >20 )  
  col1 = 'green'  else
  col1 = rainbow(nlevels(groups))[ groups ] 
 barplot( colSums(readData.out$rawCounts)/1e6, 
		col=col1,las=3, main="Total read counts (millions)")  

 # Box plot 
 x = readData.out$data 
 boxplot(x, las = 2, col=col1,
    ylab='Transformed expression levels',
    main='Distribution of transformed data') 

 # Density plot 
 par(parDefault) 
 densityPlot()       

 # Scatter plot of the first two samples 
 plot(x[,1:2],xlab=colnames(x)[1],ylab=colnames(x)[2], 
    main='Scatter plot of first two samples') 

 #plot gene or gene family
 input_selectOrg ="BestMatch" 
 input_geneSearch <- 'HOXA'	#Gene ID for searching 
 genePlot() 
 input_useSD <- 'FALSE'	#Use standard deviation instead of standard error in error bar? 
 geneBarPlotError()       

##########################
# 3. Heatmap 
########################## 
 # hierarchical clustering tree
 x <- readData.out$data
 maxGene <- apply(x,1,max)
 # remove bottom 25% lowly expressed genes, which inflate the PPC
 x <- x[which(maxGene > quantile(maxGene)[1] ) ,] 
 plot(as.dendrogram(hclust2( dist2(t(x)))), ylab="1 - Pearson C.C.", type = "rectangle") 
 #Correlation matrix
 input_labelPCC <- TRUE	#Show correlation coefficient? 
 correlationMatrix() 

 # Parameters for heatmap
 input_nGenes <- 1000	#Top genes for heatmap
 input_geneCentering <- TRUE	#centering genes ?
 input_sampleCentering <- FALSE	#Center by sample?
 input_geneNormalize <- FALSE	#Normalize by gene?
 input_sampleNormalize <- FALSE	#Normalize by sample?
 input_noSampleClustering <- FALSE	#Use original sample order
 input_heatmapCutoff <- 4	#Remove outliers beyond number of SDs 
 input_distFunctions <- 1	#which distant funciton to use
 input_hclustFunctions <- 1	#Linkage type
 input_heatColors1 <- 1	#Colors
 input_selectFactorsHeatmap <- NULL 	#Sample coloring factors 

 staticHeatmap() #Legends not showing due to margin.
 #For a better figure, plot to a file 
 dev.off() # close plot 
 tiff('heatmap.tiff', width = 10, height = 15, units = 'in', res = 300, compression = 'lzw') 
 staticHeatmap() 
 dev.off() 
  browseURL('heatmap.tiff') # show heatmap in browser 
 heatmapPlotly() # interactive heatmap using Plotly 

##########################
# 4. k-Means clustering 
##########################
 input_nGenesKNN <- 2000	#Number of genes fro k-Means
 input_nClusters <- 2	#Number of clusters 
 maxGeneClustering = 12000
 input_kmeansNormalization <- 'geneMean'	#Normalization
 input_KmeansReRun <- 0	#Random seed 

 distributionSD()  #Distribution of standard deviations 
 KmeansNclusters()  #Number of clusters 

 Kmeans.out = Kmeans()   #Running K-means 
 KmeansHeatmap()   #Heatmap for k-Means 
 

 #Read gene sets for enrichment analysis 
 sqlite  <- dbDriver('SQLite')
 input_selectGO3 <- 'GOBP'	#Gene set category
 input_minSetSize <- 15	#Min gene set size
 input_maxSetSize <- 2000	#Max gene set size 
 GeneSets.out <-readGeneSets( geneSetFile,
    convertedData.out, input_selectGO3,input_selectOrg,
    c(input_minSetSize, input_maxSetSize)  )  
 # Alternatively, users can use their own GMT files by
 #GeneSets.out <- readGMTRobust('somefile.GMT')  
 KmeansGO()  #Enrichment analysis for k-Means clusters

 input_seedTSNE <- 7	#Random seed for t-SNE
 input_colorGenes <- TRUE	#Color genes in t-SNE plot? 
 tSNEgenePlot()  #Plot genes using t-SNE 

##########################
# 5. PCA and beyond 
##########################
 input_selectFactors <- NULL 	#Factor coded by color
 input_selectFactors2 <- NULL 	#Factor coded by shape
 input_tsneSeed2 <- 0	#Random seed for t-SNE 
 #PCA, MDS and t-SNE plots
 PCAplot()	
 MDSplot()
 tSNEplot()  

 #Read gene sets for pathway analysis using PGSEA on principal components
 input_selectGO6 <- 'GOBP'	#Gene set category 
 GeneSets.out <-readGeneSets( geneSetFile,
    convertedData.out, input_selectGO6,input_selectOrg,
    c(input_minSetSize, input_maxSetSize)  )  
 PCApathway() # Run PGSEA analysis 
 cat( PCA2factor() )   #The correlation between PCs with factors 

##########################
# 6. DEG1 
##########################
 input_CountsDEGMethod <- 3	#DESeq2= 3,limma-voom=2,limma-trend=1 
 input_limmaPval <- 0.1	#FDR cutoff
 input_limmaFC <- 2	#Fold-change cutoff
 input_selectModelComprions <- NULL 	#Selected comparisons
 input_selectFactorsModel <- NULL 	#Selected comparisons
 input_selectInteractions <- NULL 	#Selected comparisons
 input_selectBlockFactorsModel <- NULL 	#Selected comparisons 
 factorReferenceLevels.out <- NULL 

 limma.out <- limma()
 limma.out$comparisons
 DEG.data.out <- DEG.data() 
 input_selectComparisonsVenn = limma.out$comparisons[1:3] # use first three comparisons
 input_UpDownRegulated <- FALSE	#Split up and down regulated genes 
 vennPlot() # Venn diagram
 sigGeneStats() # number of DEGs as figure
 sigGeneStatsTable() # number of DEGs as table 

##########################
# 7. DEG2 
##########################
 input_selectContrast <- 'Hoxa1KN-control'	#Selected comparisons 
 selectedHeatmap.data.out <- selectedHeatmap.data()
 selectedHeatmap()   # heatmap for DEGs in selected comparison

 # Save gene lists and data into files
 write.csv( selectedHeatmap.data()$genes, 'heatmap.data.csv') 
 write.csv(DEG.data(),'DEG.data.csv' )
 write(AllGeneListsGMT() ,'AllGeneListsGMT.gmt')

 input_selectGO2 <- 'MSigDB.Curated'	#Gene set category 
 geneListData.out <- geneListData()
 volcanoPlot() 
 scatterPlot()
 MAplot() 
 geneListGOTable.out <- geneListGOTable()  
 # Read pathway data again 
 GeneSets.out <-readGeneSets( geneSetFile,
    convertedData.out, input_selectGO2,input_selectOrg,
    c(input_minSetSize, input_maxSetSize)  ) 
 input_removeRedudantSets <- TRUE	#Remove highly redundant gene sets? 
 geneListGO() 

 # STRING-db API access 
 STRING10_species = read.csv(STRING10_speciesFile)  
 ix = grep('Mus musculus', STRING10_species$official_name )
 findTaxonomyID.out <- STRING10_species[ix,1] # find taxonomyID
 findTaxonomyID.out  
 # users can also skip the above and assign NCBI taxonomy id directly by
 # findTaxonomyID.out = 10090 # mouse 10090, human 9606 etc.
 STRINGdb_geneList.out <- STRINGdb_geneList() #convert gene lists
 input_STRINGdbGO <- 'Pfam'	#'Process', 'Component', 'Function', 'KEGG', 'Pfam', 'InterPro' 
 stringDB_GO_enrichmentData() 

 # PPI network retrieval and analysis
 input_nGenesPPI <- 100	#Number of top genes for PPI retrieval and analysis 
 stringDB_network1(1) #Show PPI network 
 write(stringDB_network_link(), 'PPI_results.html') # write results to html file 
 browseURL('PPI_results.html') # open in browser 

##########################
# 8. Pathway analysis 
##########################
 input_selectContrast1 <- 'Hoxa1KN-control'	#select Comparison 
 #input_selectContrast1 = limma.out$comparisons[3] # manually set
 input_selectGO <- 'GOCC'	#Gene set category 
 #input_selectGO='custom' # if custom gmt file
 input_minSetSize <- 15	#Min size for gene set
 input_maxSetSize <- 2000	#Max size for gene set 
 # Read pathway data again 
 GeneSets.out <-readGeneSets( geneSetFile,
    convertedData.out, input_selectGO,input_selectOrg,
    c(input_minSetSize, input_maxSetSize)  ) 
 input_pathwayPvalCutoff <- 0.2	#FDR cutoff
 input_nPathwayShow <- 30	#Top pathways to show
 input_absoluteFold <- TRUE	#Use absolute values of fold-change?
 input_GenePvalCutoff <- 1	#FDR to remove genes 

 input_pathwayMethod = 1  # 1  GAGE
 gagePathwayData.out <- gagePathwayData()  # pathway analysis using GAGE  
 gagePathwayData.out
  pathwayListData.out = pathwayListData()
 enrichmentPlot(pathwayListData.out, 25  )
 enrichmentNetwork(pathwayListData.out )
 enrichmentNetworkPlotly(pathwayListData.out)

 input_pathwayMethod = 3  # 1  fgsea 
 fgseaPathwayData.out <- fgseaPathwayData() #Pathway analysis using fgsea
 fgseaPathwayData.out
 pathwayListData.out = pathwayListData()
 enrichmentPlot(pathwayListData.out, 25  )
 enrichmentNetwork(pathwayListData.out )
 enrichmentNetworkPlotly(pathwayListData.out) 
  
 PGSEAplot() # pathway analysis using PGSEA 

##########################
# 9. Chromosome 
##########################
 input_selectContrast2 <- 'Hoxa1KN-control'	#select Comparison 
 #input_selectContrast2 = limma.out$comparisons[3] # manually set
 input_limmaPvalViz <- 0.1	#FDR to filter genes
 input_limmaFCViz <- 2	#FDR to filter genes 
 genomePlotly() # shows fold-changes on the genome 

##########################
# 10. Bicluster 
##########################
 input_nGenesBiclust <- 1000	#Top genes for biclustering
 input_biclustMethod <- 'BCCC()'	#Method: 'BCCC', 'QUBIC', 'runibic' ... 
 biclustering.out = biclustering()  # run analysis

 input_selectBicluster <- 1	#select a cluster 
 biclustHeatmap()   # heatmap for selected cluster
 input_selectGO4 <- 'GOBP'	#Gene set 
 # Read pathway data again 
 GeneSets.out <-readGeneSets( geneSetFile,
    convertedData.out, input_selectGO4,input_selectOrg,
    c(input_minSetSize, input_maxSetSize)  )  
 geneListBclustGO()  # enrichment analysis 

##########################
# 11. Co-expression network 
##########################
 input_mySoftPower <- 5	#SoftPower to cutoff
 input_nGenesNetwork <- 1000	#Number of top genes
 input_minModuleSize <- 20	#Module size minimum 
 wgcna.out = wgcna()   # run WGCNA
 softPower()  # soft power curve
 modulePlot()  # plot modules
 listWGCNA.Modules.out = listWGCNA.Modules() #modules

 input_selectGO5 <- 'GOBP'	#Gene set 
 # Read pathway data again 
 GeneSets.out <-readGeneSets( geneSetFile,
    convertedData.out, input_selectGO5,input_selectOrg,
    c(input_minSetSize, input_maxSetSize)  ) 
 input_selectWGCNA.Module <- '1. turquoise (995 genes)'	#Select a module
 input_topGenesNetwork <- 10	#SoftPower to cutoff
 input_edgeThreshold <- 0.4	#Number of top genes 
 moduleNetwork()	# show network of top genes in selected module

 input_removeRedudantSets <- TRUE	#Remove redundant gene sets 
 networkModuleGO()	# Enrichment analysis of selected module