---
title: 'Reproducing iDEP analyses with auto-generated R Markdown'
author:  iDEP 0.71  http://ge-lab.org/idep/, originally by Steven Xijin.Ge@sdstate.edu  
date: Sat Mar 31 10:06:11 2018  
output: html_document
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
knitr::opts_chunk$set(fig.width=6, fig.height=5, fig.align = 'center') 
``` 
 This R markdown file was auto-generated by the [iDEP website](http://ge-lab.org/idep/). It is assumed that users have analyzed their data with iDEP by clicking through all the tabs and have downloaded the related files to a folder. 
	

## 1. Read data  
First we set up the working directory to where the files are saved.    
```{r, message=FALSE} 
 setwd('C:/Users/Xijin.Ge/Downloads')   # Needs to be changed  
```
R packages and iDEP core Functions. 
	Users can also download the iDEP_core_functions.R file. 
	Many R packages needs to be installed first. This may take hours. 
	Each of these packages took years to develop.So be a patient thief. Sometimes dependencies needs to be installed manually. 
	If you are using an older version of R, and having trouble with package installation,
	try un-install the current version of R, delete all folders and files 
	(C:/Program Files/R/R-3.4.3), and 
	reinstall from scratch.  
```{r, message=FALSE  }  
 if(file.exists('iDEP_core_functions.R'))
	source('iDEP_core_functions.R') else 
    source('https://raw.githubusercontent.com/iDEP-SDSU/idep/master/shinyapps/idep/iDEP_core_functions.R') 
```  
We are using the downloaded gene expression file where gene IDs has 
	been converted to Ensembl gene IDs. This is because the ID conversion database is too large
	to download. You can use your original file if your file uses Ensembl ID, or you do not want 
	to use the pathway files available in iDEP (or it is not available).  
```{r, message=FALSE  }  
 inputFile <- 'Downloaded_Converted_Data.csv'  # Expression matrix
 sampleInfoFile <- NULL 
 geneInfoFile <- 'Human__hsapiens_gene_ensembl_GeneInfo.csv' #Gene symbols, location etc. 
 geneSetFile <- 'Human__hsapiens_gene_ensembl.db'  # pathway database in SQL; can be GMT format 
 STRING10_speciesFile <- 'https://raw.githubusercontent.com/iDEP-SDSU/idep/master/shinyapps/idep/STRING10_species.csv' 
``` 
Parameters for reading data  
```{r, message=FALSE  } 
 input_missingValue <- 'geneMedian'	#Missing values imputation method
 input_dataFileFormat <- 1	#1- read counts, 2 FKPM/RPKM or DNA microarray
 input_minCounts <- 0.5	#Min counts
 input_NminSamples <- 1	#Minimum number of samples 
 input_countsLogStart <- 4	#Pseudo count for log CPM
 input_CountsTransform <- 3	#Methods for data transformation of counts. 1-EdgeR's logCPM 2-VST, 3-rlog 
```
```{r, message=FALSE  }  

 readData.out <- readData(inputFile) 
 library(knitr)   #  install if needed. for showing tables with kable 
 kable( head(readData.out$data) )    # show the first few rows of data 
```
```{r, message=FALSE  }  
 readSampleInfo.out <- NULL 
```
```{r, message=FALSE  }  
 input_selectOrg ="NEW" 
 input_selectGO <- 'GOCC'	#Gene set category 
 input_noIDConversion = TRUE  
 allGeneInfo.out <- geneInfo(geneInfoFile) 
 converted.out = NULL 
 convertedData.out <- convertedData()	 
 nGenesFilter()  
 convertedCounts.out <- convertedCounts()  # converted counts, just for compatibility 
```

## 2. Pre-process 
```{r, message=FALSE  }  
# Read counts per library 
 parDefault = par() 
 par(mar=c(12,4,2,2)) 
 # barplot of total read counts
 x <- readData.out$rawCounts
 groups = as.factor( detectGroups(colnames(x ) ) )
 if(nlevels(groups)<=1 | nlevels(groups) >20 )  
  col1 = 'green'  else
  col1 = rainbow(nlevels(groups))[ groups ]				
		 
 barplot( colSums(x)/1e6, 
		col=col1,las=3, main="Total read counts (millions)")  
 readCountsBias()  # detecting bias in sequencing depth 
```
```{r, message=FALSE  }  

 # Box plot 
 x = readData.out$data 
 boxplot(x, las = 2, col=col1,
    ylab='Transformed expression levels',
    main='Distribution of transformed data') 
```
```{r, message=FALSE  }  

 #Density plot 
 par(parDefault) 
 densityPlot()       
```
```{r, message=FALSE  }  

 # Scatter plot of the first two samples 
 plot(x[,1:2],xlab=colnames(x)[1],ylab=colnames(x)[2], 
    main='Scatter plot of first two samples') 
```
```{r, message=FALSE  }  

 ####plot gene or gene family
 input_selectOrg ="BestMatch" 
 input_geneSearch <- 'HOXA'	#Gene ID for searching 
 genePlot()  
```
```{r, message=FALSE  } 
 input_useSD <- 'FALSE'	#Use standard deviation instead of standard error in error bar? 
 geneBarPlotError()       
```

## 3. Heatmap  
```{r, message=FALSE  }  
 # hierarchical clustering tree
 x <- readData.out$data
 maxGene <- apply(x,1,max)
 # remove bottom 25% lowly expressed genes, which inflate the PPC
 x <- x[which(maxGene > quantile(maxGene)[1] ) ,] 
 plot(as.dendrogram(hclust2( dist2(t(x)))), ylab="1 - Pearson C.C.", type = "rectangle") 
```
```{r, message=FALSE  }  
 #Correlation matrix
 input_labelPCC <- TRUE	#Show correlation coefficient? 
 correlationMatrix() 
```
```{r, message=FALSE  }  

 # Parameters for heatmap
 input_nGenes <- 1000	#Top genes for heatmap
 input_geneCentering <- TRUE	#centering genes ?
 input_sampleCentering <- FALSE	#Center by sample?
 input_geneNormalize <- FALSE	#Normalize by gene?
 input_sampleNormalize <- FALSE	#Normalize by sample?
 input_noSampleClustering <- FALSE	#Use original sample order
 input_heatmapCutoff <- 4	#Remove outliers beyond number of SDs 
 input_distFunctions <- 1	#which distant funciton to use
 input_hclustFunctions <- 1	#Linkage type
 input_heatColors1 <- 1	#Colors
 input_selectFactorsHeatmap <- NULL 	#Sample coloring factors 
 png('heatmap.png', width = 10, height = 15, units = 'in', res = 300) 
 staticHeatmap() 
 dev.off()  
```
  ![heatmap] (heatmap.png)   
```{r, message=FALSE  }  
 heatmapPlotly() # interactive heatmap using Plotly 
```

## 4. K-means clustering   
```{r, message=FALSE  } 
 input_nGenesKNN <- 2000	#Number of genes fro k-Means
 input_nClusters <- 2	#Number of clusters 
 maxGeneClustering = 12000
 input_kmeansNormalization <- 'geneMean'	#Normalization
 input_KmeansReRun <- 0	#Random seed 

 distributionSD()  #Distribution of standard deviations 
```     
```{r, message=FALSE  }  
 KmeansNclusters()  #Number of clusters 
```     
```{r, message=FALSE  }  

 Kmeans.out = Kmeans()   #Running K-means 
 KmeansHeatmap()   #Heatmap for k-Means 
 
```     
```{r, message=FALSE  }  

 #Read gene sets for enrichment analysis 
 sqlite  <- dbDriver('SQLite')
 input_selectGO3 <- 'GOBP'	#Gene set category
 input_minSetSize <- 15	#Min gene set size
 input_maxSetSize <- 2000	#Max gene set size 
 GeneSets.out <-readGeneSets( geneSetFile,
    convertedData.out, input_selectGO3,input_selectOrg,
    c(input_minSetSize, input_maxSetSize)  )  
 # Alternatively, users can use their own GMT files by
 #GeneSets.out <- readGMTRobust('somefile.GMT')  
 results <- KmeansGO()  #Enrichment analysis for k-Means clusters	
 results$adj.Pval <- format( results$adj.Pval,digits=3 )
 kable( results, row.names=FALSE) 
```     
```{r, message=FALSE  } 
 input_seedTSNE <- 7	#Random seed for t-SNE
 input_colorGenes <- TRUE	#Color genes in t-SNE plot? 
 tSNEgenePlot()  #Plot genes using t-SNE 
```

## 5. PCA and beyond   
```{r, message=FALSE  }  
 input_selectFactors <- 'Sample_Name'  
 input_selectFactors2 <- 'Sample_Name' 
 input_tsneSeed2 <- 0	#Random seed for t-SNE 
 #PCA, MDS and t-SNE plots
 PCAplot()  
```     
```{r, message=FALSE  }  
 MDSplot() 
```     
```{r, message=FALSE  }  
 tSNEplot()  
```     
```{r, message=FALSE  }  

 #Read gene sets for pathway analysis using PGSEA on principal components
 input_selectGO6 <- 'GOBP'	#Gene set category 
 GeneSets.out <-readGeneSets( geneSetFile,
    convertedData.out, input_selectGO6,input_selectOrg,
    c(input_minSetSize, input_maxSetSize)  )  
 PCApathway() # Run PGSEA analysis 
```     
```{r, message=FALSE  }  
 cat( PCA2factor() )   #The correlation between PCs with factors 
```

## 6. DEG1   
```{r, message=FALSE  } 
 input_CountsDEGMethod <- 3	#DESeq2= 3,limma-voom=2,limma-trend=1 
 input_limmaPval <- 0.1	#FDR cutoff
 input_limmaFC <- 2	#Fold-change cutoff
 input_selectModelComprions <- NULL 	#Selected comparisons
 input_selectFactorsModel <- NULL 	#Selected comparisons
 input_selectInteractions <- NULL 	#Selected comparisons
 input_selectBlockFactorsModel <- NULL 	#Selected comparisons 
 factorReferenceLevels.out <- NULL 

 limma.out <- limma()
 DEG.data.out <- DEG.data()
 limma.out$comparisons 
```     
```{r, message=FALSE  }  
 input_selectComparisonsVenn = limma.out$comparisons[1:3] # use first three comparisons
 input_UpDownRegulated <- FALSE	#Split up and down regulated genes 
 vennPlot() # Venn diagram 
```     
```{r, message=FALSE  }  
  sigGeneStats() # number of DEGs as figure 
```     
```{r, message=FALSE  }  
  sigGeneStatsTable() # number of DEGs as table 
```

## 7. DEG2   
```{r, message=FALSE  } 
 input_selectContrast <- 'Hoxa1KN-control'	#Selected comparisons 
 selectedHeatmap.data.out <- selectedHeatmap.data()
 selectedHeatmap()   # heatmap for DEGs in selected comparison

 # Save gene lists and data into files
 write.csv( selectedHeatmap.data()$genes, 'heatmap.data.csv') 
 write.csv(DEG.data(),'DEG.data.csv' )
 write(AllGeneListsGMT() ,'AllGeneListsGMT.gmt')
 
```     
```{r, message=FALSE  } 
 input_selectGO2 <- 'MSigDB.Curated'	#Gene set category 
 geneListData.out <- geneListData()  
 volcanoPlot()  
```     
```{r, message=FALSE  }  
  scatterPlot()  
```     
```{r, message=FALSE  }  
  MAplot()  
```     
```{r, message=FALSE  }  
  geneListGOTable.out <- geneListGOTable()  
 # Read pathway data again 
 GeneSets.out <-readGeneSets( geneSetFile,
    convertedData.out, input_selectGO2,input_selectOrg,
    c(input_minSetSize, input_maxSetSize)  ) 
 input_removeRedudantSets <- TRUE	#Remove highly redundant gene sets? 
 results <- geneListGO()  #Enrichment analysis
 results$adj.Pval <- format( results$adj.Pval,digits=3 )
 kable( results, row.names=FALSE) 
```

STRING-db API access. 
	We need to find the taxonomy id of your species, this used by STRING.
  First we try to guess the ID based on iDEP's database. Users can also skip this step and assign NCBI taxonomy id directly by
  findTaxonomyID.out = 10090 # mouse 10090, human 9606 etc.
	
```{r, message=FALSE  }  

  
 STRING10_species = read.csv(STRING10_speciesFile)  
 ix = grep('Mus musculus', STRING10_species$official_name )
 findTaxonomyID.out <- STRING10_species[ix,1] # find taxonomyID
 findTaxonomyID.out  
 
``` 
Enrichment analysis using STRING     
```{r, message=FALSE  }  
  STRINGdb_geneList.out <- STRINGdb_geneList() #convert gene lists
 input_STRINGdbGO <- 'Pfam'	#'Process', 'Component', 'Function', 'KEGG', 'Pfam', 'InterPro' 
 results <- stringDB_GO_enrichmentData()  # enrichment using STRING	
 results$adj.Pval <- format( results$adj.Pval,digits=3 )
 kable( results, row.names=FALSE) 
``` 
PPI network retrieval and analysis    
```{r, message=FALSE  } 
 input_nGenesPPI <- 100	#Number of top genes for PPI retrieval and analysis 
 stringDB_network1(1) #Show PPI network 
``` 
Generating interactive PPI   
```{r, message=FALSE  }  
 write(stringDB_network_link(), 'PPI_results.html') # write results to html file 
 browseURL('PPI_results.html') # open in browser 
```

## 8. Pathway analysis   
```{r, message=FALSE  } 
 input_selectContrast1 <- 'Hoxa1KN-control'	#select Comparison 
 #input_selectContrast1 = limma.out$comparisons[3] # manually set
 input_selectGO <- 'GOCC'	#Gene set category 
 #input_selectGO='custom' # if custom gmt file
 input_minSetSize <- 15	#Min size for gene set
 input_maxSetSize <- 2000	#Max size for gene set 
 # Read pathway data again 
 GeneSets.out <-readGeneSets( geneSetFile,
    convertedData.out, input_selectGO,input_selectOrg,
    c(input_minSetSize, input_maxSetSize)  ) 
 input_pathwayPvalCutoff <- 0.2	#FDR cutoff
 input_nPathwayShow <- 30	#Top pathways to show
 input_absoluteFold <- TRUE	#Use absolute values of fold-change?
 input_GenePvalCutoff <- 1	#FDR to remove genes 

 input_pathwayMethod = 1  # 1  GAGE
 gagePathwayData.out <- gagePathwayData()  # pathway analysis using GAGE  
   
 results <- gagePathwayData.out  #Enrichment analysis for k-Means clusters	
 results$adj.Pval <- format( results$adj.Pval,digits=3 )
 kable( results, row.names=FALSE) 
```     
```{r, message=FALSE  }  
 pathwayListData.out = pathwayListData() 
 enrichmentPlot(pathwayListData.out, 25  ) 
```     
```{r, message=FALSE  }  
  enrichmentNetwork(pathwayListData.out )  
```     
```{r, message=FALSE  }  
  enrichmentNetworkPlotly(pathwayListData.out) 
```     
```{r, message=FALSE  }  

 input_pathwayMethod = 3  # 1  fgsea 
 fgseaPathwayData.out <- fgseaPathwayData() #Pathway analysis using fgsea 
 results <- fgseaPathwayData.out  #Enrichment analysis for k-Means clusters	
 results$adj.Pval <- format( results$adj.Pval,digits=3 )
 kable( results, row.names=FALSE) 
```     
```{r, message=FALSE  }  
  pathwayListData.out = pathwayListData() 
 enrichmentPlot(pathwayListData.out, 25  ) 
```     
```{r, message=FALSE  }  
  enrichmentNetwork(pathwayListData.out )  
```     
```{r, message=FALSE  }  
  enrichmentNetworkPlotly(pathwayListData.out) 
```     
```{r, message=FALSE,fig.width=9, fig.height=8  }  
   PGSEAplot() # pathway analysis using PGSEA 
```

## 9. Chromosome   
```{r, message=FALSE  } 
 input_selectContrast2 <- 'Hoxa1KN-control'	#select Comparison 
 #input_selectContrast2 = limma.out$comparisons[3] # manually set
 input_limmaPvalViz <- 0.1	#FDR to filter genes
 input_limmaFCViz <- 2	#FDR to filter genes 
 genomePlotly() # shows fold-changes on the genome 
```

## 10. Biclustering   
```{r, message=FALSE  } 
 input_nGenesBiclust <- 1000	#Top genes for biclustering
 input_biclustMethod <- 'BCCC()'	#Method: 'BCCC', 'QUBIC', 'runibic' ... 
 biclustering.out = biclustering()  # run analysis

 input_selectBicluster <- 1	#select a cluster 
 biclustHeatmap()   # heatmap for selected cluster 
```     
```{r, message=FALSE  } 
 input_selectGO4 <- 'GOBP'	#Gene set category 
 # Read pathway data again 
 GeneSets.out <-readGeneSets( geneSetFile,
    convertedData.out, input_selectGO4,input_selectOrg,
    c(input_minSetSize, input_maxSetSize)  )  
 results <- geneListBclustGO()  #Enrichment analysis for k-Means clusters	
 results$adj.Pval <- format( results$adj.Pval,digits=3 )
 kable( results, row.names=FALSE) 
```

## 11. Co-expression network    
```{r, message=FALSE  } 
 input_mySoftPower <- 5	#SoftPower to cutoff
 input_nGenesNetwork <- 1000	#Number of top genes
 input_minModuleSize <- 20	#Module size minimum 
 wgcna.out = wgcna()   # run WGCNA  
```     
```{r, message=FALSE  }  
 softPower()  # soft power curve 
```     
```{r, message=FALSE  }  
  modulePlot()  # plot modules  
  listWGCNA.Modules.out = listWGCNA.Modules() #modules
 
```     
```{r, message=FALSE  } 
 input_selectGO5 <- 'GOBP'	#Gene set category 
 # Read pathway data again 
 GeneSets.out <-readGeneSets( geneSetFile,
    convertedData.out, input_selectGO5,input_selectOrg,
    c(input_minSetSize, input_maxSetSize)  ) 
 input_selectWGCNA.Module <- '1. turquoise (995 genes)'	#Select a module
 input_topGenesNetwork <- 10	#SoftPower to cutoff
 input_edgeThreshold <- 0.4	#Number of top genes 
 moduleNetwork()	# show network of top genes in selected module
 
```     
```{r, message=FALSE  } 
 input_removeRedudantSets <- TRUE	#Remove redundant gene sets 
 results <- networkModuleGO()  #Enrichment analysis of selected module
 results$adj.Pval <- format( results$adj.Pval,digits=3 )
 kable( results, row.names=FALSE) 
```