########### *********************************************##########
 ####### Including ####################
## 1. Part One: line 14 to line 386

### a. Define different functions including normalization methods (Tc, Med, UQ, FQ, Med-pgQ2 and UQ-pgQ2)
### b. Define functions for TMM from edgeR and DESeq from DESeq2
### c. Define functions to calculate cv( coefficent of variation), AUC,sensitivity, specificity, PPV, the actual FDR.

## 2. Part two: Calulate RMSD and graph RMSD in boxplot (line 388 to line 447)

## 3. Part three: Simulation with 1.5 FC for 1500 DEGs with 15,000 genes (line 451 to line 506)

## 4. Part four: Examples to perform DESeq, TMM and  Med-pgQ2 normalization and DEGs analysis using MAQC2 dataset for 
#      with the aid of DESeq2 and edgeR packages ( line 510 to line 564) 


### Part one: Define function() ###############################
### Med.pgQ2 and UQ.pgQ2 Normalization methods

source("http://bioconductor.org/biocLite.R")
biocLite()

biocLite("BiocUpgrade")
biocLite("DESeq")
biocLite("BiocGenerics")
biocLite("Biobase")
biocLite("geneplotter")
biocLite("aroma.light")
biocLite("Rsamtools")
biocLite("leeBamViews")
biocLite("DESeq2")
biocLite("limma")
biocLite("edgeR")
biocLite("S4Vectors")

require("DESeq2")
library(MASS)
library(genefilter)
library(locfit)
library(lattice)
library(RColorBrewer)
library("colorspace") # causig error in DESeq2 without install
library("ggplot2") # error for plyr in DESeq2 without instal
library(limma)
library(edgeR)


## Functions #############################
#X is a matrix of data

### full quantile normalization (FQ)
fq<-function(X){
  r<-dim(X)[1]
  c<-dim(X)[2]
  X.n<-matrix(rep(0,r*c),r,c)
  Y<-X.n
  for(i in 1:c){
    Y[,i]<-sort(X[,i])
  }
  head(Y)
  m<-apply(Y,1,mean)
  X.1<-as.vector(0)
  for(j in 1:c){
    X.1[order(X[,j])]<-m
    X.n[,j]<-as.matrix(X.1,r,1)
  }
  head(X.n)
  #         [,1]     [,2]      [,3]
  #[1,] 3.614902 1.595378 0.8513045
  colnames(X.n)<-colnames(X)
  rownames(X.n)<-rownames(X)
  return(X.n)
}

##### upper quartile (UQ) and median (Med), and Total Counts (TC) normalization methods##############

uq<-function(X){
  
  #excluding zero counts in each sample
  UQ<-function(y){
    quantile(y, 0.75)
  }
  X<-X+0.1
  upperQ<-apply(X,2,UQ)
  f.uq<-upperQ/mean(upperQ)
  upq.res<-scale(X,center=FALSE,scale=f.uq) 
  return(upq.res)
}

## Med normalization
med<-function(X){
  
    MED<-function(y){
    median(y[y>0])
  }
  X<-X+0.1
  med<-apply(X,2, MED)
  f.med<-med/mean(med)
  med.res<-scale(X,center=FALSE, scale=f.med)
  return(med.res)
}

# TC  
tc<-function(X){
  X<-X+0.1
  tc<-apply(X,2,sum)
  f.tc<-tc/mean(tc)
  tc.res<-scale(X,center=FALSE,scale=f.tc)
  return(tc.res)
}

# per gene normalization by Median: pgQ2
# X: a matrix of data with the multiplication of factor (f) as:
# f=50, 100, 200, 500 or 1000

pgene1<-function(X, f ){
  m<-apply(X,1, median)
  si<-m/f # multiply f=100 per gene per sample
  X1<-scale(t(X),center=FALSE,scale=si)
  res<-t(X1)
  rownames(res)<-rownames(X)
  return(res)  
}


# bp is a function  to calculate coefficient of variation (cv) and return a vector of cv for MAQc data
bp<-function(x){
  (len<-dim(x)[1])
  uhr<-x[,1:2]
  hbr<-x[,3:4]
  m1<-apply(uhr,1,mean)
  m2<-apply(hbr,1,mean)
  sd1<-apply(uhr,1,sd)
  sd2<-apply(hbr,1,sd)
  cv1<-sd1/m1
  cv2<-sd2/m2
  cv<-c(cv1,cv2)
  length(cv)
   return(cv)  
}
names(dat1)
dat1.1<-dat1[,4:7]
head(dat1.1)
(len<-dim(dat1.1)[1])
cv<-bp(dat1.1)
gp<-rep(c("uhr","hbr"),each=len)
boxplot(log(cv+1,base=2)~gp,col=c("green","red"),ylim=c(0,12),main="RawCount",ylab="Coefficient of variation")


## bp1 is a function to make a boxplot for the  raw and  normalized MAQC data by different normalization methods
# X is matrix of normalized counts
bp1<-function(x){
  (len<-dim(x)[1])
  uhr1<-x[,1]
  uhr2<-x[,2]
  hbr1<-x[,3]
  hbr2<-x[,4]
  dat<-c(uhr1,uhr2,hbr1,hbr2)
  return (dat)
  }

gp<-rep(c("uhr1","uhr2","hbr1","hbr2"),each=len) 
dat1<-bp1(X)
boxplot(log(dat1,base=2)~gp,col=c("green","green","red","red"),ylim=c(-2,15),main="RawCount")

### A function of Trapezoid_area used for calculating AUC values ############

Trapezoid_area<-function(X1,X2,Y1,Y2){
  base<-X1-X2
  ht<-(Y1+Y2)/2
  return(base*ht)
}

### A function for calculating the  standard error (se) of AUC values
# A: AUC value
# na: number of true positive genes
# nn: number of  true negative genes
se<-function(A, na, nn){
  Q1<-A/(2-A)
  Q2<-2*A^2/(1+A)
  d1<-A*(1-A)+(na-1)*(Q1-A^2)+(nn-1)*(Q2-A^2)
  S<-sqrt(d1/(na*nn))
  return(S)  
}
A<-0.947
na<-390
nn<-151
se(A,na,nn)

### A function to perform DESeq normalization and DEGs analysis for MAQc data using DESeq2 package
#data: a matrix of data with two conditions 
# conds: conds<-c(rep("uhr",2),rep("hbr", 2))

deseq2<-function(data,conds){
  data_new<-data
  conds<-factor(conds)
  colData<-data.frame(condition=factor(conds))
  
  dds <- DESeqDataSetFromMatrix(countData = data_new,
                                colData = colData,
                                design = ~ condition)
  dds$condition<-relevel(dds$condition,"uhr")
  dds<-estimateSizeFactors(dds)
  dds<-estimateDispersions(dds,fitType="local")
  dds<-nbinomWaldTest(dds)
  norm.res<-counts(dds,normalized=TRUE)
  res<-results(dds) 
  colnames(norm.res)<-colnames(data_new)
  
  return(list(res=res,norm.res=norm.res))
}

### A function to perform TMM normalization and DEGs analysis for MAQc and simulated data using edgeR package
edge.R<-function(data,conds,pair){
  data_new<-data
  colnames(data_new)
  conds<-factor(conds)
  ### using TMM normalization#####################
  edgeR.dgelist<-DGEList(counts=as.matrix(data_new), group=conds)
  head(edgeR.dgelist$counts) # original count matrix
  edgeR.dgelist<-calcNormFactors(edgeR.dgelist, method="TMM")
  edgeR.dgelist<-estimateCommonDisp(edgeR.dgelist)
  edgeR.dgelist<-estimateTagwiseDisp(edgeR.dgelist,trend="movingave")
  head(edgeR.dgelist$samples)
  edgeR_s<-edgeR.dgelist$pseudo.counts # pseudo.counts or normalized counts
  edgeR.test.tgw<-exactTest(edgeR.dgelist, dispersion="tagwise", pair=pair)
  
  # store full topTagw results table
  (n<-nrow(edgeR.test.tgw$table)) 
  
  res<-topTags(edgeR.test.tgw,n,sort.by="none")$table
 
  return(list(norm=edgeR_s, res=res))
}
res<-edge.R(data,conds,pair)

### edgeR for detection of DEGs from the normalized data by other normalization methods (Tc, Med, UQ, FQ, Med-pgQ2 and UQ-pgQ2)
edgeR_noTMM<-function(data, conds, pair){
  data_new<-data
  colnames(data_new)
  #class<-conds<-c(rep("uhr",2),rep("hbr", 2))
  # conds<-factor(conds)
  ### using TMM normalization#####################
  edgeR.dgelist<-DGEList(counts=as.matrix(data_new), group=conds)
  head(edgeR.dgelist$counts) # original count matrix
  edgeR.dgelist<-calcNormFactors(edgeR.dgelist, method="none") # 
  
  edgeR.dgelist<-estimateCommonDisp(edgeR.dgelist)
  edgeR.dgelist<-estimateTagwiseDisp(edgeR.dgelist,trend="movingave")
  head(edgeR.dgelist$samples)
  norm<-edgeR.dgelist$pseudo.counts # without change the size of factor and pseudo.count is 
  # different from normalized counts
  edgeR.test.tgw<-exactTest(edgeR.dgelist, dispersion="tagwise", pair=pair) # B vs A, HBR vs. UHR
  (n<-nrow(edgeR.test.tgw$table)) 
  res1<-topTags(edgeR.test.tgw,n,sort.by="none")$table 
  return(res1)
}


### AUC calculation function ###########
## AUC function
########### ##########################
# sort padj

##################
# AUC calculation
TP.t<-390
TN.t=151
fdr<-"padj"
# for edgeR
fdr<-"FDR"
## for cuffdiff # line: 2498
Tp.t<-279
Tn.t<-114
fdr<-"q_value"

####auc1 is a fuction to calculate AUC value based on the DEGs results from edgeR package#########

# TP.t:total # of True positive genes
#TN.t:total # of True negative genes
# TP.n: list of TP genes
# TN.n: list of TN genes
# DEGs: DEGs identified using edgeR methods

auc1<-function(DEGs,TP.t,TN.t, TP.n, TN.n){
  X<-DEGs
  X.s<-X[order(X$PValue),]
  k1<-seq(100, dim(X.s)[1],by=200)
  k<-c(k1,dim(X)[1])
  tp.1<-y1<-as.vector(0)
  tn.1<-x1<-as.vector(0)
  FP<-TP<-0
  FP.p<-TP.p<-0
  A<-0
  #P<-306
  P<-TP.t
  #N<-151
  N<-TN.t
  (len.t<-TP.t)
  (len.t1<-TN.t)
  for(i in 1:length(k)){
    
    TP.1<-which(X.s$Gene.name[1:k[i]]%in%TP.n$new.name)
    TP<-length(TP.1)
    #y7[i]<-length(TP.7)
    TN.1<-which(X.s$Gene.name[1:k[i]]%in%TN.n$new.name)
    FP<-length(TN.1)
    #x7[i]<-length(TN.7)
    #tp.7[i]<-y7[i]/len.t
    A<-A+Trapezoid_area(FP,FP.p,TP,TP.p)
    FP.p<-FP
    TP.p<-TP
  }
  A1<-A+Trapezoid_area(N,FP.p,N,TP.p)
  A2<-A1/(P*N)
  return (A2)
}

####auc2 is a fuction to calculate AUC value based on the DEGs results from Cufflinks-Cuffdiff2 package#########

auc2<-function(DEGs,TP.t,TN.t, TP.n, TN.n){
  X<-DEGs
  X.s<-X[order(X$p_value),]
  k1<-seq(100, dim(X.s)[1],by=200)
  k<-c(k1,dim(X)[1])
  tp.1<-y1<-as.vector(0)
  tn.1<-x1<-as.vector(0)
  FP<-TP<-0
  FP.p<-TP.p<-0
  A<-0
  #P<-306
  P<-TP.t
  #N<-151
  N<-TN.t
  (len.t<-TP.t)
  (len.t1<-TN.t)
  for(i in 1:length(k)){
    TP.7<-which(X.s$gene[1:k[i]]%in%TP.n$new.name)
    TP<-length(TP.7)
    #y7[i]<-length(TP.7)
    TN.7<-which(X.s$gene[1:k[i]]%in%TN.n$new.name)
    FP<-length(TN.7)
    A<-A+Trapezoid_area(FP,FP.p,TP,TP.p)
    FP.p<-FP
    TP.p<-TP
  }
  A1<-A+Trapezoid_area(N,FP.p,N,TP.p)
  A2<-A1/(P*N)
  return (A2)
}

####auc3 is a fuction to calculate AUC value based on the DEGs results from DESeq2 package#########

auc3<-function(DEGs,TP.t,TN.t, TP.n, TN.n){
  X<-DEGs
  X.s<-X[order(X$pvalue),]
  k1<-seq(100, dim(X.s)[1],by=200)
  k<-c(k1,dim(X)[1])
  tp.1<-y1<-as.vector(0)
  tn.1<-x1<-as.vector(0)
  FP<-TP<-0
  FP.p<-TP.p<-0
  A<-0
  #P<-306
  P<-TP.t
  #N<-151
  N<-TN.t
  (len.t<-TP.t)
  (len.t1<-TN.t)
  for(i in 1:length(k)){
    
    TP.1<-which(X.s$Gene.name[1:k[i]]%in%TP.n$new.name)
    TP<-length(TP.1)
    #y7[i]<-length(TP.7)
    TN.1<-which(X.s$Gene.name[1:k[i]]%in%TN.n$new.name)
    FP<-length(TN.1)
    #x7[i]<-length(TN.7)
    #tp.7[i]<-y7[i]/len.t
    A<-A+Trapezoid_area(FP,FP.p,TP,TP.p)
    FP.p<-FP
    TP.p<-TP
  }
  A1<-A+Trapezoid_area(N,FP.p,N,TP.p)
  A2<-A1/(P*N)
  return (A2)
}

############################## *************** ##############################

########################### Part 2: Calculate RMSD and Boxplot ###################3
tag2<-read.csv("log2FC.10methods.edgeRtest_maqc_2rep_974genes_RMSD-6=19-2015.csv", header=TRUE)
names(tag2)
#[1] "FC.BtoA"            "Gene.org"           "new.name"           "AssociatedGeneName" "ensID"             
#[6] "EntrezID"           "des.FC"             "des.Gene.name"      "TMM.FC"             "TMM.Gene.name"     
#[11] "raw.FC"             "raw.Gene.name"      "tc.FC"              "tc.Gene.name"       "med.FC"            
#[16] "med.Gene.name"      "uq.FC"              "uq.Gene.name"       "fq.FC"              "fq.Gene.name"      
#[21] "med.pgQ2.FC"        "med.pgQ2.Gene.name" "uq.pgQ2.FC"         "uq.pgQ2.Gene.name"  "TagFC.BtoA"   
tagFC<-tag2$TagFC.BtoA

dat1<-tag2$des.FC
(cor1<-sqrt(sum((tagFC-dat1)^2)/length(tagFC))) #1.592

dat1<-tag2$TMM.FC
(cor1<-sqrt(sum((tagFC-dat1)^2)/length(tagFC))) #1.58837

dat1<-tag2$raw.FC
(cor1<-sqrt(sum((tagFC-dat1)^2)/length(tagFC))) # 1.49066
cor(tagFC,dat1) #[1] [1] 0.9287311

dat1<-tag2$tc.FC
(cor1<-sqrt(sum((tagFC-dat1)^2)/length(tagFC))) #1.497269

dat1<-tag2$med.FC
(cor1<-sqrt(sum((tagFC-dat1)^2)/length(tagFC))) #1.502001

dat1<-tag2$uq.FC
(cor1<-sqrt(sum((tagFC-dat1)^2)/length(tagFC))) #1.496423

dat1<-tag2$fq.FC
(cor1<-sqrt(sum((tagFC-dat1)^2)/length(tagFC))) #1.542584

dat1<-tag2$med.pgQ2.FC
(cor1<-sqrt(sum((tagFC-dat1)^2)/length(tagFC))) #1.546629

dat1<-tag2$uq.pgQ2.FC
(cor1<-sqrt(sum((tagFC-dat1)^2)/length(tagFC))) #1.549645

## save this data as a file: RMSD-edgeRTest-7-2-2015.fig2.csv

# Fig2 RMSD correlations with TaqMan fold changes
# barplots
require(grDevices)

dat<-read.csv("RMSD-edgeRTest-7-2-2015.fig2.csv",header=TRUE)
dat1<-matrix(t(dat$RMSD),nrow=1)
dat1
colnames(dat1)<-dat$Methods

### Fig2.RMSD plotting on 1-14-2016 !!!!!!!!!!!!!!!

png(file="Fig2_RMSD_res300.good.1-14-2016.png",width=600,heigh=1200,res=300,pointsize=4)
par(cex.lab=1.2,cex=1.2)
barplot(dat$RMSD,col=rainbow(20),ylim=c(0,2),space=0.1,main="RMSD between MAQC RNA-seq and qRT-PCR",
        ylab="RMSD from qRT-PCR log2 fold changes")
axis(1,at=seq(0.7,9.5,length.out=9),labels=dat$Methods,cex=1.2,las=2)

dev.off()

########################################################################################



####################Part 3:  Simulation ##########################################

### simulation using poisson distribution from human breast cancer: normal and EN 
data<-read.csv("GSE47462_human-BreastCa_xiao_edit.csv", header=TRUE)

########## start simulation ########
idx<-sample(1:dim(data)[1],15000) ## simulate 15,000 genes from Bca data
g<-length(idx) #15000
data1<-data[idx,]
lam<-apply(data1[,2:50],1,mean) # estimate lambda 

# Simulation based on all 15000 genes
# 10% genes
(deg10<-g*0.1) #1500

# sample indx
i1<-sample(1:g,deg10)
i1.1<-i1[1:750]
i1.2<-i1[751:1500]

# simulation from sim 1 to sim 13  ########
 # i.e. for the 10th simulation 
##10% wih  1.5 fold change
lam10.1<-lam10.2<-lam
lam10.1[i1.1]<-lam[i1.1]*1.5  # in one condition  
lam10.2[i1.2]<-lam[i1.2]*1.5  # in the other condition


#N1<-abs(rnorm(20,1,1))
s0<-s1<-matrix(0,nrow=g,ncol=10) # 10 biological replicates in both conditions

for (i in 1:10){
  x<-abs(rnorm(1,1,1))
  s0[,i]<-rpois(g,lam10.1)*x
  s1[,i]<-rpois(g,lam10.2)*x 
}
s1.10<-cbind(s0,s1)
colnames(s1.10)<-c("A1","A2","A3","A4","A5","A6","A7","A8","A9","A10",
                   "B1","B2","B3","B4","B5","B6","B7","B8","B9","B10")
rownames(s1.10)<-data$genes[idx]
head(s1.10)
write.csv(s1.10,file="sim10.Bca.1500DEGs.1.5fc.rpois.csv",row.names=TRUE)

#status: the index for DEGs with 1.5FC in the 10th simulation 
# 1: indicator for DEGs
# 0: indicator for no changes between two conditions
sta.10<-rep(0,g) #number of asimulation genes
sta.10[i1]<-1
sta.10<-as.matrix(sta.10,ncol=1)
colnames(sta.10)<-"status"
rownames(sta.10)<-data$genes[idx]
head(sta.10)

write.csv(sta.10,file="sim10.status.Bca.1500DEGs.rpois.csv",row.names=TRUE)

################### end simulation  ############################

### ####### part4: examples to perform DESeq2, edgeR and med-pgQ2 normalization and DEGs analysis using MAQc2 ##################


# (1). Read MAQC2 data files ################################################

data<-read.csv("maqc2_raw.count_filteredBy2_36451genes.csv",header=TRUE)
names(data)
#[1] "X.1"           "X"             "ens.id"        "GSM597211.uhr" "SRX016367.UHR" "GSM597210.hbr" "SRX016359.HBR" "EntrezID"
#[9] "Gene.name"
dim(data) #36451 7
data1<-data[,2:5]
rownames(data1)<-data$ensID
names(data1)<-c("UHR.1","UHR.2","HBR.1", "HBR.2")
data1[1:3,]
#               UHR.1 UHR.2 HBR.1 HBR.2
#NSG00000000003   565   746    75    92
#ENSG00000000005     9    17     2     3
#ENSG00000000419   546   768   139   189
class<-conds<-c(rep("uhr",2),rep("hbr", 2))
conds<-factor(conds)

#### (2).  DESeq2 ###################################

library("DESeq2")  # version 1.6.3
res1<-deseq2(data1,conds)
norm.data<-res1$norm.res  # normalized data by DESeq
head(norm.data)
DEGs.data<-res1$res      # DEGs by DESeq2
head(DEGs.data)[1:2,]


# (3).  edgeR ###############################

library("edgeR")   # version 3.8.6
pair<-c("uhr","hbr")

res2<-edge.R(data1,conds,pair)
norm.data2<-res2$norm
head(norm.data2)
DEGs.2<-res2$res
head(DEGs.2)

###### (4). Med-pgQ2 ####################

library("edgeR")

## Med normalization

norm.med<-med(data1) # median normalization per sample
head(norm.med)[1:2,]
norm.MedpgQ2<-pgene1(norm.med, f=100)  # per gene normalization across samples with scaling factor 100
head(norm.MedpgQ2)[1:2,]
res3.DEGs<-edgeR_noTMM(norm.MedpgQ2, conds, pair) # DEGs using exact test from edgeR 
head(res3.DEGs)[1:2, ]

##### (5). UQ-pgQ2 ####################

library("edgeR")

## Med normalization

norm.uq<-uq(data1) # uq normalization per sample
head(norm.uq)[1:2,]
norm.UQpgQ2<-pgene1(norm.uq, f=100)  # per gene normalization across samples with scaling factor 100
head(norm.UQpgQ2)[1:2,]
res4.DEGs<-edgeR_noTMM(norm.UQpgQ2, conds, pair) # DEGs using exact test from edgeR 
head(res4.DEGs)[1:2, ]