########### *********************************************########## ####### Including #################### ## 1. Part One: line 14 to line 386 ### a. Define different functions including normalization methods (Tc, Med, UQ, FQ, Med-pgQ2 and UQ-pgQ2) ### b. Define functions for TMM from edgeR and DESeq from DESeq2 ### c. Define functions to calculate cv( coefficent of variation), AUC,sensitivity, specificity, PPV, the actual FDR. ## 2. Part two: Calulate RMSD and graph RMSD in boxplot (line 388 to line 447) ## 3. Part three: Simulation with 1.5 FC for 1500 DEGs with 15,000 genes (line 451 to line 506) ## 4. Part four: Examples to perform DESeq, TMM and Med-pgQ2 normalization and DEGs analysis using MAQC2 dataset for # with the aid of DESeq2 and edgeR packages ( line 510 to line 564) ### Part one: Define function() ############################### ### Med.pgQ2 and UQ.pgQ2 Normalization methods source("http://bioconductor.org/biocLite.R") biocLite() biocLite("BiocUpgrade") biocLite("DESeq") biocLite("BiocGenerics") biocLite("Biobase") biocLite("geneplotter") biocLite("aroma.light") biocLite("Rsamtools") biocLite("leeBamViews") biocLite("DESeq2") biocLite("limma") biocLite("edgeR") biocLite("S4Vectors") require("DESeq2") library(MASS) library(genefilter) library(locfit) library(lattice) library(RColorBrewer) library("colorspace") # causig error in DESeq2 without install library("ggplot2") # error for plyr in DESeq2 without instal library(limma) library(edgeR) ## Functions ############################# #X is a matrix of data ### full quantile normalization (FQ) fq<-function(X){ r<-dim(X)[1] c<-dim(X)[2] X.n<-matrix(rep(0,r*c),r,c) Y<-X.n for(i in 1:c){ Y[,i]<-sort(X[,i]) } head(Y) m<-apply(Y,1,mean) X.1<-as.vector(0) for(j in 1:c){ X.1[order(X[,j])]<-m X.n[,j]<-as.matrix(X.1,r,1) } head(X.n) # [,1] [,2] [,3] #[1,] 3.614902 1.595378 0.8513045 colnames(X.n)<-colnames(X) rownames(X.n)<-rownames(X) return(X.n) } ##### upper quartile (UQ) and median (Med), and Total Counts (TC) normalization methods############## uq<-function(X){ #excluding zero counts in each sample UQ<-function(y){ quantile(y, 0.75) } X<-X+0.1 upperQ<-apply(X,2,UQ) f.uq<-upperQ/mean(upperQ) upq.res<-scale(X,center=FALSE,scale=f.uq) return(upq.res) } ## Med normalization med<-function(X){ MED<-function(y){ median(y[y>0]) } X<-X+0.1 med<-apply(X,2, MED) f.med<-med/mean(med) med.res<-scale(X,center=FALSE, scale=f.med) return(med.res) } # TC tc<-function(X){ X<-X+0.1 tc<-apply(X,2,sum) f.tc<-tc/mean(tc) tc.res<-scale(X,center=FALSE,scale=f.tc) return(tc.res) } # per gene normalization by Median: pgQ2 # X: a matrix of data with the multiplication of factor (f) as: # f=50, 100, 200, 500 or 1000 pgene1<-function(X, f ){ m<-apply(X,1, median) si<-m/f # multiply f=100 per gene per sample X1<-scale(t(X),center=FALSE,scale=si) res<-t(X1) rownames(res)<-rownames(X) return(res) } # bp is a function to calculate coefficient of variation (cv) and return a vector of cv for MAQc data bp<-function(x){ (len<-dim(x)[1]) uhr<-x[,1:2] hbr<-x[,3:4] m1<-apply(uhr,1,mean) m2<-apply(hbr,1,mean) sd1<-apply(uhr,1,sd) sd2<-apply(hbr,1,sd) cv1<-sd1/m1 cv2<-sd2/m2 cv<-c(cv1,cv2) length(cv) return(cv) } names(dat1) dat1.1<-dat1[,4:7] head(dat1.1) (len<-dim(dat1.1)[1]) cv<-bp(dat1.1) gp<-rep(c("uhr","hbr"),each=len) boxplot(log(cv+1,base=2)~gp,col=c("green","red"),ylim=c(0,12),main="RawCount",ylab="Coefficient of variation") ## bp1 is a function to make a boxplot for the raw and normalized MAQC data by different normalization methods # X is matrix of normalized counts bp1<-function(x){ (len<-dim(x)[1]) uhr1<-x[,1] uhr2<-x[,2] hbr1<-x[,3] hbr2<-x[,4] dat<-c(uhr1,uhr2,hbr1,hbr2) return (dat) } gp<-rep(c("uhr1","uhr2","hbr1","hbr2"),each=len) dat1<-bp1(X) boxplot(log(dat1,base=2)~gp,col=c("green","green","red","red"),ylim=c(-2,15),main="RawCount") ### A function of Trapezoid_area used for calculating AUC values ############ Trapezoid_area<-function(X1,X2,Y1,Y2){ base<-X1-X2 ht<-(Y1+Y2)/2 return(base*ht) } ### A function for calculating the standard error (se) of AUC values # A: AUC value # na: number of true positive genes # nn: number of true negative genes se<-function(A, na, nn){ Q1<-A/(2-A) Q2<-2*A^2/(1+A) d1<-A*(1-A)+(na-1)*(Q1-A^2)+(nn-1)*(Q2-A^2) S<-sqrt(d1/(na*nn)) return(S) } A<-0.947 na<-390 nn<-151 se(A,na,nn) ### A function to perform DESeq normalization and DEGs analysis for MAQc data using DESeq2 package #data: a matrix of data with two conditions # conds: conds<-c(rep("uhr",2),rep("hbr", 2)) deseq2<-function(data,conds){ data_new<-data conds<-factor(conds) colData<-data.frame(condition=factor(conds)) dds <- DESeqDataSetFromMatrix(countData = data_new, colData = colData, design = ~ condition) dds$condition<-relevel(dds$condition,"uhr") dds<-estimateSizeFactors(dds) dds<-estimateDispersions(dds,fitType="local") dds<-nbinomWaldTest(dds) norm.res<-counts(dds,normalized=TRUE) res<-results(dds) colnames(norm.res)<-colnames(data_new) return(list(res=res,norm.res=norm.res)) } ### A function to perform TMM normalization and DEGs analysis for MAQc and simulated data using edgeR package edge.R<-function(data,conds,pair){ data_new<-data colnames(data_new) conds<-factor(conds) ### using TMM normalization##################### edgeR.dgelist<-DGEList(counts=as.matrix(data_new), group=conds) head(edgeR.dgelist$counts) # original count matrix edgeR.dgelist<-calcNormFactors(edgeR.dgelist, method="TMM") edgeR.dgelist<-estimateCommonDisp(edgeR.dgelist) edgeR.dgelist<-estimateTagwiseDisp(edgeR.dgelist,trend="movingave") head(edgeR.dgelist$samples) edgeR_s<-edgeR.dgelist$pseudo.counts # pseudo.counts or normalized counts edgeR.test.tgw<-exactTest(edgeR.dgelist, dispersion="tagwise", pair=pair) # store full topTagw results table (n<-nrow(edgeR.test.tgw$table)) res<-topTags(edgeR.test.tgw,n,sort.by="none")$table return(list(norm=edgeR_s, res=res)) } res<-edge.R(data,conds,pair) ### edgeR for detection of DEGs from the normalized data by other normalization methods (Tc, Med, UQ, FQ, Med-pgQ2 and UQ-pgQ2) edgeR_noTMM<-function(data, conds, pair){ data_new<-data colnames(data_new) #class<-conds<-c(rep("uhr",2),rep("hbr", 2)) # conds<-factor(conds) ### using TMM normalization##################### edgeR.dgelist<-DGEList(counts=as.matrix(data_new), group=conds) head(edgeR.dgelist$counts) # original count matrix edgeR.dgelist<-calcNormFactors(edgeR.dgelist, method="none") # edgeR.dgelist<-estimateCommonDisp(edgeR.dgelist) edgeR.dgelist<-estimateTagwiseDisp(edgeR.dgelist,trend="movingave") head(edgeR.dgelist$samples) norm<-edgeR.dgelist$pseudo.counts # without change the size of factor and pseudo.count is # different from normalized counts edgeR.test.tgw<-exactTest(edgeR.dgelist, dispersion="tagwise", pair=pair) # B vs A, HBR vs. UHR (n<-nrow(edgeR.test.tgw$table)) res1<-topTags(edgeR.test.tgw,n,sort.by="none")$table return(res1) } ### AUC calculation function ########### ## AUC function ########### ########################## # sort padj ################## # AUC calculation TP.t<-390 TN.t=151 fdr<-"padj" # for edgeR fdr<-"FDR" ## for cuffdiff # line: 2498 Tp.t<-279 Tn.t<-114 fdr<-"q_value" ####auc1 is a fuction to calculate AUC value based on the DEGs results from edgeR package######### # TP.t:total # of True positive genes #TN.t:total # of True negative genes # TP.n: list of TP genes # TN.n: list of TN genes # DEGs: DEGs identified using edgeR methods auc1<-function(DEGs,TP.t,TN.t, TP.n, TN.n){ X<-DEGs X.s<-X[order(X$PValue),] k1<-seq(100, dim(X.s)[1],by=200) k<-c(k1,dim(X)[1]) tp.1<-y1<-as.vector(0) tn.1<-x1<-as.vector(0) FP<-TP<-0 FP.p<-TP.p<-0 A<-0 #P<-306 P<-TP.t #N<-151 N<-TN.t (len.t<-TP.t) (len.t1<-TN.t) for(i in 1:length(k)){ TP.1<-which(X.s$Gene.name[1:k[i]]%in%TP.n$new.name) TP<-length(TP.1) #y7[i]<-length(TP.7) TN.1<-which(X.s$Gene.name[1:k[i]]%in%TN.n$new.name) FP<-length(TN.1) #x7[i]<-length(TN.7) #tp.7[i]<-y7[i]/len.t A<-A+Trapezoid_area(FP,FP.p,TP,TP.p) FP.p<-FP TP.p<-TP } A1<-A+Trapezoid_area(N,FP.p,N,TP.p) A2<-A1/(P*N) return (A2) } ####auc2 is a fuction to calculate AUC value based on the DEGs results from Cufflinks-Cuffdiff2 package######### auc2<-function(DEGs,TP.t,TN.t, TP.n, TN.n){ X<-DEGs X.s<-X[order(X$p_value),] k1<-seq(100, dim(X.s)[1],by=200) k<-c(k1,dim(X)[1]) tp.1<-y1<-as.vector(0) tn.1<-x1<-as.vector(0) FP<-TP<-0 FP.p<-TP.p<-0 A<-0 #P<-306 P<-TP.t #N<-151 N<-TN.t (len.t<-TP.t) (len.t1<-TN.t) for(i in 1:length(k)){ TP.7<-which(X.s$gene[1:k[i]]%in%TP.n$new.name) TP<-length(TP.7) #y7[i]<-length(TP.7) TN.7<-which(X.s$gene[1:k[i]]%in%TN.n$new.name) FP<-length(TN.7) A<-A+Trapezoid_area(FP,FP.p,TP,TP.p) FP.p<-FP TP.p<-TP } A1<-A+Trapezoid_area(N,FP.p,N,TP.p) A2<-A1/(P*N) return (A2) } ####auc3 is a fuction to calculate AUC value based on the DEGs results from DESeq2 package######### auc3<-function(DEGs,TP.t,TN.t, TP.n, TN.n){ X<-DEGs X.s<-X[order(X$pvalue),] k1<-seq(100, dim(X.s)[1],by=200) k<-c(k1,dim(X)[1]) tp.1<-y1<-as.vector(0) tn.1<-x1<-as.vector(0) FP<-TP<-0 FP.p<-TP.p<-0 A<-0 #P<-306 P<-TP.t #N<-151 N<-TN.t (len.t<-TP.t) (len.t1<-TN.t) for(i in 1:length(k)){ TP.1<-which(X.s$Gene.name[1:k[i]]%in%TP.n$new.name) TP<-length(TP.1) #y7[i]<-length(TP.7) TN.1<-which(X.s$Gene.name[1:k[i]]%in%TN.n$new.name) FP<-length(TN.1) #x7[i]<-length(TN.7) #tp.7[i]<-y7[i]/len.t A<-A+Trapezoid_area(FP,FP.p,TP,TP.p) FP.p<-FP TP.p<-TP } A1<-A+Trapezoid_area(N,FP.p,N,TP.p) A2<-A1/(P*N) return (A2) } ############################## *************** ############################## ########################### Part 2: Calculate RMSD and Boxplot ###################3 tag2<-read.csv("log2FC.10methods.edgeRtest_maqc_2rep_974genes_RMSD-6=19-2015.csv", header=TRUE) names(tag2) #[1] "FC.BtoA" "Gene.org" "new.name" "AssociatedGeneName" "ensID" #[6] "EntrezID" "des.FC" "des.Gene.name" "TMM.FC" "TMM.Gene.name" #[11] "raw.FC" "raw.Gene.name" "tc.FC" "tc.Gene.name" "med.FC" #[16] "med.Gene.name" "uq.FC" "uq.Gene.name" "fq.FC" "fq.Gene.name" #[21] "med.pgQ2.FC" "med.pgQ2.Gene.name" "uq.pgQ2.FC" "uq.pgQ2.Gene.name" "TagFC.BtoA" tagFC<-tag2$TagFC.BtoA dat1<-tag2$des.FC (cor1<-sqrt(sum((tagFC-dat1)^2)/length(tagFC))) #1.592 dat1<-tag2$TMM.FC (cor1<-sqrt(sum((tagFC-dat1)^2)/length(tagFC))) #1.58837 dat1<-tag2$raw.FC (cor1<-sqrt(sum((tagFC-dat1)^2)/length(tagFC))) # 1.49066 cor(tagFC,dat1) #[1] [1] 0.9287311 dat1<-tag2$tc.FC (cor1<-sqrt(sum((tagFC-dat1)^2)/length(tagFC))) #1.497269 dat1<-tag2$med.FC (cor1<-sqrt(sum((tagFC-dat1)^2)/length(tagFC))) #1.502001 dat1<-tag2$uq.FC (cor1<-sqrt(sum((tagFC-dat1)^2)/length(tagFC))) #1.496423 dat1<-tag2$fq.FC (cor1<-sqrt(sum((tagFC-dat1)^2)/length(tagFC))) #1.542584 dat1<-tag2$med.pgQ2.FC (cor1<-sqrt(sum((tagFC-dat1)^2)/length(tagFC))) #1.546629 dat1<-tag2$uq.pgQ2.FC (cor1<-sqrt(sum((tagFC-dat1)^2)/length(tagFC))) #1.549645 ## save this data as a file: RMSD-edgeRTest-7-2-2015.fig2.csv # Fig2 RMSD correlations with TaqMan fold changes # barplots require(grDevices) dat<-read.csv("RMSD-edgeRTest-7-2-2015.fig2.csv",header=TRUE) dat1<-matrix(t(dat$RMSD),nrow=1) dat1 colnames(dat1)<-dat$Methods ### Fig2.RMSD plotting on 1-14-2016 !!!!!!!!!!!!!!! png(file="Fig2_RMSD_res300.good.1-14-2016.png",width=600,heigh=1200,res=300,pointsize=4) par(cex.lab=1.2,cex=1.2) barplot(dat$RMSD,col=rainbow(20),ylim=c(0,2),space=0.1,main="RMSD between MAQC RNA-seq and qRT-PCR", ylab="RMSD from qRT-PCR log2 fold changes") axis(1,at=seq(0.7,9.5,length.out=9),labels=dat$Methods,cex=1.2,las=2) dev.off() ######################################################################################## ####################Part 3: Simulation ########################################## ### simulation using poisson distribution from human breast cancer: normal and EN data<-read.csv("GSE47462_human-BreastCa_xiao_edit.csv", header=TRUE) ########## start simulation ######## idx<-sample(1:dim(data)[1],15000) ## simulate 15,000 genes from Bca data g<-length(idx) #15000 data1<-data[idx,] lam<-apply(data1[,2:50],1,mean) # estimate lambda # Simulation based on all 15000 genes # 10% genes (deg10<-g*0.1) #1500 # sample indx i1<-sample(1:g,deg10) i1.1<-i1[1:750] i1.2<-i1[751:1500] # simulation from sim 1 to sim 13 ######## # i.e. for the 10th simulation ##10% wih 1.5 fold change lam10.1<-lam10.2<-lam lam10.1[i1.1]<-lam[i1.1]*1.5 # in one condition lam10.2[i1.2]<-lam[i1.2]*1.5 # in the other condition #N1<-abs(rnorm(20,1,1)) s0<-s1<-matrix(0,nrow=g,ncol=10) # 10 biological replicates in both conditions for (i in 1:10){ x<-abs(rnorm(1,1,1)) s0[,i]<-rpois(g,lam10.1)*x s1[,i]<-rpois(g,lam10.2)*x } s1.10<-cbind(s0,s1) colnames(s1.10)<-c("A1","A2","A3","A4","A5","A6","A7","A8","A9","A10", "B1","B2","B3","B4","B5","B6","B7","B8","B9","B10") rownames(s1.10)<-data$genes[idx] head(s1.10) write.csv(s1.10,file="sim10.Bca.1500DEGs.1.5fc.rpois.csv",row.names=TRUE) #status: the index for DEGs with 1.5FC in the 10th simulation # 1: indicator for DEGs # 0: indicator for no changes between two conditions sta.10<-rep(0,g) #number of asimulation genes sta.10[i1]<-1 sta.10<-as.matrix(sta.10,ncol=1) colnames(sta.10)<-"status" rownames(sta.10)<-data$genes[idx] head(sta.10) write.csv(sta.10,file="sim10.status.Bca.1500DEGs.rpois.csv",row.names=TRUE) ################### end simulation ############################ ### ####### part4: examples to perform DESeq2, edgeR and med-pgQ2 normalization and DEGs analysis using MAQc2 ################## # (1). Read MAQC2 data files ################################################ data<-read.csv("maqc2_raw.count_filteredBy2_36451genes.csv",header=TRUE) names(data) #[1] "X.1" "X" "ens.id" "GSM597211.uhr" "SRX016367.UHR" "GSM597210.hbr" "SRX016359.HBR" "EntrezID" #[9] "Gene.name" dim(data) #36451 7 data1<-data[,2:5] rownames(data1)<-data$ensID names(data1)<-c("UHR.1","UHR.2","HBR.1", "HBR.2") data1[1:3,] # UHR.1 UHR.2 HBR.1 HBR.2 #NSG00000000003 565 746 75 92 #ENSG00000000005 9 17 2 3 #ENSG00000000419 546 768 139 189 class<-conds<-c(rep("uhr",2),rep("hbr", 2)) conds<-factor(conds) #### (2). DESeq2 ################################### library("DESeq2") # version 1.6.3 res1<-deseq2(data1,conds) norm.data<-res1$norm.res # normalized data by DESeq head(norm.data) DEGs.data<-res1$res # DEGs by DESeq2 head(DEGs.data)[1:2,] # (3). edgeR ############################### library("edgeR") # version 3.8.6 pair<-c("uhr","hbr") res2<-edge.R(data1,conds,pair) norm.data2<-res2$norm head(norm.data2) DEGs.2<-res2$res head(DEGs.2) ###### (4). Med-pgQ2 #################### library("edgeR") ## Med normalization norm.med<-med(data1) # median normalization per sample head(norm.med)[1:2,] norm.MedpgQ2<-pgene1(norm.med, f=100) # per gene normalization across samples with scaling factor 100 head(norm.MedpgQ2)[1:2,] res3.DEGs<-edgeR_noTMM(norm.MedpgQ2, conds, pair) # DEGs using exact test from edgeR head(res3.DEGs)[1:2, ] ##### (5). UQ-pgQ2 #################### library("edgeR") ## Med normalization norm.uq<-uq(data1) # uq normalization per sample head(norm.uq)[1:2,] norm.UQpgQ2<-pgene1(norm.uq, f=100) # per gene normalization across samples with scaling factor 100 head(norm.UQpgQ2)[1:2,] res4.DEGs<-edgeR_noTMM(norm.UQpgQ2, conds, pair) # DEGs using exact test from edgeR head(res4.DEGs)[1:2, ]