# Note: The following code is set up assuming all supplementary tables are in .csv format and in the same directory along with MethodFunctions.R # Fill in the below variable with the path to the directory with the Supp. Data PathToData<-"path/to/SuppTables/" # Fill in the below variable with the path to an output directory for all plots PathToOutput<-"path/to/output/plots/" # Required Packages require(randomForest) # Required for certain plots require(lattice) require(latticeExtra) require(hexbin) require(Vennerable) # Source the supplemental source code file source(paste(PathToData,"Supplemental_R_Source_Code02.R",sep="")) # Required Variables: # AllUniqueProts - a vector of uniquely IDed proteins AllUniqueProts<-InputCSV(file=paste(PathToData,"Supplemental_Table07.csv",sep = ""),OutType="uvec") # AllRNAAbund - a named vector of average RNA abundance (log(2)) AllRNAAbund<-InputCSV(file=paste(PathToData,"Supplemental_Table08.csv",sep = ""),OutType="nvec") # AllProtAbund - a named vector of average Protein Abundance (log(2)) AllProtAbund<-InputCSV(file=paste(PathToData,"Supplemental_Table06.csv",sep = ""),OutType="nvec") # MethMatrix - a matrix (gene X Feature) of methylation scores MethScoreMatrix5Bins<-InputCSV(file=paste(PathToData,"Supplemental_Table05.csv",sep = ""),OutType="matrix") MethScoreMatrixNonBin<-InputCSV(file=paste(PathToData,"Supplemental_Table04.csv",sep = ""),OutType="matrix") MethScoreMatrix5BinsHistones<-InputCSV(file=paste(PathToData,"Supplemental_Table17.csv",sep = ""),OutType="matrix") # Use the Binned Methylation Data MethMatrix<-MethScoreMatrix5Bins # Un-comment the line below to use the non-binned methylation data # MethMatrix<-MethScoreMatrixNonBin # Un-comment the line below to use the binned Histone modification data alone # MethMatrix<-MethScoreMatrix5BinsHistones # Un-comment the line below to used a combination of DNA methylation and Histone Data # MethMatrix<-cbind(MethScoreMatrix5Bins,MethScoreMatrix5BinsHistones) # FilteredGenes - a vector of gene names in the filtered set FilteredGenes<-InputCSV(file=paste(PathToData,"Supplemental_Table09.csv",sep = ""),OutType="uvec") FilteredGenesV4<-InputCSV(file=paste(PathToData,"Supplemental_Table02.csv",sep = ""),OutType="uvec") # BiotypeLst<- list of 3 biotypes ("protein_coding","pseudogene","transposable_element") BiotypeLst<-InputCSV(file=paste(PathToData,"Supplemental_Table10.csv",sep = ""),OutType="list") # AllTEGenes - a vector of all gene models to exclude from the "Without TE analysis" AllTEGenes<-InputCSV(file=paste(PathToData,"Supplemental_Table12.csv",sep = ""),OutType="uvec") # AllGenesIntronNum - a vector of with genes as names (for all genes in the genome) indicating the number of introns in each gene model AllGenesIntronNum<-InputCSV(file=paste(PathToData,"Supplemental_Table11.csv",sep = ""),OutType="nvec") # Identify a vector of uniquely IDed proteins that also have methylation data AllGenes<-rownames(MethMatrix) UniquePr<-intersect(AllUniqueProts,AllGenes) # Construct a vector of RNA Abundance for all genes with methylation data # Give non-detected genes a non-zero number which can be log-transformed AllRNAAbund<-AllRNAAbund[intersect(names(AllRNAAbund),AllGenes)] MinRNA<-floor(min(AllRNAAbund)) MinRNA<-MinRNA-1 RNAAbund<-rep(MinRNA,length(AllGenes)) names(RNAAbund)<-AllGenes RNAAbund[names(AllRNAAbund)]<-AllRNAAbund # Compile a list of 3 RNA abundance classes RNAClassList<-list() RNAClassList[["High"]]<-names(RNAAbund)[which(RNAAbund>0)] RNAClassList[["Low"]]<-names(RNAAbund)[which(RNAAbund<=0 & RNAAbund>MinRNA)] RNAClassList[["NonDetected"]]<-names(RNAAbund)[which(RNAAbund==MinRNA)] # Compile a list of refined RNA classes by retaining only genes with observed proteins ProtClassList<-lapply(RNAClassList,function(x) intersect(x,UniquePr)) # Build a list of Class Vectors for modeling yVecList_All<-list() yVecList_All[["EPC"]]<-BuildYvec(posvec = ProtClassList[["High"]],negvec = setdiff(RNAClassList[["NonDetected"]],ProtClassList[["NonDetected"]])) yVecList_All[["ERC"]]<-BuildYvec(posvec = RNAClassList[["High"]],negvec = RNAClassList[["NonDetected"]]) yVecList_All[["PFI"]]<-BuildYvec(posvec = ProtClassList[["High"]],negvec = setdiff(RNAClassList[["High"]],ProtClassList[["High"]])) # Un-comment this line to run the analysis without any TEs or TE-containing genes #yVecList_All<-lapply(yVecList_All,function(x) x[setdiff(names(x),AllTEGenes)]) # Build a vector of Log2(protein abundance) for all genes. Non-detected genes get a value of (-1) ProtAbund<-rep(-1,length(AllGenes)) names(ProtAbund)<-AllGenes ProtInds<-intersect(UniquePr, names(AllProtAbund)) ProtAbund[ProtInds]<-AllProtAbund[ProtInds] # Build a list of continuous class vectors for predictive models yVecList_All_Quant<-list() yVecList_All_Quant[["PEP"]]<-ProtAbund[names(yVecList_All[["EPC"]])] yVecList_All_Quant[["REP"]]<-RNAAbund[names(yVecList_All[["ERC"]])] # Build a list of Training Matricies using the DNA Methylation Features xMatList_All<-lapply(yVecList_All,function(x) MethMatrix[names(x),]) # Do a simple data imputation by replacing NAs with 0.5 (a value indicating an ambiguous methylation signal) xMatList_All<-lapply(xMatList_All,function(x) apply(x,2,function(y) SubNAs(y))) # Build Random Forest models # Classification Models RF_List_All<-lapply(names(yVecList_All), function(x) randomForest(x = xMatList_All[[x]], y = as.factor(yVecList_All[[x]]), ntree = 1000, importance = T, keep.forest = T)) names(RF_List_All)<-names(yVecList_All) # Quantitative Predictions xMatList_All_Quant<-xMatList_All[1:2] names(xMatList_All_Quant)<-names(yVecList_All_Quant) RF_List_All_Quant<-lapply(names(yVecList_All_Quant), function(x) randomForest(x = xMatList_All_Quant[[x]], y = yVecList_All_Quant[[x]], ntree = 1000, importance = T, keep.forest = T)) names(RF_List_All_Quant)<-names(yVecList_All_Quant) # Pull out RF Importance Measures to use in regression analysis RF_List_All_Imp<-sapply(RF_List_All,function(x) x$importance[,3]) # Determine Signs of Feature Relationships based on the sign of the t.test statistic PosFeats<-lapply(yVecList_All,function(x) lapply(colnames(MethMatrix),function(c) MethMatrix[names(x)[which(x==T)],c])) NegFeats<-lapply(yVecList_All,function(x) lapply(colnames(MethMatrix),function(c) MethMatrix[names(x)[which(x==F)],c])) for(f in 1:length(PosFeats)) { names(PosFeats[[f]])<-colnames(MethMatrix) names(NegFeats[[f]])<-colnames(MethMatrix) } FeatureTtests<-lapply(1:3,function(x) lapply(1:length(PosFeats[[x]]), function(y) t.test(PosFeats[[x]][[y]],NegFeats[[x]][[y]]))) FeaturePvals<-sapply(FeatureTtests,function(x) sapply(x,function(y) y$p.value)) FeatureStat<-sapply(FeatureTtests,function(x) sapply(x,function(y) y$statistic)) colnames(FeaturePvals)<-names(PosFeats) row.names(FeaturePvals)<-colnames(MethMatrix) colnames(FeatureStat)<-names(PosFeats) row.names(FeatureStat)<-colnames(MethMatrix) # Use the Classification Model to classify the rest of the genes not used in the training set MethMatrix<-apply(MethMatrix,2,function(x) SubNAs(x)) r1<-lapply(names(RF_List_All),function(x) predict(RF_List_All[[x]],newdata=MethMatrix[setdiff(row.names(MethMatrix),names(yVecList_All[[x]])),],type="vote")[,"TRUE"]) names(r1)<-names(RF_List_All) # For genes in the training set, use the Out-Of-Bag cross-validated classifications RF_List_Predictions<-lapply(names(RF_List_All),function(x) c(r1[[x]],RF_List_All[[x]]$votes[,"TRUE"])) # Put all classification vectors in the same order RF_List_Predictions<-lapply(RF_List_Predictions,function(x) x[AllGenes]) names(RF_List_Predictions)<-names(RF_List_All) # Organize vectors of operative ("positive") gene classifications into a list along with the pre-defined Filtered Gene Set (FGS) RF_Sets<-list() RF_Sets[["EPC"]]<-RF_List_Predictions[["EPC"]] RF_Sets[["ERC"]]<-RF_List_Predictions[["ERC"]] RF_Sets<-lapply(RF_Sets,function(x) names(x)[which(x>0.5)]) RF_Sets[["Filtered Set V2"]]<-intersect(FilteredGenes,AllGenes) RF_Sets[["Filtered Set V4"]]<-intersect(FilteredGenesV4,AllGenes) RF_Silenced<-lapply(RF_Sets,function(x) setdiff(AllGenes,x)) # Determine Gene Body Methylation (gbM) genes GBMgenes<-ReturnGBMGenes(MethMatrix[,grep("CpG_Gene",colnames(MethMatrix))]) # Figure Plotting # ------ # Set up Color pallete BlueYellow<-c(rgb(0,0,0.8),rgb(0.8,0.8,1),rgb(1,1,0.8),rgb(0.8,0.8,0)) colPal<-colorRampPalette(BlueYellow) tcols16<-colPal(16) # These 6 colors are used through out tcols<-c("grey30","grey80",tcols16[c(12,5,16,1)]) # ------ # ------ # Summarized Feature Importance bar plots # Supp04_E and Supp 01 ****** # ****** For Supp 01, To re-create this plot the above models must be re-constructed with non-binned data including the UTRs and TSS (un-commnet line 35 "MethMatrix<-MethScoreMatrixNonBin") ****** # for EPC a1<-do.call(rbind,lapply(row.names(RF_List_All_Imp),function(x) SplitMethName(x,RF_List_All_Imp[x,1]))) # for ERC a2<-do.call(rbind,lapply(row.names(RF_List_All_Imp),function(x) SplitMethName(x,RF_List_All_Imp[x,2]))) # for PFI a3<-do.call(rbind,lapply(row.names(RF_List_All_Imp),function(x) SplitMethName(x,RF_List_All_Imp[x,3]))) nc<-ncol(a1)-1 tmn<-"" b1<-apply(a1[,1:nc],2,function(x) tapply(a1[,(nc+1)],x,function(y) sum(y))) b2<-apply(a2[,1:nc],2,function(x) tapply(a2[,(nc+1)],x,function(y) sum(y))) b3<-apply(a3[,1:nc],2,function(x) tapply(a3[,(nc+1)],x,function(y) sum(y))) names(b1)<-NULL names(b2)<-NULL names(b3)<-NULL b1<-c(unlist(sapply(b1,function(x) c(0,x))),0) b2<-c(unlist(sapply(b2,function(x) c(0,x))),0) b3<-c(unlist(sapply(b3,function(x) c(0,x))),0) b<-list(b1,b2,b3) bcols<-rep(c(tcols[5:6],"red"),times=sapply(b,length)) b<-unlist(lapply(b,function(x) (x/max(abs(x))))) names(b)<-gsub("CpG","CG",names(b)) # Supp 04_E pdf(file=paste(PathToOutput,"Supp04_E.pdf",sep=""),width = 10,height = 6) par(mar=c(5,4,2,2),font.axis=2, cex.axis=1.2, cex.lab=1.2, xpd=T) barplot(b,las=2,col=bcols,ylab="Normalized Sum of Unsigned Feature Importance",main=tmn, cex.names = 1.5) legend("topright",legend = colnames(RF_List_All_Imp),pch = 15, pt.cex = 3, text.font = 2, cex=1.1,col = rep(c(tcols[5:6],"red")),bty="n") dev.off() # Supp 01 bcols<-rep(c(tcols[5:6]),each=16) pdf(file=paste(PathToOutput,"Supp01.pdf",sep=""),width = 8,height = 6) par(mar=c(6,4,2,2),font.axis=2, cex.axis=1.2, cex.lab=1.2, xpd=T) btmp<-b[c(1:5,11:15,6,9,8,10,7,16,17:21,27:31,22,25,24,26,23,32)] xbars<-barplot(btmp,names.arg = "",col=bcols,ylab="Normalized Sum of Unsigned Feature Importance",main=tmn) text(x = xbars, par("usr")[3]-(0.02), srt = 45, adj= 1, xpd = TRUE, labels = names(btmp), cex=1.1, font=2) legend("topright",legend = colnames(RF_List_All_Imp)[1:2],pch = 15, pt.cex = 3, text.font = 2, cex=1.1,col = tcols[5:6],bty="n") dev.off() # ------ # ------ # Boxplots for Methylation levels of Gene Bins 1-5 Grouped by Class tnm<-colnames(MethScoreMatrix5Bins) tnm<-gsub("\\_Bin[[:digit:]]{1}","",tnm) BinList<-tapply(1:ncol(MethScoreMatrix5Bins),tnm,function(x) MethScoreMatrix5Bins[,x]) for(n in 1:length(BinList)) colnames(BinList[[n]])<-c("Bin1","Bin2","Bin3","Bin4","Bin5") r<-RNAClassList[["High"]] p<-ProtClassList[["High"]] d<-setdiff(r,p) u<-setdiff(AllGenes,union(r,p)) SetLst<-list("Observed Protein"=p,"High mRNA / No Protein"=d,"High mRNA"=r,"No Protein / No mRNA"=u) # Supp 02 # For CpG Methylation ConFeat<-c("Whole Gene CG"="CpG_Gene", "Exon CG"="CpG_Exon", "Intron CG"="CpG_Intron") pdf(file=paste(PathToOutput,"Supp02.pdf",sep=""),width = 6.5,height = 11) par(mfrow=c(6,2),mar=c(3,3,4,1)) for(i in 1:length(ConFeat)) t<-lapply(names(SetLst),function(x) boxplot(BinList[[ConFeat[i]]][SetLst[[x]],],main=paste(x,names(ConFeat)[i]),notch = T,ylim=c(0,1))) dev.off() # Supp 03 # For CHG Methylation ConFeat<-c("Whole Gene CHG"="CHG_Gene", "Exon CHG"="CHG_Exon", "Intron CHG"="CHG_Intron") pdf(file=paste(PathToOutput,"Supp03.pdf",sep=""),width = 6.5,height = 11) par(mfrow=c(6,2),mar=c(3,3,4,1)) for(i in 1:length(ConFeat)) t<-lapply(names(SetLst),function(x) boxplot(BinList[[ConFeat[i]]][SetLst[[x]],],main=paste(x,names(ConFeat)[i]),notch = T,ylim=c(0,1))) dev.off() # ------ # ------ # Distribution and classification of genes based on RNA abundance # Fig 01_B hlst<-list("No mRNA / Observed Protein (NR/OP)"= ProtClassList[["NonDetected"]],"No mRNA / No Protein (NR/NP)"= setdiff(RNAClassList[["NonDetected"]],UniquePr),"Low mRNA / Observed Protein (LR/OP)"=ProtClassList[["Low"]],"Low mRNA / No Protein (LR/NP)"=setdiff(RNAClassList[["Low"]],UniquePr),"High mRNA / Observed Protein (HR/OP)"=ProtClassList[["High"]],"High mRNA / No Protein (HR/NP)"=setdiff(RNAClassList[["High"]],UniquePr)) hlst<-lapply(hlst,function(x) RNAAbund[x]) pdf(file=paste(PathToOutput,"Fig01_B.pdf",sep=""),width = 10,height = 6) PlotMultiHist(HistLst = hlst,brks = seq(-12,15,0.5),xlb = "mRNA Abundance log2(FPKM)", ylb="Gene Counts", cols = tcols,LegBoxSz = 3, ledgx = 42, ledgy = 5200) dev.off() # ------ # ------ # legend for models # Fig 01_C pdf(file=paste(PathToOutput,"Fig01_C.pdf",sep=""),width = 15,height = 4) par(mar=c(0,0,0,0)) BlankPlot(yrng = c(0,4),xrng = c(0,15)) xl<-c(3,10,3,6,10,13,3,10) yb<-c(2.75,2.75,1.5,1.5,1.5,1.5,0.25,0.25) xr<-xl+2 yt<-yb+1 tmpCols<-tcols[c(5,2,6,5,2,1,5,6)] rect(xleft = xl,ybottom = yb,xright = xr,ytop = yt,col = tmpCols) text(x = c(8.5,8.5,8.5),y = c(0.75,2,3.25),pos = 4,labels = c("vs."), font = 2, cex = 3) text(x = c(5.25,12.25), y = c(2,2), labels = c("+","+"),pos = 4, cex =3, font = 2) text(x = c(0.5,0.5,0.5),y=c(0.75,2,3.25),labels = names(yVecList_All)[length(yVecList_All):1],pos = 4, cex=3.5, font = 2) dev.off() # ------ # ------ # ROC and PR curves for classifier # Fig 02_A , Supp 04_A , Supp 07_F , Supp 07_G ******(Also used for Supp 09_A, 09_B)****** # ***** For Supp 09_A and Supp 09_B, the above analysis must be run after all TE-containing gene models are removed (un-comment line 90 "yVecList_All<-lapply(yVecList_All,function(x) x[setdiff(names(x),AllTEGenes)])") ****** # Construct vectors for simple prediction based on Filtered Gene Set FGSobs<-list("EPC"=RF_Sets[["EPC"]],"ERC"=RF_Sets[["ERC"]],"FGSv2_HR_OP"=RF_Sets[["Filtered Set V2"]],"FGSv2_HR"=RF_Sets[["Filtered Set V2"]],"FGSv4_HR_OP"=RF_Sets[["Filtered Set V4"]],"FGSv4_HR"=RF_Sets[["Filtered Set V4"]]) TrSets<-list("EPC"=yVecList_All[["EPC"]],"ERC"=yVecList_All[["ERC"]],"FGSv2_HR_OP"=yVecList_All[["EPC"]],"FGSv2_HR"=yVecList_All[["ERC"]],"FGSv4_HR_OP"=yVecList_All[["EPC"]],"FGSv4_HR"=yVecList_All[["ERC"]]) FGSpreds<-lapply(1:length(FGSobs), function(x) BuildYvec(posvec = intersect(FGSobs[[x]],names(TrSets[[x]])), negvec = setdiff(names(TrSets[[x]]),FGSobs[[x]]),MakeQuant = T)) names(FGSpreds)<-names(FGSobs) CurvePreds<-c(lapply(RF_List_All, function(x) x$votes[,"TRUE"]),FGSpreds) CurveLabs<-c(yVecList_All,TrSets) # ROC and PR Curves for OOB and Cross validated results of all predictive models PlotROCPR(predlist = CurvePreds[1:3], lablist = CurveLabs[1:3], dir = PathToOutput, plotnames = c("Fig02_A","Supp04_A"), main = "",col=c(tcols[5:6],"red"),ltys = c(1,1,1),lcex=0.8,ledgX = 0.45,ledgY = 0.25) # ROC and PR Curves comparing stardard cutoffs for EPC and ERC with V2 and V4 FGSs PlotROCPR(predlist = CurvePreds[4:9], lablist = CurveLabs[4:9], dir = PathToOutput, plotnames = c("Supp07_F","Supp07_G"), main = "",col=c(tcols[5:6],tcols[5:6],tcols[5:6]),ltys = c(1,1,2,2,3,3),lcex=0.75,ledgX = 0.35,ledgY = 0.36) # Supp 09_A and Supp 09_B #PlotROCPR(predlist = CurvePreds[1:3], lablist = CurveLabs[1:3], dir = PathToOutput, plotnames = c("Supp09_A","Supp09_B"), main = "",col=c(tcols[5:6],"red"),ltys = c(1,1,1),lcex=0.8,ledgX = 0.45,ledgY = 0.25) # ------ # ------ # Quantitative Prediction Results # for Protein Model require(lattice) require(latticeExtra) require(hexbin) tmpy<-yVecList_All_Quant[["PEP"]] tmpx<-RF_List_All_Quant[["PEP"]]$predicted tmpMn<-"" # Supp 04_D pdf(file=paste(PathToOutput,"Supp04_D.pdf",sep=""),width = 5,height = 5) p<-hexbinplot(tmpy ~ tmpx,colramp=colPal,xbins=30,xlab="Predicted Log2(Normalized Spectral Counts)",ylab="Observed Log2(Normalized Spectral Counts)",main=tmpMn, aspect=1) p+layer(panel.text(p$x.limits[2]*0.4, p$y.limits[2]*0.95, paste("R^2 =",round(cor(tmpx,tmpy)^2,3)), col="grey50", cex = 1.2, font = 2) ) dev.off() # Supp 04_B keepInds<-which(tmpy>min(tmpy)) pdf(file=paste(PathToOutput,"Supp04_B.pdf",sep=""),width = 5,height = 5) p<-hexbinplot(tmpy[keepInds] ~ tmpx[keepInds],colramp=colPal,xbins=30,xlab="Predicted Log2(Normalized Spectral Counts)",ylab="Observed Log2(Normalized Spectral Counts)",main=tmpMn,aspect=1) p+layer(panel.text(p$x.limits[2]*0.4, p$y.limits[2]*0.95, paste("R^2 =",round(cor(tmpx[keepInds],tmpy[keepInds])^2,3)), col="grey50", cex = 1.2, font = 2) ) dev.off() # for mRNA Models tmpy<-yVecList_All_Quant[["REP"]] tmpx<-RF_List_All_Quant[["REP"]]$predicted tmpMn<-"" # Supp 04_C pdf(file=paste(PathToOutput,"Supp04_C.pdf",sep=""),width = 5,height = 5) p<-hexbinplot(tmpy ~ tmpx,colramp=colPal,xbins=30,xlab="Predicted Log2(FPKM)",ylab="Observed Log2(FPKM)",main=tmpMn, aspect=1) p+layer(panel.text(p$x.limits[2]*0.4, p$y.limits[2]*0.95, paste("R^2 =",round(cor(tmpx,tmpy)^2,3)), col="grey50", cex = 1.2, font = 2) ) dev.off() # Fig 02_B keepInds<-which(tmpy>min(tmpy)) pdf(file=paste(PathToOutput,"Fig02_B.pdf",sep=""),width = 5,height = 5) p<-hexbinplot(tmpy[keepInds] ~ tmpx[keepInds],colramp=colPal,xbins=30,xlab="Predicted Log2(FPKM)",ylab="Observed Log2(FPKM)",main=tmpMn,aspect=1) p+layer(panel.text(p$x.limits[2]*0.4, p$y.limits[2]*0.95, paste("R^2 =",round(cor(tmpx[keepInds],tmpy[keepInds])^2,3)), col="grey50", cex = 1.2, font = 2) ) dev.off() # ------ # ------ # Regression Analysis, Plotting Feature Importance RF_List_All_Imp<-apply(RF_List_All_Imp,2,function(x) x/max(x)) r<-row.names(RF_List_All_Imp) RF_List_All_Imp<- RF_List_All_Imp[order(r),] FeatureSigns<- FeatureStat[order(r),] # Use the signed feature importance RF_List_All_Imp_Signed<- RF_List_All_Imp*sign(FeatureSigns) # Fig 02_C-E ******(Also used for Supp 09_C-E & Supp 13_G & H)****** # ***** For Supp 09_C-E, the above analysis must be run after all TE-containing gene models are removed (un-comment line 90 "yVecList_All<-lapply(yVecList_All,function(x) x[setdiff(names(x),AllTEGenes)])") ****** # ***** For Supp 13_G and Supp 13_H, the above analysis must be run using histone modification data in place of DNA methylation (un-comment line 37 "MethMatrix<-MethScoreMatrix5BinsHistones") PlotSignedImportance(ImpMat = RF_List_All_Imp_Signed, colm=1, colors = c(tcols[5],rgb(1,1,0.5,0.5)), filenm = paste(PathToOutput,"Fig02_C.pdf",sep="")) PlotSignedImportance(ImpMat = RF_List_All_Imp_Signed, colm=2, colors = c(tcols[6],"skyblue"), filenm = paste(PathToOutput,"Fig02_D.pdf",sep="")) PlotSignedImportance(ImpMat = RF_List_All_Imp_Signed, colm=3, colors = c(rgb(0.75,0,0),rgb(1,0.75,0.75)),ylimt = c(-1.3,0.7), filenm = paste(PathToOutput,"Fig02_E.pdf",sep="")) # Supp 09_C-E #PlotSignedImportance(ImpMat = RF_List_All_Imp_Signed, colm=1, colors = c(tcols[5],rgb(1,1,0.5,0.5)), filenm = paste(PathToOutput,"Supp09_C.pdf",sep="")) #PlotSignedImportance(ImpMat = RF_List_All_Imp_Signed, colm=2, colors = c(tcols[6],"skyblue"), filenm = paste(PathToOutput,"Supp09_D.pdf",sep="")) #PlotSignedImportance(ImpMat = RF_List_All_Imp_Signed, colm=3, colors = c(rgb(0.75,0,0),rgb(1,0.75,0.75)),ylimt = c(-1.2,0.7), filenm = paste(PathToOutput,"Supp09_E.pdf",sep="")) # ------ # Feature Sign Boxplots # EPC Model cols<-tcols[rep(c(5,2),30)] tl<-unlist(lapply(1:length(PosFeats[[1]]),function(x) list(PosFeats[[1]][[x]],NegFeats[[1]][[x]])),recursive = F) tAt<-1:(ncol(MethMatrix)*3) tAt<-tAt[-(seq(3,length(tAt),3))] # Supp 05_A (Requires models to be run with only DNA methylation data (Lines 35, 37 and 39 must be commented out)) PlotClassBoxes(filenm = paste(PathToOutput,"Supp05_A.pdf",sep=""), datalst = tl, datalabs = gsub("CpG","CG",names(PosFeats[[1]])) ,Cols = cols, BrdCol = "black", ylab = "Percent Methylation", tAt = tAt, ledgeNms = c("Observed Protein","No Protein")) # ERC Model cols<-tcols[rep(c(6,1),30)] tl<-unlist(lapply(1:length(PosFeats[[1]]),function(x) list(PosFeats[[2]][[x]],NegFeats[[2]][[x]])),recursive = F) # Supp 05_B (Requires models to be run with only DNA methylation data (Lines 35, 37 and 39 must be commented out)) PlotClassBoxes(filenm = paste(PathToOutput,"Supp05_B.pdf",sep=""), datalst = tl, datalabs = gsub("CpG","CG",names(PosFeats[[1]])) ,Cols = cols, ylab = "Percent Methylation", tAt = tAt, ledgeNms = c("High mRNA","No mRNA")) # PFI Model cols<-tcols[rep(c(5,6),30)] tl<-unlist(lapply(1:length(PosFeats[[1]]),function(x) list(PosFeats[[3]][[x]],NegFeats[[3]][[x]])),recursive = F) # Supp 05_C (Requires models to be run with only DNA methylation data (Lines 35, 37 and 39 must be commented out)) PlotClassBoxes(filenm = paste(PathToOutput,"Supp05_C.pdf",sep=""), datalst = tl, datalabs = gsub("CpG","CG",names(PosFeats[[1]])) ,Cols = cols, BrdCol = "grey40", ylab = "Percent Methylation", tAt = tAt, ledgeNms = c("Observed Protein","High mRNA & No Protein")) # ------ # ------ # Intron Number Distribution SetLst<-list("All Genes"=names(AllGenesIntronNum),"Filtered Set"=FilteredGenes, "gbM Genes"=GBMgenes, "Protein Coding"=BiotypeLst[["protein_coding"]], "Transposable Element"=BiotypeLst[["transposable_element"]], "Pseudogene"=BiotypeLst[["pseudogene"]], "Observed Protein"=UniquePr, "High RNA / No Protein"=setdiff(RNAClassList[["High"]],UniquePr), "Low mRNA / No Protein"=setdiff(RNAClassList[["Low"]],UniquePr), "Non-Observed"=setdiff(RNAClassList[["NonDetected"]],UniquePr)) lcols1<-c("black","white","orange","green","red","blue") lcols2<-c(tcols[c(5,6,4,2)]) lcols<-c(lcols1,lcols2) tempIntTF<-rep(F,length(AllGenes)) names(tempIntTF)<-AllGenes IntGenes<-names(AllGenesIntronNum)[which(AllGenesIntronNum>0)] IntGenes<-intersect(IntGenes,names(tempIntTF)) tempIntTF[IntGenes]<-T SetB<-SetLst SetB<-lapply(SetB,function(x) intersect(x,AllGenes)) tpvals<-sapply(lapply(SetB,function(y) intersect(y,SetB[[2]])),function(x) GetOverLapStats(set1 = x, set2 = intersect(IntGenes,SetB[[2]]), back = SetB[[2]], FoldEnrich = F, nlog10=F)[["Enrichment_Pvalue"]]) names(SetB)<-paste(names(SetB),"\n(N=",sapply(SetB,length),", Pval=",format(tpvals,digits = 2),")",sep="") iBin<-sapply(SetB,function(x) (table(tempIntTF[x])/length(x))*100) # Supp 12_A pdf(file=paste(PathToOutput,"Supp12_A.pdf",sep=""),width = 10,height = 5) par(mar=c(10,5,3,1)) tx<-barplot(iBin[c(2:1),], col=tcols[c(5,1)], names.arg = rep("",ncol(iBin))) title(ylab = "Percentage", line = 4) text(x = tx, par("usr")[3]-6, srt = 45, adj= 1, xpd = TRUE, labels = colnames(iBin), cex=1.2, font=2) legend(x = 4.5,y=125,xpd=T,legend = c("No Introns","One or More Introns"), col = tcols[c(1,5)], pch=15, pt.cex = 3,bty="n") dev.off() bmat<-BinList[["CpG_Gene"]] hr<-RNAClassList[["High"]] tlst<-list("No Introns"=setdiff(hr,IntGenes),"One or More Introns"=intersect(hr,IntGenes)) names(tlst)<-paste(names(tlst),"\n(N=",sapply(tlst,length),")",sep="") # Supp 12_B pdf(file=paste(PathToOutput,"Supp12_B-C.pdf",sep=""),width = 10,height = 3) par(mfrow=c(1,2),mar=c(3,5,4,1)) t<-lapply(names(tlst),function(x) boxplot(bmat[tlst[[x]],],main=x,notch = T,ylim=c(0,1), ylab="CG Methylation Level")) dev.off() # ------ # ------ # GBM Genes Separated by classification Pie Charts # Observed Genes lo<-list("Observed Protein"=UniquePr,"High mRNA / No Protein"=setdiff(RNAClassList[["High"]],UniquePr), "Low mRNA / No Protein"=setdiff(RNAClassList[["Low"]],UniquePr), "Non-Observed"=setdiff(RNAClassList[["NonDetected"]],UniquePr)) FEnObs<-sapply(lo,function(x) GetOverLapStats(set1 = x,set2 = GBMgenes,back = AllGenes))[1,] names(lo)<-paste(names(FEnObs),"\n(Fold Enrich.:",round(FEnObs,1),")",sep="") # Supp 11_A pdf(file=paste(PathToOutput,"Supp11_A.pdf",sep=""),width = 6,height = 5) ObPie<-sapply(lo,function(x) length(intersect(x,GBMgenes))) par(mar=c(0,0,5,5)) pie(ObPie,radius = 1, labels = "",init.angle = ,col = tcols[c(5,6,4,2)],main="Observed Classifications") legend(x = 0.75, y = 1.6,legend = names(ObPie),col = tcols[c(5,6,4,2)], pch=15,xpd = T,text.font = 2,bty="n", y.intersp = 1.5,cex=0.9, pt.cex = 3) dev.off() # Predicted Genes lp<-lapply(RF_List_Predictions,function(x) names(x)[x==2]) lp<-list("EPC & ERC"=intersect(lp[[1]],lp[[2]]),"ERC Only"=setdiff(lp[[2]],lp[[1]]),"Non Express-able"=setdiff(AllGenes,union(lp[[1]],lp[[2]]))) FEnPred<-sapply(lp,function(x) GetOverLapStats(set1 = x,set2 = GBMgenes,back = AllGenes))[1,] names(lp)<-paste(names(FEnPred),"\n(Fold Enrich.:",round(FEnPred,1),")",sep="") # Supp 11_B pdf(file=paste(PathToOutput,"Supp11_B.pdf",sep=""),width = 6,height = 5) par(mar=c(0,0,5,5)) ObPie<-sapply(lp,function(x) length(intersect(x,GBMgenes))) pie(ObPie,radius = 1,labels = "",col = tcols[c(5,6,2)], main="Express-able Classifications") legend(x = 0.85, y = 1.55,legend = names(ObPie),col = tcols[c(5,6,2)], pch=15,xpd = T,text.font = 2,bty="n", y.intersp = 1.5,cex=0.9, pt.cex = 3) dev.off() # ------ # ------ # Biomodal Distribuitions of Gene Classifications lpreds<-list("All Genes"=AllGenes,"Classified as Express-able Proteins"=RF_Sets[[1]], "Classified as Silenced Protein"=setdiff(AllGenes,RF_Sets[[1]]), "Classified as Express-able mRNA"=RF_Sets[[2]], "Classified as Silenced mRNA"=setdiff(AllGenes,RF_Sets[[2]]), "FGSv2"=RF_Sets[[3]], "Rejected Set v2"=setdiff(AllGenes,RF_Sets[[3]]),"FGSv4"=RF_Sets[[4]],"Rejected Set v4"=setdiff(AllGenes,RF_Sets[[4]])) lpreds<-lapply(lpreds,function(x) RNAAbund[x]) lcols<-c("black",tcols[c(5,1,6,2)]) # Supp 07_B pdf(file=paste(PathToOutput,"Supp07_B.pdf",sep=""),width = 8,height = 6) PlotMultiDensities(denslist = lpreds[1:3], cols = lcols[1:3], xlab = "mRNA Abundance Log2(FPKM)", main="EPC Model", ylab = "Relative Frequency") dev.off() # Supp 07_C pdf(file=paste(PathToOutput,"Supp07_C.pdf",sep=""),width = 8,height = 6) PlotMultiDensities(denslist = lpreds[c(1,4,5)], cols = lcols[c(1:3)], xlab = "mRNA Abundance Log2(FPKM)", main="ERC Model", ylab = "Relative frequency") dev.off() # Supp 07_D pdf(file=paste(PathToOutput,"Supp07_D.pdf",sep=""),width = 8,height = 6) PlotMultiDensities(denslist = lpreds[c(1,6,7)], cols = lcols[c(1,4,5)], xlab = "mRNA Abundance Log2(FPKM)", main="RefGen_v2 FGS", ylab = "Relative frequency") dev.off() # Supp 07_E pdf(file=paste(PathToOutput,"Supp07_E.pdf",sep=""),width = 8,height = 6) PlotMultiDensities(denslist = lpreds[c(1,8,9)], cols = lcols[c(1,4,5)], xlab = "mRNA Abundance Log2(FPKM)", main="RegGen_v4 FGS", ylab = "Relative frequency") dev.off() # ------ # ------ # Venndiagrams of Gene sets classified as Express-Able # Fig S07_A v<-Plot3WayVenn(RF_Sets,FourWay = T) pdf(file=paste(PathToOutput,"Supp07_A.pdf",sep=""),width = 6,height = 6) C3 <- compute.Venn(v, doWeights = F) gp <- VennThemes(C3, colourAlgorithm = "binary") for(s in 1:length(gp[[1]])) { if(gp[[1]][[s]]$fill=="blue") gp[[1]][[s]]$fill<-"black" if(gp[[2]][[s]]$col=="blue") gp[[2]][[s]]$col<-"black" } for(s in 1:length(gp[[3]])) gp[[3]][[s]]$col<-"black" plot(v,gpList = gp,doWeight=F,type="ellipses",show = list(SetLabels = F)) dev.off() # Supp 06_A v<-Plot3WayVenn(c(RF_Sets[c(1,3,4)],list("EPC Training Set Pos."=names(yVecList_All[[1]])[which(yVecList_All[[1]]==T)])),FourWay = T) pdf(file=paste(PathToOutput,"Supp06_A.pdf",sep=""),width = 6,height = 6) plot(v,doWeight=F) dev.off() # Supp 06_B v<-Plot3WayVenn(c(RF_Sets[c(2,3,4)],list("ERC Training Set Pos."=names(yVecList_All[[2]])[which(yVecList_All[[2]]==T)])),FourWay = T) pdf(file=paste(PathToOutput,"Supp06_B.pdf",sep=""),width = 6,height = 6) plot(v,doWeight=F) dev.off() # Supp 06_C v<-Plot3WayVenn(c(RF_Sets[c(1,3,4)],list("EPC Training Set Neg."=names(yVecList_All[[1]])[which(yVecList_All[[1]]==F)])),FourWay = T) pdf(file=paste(PathToOutput,"Supp06_C.pdf",sep=""),width = 6,height = 6) plot(v,doWeight=F) dev.off() # Supp 06_D v<-Plot3WayVenn(c(RF_Sets[c(2,3,4)],list("ERC Training Set Neg."=names(yVecList_All[[2]])[which(yVecList_All[[2]]==F)])),FourWay = T) pdf(file=paste(PathToOutput,"Supp06_D.pdf",sep=""),width = 6,height = 6) plot(v,doWeight=F) dev.off() # ------ # ------ # Barplot of predictions broken down by annotations Gbl<-list("All Genes"=AllGenes) Gbl<-c(Gbl,lapply(BiotypeLst,function(x) intersect(x,AllGenes))) Gbl[[5]]<-intersect(FilteredGenes,AllGenes) names(Gbl)<-c("All Genes","Protein Coding","Pseudogenes","Transposable Elements","Filtered Set") Gblex<-sapply(Gbl[c(1,2,5,4,3)],function(x) length(intersect(x,RF_Sets[[1]]))) Gblne<-sapply(Gbl[c(1,2,5,4,3)],function(x) length(setdiff(x,RF_Sets[[1]]))) Gbl<-rbind(Gblex,Gblne) # Supp 08 pdf(file=paste(PathToOutput,"Supp08.pdf",sep=""),width = 6,height = 6) par(mar=c(8,5,1,1)) tx<-barplot(Gbl, col=tcols[c(5,1)], names.arg = rep("",ncol(Gbl))) title(ylab = "Number of Genes", line = 4) text(x = tx, y = 6000,labels = Gbl[1,],col=c("white","white","white",tcols[c(5,5)]),las=2,pos = 1, font=2) text(x = tx, y = apply(Gbl,2,sum)-2000,labels = Gbl[2,],col="grey80",las=2,pos = 1, font=2) text(x = tx, par("usr")[3]-1000, srt = 45, adj= 1, xpd = TRUE, labels = colnames(Gbl), cex=1.2, font=2) legend("topright",legend = c("Express-Able Protein","Silenced"), col = tcols[c(5,1)], pch=15, pt.cex = 3,bty="n") dev.off() # ------ # ------ # Histone Modification Data tAt<-1:(ncol(MethScoreMatrix5BinsHistones)*3) tAt<-tAt[-(seq(3,length(tAt),3))] # Supp 13_A cols<-tcols[rep(c(5,2),30)] tl<-list(RF_Sets[[1]],RF_Silenced[[1]]) tl<-unlist(apply(MethScoreMatrix5BinsHistones, 2,function(x) lapply(tl,function(y) x[y])),recursive = F) PlotClassBoxes(filenm = paste(PathToOutput,"Supp13_A.pdf",sep=""), datalst = tl, datalabs = colnames(MethScoreMatrix5BinsHistones) ,Cols = cols, BrdCol = "black", ylab = "Histone Modification Level", tAt = tAt, ledgeNms = c("EPC: Expressable","EPC: Silenced")) # Supp 13_B cols<-tcols[rep(c(6,1),30)] tl<-list(RF_Sets[[2]],RF_Silenced[[2]]) tl<-unlist(apply(MethScoreMatrix5BinsHistones, 2,function(x) lapply(tl,function(y) x[y])),recursive = F) PlotClassBoxes(filenm = paste(PathToOutput,"Supp13_B.pdf",sep=""), datalst = tl, datalabs = colnames(MethScoreMatrix5BinsHistones) ,Cols = cols, ylab = "Histone Modification Level", tAt = tAt, ledgeNms = c("ERC: Expressable","ERC: Silenced")) # Supp 13_C and Supp 13_D # ***** For Supp 13_C and Supp 13_D, the above models must be constructed using histone modification data in place of DNA methylation (un-comment line 37 "MethMatrix<-MethScoreMatrix5BinsHistones") #PlotROCPR(predlist = lapply(RF_List_All[1:2], function(x) x$votes[,"TRUE"]), lablist = yVecList_All[1:2], dir = PathToOutput, plotnames = c("Supp13_C","Supp13_D"), main = "",col=c(tcols[5:6],"red"),lcex=0.8,ledgX = 0.45,ledgY = 0.3) # Supp 13_E and Supp 13_F # ***** For Supp 13_E and Supp 13_F, the above models must be constructed using histone modification data in place of DNA methylation (un-comment line 37 "MethMatrix<-MethScoreMatrix5BinsHistones") require(lattice) require(latticeExtra) require(hexbin) # Supp 13_E tmpy<-yVecList_All_Quant[["PEP"]] tmpx<-RF_List_All_Quant[["PEP"]]$predicted tmpMn<-"" keepInds<-which(tmpy>min(tmpy)) pdf(file=paste(PathToOutput,"Supp13_E.pdf",sep=""),width = 5,height = 5) p<-hexbinplot(tmpy[keepInds] ~ tmpx[keepInds],colramp=colPal,xbins=30,xlab="Predicted Log2(Normalized Spectral Counts)",ylab="Observed Log2(Normalized Spectral Counts)",main=tmpMn,aspect=1) p+layer(panel.text(p$x.limits[2]*0.7, p$y.limits[2]*0.95, paste("R^2 =",round(cor(tmpx[keepInds],tmpy[keepInds])^2,3)), col="grey50", cex = 1.2, font = 2) ) dev.off() # Supp 13_F tmpy<-yVecList_All_Quant[["REP"]] tmpx<-RF_List_All_Quant[["REP"]]$predicted keepInds<-which(tmpy>min(tmpy)) pdf(file=paste(PathToOutput,"Supp13_F.pdf",sep=""),width = 5,height = 5) p<-hexbinplot(tmpy[keepInds] ~ tmpx[keepInds],colramp=colPal,xbins=30,xlab="Predicted Log2(FPKM)",ylab="Observed Log2(FPKM)",main=tmpMn,aspect=1) p+layer(panel.text(p$x.limits[2]*0.4, p$y.limits[2]*0.95, paste("R^2 =",round(cor(tmpx[keepInds],tmpy[keepInds])^2,3)), col="grey50", cex = 1.2, font = 2) ) dev.off() # Supp 13_G & H # ***** For Supp 13_G and Supp 13_H, the above models must be constructed using histone modification data in place of DNA methylation (un-comment line 37 "MethMatrix<-MethScoreMatrix5BinsHistones") #PlotSignedImportance(ImpMat = RF_List_All_Imp_Signed, colm=1, colors = tcols[5], filenm = paste(PathToOutput,"Supp13_G.pdf",sep=""), ledgex = "topright", pmar=c(7,4,2,1), ledgeLabs = NULL) #PlotSignedImportance(ImpMat = RF_List_All_Imp_Signed, colm=2, colors = tcols[6], filenm = paste(PathToOutput,"Supp13_H.pdf",sep=""), ledgex = "topright", pmar=c(7,4,2,1), ledgeLabs = NULL) # ------ # ************************************************** # Evaluation of Different Genotypes # Variables # MethMat4GenotypesAllContextAllRegions- a matrix of methylation data for different Genotypes MethMat5GenotypesAllContextAllRegionsWithCov<-InputCSV(file=paste(PathToData,"Supplemental_Table13.csv",sep = ""),OutType="matrix") MethMat5Geno<- MethMat5GenotypesAllContextAllRegionsWithCov # RnaSeqFPKMAvgGenotypeData - a matrix of mRNA abundance corresponding to the same genotypes that have methyation data RnaSeqFPMGenotypeData<-InputCSV(file=paste(PathToData,"Supplemental_Table15.csv",sep = ""),OutType="matrix") RnaMatGeno<-RnaSeqFPMGenotypeData # MethMat5GenoCov - a matrix of WGBS coverage values of all non-B73 genotypes for genomic regions and all Bins CovInd<-grep(".cov.",colnames(MethMat5Geno)) MethMat5GenoCov<- MethMat5Geno[,CovInd] MethMat5Geno<-MethMat5Geno[,-CovInd] # only use CpG and CHG methylation Context i<-grep("CHH",colnames(MethMat5Geno)) MethMat5Geno<-MethMat5Geno[,-i] # Make a list containing a matrix for each inbred MethMatLst<-lapply(seq(1,150,30),function(x) MethMat5Geno[,x:(x+29)]) names(MethMatLst)<-c("B73","CML322","Mo17","Oh43","Tx303") # Filter out Genes with no/low coverage in any of non-B73 genotypes Int<-intersect(row.names(RnaMatGeno),row.names(MethMat5Geno)) CovMat<- MethMat5GenoCov[Int,] CovVec<-apply(CovMat,1,function(x) length(which(!is.na(x) & x>0))) Int<-names(CovVec)[which(CovVec>25)] MethMatLst<-lapply(MethMatLst,function(x) x[Int,]) RnaMatGeno<- RnaMatGeno[Int,] # Define the test data as all Non-B73 Genotypes testMatLst<-MethMatLst # Replace NAs with non-informative methylation level of 0.5 testMatLst<-lapply(testMatLst,function(x) apply(x,2,SubNAs)) # make all colnames the same so prediction() can be run c<-colnames(testMatLst[[1]]) c<-gsub("B73.","",c,fixed = T) for(x in 1:length(testMatLst)) { colnames(testMatLst[[x]])<-c } # Generate New yVec yVec<-yVecList_All[["ERC"]] yVec<-yVec[intersect(names(yVec),row.names(testMatLst[[1]]))] # Generate the RF Model We are using the exact same model that was created for the different tissues using a B73 whole seedling sample so we do not need to build a new model require(randomForest) TrainCol<-"B73" RF_B73Inbreds<-randomForest(x = testMatLst[[TrainCol]][names(yVec),], y = as.factor(yVec), ntree = 1000, importance = T, keep.forest = T) xvals<-RF_B73Inbreds$votes[,"TRUE"] # Predict the test data using the B73 model PredLst<-lapply(testMatLst,function(x) predict(RF_B73Inbreds,x,type = "vote")[,"TRUE"]) # Insert the existing predictions for B73 from above PredLst[[TrainCol]][names(xvals)]<-xvals # calculate average Inbred expression RnaMatGenoAvg<-data.matrix(RnaMatGeno) colnames(RnaMatGenoAvg)<-gsub("\\.[[:digit:]]","",colnames(RnaMatGenoAvg)) RnaMatGenoAvg<-t(apply(RnaMatGenoAvg,1,function(x) tapply(x,INDEX = names(x),function(y) mean(y,na.rm=T)))) RnaMatLog<-log2(RnaMatGenoAvg+0.5) # ------ # ************************************************** # Evaluation of Different Tissues # Variables # MethMatTissueAllContextAllRegions - a matrix of methylation data for different tissues MethMatTissueAllContextAllRegions<-InputCSV(file=paste(PathToData,"Supplemental_Table14.csv",sep = ""),OutType="matrix") # RnaSeqFPKMAvgTissuesBriggsData - a matrix of mRNA abundance corresponding to the same tissue that have methyation data RnaSeqFPMTissues<-InputCSV(file=paste(PathToData,"Supplemental_Table16.csv",sep = ""),OutType="matrix") MethMatTissue<- MethMatTissueAllContextAllRegions RnaMatTissue<- RnaSeqFPMTissues # only use CpG and CHG methylation Context i<-grep("CHH",colnames(MethMatTissue)) MethMatTissue<-MethMatTissue[,-i] MethMatLst<-lapply(seq(1,90,30),function(x) MethMatTissue[,x:(x+29)]) names(MethMatLst)<-c("B73_Anther","B73_Earshoot","B73_SAM") # Filter out any genes without DNA Methylation Data keeps<-intersect(row.names(RnaMatTissue),row.names(MethMatTissue)) RnaMatTissue<-RnaMatTissue[keeps,] MethMatLst<-lapply(MethMatLst,function(x) x[keeps,]) # Define the test data sets as all tissue types testMatLst<-MethMatLst testMatLst<-lapply(testMatLst,function(x) apply(x,2,SubNAs)) # make all colnames the same so prediction() can be run c<-colnames(testMatLst[[1]]) c<-gsub("B73_Anther.","",c) for(x in 1:length(testMatLst)) { colnames(testMatLst[[x]])<-c } # Use the B73 leaf model built above require(randomForest) PredLstTissue<-lapply(testMatLst,function(x) predict(RF_B73Inbreds,x,type = "vote")[,"TRUE"]) # calculate Avg. rep expression RnaMatTissueAvg<-data.matrix(RnaMatTissue) colnames(RnaMatTissueAvg)<-gsub("\\.[[:digit:]]","",colnames(RnaMatTissueAvg)) RnaMatTissueAvg<-t(apply(RnaMatTissueAvg,1,function(x) tapply(x,INDEX = names(x),function(y) mean(y,na.rm=T)))) RnaMatTissueLog<-log2(RnaMatTissueAvg+0.5) # Figure Plotting # ------ # Results of Testing the EPC-2 Classification model on Different Inbreds and Tissues # Plotting Fig 03 # Fig 03 pdf(paste(PathToOutput,"Fig03.pdf",sep=""),width = 9,height = 6) layout(mat = matrix(c(1,5,4,3,2,6),nrow = 2)) RnaList<-lapply(names(PredLst),function(x) RnaMatLog[,x]) ExLst<-lapply(RnaList,function(x) x[which(x>0)]) NoExLst<-lapply(RnaList,function(x) x[which(x==log2(0.5))]) yVecLst<-lapply(1:length(ExLst),function(x) BuildYvec(posvec = names(ExLst[[x]]), negvec = names(NoExLst[[x]]))) names(yVecLst)<-names(PredLst) PredLstVal<-lapply(names(yVecLst),function(x) PredLst[[x]][names(yVecLst[[x]])]) names(PredLstVal)<-names(yVecLst) names(PredLstVal)[1]="B73 [X-Validated]" names(yVecLst)[1]<-"B73 [X-Validated]" par(mar=c(4,4,1,1), mgp=c(2,1,0)) PlotROCPR(predlist = PredLstVal, yVecLst, dir = "~/Downloads/", plotnames = "Genotypes5",cols = c("black","orange","purple","yellow","grey50"),ltys = c(1,1,1,1,1),lwds=3,SavePlot = F,RocPl=T,PrPl=F,ledgX = 0.3,ledgY = 0.3,lcex = 0.8) # Colored Scatter Plot IntNm<-intersect(row.names(RnaMatLog),names(PredLst[[TrainCol]])) RnaMatLog<-RnaMatLog[IntNm,names(PredLst)] CompInds<-c(2:5) CompGrid<-combn(x = CompInds, m = 2, simplify = F) names(CompGrid)<-sapply(CompGrid,function(x) paste(colnames(RnaMatLog)[x[1]],colnames(RnaMatLog)[x[2]],sep=" vs. ")) pDiff<-lapply(CompGrid,function(x) ReturnMaxMin(PredLst[[x[1]]],PredLst[[x[2]]])) pDiffMat<-do.call(rbind,pDiff) pDs<-lapply(pDiff, function(x) (x[,1]-x[,2])>0.6) pDsVec<-unlist(pDs) tscols<-c(adjustcolor("grey40",alpha.f = 0.005),adjustcolor(c("blue"),alpha.f = 0.005)) pdCols<-rep(tscols[1],length(pDsVec)) pdCols[which(pDsVec)]<-tscols[2] # Scatter plot for Fig4 C plot(pDiffMat[,2],pDiffMat[,1],xlab="Prediction from Lower Inbred",ylab="Prediction from Higher Inbred",col=pdCols,pch=20,cex=0.7,font=2) text(x = 0.05,y = 0.95,labels = length(which(pDsVec)),pos = 4,col="midnightblue",font=2) legend("bottomright",legend = c("Inbred-Variable Predictions"), col = rgb(0,0,1,0.5), pch=16, pt.cex = 1.3, bty="n") # Now Filter out genes that have disagreeing technical reps FPMcut<-RnaMatGeno>1 FPMcut<-lapply(1:5, function(x) FPMcut[,((x-1)*3+1):(x*3)]) names(FPMcut)<-sapply(1:5, function(x) colnames(RnaMatGeno)[(x-1)*3+1]) cntDiffs<-lapply(CompGrid,function(x) cbind(FPMcut[[x[1]]],FPMcut[[x[2]]])) QualRes<-lapply(cntDiffs,FindSwitch) QualRes<-lapply(QualRes,function(x) x[which(x!=(-1))]) # make a list of predicted genes predData<-do.call(cbind,PredLst[colnames(RnaMatGenoAvg)]) compNms<-names(CompGrid) names(compNms)<-compNms keepLst<-lapply(QualRes,function(x) intersect(names(x),row.names(predData))) #PredHigh1<-lapply(compNms,function(x) (predData[keepLst[[x]],CompGrid[[x]][2]]<0.2 & predData[keepLst[[x]],CompGrid[[x]][1]]>0.8)) #PredHigh2<-lapply(compNms,function(x) (predData[keepLst[[x]],CompGrid[[x]][1]]<0.2 & predData[keepLst[[x]],CompGrid[[x]][2]]>0.8)) PredHigh1<-lapply(compNms,function(x) (predData[keepLst[[x]],CompGrid[[x]][1]]-predData[keepLst[[x]],CompGrid[[x]][2]])>0.6) PredHigh2<-lapply(compNms,function(x) (predData[keepLst[[x]],CompGrid[[x]][2]]-predData[keepLst[[x]],CompGrid[[x]][1]])>0.6) for(n in compNms) { names(PredHigh1[[n]])<-keepLst[[n]] names(PredHigh2[[n]])<-keepLst[[n]] } exph1<-lapply(compNms,function(x) RnaMatGenoAvg[names(PredHigh1[[x]])[which(PredHigh1[[x]])],CompGrid[[x]][1]]) exph2<-lapply(compNms,function(x) RnaMatGenoAvg[names(PredHigh2[[x]])[which(PredHigh2[[x]])],CompGrid[[x]][2]]) expl1<-lapply(compNms,function(x) RnaMatGenoAvg[names(PredHigh1[[x]])[which(PredHigh1[[x]])],CompGrid[[x]][2]]) expl2<-lapply(compNms,function(x) RnaMatGenoAvg[names(PredHigh2[[x]])[which(PredHigh2[[x]])],CompGrid[[x]][1]]) yvec<-log2(c(unlist(exph1),unlist(exph2))+0.5) xvec<-log2(c(unlist(expl1),unlist(expl2))+0.5) f<-colorRampPalette(colors = c("blue","red","green")) tcols<-f(6) pcols<-c(rep(tcols,times=sapply(exph1,length)),rep(tcols,times=sapply(exph2,length))) pcols<-adjustcolor(col = pcols,alpha.f = 0.6) rnd<-sample(1:length(xvec),length(xvec),replace = F) plot(xvec[rnd],yvec[rnd],pch=16,col=pcols[rnd],ylim=c(-1.5,10),xlim=c(-1.5,10),xlab="Silenced Inbred Log2(FPM)",ylab="Expressable Inbred Log2(FPM)",font=2) abline(h=0,v=0,col=adjustcolor("red",alpha.f = 0.3),lty=2,lwd=3) lines(x = c(0,9), y = c(0,9), lwd=3, lty=1, col=adjustcolor("grey30",alpha.f = 0.1)) tab1<-table(factor(yvec>0,levels = c(T,F)),factor(xvec>0,levels = c(T,F))) text(x = c(9,9,-2.2,-2.2),y = c(9.5,-1.5,9.5,-1.5),labels = as.numeric(tab1), pos = 4, font=2, col = c("red","red","darkgreen","black")) HLdiff<-yvec-xvec UpIn2<-which(yvec>0 & xvec>0) tab2<-table(factor(HLdiff[UpIn2]>0,levels=c(T,F))) text(x=c(7,9), y=c(9.5,8), labels=as.numeric(tab2), col= "black", pos = 4) legend(x = 6,y = 4.5, legend = names(exph1), col = adjustcolor(tcols,alpha.f = 0.6), pch = 16, pt.cex = 1.2, bty="r", xpd=T, cex = 0.8, bg = "white") # Calculate the accuracy (# correct calls / # of calls evaluated) #tab1[1,2]/(tab1[1,2]+sum(tab1[,1])) #sum(tab1[1,2],tab2[1])/(tab1[1,2]+sum(tab1[,1])) RnaList<-lapply(names(PredLstTissue),function(x) RnaMatTissueLog[,x]) ExLst<-lapply(RnaList,function(x) x[which(x>0)]) NoExLst<-lapply(RnaList,function(x) x[which(x==(log2(0.5)))]) yVecLst<-lapply(1:length(ExLst),function(x) BuildYvec(posvec = names(ExLst[[x]]), negvec = names(NoExLst[[x]]))) names(yVecLst)<-names(PredLstTissue) PredLstVal<-lapply(names(yVecLst),function(x) PredLstTissue[[x]][names(yVecLst[[x]])]) names(PredLstVal)<-names(yVecLst) PlotROCPR(predlist = PredLstVal, yVecLst, dir = "~/Downloads/", plotnames = "Tissues3",cols = c("black","orange","purple"),ltys = c(1,1,1),lwds=3,SavePlot = F,RocPl=T,PrPl=F,ledgX = 0.3,ledgY = 0.3,lcex = 0.8) # Colored Scatter Plot IntNm<-intersect(row.names(RnaMatTissueLog),names(PredLstTissue[[1]])) RnaMatTissueLog<-RnaMatTissueLog[IntNm,names(PredLstTissue)] CompInds<-c(1:3) CompGrid<-combn(x = CompInds, m = 2, simplify = F) names(CompGrid)<-sapply(CompGrid,function(x) paste(colnames(RnaMatTissueLog)[x[1]],colnames(RnaMatTissueLog)[x[2]],sep=" vs. ")) pDiff<-lapply(CompGrid,function(x) ReturnMaxMin(PredLstTissue[[x[1]]],PredLstTissue[[x[2]]])) pDiffMat<-do.call(rbind,pDiff) pDs<-lapply(pDiff, function(x) (x[,1]-x[,2])>0.6) pDsVec<-unlist(pDs) tscols<-c(adjustcolor("grey40",alpha.f = 0.005),adjustcolor(c("blue"),alpha.f = 0.005)) pdCols<-rep(tscols[1],length(pDsVec)) pdCols[which(pDsVec)]<-tscols[2] # Scatter plot for Fig4 D plot(pDiffMat[,2],pDiffMat[,1],xlab="Prediction from Lower Inbred",ylab="Prediction from Higher Inbred",col=pdCols,pch=20,cex=0.7,font=2) text(x = 0.05,y = 0.95,labels = length(which(pDsVec)),pos = 4,col="midnightblue",font=2) legend("bottomright",legend = c("Tissue-Variable Predictions"), col = rgb(0,0,1,0.5), pch=16, pt.cex = 1.3, bty = "n") # Now Filter out genes that have disagreeing technical reps FPMcut<-RnaMatTissue>1 FPMcut<-lapply(1:3, function(x) FPMcut[,((x-1)*3+1):(x*3)]) names(FPMcut)<-sapply(1:3, function(x) colnames(RnaMatTissue)[(x-1)*3+1]) cntDiffs<-lapply(CompGrid,function(x) cbind(FPMcut[[x[1]]],FPMcut[[x[2]]])) QualRes<-lapply(cntDiffs,FindSwitch) QualRes<-lapply(QualRes,function(x) x[which(x!=(-1))]) # make a list of predicted genes predData<-do.call(cbind,PredLstTissue[colnames(RnaMatTissueAvg)]) compNms<-names(CompGrid) names(compNms)<-compNms keepLst<-lapply(QualRes,function(x) intersect(names(x),row.names(predData))) #PredHigh1<-lapply(compNms,function(x) (predData[keepLst[[x]],CompGrid[[x]][2]]<0.20 & predData[keepLst[[x]],CompGrid[[x]][1]]>0.8)) #PredHigh2<-lapply(compNms,function(x) (predData[keepLst[[x]],CompGrid[[x]][1]]<0.20 & predData[keepLst[[x]],CompGrid[[x]][2]]>0.8)) PredHigh1<-lapply(compNms,function(x) (predData[keepLst[[x]],CompGrid[[x]][1]]-predData[keepLst[[x]],CompGrid[[x]][2]])>0.6) PredHigh2<-lapply(compNms,function(x) (predData[keepLst[[x]],CompGrid[[x]][2]]-predData[keepLst[[x]],CompGrid[[x]][1]])>0.6) for(n in compNms) { names(PredHigh1[[n]])<-keepLst[[n]] names(PredHigh2[[n]])<-keepLst[[n]] } exph1<-lapply(compNms,function(x) RnaMatTissueAvg[names(PredHigh1[[x]])[which(PredHigh1[[x]])],CompGrid[[x]][1]]) exph2<-lapply(compNms,function(x) RnaMatTissueAvg[names(PredHigh2[[x]])[which(PredHigh2[[x]])],CompGrid[[x]][2]]) expl1<-lapply(compNms,function(x) RnaMatTissueAvg[names(PredHigh1[[x]])[which(PredHigh1[[x]])],CompGrid[[x]][2]]) expl2<-lapply(compNms,function(x) RnaMatTissueAvg[names(PredHigh2[[x]])[which(PredHigh2[[x]])],CompGrid[[x]][1]]) yvec<-log2(c(unlist(exph1),unlist(exph2))+0.5) xvec<-log2(c(unlist(expl1),unlist(expl2))+0.5) f<-colorRampPalette(colors = c("blue","red","green")) tcols<-f(3) pcols<-c(rep(tcols,times=sapply(exph1,length)),rep(tcols,times=sapply(exph2,length))) pcols<-adjustcolor(col = pcols,alpha.f = 0.6) rnd<-sample(1:length(xvec),length(xvec),replace = F) plot(xvec[rnd],yvec[rnd],pch=16,col=pcols[rnd],ylim=c(-1.5,10),xlim=c(-1.5,10),xlab="Silenced Inbred Log2(FPM)",ylab="Expressable Inbred Log2(FPM)",font=2) abline(h=0,v=0,col=adjustcolor("red",alpha.f = 0.3),lty=2,lwd=3) lines(x = c(0,9), y = c(0,9), lwd=3, lty=1, col=adjustcolor("grey30",alpha.f = 0.1)) tab1<-table(factor(yvec>0,levels = c(T,F)),factor(xvec>0,levels = c(T,F))) text(x = c(9,9,-2.2,-2.2),y = c(9.5,-1.5,9.5,-1.5),labels = as.numeric(tab1), pos = 4, font=2, col = c("red","red","darkgreen","black")) HLdiff<-yvec-xvec UpIn2<-which(yvec>0 & xvec>0) tab2<-table(factor(HLdiff[UpIn2]>0,levels=c(T,F))) text(x=c(7,9), y=c(9.5,8), labels=as.numeric(tab2), col= "black", pos = 4) legend(x = 4,y = 3.5, legend = names(exph1), col = adjustcolor(tcols,alpha.f = 0.6), pch = 16, pt.cex = 1.2, bty="r", xpd=T, cex = 0.8, bg = "white") # Calculate the accuracy (# correct calls / # of calls evaluated) #tab1[1,2]/(tab1[1,2]+sum(tab1[,1])) #sum(tab1[1,2],tab2[1])/(tab1[1,2]+sum(tab1[,1])) dev.off() # ------ # Overlap of predicted expressomes for 4 inbred lines # Supp 10_A #require(Vennerable) #predData<-do.call(cbind,PredLst[colnames(RnaMatGenoAvg)]) #UnqExp<-lapply(1:4,function(c) apply(predData[,-1],1,function(x) max((x[c]-x)))) #UnqExp<-lapply(UnqExp,function(x) names(x)[x>0.6]) #names(UnqExp)<-colnames(predData[,-1]) #InbVenn<-Plot3WayVenn(SetLst = UnqExp,FourWay = T) #pdf(paste(PathToOutput,"Supp10.pdf",sep=""),width = 6,height = 6) #plot(InbVenn, doWeights=F) #dev.off() # Supp Figure 10 - Features of the Expressable vs. Silent Genes from 5 inbred ERCs # Number of additional expressable genes per inbred line added PredMat<-do.call(cbind,PredLst) PredMatLst<-lapply(1:100,function(x) PredMat[,sample(colnames(PredMat),5,replace = F)]) PredMatCnts<-sapply(PredMatLst,function(p) sapply(1:ncol(p),function(x) length(which(apply(p[,1:x,drop=F],1,function(y) any(y>=0.5)))))) ERC2Features<-read.table(file = "~/Google Drive/MethylationPaper/Manuscript/CurrentManuscript/Supplemental_Table18.csv", sep=",",header=T, row.names = 1, stringsAsFactors = F) ERC2Groups<-tapply(1:nrow(ERC2Features), INDEX = factor(ERC2Features$Group,levels = c("All_Inbreds_Syntenic","All_Inbreds_NonSyntenic","Any_Inbred","No_Inbreds")), function(x) x) colnames(ERC2Features)<-gsub(".."," (",colnames(ERC2Features),fixed=T) colnames(ERC2Features)<-gsub(".",")",colnames(ERC2Features),fixed=T) ERC2Genes<-lapply(ERC2Groups,function(x) rownames(ERC2Features)[x]) # determine proportion of TEs ERC2numTEoverlap<-sapply(ERC2Genes,function(x) c("Non-TE Gene"=length(setdiff(x,AllTEGenes)),"Probable TE"=length(intersect(x,AllTEGenes)))) # Blast Clusters ClstERC2_NoTE<-tapply(row.names(ERC2Features),INDEX = ERC2Features$Blast_Cluster,function(x) x) # Determine Connectivity of each Gene clstLen<-sapply(ClstERC2_NoTE,length) geneConn<-setNames(rep(clstLen,times=clstLen),nm = unlist(ClstERC2_NoTE)) ERC2numClst<-sapply(ERC2Genes,function(x) length(intersect(names(geneConn),x))) ERC2Conn<-lapply(ERC2Genes,function(x) geneConn[intersect(names(geneConn),x)]) pdf(paste(PathToOutput,"Supp10_A-J.pdf",sep=""),width = 15,height = 9) par(mfrow=c(2,5),mar=c(10,5,2,2)) b<-boxplot(t(PredMatCnts), ylab="", outline=T, notch=T, col="skyblue", varwidth=F, cex.axis=1.3) mtext(text = "# of Expressable Genes" ,side = 2,cex = 1.1, line = 3) mtext(text = "# of Inbreds Examined" ,side = 1,cex = 1.1, line = 3) b<-barplot(ERC2numTEoverlap, ylab="", col=c(tcols16[5],"grey30"), names=rep("",ncol(ERC2numTEoverlap)), main = "Number of TEs", cex.axis=1.3) legend("top",legend = rownames(ERC2numTEoverlap)[2:1], fill = c("grey30",tcols16[5]), xpd=T, bty="n") mtext(text = "Number of Genes",side = 2,cex = 1.1, line = 3) text(x = b, par("usr")[3]*2.5, srt = 45, adj= 1, xpd = TRUE, labels = names(ERC2Groups), cex=1.3) # remove TEs ERC2Genes<-lapply(ERC2Genes,function(x) setdiff(x,AllTEGenes)) # Boxplots Supp Figure 14 ERC2cols<-c(tcols16[c(3,7)],"grey80","grey50") for(x in colnames(ERC2Features)[2:5]) { b<-boxplot(lapply(ERC2Genes,function(y) ERC2Features[y,x]), ylab="", outline=F, notch=T, varwidth=T, col=ERC2cols, names=rep("",length(ERC2Groups)), cex.axis=1.3) mtext(text = gsub("_"," ",x,fixed=T),side = 2,cex = 1.1, line = 3) text(x = 1:length(ERC2Genes), par("usr")[3]*2.5, srt = 45, adj= 1, xpd = TRUE, labels = names(ERC2Groups), cex=1.3) } boxplot(ERC2Conn,ylab="",outline=F, notch=T, varwidth=T, col=ERC2cols, names=rep("",length(ERC2Groups)), cex.axis=1.3) mtext(text = "Connectivity (Number of Paralogs)",side = 2,cex = 1.1, line = 3) text(x = 1:length(ERC2Genes), par("usr")[3]*2.5, srt = 45, adj= 1, xpd = TRUE, labels = names(ERC2Groups), cex=1.3) ExpressableList<-list("No Expression"=setdiff(unlist(ERC2Genes),union(names(AllRNAAbund),names(AllProtAbund))),"mRNA Only"=setdiff(names(AllRNAAbund),names(AllProtAbund)),"Protein & mRNA"=intersect(names(AllRNAAbund),names(AllProtAbund))) ERC2observed<-sapply(ERC2Genes,function(x) sapply(ExpressableList,function(y) length(intersect(x,y)))) b<-barplot(ERC2observed, ylab="", col=c("grey80",tcols16[c(16,1)]), names=rep("",length(ERC2Groups)), legend.text = row.names(ERC2observed), args.legend = list(x="top",bty="n"), main="Observed Expression (B73)", cex.axis=1.3) mtext(text = "Number of Genes",side = 2,cex = 1.1, line = 3) text(x = b, par("usr")[3]*2.5, srt = 45, adj= 1, xpd = TRUE, labels = names(ERC2Groups), cex=1.3) ERC2mRNAexp<-sapply(ERC2Genes,function(x) AllRNAAbund[intersect(x,names(AllRNAAbund))]) ERC2protExp<-sapply(ERC2Genes,function(x) AllProtAbund[intersect(x,names(AllProtAbund))]) boxplot(ERC2mRNAexp,ylab="",outline=F, notch=T, varwidth=T, col=ERC2cols, names=rep("",length(ERC2Groups)), cex.axis=1.3) mtext(text = "mRNA Abundance log2(FPMK)",side = 2,cex = 1.1, line = 3) text(x = 1:length(ERC2Genes), -10, srt = 45, adj= 1, xpd = TRUE, labels = names(ERC2Groups), cex=1.3) boxplot(ERC2protExp,ylab="",outline=F, notch=T, varwidth=T, col=ERC2cols, names=rep("",length(ERC2Groups)), cex.axis=1.3) mtext(text = "Protein Abundance (Spectral Counts)",side = 2,cex = 1.1, line = 3) text(x = 1:length(ERC2Genes), -2, srt = 45, adj= 1, xpd = TRUE, labels = names(ERC2Groups), cex=1.3) dev.off() # Show the breakdown of the 15 largest gene clusters ClstERC2_NoTE<-ClstERC2_NoTE[order(sapply(ClstERC2_NoTE,length),decreasing = T)] ClstSetMat<-sapply(ClstERC2_NoTE[1:15],function(x) sapply(ERC2Genes,function(y) length(intersect(x,y)))) famNames = c("Kinase","Map Kinase","Pentatricopeptide Repeat","Unknown 01","Cytochrome p450 (CYP)","Ubiquitin E3","Unknown 02","AP2/EREB Transcription Factor","MYB Transcription Factor","UDP Glucosyl Transferase","Defense/PR Proteins","Unknown 03","RNA Binding","Peroxidase","Unknown 04") pdf(paste(PathToOutput,"Supp10_K.pdf",sep=""),width = 12,height = 6) par(mar=c(9,5,1,1)) tsp<-barplot(ClstSetMat[,1:15], ylab ="",legend.text=row.names(ClstSetMat), col=ERC2cols, names.arg = rep("",15)) mtext(text = "Number of Genes",side = 2,cex = 1.4, line = 3) text(x = tsp, par("usr")[3]*2.5, srt = 30, adj= 1, xpd = TRUE, labels = famNames, cex=1.4) dev.off() ttabMat<-sapply(ClstERC2_NoTE,function(x) sapply(ERC2Genes, function(y) length(intersect(x,y)))) pdf(paste(PathToOutput,"Supp10_L.pdf",sep=""),width = 12,height = 5) par(mar=c(2,8,3,1),mfrow=c(1,4)) for(i in 1:4) { t<-BarCompare(ERC2Genes, clstLst = ClstERC2_NoTE, setInd = i, cols = ERC2cols, tabMatrix = ttabMat) } dev.off()