#This file contains the R code used to generate gene signatures, tables and figures. #The TCGA and array express data sets were downloaded, as indicated in the paper, and assembled into lists #containing the expression data and clinicopathological variables. #The lists can contain the following variables (depending on data set): # $data (expression data), $key (key containing microarray probe ID and gene symbol), $devent (OS, deceased=1, alive=0), $dtime (follow-up time for OS) # $revent (recurrence or NTE=new tumor event, yes=1, no=0), $rtime (follow-up time recurrence), $age (age at diagnosis), $stage (stage at diagnosis), # $ER (positive/negative=1/0 for estrogen receptor), $node (lymph nodes yes=1, no=0), $tum2 (tumor size >2cm=1) #The data sets (lists) are not included as supplemental files #xclus.RData contains the genes in the signatures identified in Winslow et al, Breast Cancer Res, 2015;10:e0130300 #insert path of datasets path = "" #load signatures from Breast Cancer Res paper load(file=paste(path,"xclus.RData",sep="")) #name TCGA cancer set files TBRCA = "TBRCA150918set.RData" TCOAD = "TCOAD150917set.RData" THNSC = "THNSC150918set.RData" TKIRC = "TKIRC150918set.RData" TLUAD = "TLUAD150917set.RData" TLUSC = "TLUSC150917set.RData" #name TCGA normal breast tissue data file TBRCAnorm = "mRNAnormdata.RData" #name set files for the microarray sets analyzed GEOD21653 = "GEOD21653set.RData" EMTAB365 = "EMTAB365set.RData" #name library where the GSE6744 data are stored GSE66744 = "GSE66744\\" #load TCGA BRCA set load(file=paste(path,TBRCA,sep="")) #name TCGA datasets sets<-c(TBRCA,TCOAD,THNSC, TKIRC,TLUAD,TLUSC) setnames<-c("BRCA","COAD","HNSC","KIRC","LUAD","LUSC") #get signatures from previous paper endo<-c(xclus[[4]],xclus[[5]]) ecmg<-c(xclus[[1]],xclus[[2]]) #remove genes with low variance prior to expansion mRNAtumvarselect<-apply(set$data,1,var)>0.2 mRNAtumvar<-set$data[mRNAtumvarselect,] #expand potential ECM genes ng<-NULL for (i in 1:length(ecmg)){ for (j in 1:nrow(mRNAtumvar)){ if (cor(mRNAtumvar[ecmg[i],],mRNAtumvar[j,])>0.84) ng<-c(ng,rownames(mRNAtumvar)[j]) } } ng<-unique(ng) ecmset<-ng #ecmset = Table S3 #expand potential endo genes ng<-NULL for (i in 1:length(endo)){ for (j in 1:nrow(mRNAtumvar)){ if (cor(mRNAtumvar[endo[i],],mRNAtumvar[j,])>0.84) ng<-c(ng,rownames(mRNAtumvar)[j]) } } ng<-unique(ng) endoset<-ng #endoset = Table S4 #functions for obtaining compact clusters cormean<-function(x){ return(apply(x,1,function (y) (sum(y)-1)/(length(y)-1))) } findcors<-function (x) { y<-cbind(cormean(cor(t(set$data[x,tumind1]))),cormean(cor(t(set$data[x,tumind2]))), cormean(cor(t(set$data[x,tumind3]))),cormean(cor(t(set$data[x,tumind4])))) return(y) } stp<-0.85 findsetsim<-function (x,stp=0.85){ lims<-seq(0.5,stp,0.05) for(i in 1:length(lims)){ y<-cormean(cor(t(set$data[x,]))) yy<-(y>lims[i]) x<-x[yy] y<-cormean(cor(t(set$data[x,]))) yy<-(y>lims[i]) x<-x[yy] } return(x) } #obtain ECM clusters ecmlist<-as.list(NULL) for (i in 1:length(sets)){ load(paste(path,sets[i],sep="")) ecmlist[[i]]<-findsetsim(ecmset,0.85) } #ecmlist contains ECM clusters for the different cancer forms (Table 1) #necm contains genes that are present in ECM cluster in all cancer forms necm<-names(table(unlist(ecmlist)))[table(unlist(ecmlist))==6] #obtain endo clusters endolist<-as.list(NULL) for (i in 1:length(sets)){ load(paste(path,sets[i],sep="")) endolist[[i]]<-findsetsim(endoset,0.85) } nend<-names(table(unlist(endolist)))[table(unlist(endolist))==6] #endolist contains endo clusters for the different cancer forms (Table 2) #nend contains genes that are present in endo cluster in all cancer forms #make scatterplots ECM-endo (Fig 1) subnames = c("A","B","C","D","E","F") tiff(file=paste(path,"fig1AE.tif",sep=""),width=2800,height=2000,res=300,compression="lzw") par(mfrow=c(2,3),mar=c(5,4,4,1)) for (i in 1:length(sets)){ load(paste(path,sets[i],sep="")) plot(colMeans(set$data[necm,]),colMeans(set$data[nend,]),main=setnames[i], xlab="ECM metagene",ylab="Endothelial metagene",yaxt="n",cex.lab=1.3,cex.axis=1.2) axis(2,at=seq(4,12,2),labels=seq(4,12,2),las=1,cex.axis=1.2) legend("topleft",legend=signif(cor(colMeans(set$data[necm,]),colMeans(set$data[nend,])),3),bty="n",cex=1.2) mtext(subnames[i],side=3,line=1,adj=0,cex=2,outer=F) } dev.off() #survival function, right-censored library(survival) survrc<-function(time,event,rc){ event[time>rc]<-0 time[time>rc]<-rc output<-Surv(time,event) } #uni- and multivariate Cox of ECM and endo signatures, data stored in variable y (Table 3A-B) options(scipen=10) y<-NULL i = 1 load(paste(path,sets[i],sep="")) ecuv<-summary(coxph(survrc(set$rtime,set$revent,1825)~scale(colMeans(set$data[necm,]))+ strata(set$age)+strata(set$stage)))$coefficients[,c(2,5)] enuv<-summary(coxph(survrc(set$rtime,set$revent,1825)~scale(colMeans(set$data[nend,]))+ strata(set$age)+strata(set$stage)))$coefficients[,c(2,5)] ecmv<-summary(coxph(survrc(set$rtime,set$revent,1825)~scale(colMeans(set$data[necm,]))+scale(colMeans(set$data[nend,]))+ strata(set$age)+strata(set$stage)))$coefficients[1,c(2,5)] enmv<-summary(coxph(survrc(set$rtime,set$revent,1825)~scale(colMeans(set$data[necm,]))+scale(colMeans(set$data[nend,]))+ strata(set$age)+strata(set$stage)))$coefficients[2,c(2,5)] y<-rbind(y,c(ecuv,enuv),c(ecmv,enmv)) for (i in 2:length(sets)){ load(paste(path,sets[i],sep="")) ecuv<-summary(coxph(survrc(set$dtime,set$devent,1825)~scale(colMeans(set$data[necm,]))+ strata(set$age)+strata(set$stage)))$coefficients[,c(2,5)] enuv<-summary(coxph(survrc(set$dtime,set$devent,1825)~scale(colMeans(set$data[nend,]))+ strata(set$age)+strata(set$stage)))$coefficients[,c(2,5)] ecmv<-summary(coxph(survrc(set$dtime,set$devent,1825)~scale(colMeans(set$data[necm,]))+scale(colMeans(set$data[nend,]))+ strata(set$age)+strata(set$stage)))$coefficients[1,c(2,5)] enmv<-summary(coxph(survrc(set$dtime,set$devent,1825)~scale(colMeans(set$data[necm,]))+scale(colMeans(set$data[nend,]))+ strata(set$age)+strata(set$stage)))$coefficients[2,c(2,5)] y<-rbind(y,c(ecuv,enuv),c(ecmv,enmv)) } rownames(y)<-rep("",nrow(y)) for(i in 1:length(setnames)){ rownames(y)[2*i-1]<-paste(setnames[i],"univariate") rownames(y)[2*i]<-paste(setnames[i],"multivariate") } colnames(y)<-c("ECM HR","ECM p-val","Endo HR","Endo p-val") #generate three decimals from data in y yy = NULL for (i in 1:length(setnames)){ yy = rbind(yy,y[grep(setnames[i],rownames(y)),][1,1:2]) } yy = round(yy,3) yy = NULL for (i in 1:length(setnames)){ yy = rbind(yy,y[grep(setnames[i],rownames(y)),][2,1:2]) } yy = round(yy,3) yy = NULL for (i in 1:length(setnames)){ yy = rbind(yy,y[grep(setnames[i],rownames(y)),][1,3:4]) } yy = round(yy,3) yy = NULL for (i in 1:length(setnames)){ yy = rbind(yy,y[grep(setnames[i],rownames(y)),][2,3:4]) } yy = round(yy,3) #multivariate Cox, ECM, endo, ER, node, size (TCGA BRCA) Table 3C load(file=paste(path,TBRCA,sep="")) n = set$stage<4 n[is.na(n)] = F coxph(survrc(set$rtime,set$revent,1825)~scale(colMeans(set$data[necm,]))+scale(colMeans(set$data[nend,]))+set$ER+set$tum2+set$node,subset=n) #boxplot ECM sign i +/-node, tum size, +/- ER (Figure 2) load(file=paste(path,TBRCA,sep="")) tiff(file=paste(path,"fig2.tif",sep=""),width=2200,height=2200,res=300,compression="lzw") par(fig=c(0,0.33,0.5,1)) par(mar=c(4,4,2,1)) par(cex=0.8) y<-colMeans(set$data[necm,]) boxplot(colMeans(set$data[necm,])~set$ER,col=c(2,4),las=1,ylab="ECM metagene",xaxt="n",ylim=c(min(y)*0.95,max(y)*1.1)) axis(1,at=1:2,labels=c("ER neg","ER pos"),tick=F) text(0.5,1.09*max(y),labels=paste("p=",signif(t.test(colMeans(set$data[necm,])~set$ER)$p.value,3),sep=""),adj=0) mtext("A",side=2,line=1,at=22.5,cex=2,las=1,outer=F) par(fig=c(0.33,0.67,0.5,1),new=T) boxplot(colMeans(set$data[necm,])~set$node,col=c(4,2),las=1,ylab="ECM metagene",xaxt="n",ylim=c(min(y)*0.95,max(y)*1.1)) axis(1,at=1:2,labels=c("node neg","node pos"),tick=F) text(0.5,1.09*max(y),labels=paste("p=",signif(t.test(colMeans(set$data[necm,])~set$node)$p.value,3),sep=""),adj=0) mtext("B",side=2,line=1,at=22.5,cex=2,las=1,outer=F) par(fig=c(0.67,1,0.5,1),new=T) boxplot(colMeans(set$data[necm,])~set$tum2,col=c(4,2),las=1,ylab="ECM metagene",xaxt="n",ylim=c(min(y)*0.95,max(y)*1.1)) axis(1,at=1:2,labels=c("size<2cm","size>2cm"),tick=F) text(0.5,1.09*max(y),labels=paste("p=",signif(t.test(colMeans(set$data[necm,])~set$tum2)$p.value,3),sep=""),adj=0) mtext("C",side=2,line=1,at=22.5,cex=2,las=1,outer=F) #boxplot endo sign i +/-node, tum size, +/- ER (Figure) par(fig=c(0,0.33,0,0.5),new=T) par(mar=c(4,4,1,1)) par(cex=0.8) y<-colMeans(set$data[nend,]) boxplot(colMeans(set$data[nend,])~set$ER,col=c(2,4),las=1,ylab="Endothelial metagene",xaxt="n",ylim=c(min(y)*0.95,max(y)*1.1)) axis(1,at=1:2,labels=c("ER neg","ER pos"),tick=F) text(0.5,1.09*max(y),labels=paste("p=",signif(t.test(colMeans(set$data[nend,])~set$ER)$p.value,3),sep=""),adj=0) mtext("D",side=2,line=1,at=13.2,cex=2,las=1,outer=F) par(fig=c(0.33,0.67,0,0.5),new=T) boxplot(colMeans(set$data[nend,])~set$node,col=c(4,2),las=1,ylab="Endothelial metagene",xaxt="n",ylim=c(min(y)*0.95,max(y)*1.1)) axis(1,at=1:2,labels=c("node neg","node pos"),tick=F) text(0.5,1.09*max(y),labels=paste("p=",signif(t.test(colMeans(set$data[nend,])~set$node)$p.value,3),sep=""),adj=0) mtext("E",side=2,line=1,at=13.2,cex=2,las=1,outer=F) par(fig=c(0.67,1,0,0.5),new=T) boxplot(colMeans(set$data[nend,])~set$tum2,col=c(4,2),las=1,ylab="Endothelial metagene",xaxt="n",ylim=c(min(y)*0.95,max(y)*1.1)) axis(1,at=1:2,labels=c("size<2cm","size>2cm"),tick=F) text(0.5,1.09*max(y),labels=paste("p=",signif(t.test(colMeans(set$data[nend,])~set$tum2)$p.value,3),sep=""),adj=0) mtext("F",side=2,line=1,at=13.2,cex=2,las=1,outer=F) dev.off() #read scores of TCGA BRCA tumors (stroma histology type) load(file=paste(path,"stromtype.RData",sep="")) load(file=paste(path,TBRCA,sep="")) #generate figure of ECM signature vs stroma scores (Figure) tiff(file=paste(path,"fig3.tif",sep=""),width=2200,height=1500,res=300,compression="lzw") par(fig=c(0,0.22,0,1)) par(mar=c(2.5,4,1,1)) boxplot(colMeans(set$data[necm,TCGAscore$ID])~stromtype,las=1,xaxt="n",col=c(4,2), ylab="ECM metagene") axis(1,at=c(1:2),tick=F,labels=c("S","M"),line=-1) title(xlab="stroma type",line=1) mtext("A",side=2,line=2.5,at=20.7,cex=2,las=1,outer=F) par(fig=c(0.22,0.5,0,1),new=T) boxplot(colMeans(set$data[necm,TCGAscore$ID])~TCGAscore$stroma,las=1,xaxt="n",col=c("gray80","gray70","gray60"), ylab="ECM metagene") axis(1,at=c(1:3),tick=F,labels=1:3,line=-1) title(xlab="stroma score",line=1) mtext("B",side=2,line=2.5,at=20.7,cex=2,las=1,outer=F) par(fig=c(0.5,1,0,1),new=T) boxplot(colMeans(set$data[necm,TCGAscore$ID])~TCGAscore$stroma+stromtype,las=1,xaxt="n", col=c("blue2","blue3","blue4","red2","red3","red4"), ylab="ECM metagene") axis(1,at=c(1:6),tick=F,labels=c("S1","S2","S3","M1","M2","M3"),line=-1) title(xlab="stroma type+score",line=1) mtext("C",side=2,line=2.5,at=20.7,cex=2,las=1,outer=F) dev.off() #identify genes that are prognostic in multivariate model (stratify for node and ER) #top pairs in topres load(file=paste(path,TBRCA,sep="")) ng<-ecmset gi<-NULL gj<-NULL res<-NULL n = set$stage<4 n[is.na(n)] = F for (i in 1:(length(ng)-1)){ for (j in (i+1):length(ng)){ cx<-summary(coxph(survrc(set$rtime,set$revent,1825)~set$data[ng[i],]+set$data[ng[j],]+strata(set$node)+strata(set$ER)+strata(set$tum2),subset=n)) gi<-c(gi,ng[i]) gj<-c(gj,ng[j]) res<-rbind(res,c(cx$coefficients[1,c(2,5)],cx$coefficients[2,c(2,5)],cx$logtest[3])) } } options(scipen=10) res[,c(1,3)]<-formatC(round(res[,c(1,3)],2),format="f",digits=2) res[,c(2,4)]<-formatC(round(as.numeric(res[,c(2,4)]),6),format="f",digits=6) res[,5]<-formatC(round(as.numeric(res[,5]),5),format="f",digits=5) resfin<-data.frame(cbind(as.character(gi),as.character(gj),res),stringsAsFactors=F) resfin<-resfin[order(resfin[,7]),] resfin[resfin[,3]>1,]<-resfin[resfin[,3]>1,c(2,1,5,6,3,4,7)] colnames(resfin)<-c("gene 1","gene 2","HR gene 1","p-val gene 1","HR gene 2","p-val gene 2","p-val likelihood") topres<-resfin[1:100,] #genes that cause a lower HR in ecmn gcox<-c(topres[topres[,3]<1,1],topres[topres[,5]<1,2]) table(gcox) ecmn<-names(table(gcox))[table(gcox)>5] #genes that cause a higher HR in ecmp gcox<-c(topres[topres[,3]>1,1],topres[topres[,5]>1,2]) table(gcox) ecmp<-names(table(gcox))[table(gcox)>5] #generate HR plot of different cancer datasets sets<-c(TBRCA,TCOAD,THNSC, TKIRC,TLUAD,TLUSC,GEOD21653,EMTAB365) setnames<-c("BRCA","COAD","HNSC","KIRC","LUAD","LUSC","GEOD21653","EMTAB365") #generate HR plot of different cancer datasets (Fig 4) y<-NULL yuni<-NULL for(i in 1:length(sets)){ load(paste(path,sets[i],sep="")) x12<-unlist(lapply(ecmn,function(x) which(set$key[,2]==x))) x13<-unlist(lapply(ecmp,function(x) which(set$key[,2]==x))) x12<-scale(colMeans(set$data[set$key[x12,1],])) x13<-scale(colMeans(set$data[set$key[x13,1],])) es<-sum(as.integer(set$ER),na.rm=T) ns<-sum(as.integer(set$node),na.rm=T) if(i==1){ n = set$stage<4 n[is.na(n)] = F cx<-summary(coxph(Surv(set$rtime,set$revent)~x12+x13+set$node+set$ER,subset=n)) cx1<-summary(coxph(Surv(set$rtime,set$revent)~x12+set$node+set$ER,subset=n)) cx2<-summary(coxph(Surv(set$rtime,set$revent)~x13+set$node+set$ER,subset=n)) } if(i==7|i==8){ cx<-summary(coxph(Surv(set$time,set$event)~x12+x13+set$node+set$ER)) cx1<-summary(coxph(Surv(set$time,set$event)~x12+set$node+set$ER)) cx2<-summary(coxph(Surv(set$time,set$event)~x13+set$node+set$ER)) } if(i>1&i<7){ n = rep(T,length(set$stage)) cx<-summary(coxph(Surv(set$dtime,set$devent)~x12+x13+strata(set$stage)+strata(set$age),subset=n)) cx1<-summary(coxph(Surv(set$dtime,set$devent)~x12+strata(set$stage)+strata(set$age),subset=n)) cx2<-summary(coxph(Surv(set$dtime,set$devent)~x13+strata(set$stage)+strata(set$age),subset=n)) } y<-rbind(y,c(cx$conf.int[1,c(1,3,4)],cx$conf.int[2,c(1,3,4)])) yuni<-rbind(yuni,c(cx1$conf.int[1,c(1,3,4)],cx2$conf.int[1,c(1,3,4)])) } y<-y[nrow(y):1,] yuni<-yuni[nrow(yuni):1,] tiff(file=paste(path,"fig4.tif",sep=""),width=2000,height=1600,res=300,compression="lzw") plot(-1,-1,ylim=c(0,nrow(y)+1),log="x",xlim=c(0.1,10),yaxt="n",xlab="Hazard ratio",ylab="",cex=0.7) abline(v=1) for(i in 1:nrow(y)){ lines(y[i,2:3],c(i-5/6,i-5/6),lty=1,col=4,lwd=2) lines(y[i,5:6],c(i-4/6,i-4/6),lty=1,col=2,lwd=2) lines(yuni[i,2:3],c(i-2/6,i-2/6),lty=1,col="blue4",lwd=2) lines(yuni[i,5:6],c(i-1/6,i-1/6),lty=1,col="red4",lwd=2) abline(h=i,lty=2) text(0.1,9-i-1/3,labels=setnames[i],cex=1,adj=0) } lines(c(0.1,0.15),rep(nrow(y)+0.8,2),lty=1,lwd=2,col="blue4") lines(c(0.15,0.225),rep(nrow(y)+0.8,2),lty=1,lwd=2,col="red4") lines(c(0.1,0.15),rep(nrow(y)+0.4,2),lty=1,lwd=2,col=4) lines(c(0.15,0.225),rep(nrow(y)+0.4,2),lty=1,lwd=2,col=2) text(0.24,nrow(y)+0.85,labels="univariate",pos=4) text(0.24,nrow(y)+0.45,labels="multivariate",pos=4) dev.off() #plot HR of ecmn and ecmp genes in breast cancer microarray sets (Fig 5) sets<-c(GEOD21653,EMTAB365) x1 = NULL x2 = NULL x3 = NULL x4 = NULL for (i in 1:length(sets)){ load(paste(path,sets[i],sep="")) probesp = NULL for (j in 1:length(ecmp)){ x = set$key[which(set$key[,2]==ecmp[j]),1] if(length(x)>1){ rm = rowMeans(set$data[x,]) probesp = c(probesp,x[which.max(rm)]) } else { probesp = c(probesp,x) } } probesn = NULL for (j in 1:length(ecmn)){ x = set$key[which(set$key[,2]==ecmn[j]),1] if(length(x)>1){ rm = rowMeans(set$data[x,]) probesn = c(probesn,x[which.max(rm)]) } else { probesn = c(probesn,x) } } for(k in 1:length(probesp)){ cx = summary(coxph(survrc(set$time,set$event,1825)~scale(set$data[probesp[k],])+ scale(colMeans(set$data[probesn,]))+strata(set$ER)+strata(set$node))) x1=rbind(x1,cx$conf.int[1,c(1,3,4)]) cx = summary(coxph(survrc(set$time,set$event,1825)~scale(set$data[probesp[k],])+ +strata(set$ER)+strata(set$node))) x2 = rbind(x2,cx$conf.int[1,c(1,3,4)]) } for(k in 1:length(probesn)){ cx = summary(coxph(survrc(set$time,set$event,1825)~scale(set$data[probesn[k],])+ scale(colMeans(set$data[probesp,]))+strata(set$ER)+strata(set$node))) x3=rbind(x3,cx$conf.int[1,c(1,3,4)]) cx = summary(coxph(survrc(set$time,set$event,1825)~scale(set$data[probesn[k],])+ +strata(set$ER)+strata(set$node))) x4 = rbind(x4,cx$conf.int[1,c(1,3,4)]) } } tiff(file=paste(path,"fig5A.tif",sep=""),width=2000,height=1600,res=300,compression="lzw") plot(-1,-1,ylim=c(0,6),log="x",xlim=c(0.3,3.3),yaxt="n",xlab="Hazard ratio",ylab="",cex=0.7) abline(v=1) for(i in 1:5){ lines(x3[i,2:3],c(i-5/6,i-5/6),lty=1,col="blue",lwd=2) lines(x4[i,2:3],c(i-4/6,i-4/6),lty=1,col="blue4",lwd=2) lines(x3[i+5,2:3],c(i-2/6,i-2/6),lty=1,col="red",lwd=2) lines(x4[i+5,2:3],c(i-1/6,i-1/6),lty=1,col="red4",lwd=2) abline(h=i,lty=2) text(0.3,6-i-1/3,labels=ecmn[i],cex=1,adj=0) } lines(c(0.3,0.36),rep(nrow(x3)/2+0.8,2),lty=1,lwd=2,col="red4") lines(c(0.36,0.432),rep(nrow(x3)/2+0.8,2),lty=1,lwd=2,col="blue4") lines(c(0.3,0.36),rep(nrow(x3)/2+0.4,2),lty=1,lwd=2,col="red") lines(c(0.36,0.432),rep(nrow(x3)/2+0.4,2),lty=1,lwd=2,col="blue") text(0.44,nrow(x3)/2+0.85,labels="univariate",pos=4) text(0.44,nrow(x3)/2+0.45,labels="multivariate",pos=4) lines(c(1.05,1.25),rep(nrow(x3)/2+0.8,2),lty=1,lwd=2,col="red4") lines(c(1.25,1.5),rep(nrow(x3)/2+0.8,2),lty=1,lwd=2,col="red") lines(c(1.05,1.25),rep(nrow(x3)/2+0.4,2),lty=1,lwd=2,col="blue4") lines(c(1.25,1.5),rep(nrow(x3)/2+0.4,2),lty=1,lwd=2,col="blue") text(1.6,nrow(x3)/2+0.85,labels="GEOD21653",pos=4) text(1.6,nrow(x3)/2+0.45,labels="EMTAB365",pos=4) mtext("A",side=3,line=0,cex=2,adj=0,outer=F) dev.off() tiff(file=paste(path,"fig5B.tif",sep=""),width=2000,height=1600,res=300,compression="lzw") plot(-1,-1,ylim=c(0,6),log="x",xlim=c(0.3,3.3),yaxt="n",xlab="Hazard ratio",ylab="",cex=0.7) abline(v=1) for(i in 1:5){ lines(x1[i,2:3],c(i-5/6,i-5/6),lty=1,col="blue",lwd=2) lines(x2[i,2:3],c(i-4/6,i-4/6),lty=1,col="blue4",lwd=2) lines(x1[i+5,2:3],c(i-2/6,i-2/6),lty=1,col="red",lwd=2) lines(x2[i+5,2:3],c(i-1/6,i-1/6),lty=1,col="red4",lwd=2) abline(h=i,lty=2) text(0.3,6-i-1/3,labels=ecmp[i],cex=1,adj=0) } lines(c(0.3,0.36),rep(nrow(x3)/2+0.8,2),lty=1,lwd=2,col="red4") lines(c(0.36,0.432),rep(nrow(x3)/2+0.8,2),lty=1,lwd=2,col="blue4") lines(c(0.3,0.36),rep(nrow(x3)/2+0.4,2),lty=1,lwd=2,col="red") lines(c(0.36,0.432),rep(nrow(x3)/2+0.4,2),lty=1,lwd=2,col="blue") text(0.44,nrow(x3)/2+0.85,labels="univariate",pos=4) text(0.44,nrow(x3)/2+0.45,labels="multivariate",pos=4) lines(c(1.05,1.25),rep(nrow(x3)/2+0.8,2),lty=1,lwd=2,col="red4") lines(c(1.25,1.5),rep(nrow(x3)/2+0.8,2),lty=1,lwd=2,col="red") lines(c(1.05,1.25),rep(nrow(x3)/2+0.4,2),lty=1,lwd=2,col="blue4") lines(c(1.25,1.5),rep(nrow(x3)/2+0.4,2),lty=1,lwd=2,col="blue") text(1.6,nrow(x3)/2+0.85,labels="GEOD21653",pos=4) text(1.6,nrow(x3)/2+0.45,labels="EMTAB365",pos=4) mtext("B",side=3,line=0,cex=2,adj=0,outer=F) dev.off() #plot expression tumors in normal tissue and in tumor tissue (Fig 6) load(paste(path,TBRCAnorm, sep="")) mod<-as.matrix(log2(mRNAnormdata+1)) rownames(mod)<-mRNAID[,1] load(file=paste(path,TBRCA,sep="")) x<-match(colnames(mod),colnames(set$data)) np<-mod[,!is.na(x)] x<-x[!is.na(x)] tp<-set$data[,x] tiff(file=paste(path,"fig6.tif",sep=""),width=2700,height=1600,res=300,compression="lzw") options(scipen=3) gns = c(ecmn,ecmp) par(mfrow=c(3,6)) par(mar=c(1,3,2,1)) par(oma=c(0,2,0,0)) par(font.main=4) for (i in 1:length(gns)){ ymin = min(c(tp[gns[i],],np[gns[i],])) ymax = max(c(tp[gns[i],],np[gns[i],])) y1 = ymin - 0.1*(ymax-ymin) y2 = ymax + 0.1*(ymax-ymin) boxplot(tp[gns[i],],np[gns[i],],main=gns[i],ylim=c(y1,y2),xaxt="n",col=c(2,4),las=1) tt = t.test(tp[gns[i],],np[gns[i],]) text(1.3,ymax,labels=signif(tt$p.value,2),pos=3) if(i==1){mtext("A",side=2,line=0.8,cex=1.6,at=20,las=1,outer=F)} if(i==7){mtext("B",side=2,line=0.8,cex=1.6,at=17.5,las=1,outer=F)} if(i==11){ #frame() plot(0,0,type="n",xlim=c(0,10),ylim=c(0,10),xaxt="n",yaxt="n",xlab="",ylab="") # type="n" hides the points box("plot", col="white") legend(-1.5,8,bty="n",pch=15,col=c("red","blue"),legend=c("cancer","normal"),cex=1.6) } } gns = c(ecmp) for (i in 1:length(gns)){ ymin = min(c(tp[gns[i],]-colMeans(tp[necm,]),np[gns[i],]-colMeans(np[necm,]))) ymax = max(c(tp[gns[i],]-colMeans(tp[necm,]),np[gns[i],]-colMeans(np[necm,]))) y1 = ymin - 0.1*(ymax-ymin) y2 = ymax + 0.1*(ymax-ymin) boxplot(tp[gns[i],]-colMeans(tp[necm,]),np[gns[i],]-colMeans(np[necm,]),main=gns[i], ylim=c(y1,y2),xaxt="n",col=c(2,4),las=1) tt = t.test(tp[gns[i],]-colMeans(tp[necm,]),np[gns[i],]-colMeans(np[necm,])) text(1.3,y2,labels=signif(tt$p.value,2)) #text(1,1.1*ymax,labels=signif(tt$p.value,2)) if(i==1){mtext("C",side=2,line=0.8,cex=1.6,at=(-0.8),las=1,outer=F)} } mtext("log2 expression",side=2,line=0,cex=1,outer=T) mtext("log2 expression",side=2,line=0,cex=1,outer=T,adj=0.93) mtext("log2 ratio",side=2,line=0,cex=1,outer=T,adj=0.12) dev.off() #find and plot expression levels in mouse and human cells (stroma vs tumor) - xenografts pathx = paste(path,GSE66744,sep="") x = dir(pathx) samples = x[grep("Mammary",x)] samples = c(samples,x[grep("Mouse",x)]) samples = samples[3:18] fact = c(1,2,1,2,1,2,1,2,3,4,3,4,3,4,3,4) ecmn_mouse = sapply(ecmn,function(x) paste(substr(x,1,1),tolower(substr(x,2,nchar(x))),sep="")) ecmp_mouse = sapply(ecmp,function(x) paste(substr(x,1,1),tolower(substr(x,2,nchar(x))),sep="")) contr = c("ESR1","YWHAZ","SDHA","UBC") contr_mouse = sapply(contr,function(x) paste(substr(x,1,1),tolower(substr(x,2,nchar(x))),sep="")) gdata = NULL for (i in 1:length(samples)){ x = read.table(paste(pathx,samples[i],sep=""), stringsAsFactors=F,header=T,sep="\t") xl = log2(10e+5*x[,3]/sum(x[,3])+1) if (length(grep("human",samples[i]))>0){ gdata = rbind(gdata,xl[match(c(ecmn,ecmp,contr),x[,1])]) } if (length(grep("mouse",samples[i]))>0){ gdata = rbind(gdata,xl[match(c(ecmn_mouse,ecmp_mouse,contr_mouse),x[,1])]) } } tiff(file=paste(path,"fig7.tif",sep=""),width=2500,height=1600,res=300,compression="lzw") par(mfrow=c(3,6)) par(mar=c(1,2,2,0.5)) par(oma=c(0,2,0,0)) for (i in 1:ncol(gdata)){ ymax = max(gdata[,i]) plot(jitter(fact),gdata[,i],col=c(2,4,2,4),xaxt="n",las=1,ylab="log2 RPKM",pch=19,xlab="",main=c(ecmn,ecmp,contr)[i], font.main=4,ylim=c(0,ymax*1.3),xlim=c(0.5,4.5)) lines(c(0.7,2.3),c(ymax*1.02,ymax*1.02)) lines(c(2.7,4.3),c(ymax*1.02,ymax*1.02)) lines(c(2.5,2.5),c(0,ymax*1.05),lty=2) text(1.5,ymax*1.01,pos=3,labels="MCF7") text(3.5,ymax*1.01,pos=3,labels=" MDA-\nMB-231") if (i==11){ frame() } if (i==15){ plot(0,0,type="n",xlim=c(0,10),ylim=c(0,10),xaxt="n",yaxt="n",xlab="",ylab="") # type="n" hides the points box("plot", col="white") legend(-1,8,bty="n",pch=16,col=c("red","blue"),legend=c("cancer \ncells","stroma"),cex=1.6) frame() } } mtext("log2 expression",side=2,line=0,cex=1,adj=0.48,outer=T) mtext("log2 expression",side=2,line=0,cex=1,outer=T,adj=0.91) mtext("log2 expression",side=2,line=0,cex=1,outer=T,adj=0.06) mtext("A",side=2,line=0,cex=1.6,outer=T,at=0.98,las=1) mtext("B",side=2,line=0,cex=1.6,outer=T,at=0.64,las=1) mtext("C",side=2,line=0,cex=1.6,outer=T,at=0.30,las=1) dev.off() #boxplot of ECM and endothelial clusters in TCGA tumors (Figure S3) sets<-c(TBRCA,TCOAD,THNSC, TKIRC,TLUAD,TLUSC) setnames<-c("BRCA","COAD","HNSC","KIRC","LUAD","LUSC") x<-NULL y<-NULL ind<-NULL for (i in 1:length(sets)){ load(paste(path,sets[i],sep="")) x<-c(x,colMeans(set$data[necm,])) y<-c(y,colMeans(set$data[nend,])) ind<-c(ind,rep(i,ncol(set$data))) } tiff(file=paste(path,"Fig S3.tif",sep=""),width=1600,height=3200,res=300,compression="lzw") par(mfrow=c(2,1)) boxplot(x~ind,xaxt="n",main="ECM metagene",ylab="mean log2 expression") axis(1,at=1:6,tick=F,labels=setnames) boxplot(y~ind,xaxt="n",main="Endothelial metagene",ylab="mean log2 expression") axis(1,at=1:6,tick=F,labels=setnames) dev.off() #find and plot expression levels of P4HAs in mouse and human cells (stroma vs tumor) - Fig 8A pathx = paste(path,GSE66744,sep="") x = dir(pathx) samples = x[grep("Mammary",x)] samples = c(samples,x[grep("Mouse",x)]) samples = samples[3:18] fact = c(1,2,1,2,1,2,1,2,3,4,3,4,3,4,3,4) genes = c("P4HA1","P4HA2","P4HA3") genes_mouse = sapply(genes,function(x) paste(substr(x,1,1),tolower(substr(x,2,nchar(x))),sep="")) gdata = NULL for (i in 1:length(samples)){ x = read.table(paste(pathx,samples[i],sep=""), stringsAsFactors=F,header=T,sep="\t") xl = log2(10e+5*x[,3]/sum(x[,3])+1) if (length(grep("human",samples[i]))>0){ gdata = rbind(gdata,xl[match(c(genes),x[,1])]) } if (length(grep("mouse",samples[i]))>0){ gdata = rbind(gdata,xl[match(c(genes_mouse),x[,1])]) } } tiff(file=paste(path,"fig8A.tif",sep=""),width=1600,height=800,res=300,compression="lzw") par(oma=c(0,2,0,0)) par(mfrow=c(1,4)) par(mar=c(1,2,2,0.5)) par(font.main=4) for (i in 1:ncol(gdata)){ ymax = max(gdata[,i]) plot(jitter(fact),gdata[,i],col=c(2,4,2,4),xaxt="n",las=1,ylab="log2 RPKM",pch=19,xlab="",main=c(genes)[i], ylim=c(0,ymax*1.3),xlim=c(0.5,4.5)) lines(c(0.7,2.3),c(ymax*1.02,ymax*1.02)) lines(c(2.7,4.3),c(ymax*1.02,ymax*1.02)) lines(c(2.5,2.5),c(0,ymax*1.05),lty=2) text(1.5,ymax*1.01,pos=3,labels="MCF7") text(3.5,ymax*1.01,pos=3,labels=" MDA-\nMB-231") #legend(0.2,ymax*1.4,pch=19,col=c(2,4),legend=c("Cancer cells","Stroma"),bty="n") } mtext("log2 expression",side=2,line=0,cex=1,adj=0.48,outer=T) mtext("A",side=2,line=0,cex=1.6,outer=T,at=0.95,las=1) plot(0,0,type="n",xlim=c(0,10),ylim=c(0,10),xaxt="n",yaxt="n",xlab="",ylab="") # type="n" hides the points box("plot", col="white") legend(-1,8,bty="n",pch=16,col=c("red","blue"),legend=c("cancer \ncells","stroma"),cex=1.2) dev.off() #plot expression in tumors and in normal (normalized to ECM metagene) tissue of P4HAs - Fig 8B load(paste(path,TBRCAnorm,sep="")) mod<-as.matrix(log2(mRNAnormdata+1)) rownames(mod)<-mRNAID[,1] load(file=paste(path,TBRCA,sep="")) x<-match(colnames(mod),colnames(set$data)) np<-mod[,!is.na(x)] x<-x[!is.na(x)] tp<-set$data[,x] options(scipen=3) gns = c("P4HA1","P4HA2","P4HA3") tiff(file=paste(path,"fig8B.tif",sep=""),width=1600,height=800,res=300,compression="lzw") par(oma=c(0,2,0,0)) par(mfrow=c(1,4)) par(mar=c(1,2,2,0.5)) par(font.main=4) for (i in 1:length(gns)){ ymin = min(c(tp[gns[i],]-colMeans(tp[necm,]),np[gns[i],]-colMeans(np[necm,]))) ymax = max(c(tp[gns[i],]-colMeans(tp[necm,]),np[gns[i],]-colMeans(np[necm,]))) y1 = ymin - 0.1*(ymax-ymin) y2 = ymax + 0.1*(ymax-ymin) boxplot(tp[gns[i],]-colMeans(tp[necm,]),np[gns[i],]-colMeans(np[necm,]),main=gns[i], ylim=c(y1,y2),xaxt="n",col=c(2,4),las=1) tt = t.test(tp[gns[i],]-colMeans(tp[necm,]),np[gns[i],]-colMeans(np[necm,])) text(1.3,y2,labels=signif(tt$p.value,2)) #text(1,1.1*ymax,labels=signif(tt$p.value,2)) } mtext("log2 ratio",side=2,line=0,cex=1,adj=0.48,outer=T) mtext("B",side=2,line=0,cex=1.6,outer=T,at=0.95,las=1) plot(0,0,type="n",xlim=c(0,10),ylim=c(0,10),xaxt="n",yaxt="n",xlab="",ylab="") # type="n" hides the points box("plot", col="white") legend(-1,8,bty="n",pch=15,col=c("red","blue"),legend=c("cancer","normal"),cex=1.2) dev.off() #find cor coefficients for P4HAs and ECM signature in TCGA data Fig 8D sets<-c(TBRCA,TCOAD,THNSC, TKIRC,TLUAD,TLUSC) setnames<-c("BRCA","COAD","HNSC","KIRC","LUAD","LUSC") x<-NULL for (i in 1:length(sets)){ load(paste(path,sets[i],sep="")) x<-cbind(x,c(cor(set$data["P4HA1",],colMeans(set$data[necm,])), cor(set$data["P4HA2",],colMeans(set$data[necm,])), cor(set$data["P4HA3",],colMeans(set$data[necm,])))) } colnames(x)<-setnames rownames(x)<-c("P4HA1","P4HA2","P4HA3") x<-x[nrow(x):1,] cols<-c("red4","orange4","green4","cyan4","blue4","purple4") cols<-c("red1","orange1","green1","cyan1","blue1","purple1") tiff(file=paste(path,"fig8D.tif",sep=""),width=1600,height=1400,res=300,compression="lzw") par(mar=c(5,6,4,2)) plot(1000,1000,ylim=c(0,3),xlim=c(0,1),yaxt="n",xlab="correlation coefficient with ECM metagene",ylab="") for(i in nrow(x):1){ points(x[i,],rep(i*0.5,length(sets)),col=cols,pch=19) } axis(2,at=c(0.5*(1:3)),labels=rownames(x),tick=F,las=1,font=3) legend(0.2,3,pch=19,col=cols[1:3],legend=setnames[1:3],bty="n") legend(0.5,3,pch=19,col=cols[4:6],legend=setnames[4:6],bty="n") mtext("D",side=2,line=2,cex=1.6,at=3,las=1) dev.off() #plot HRs of P4HAs in breast cancer microarray sets and TCGA BRCA - Fig 8E gns=c("P4HA1","P4HA2","P4HA3") sets<-c(GEOD21653,EMTAB365) yy<-NULL for (i in 1:length(sets)){ load(paste(path,sets[i],sep="")) y<-NULL for (j in 1:length(gns)){ x12<-which(set$key[,2]==gns[j]) rm<-rowMeans(set$data[set$key[x12,1],]) x12<-names(rm)[which.max(rm)] x13<-set$key[unlist(sapply(necm,function(x) which(set$key[,2]==x))),1] y<-c(y,summary(coxph(survrc(set$time,set$event,1825)~scale(set$data[x12,])+strata(set$ER)+strata(set$node)))$conf.int[1,c(1,3:4)]) y<-c(y,summary(coxph(survrc(set$time,set$event,1825)~scale(set$data[x12,])+scale(colMeans(set$data[x13,]))+strata(set$ER)+strata(set$node)))$conf.int[1,c(1,3:4)]) } yy<-cbind(yy,y) } rn<-c(rep(gns[1],6),rep(gns[2],6),rep(gns[3],6)) rn<-paste(rn,rep(c("uni HR","l","h","multi HR","l","h"),length(gns))) rownames(yy)<-rn y<-NULL load(file=paste(path,TBRCA,sep="")) for(j in 1:length(gns)){ y<-c(y,summary(coxph(survrc(set$rtime,set$revent,1825)~scale(set$data[gns[j],])+strata(set$ER)+strata(set$node)))$conf.int[1,c(1,3:4)]) y<-c(y,summary(coxph(survrc(set$rtime,set$revent,1825)~scale(set$data[gns[j],])+scale(colMeans(set$data[necm,]))+strata(set$ER)+strata(set$node)))$conf.int[1,c(1,3:4)]) } yy<-cbind(yy,y) rn<-c(rep(gns[1],6),rep(gns[2],6),rep(gns[3],6)) rn<-paste(rn,rep(c("uni HR","l","h","multi HR","l","h"),length(gns))) rownames(yy)<-rn library(RColorBrewer) setnames<-c("GEOD21653","EMTAB365","TCGA BRCA") cols = c("red4","red","blue4","blue","black","gray50") ltypes=c(2,1) tiff(file=paste(path,"fig8E.tif",sep=""),width=1800,height=1400,res=300,compression="lzw") plot(1000,1000,ylim=c(-28,0),xlim=c(0.6,6),log="x",xaxt="n",yaxt="n",xlab="HR",ylab="") axis(1,at=c(0.7,1,1.5,2.5,4)) abline(v=1) ind<-c(1,2,3) for (i in 1:3){ for(k in 1:3){ j<-ind[k] points(yy[i*6-5,j],-((i-1)*11+2*j-1),col=cols[2*j-1]) points(yy[i*6-2,j],-((i-1)*11+2*j),col=cols[2*j]) lines(c(yy[i*6-4,j],yy[i*6-3,j]),c(-((i-1)*11+2*j-1),-((i-1)*11+2*j-1)),col=cols[2*j-1],lty=1,lwd=2) lines(c(yy[i*6-1,j],yy[i*6,j]),c(-((i-1)*11+2*j),-((i-1)*11+2*j)),col=cols[2*j],lty=1,lwd=2) } text(0.63,-(i*11)+8,labels=gns[i],bty="n",font=3) } lgds<-paste(c(sapply(setnames,function(x) rep(x,2))),rep(c("uni","multi"),2)) legend(2.2,-1.1,legend=lgds,bty="n",lty=1,lwd=2,col=cols) mtext("E",side=2,line=0.4,cex=1.6,at=0,las=1) dev.off()