# R code used to generate figures for Wertheim et al MELP manuscript # # final version by S. Master, UPenn, 7/15/13 (update from prior stable version, 12/17/12) # also run load_sampdata.R source("load_sampdata.R") # prep for Figure 3a # GSE18700 files obtained from GEO data <- read.delim("GSE18700_series_matrix-1.txt",comment.char="!") data2 <- read.delim("GSE18700_series_matrix-2.txt",comment.char="!") data2b <- data2[,-1] data3 <- cbind(data, data2b) row.names(data3) <- data3[,1] data3 <- data3[,-1] d3count <- apply(data3,1,function(x) { sum(x>2)}) d3min <- apply(data3,1,min) d3mean <- apply(data3,1,mean) d3ymax <- apply(data3,1,function(x) {max(density(x)$y)}) d3rows <- row.names(data3) # prep for Figure 3b normfactor <- (data3["MSPI0406S00318682",] + data3["MSPI0406S00653944",] + data3["MSPI0406S00890278",]) / 3 norm_mat <- matrix(rep(normfactor,length(data3[,1])),nrow=length(data3[,1]),byrow=T) data4 <- t(apply(data3,1,'-',unlist(normfactor))) row.names(data4) <- row.names(data3) tlist <- rn names(tlist) <- samp # prep for Figure 4a melp_input <- read.delim("alldata.120411.txt",header=T,row.names=1) oslist <- row.names(melp_input) locuslist <- names(melp_input) kf_train <- read.delim("ktrain.txt",header=F) kf_test <- read.delim("ktest.txt",header=F) kf_valid <- read.delim("kvalid.txt",header=F) # fix a naming issue oslist[grep("6357-1",oslist,fixed=TRUE,value=FALSE)] <- "6357" s_orig <- grep("_",oslist,fixed=TRUE,value=FALSE,invert=TRUE) s_pcr20 <- grep("_20",oslist,fixed=TRUE,value=FALSE) s_pcr11 <- grep("_11",oslist,fixed=TRUE,value=FALSE) melp_mat <- melp_input[s_orig,] melp_mat20 <- melp_input[s_pcr20,] melp_mat11 <- melp_input[s_pcr11,] samplist <- tlist[oslist[s_orig]] whelp <- !is.na(samplist) melp_mat <- melp_mat[whelp,] samplist <- samplist[whelp] row.names(melp_mat) <- samplist r20 <- sub("_..","",row.names(melp_mat20)) s20 <- tlist[r20] w20 <- !is.na(s20) s20 <- s20[w20] melp_mat20 <- melp_mat20[w20,] row.names(melp_mat20) <- s20 r11 <- sub("_..","",row.names(melp_mat11)) s11 <- tlist[r11] w11 <- !is.na(s11) s11 <- s11[w11] melp_mat11<- melp_mat11[w11,] row.names(melp_mat11) <- s11 sub_list <- oslist[] hyb_mat <- melp_input[s_orig] train_sublist <- samplist %in% as.character(kf_train$V3) test_sublist <- samplist %in% as.character(kf_test$V3) valid_sublist <- samplist %in% as.character(kf_valid$V3) tv_sublist <- test_sublist | valid_sublist # Figueroa et al Cancer Cell 2010, Supplementary table 5 figueroaloci <- c("MSPI0406S00783415","MSPI0406S00196536","MSPI0406S00920592","MSPI0406S00710190","MSPI0406S00163833","MSPI0406S00600078","MSPI0406S00914183","MSPI0406S00765490","MSPI0406S00914182","MSPI0406S00914181","MSPI0406S00997890","MSPI0406S00838340","MSPI0406S00136939","MSPI0406S00669709","MSPI0406S00027418","MSPI0406S00589152","MSPI0406S00910305","MSPI0406S00698115") comp_mat_1 <- data3[figueroaloci,samplist] # HELP, unnormalized comp_mat_2 <- data4[figueroaloci,samplist] # HELP, normalized comp_mat_3 <- melp_mat[samplist, figueroaloci] # MELP, unnormalized norm_melp_mat3 <- apply(melp_mat[samplist,c("MSPI0406S00318682","MSPI0406S00653944","MSPI0406S00890278")],1,sum) norm_melp_mat <- norm_melp_mat3 / 3 comp_mat_4 <- melp_mat[samplist, figueroaloci] - norm_melp_mat norm20_melp_mat3 <- apply(melp_mat20[s20,c("MSPI0406S00318682","MSPI0406S00653944","MSPI0406S00890278")],1,sum) norm20_melp_mat <- norm20_melp_mat3 / 3 comp20_mat_4 <- melp_mat20[s20,locuslist] - norm20_melp_mat norm11_melp_mat3 <- apply(melp_mat11[s11,c("MSPI0406S00318682","MSPI0406S00653944","MSPI0406S00890278")],1,sum) norm11_melp_mat <- norm11_melp_mat3 / 3 comp11_mat_4 <- melp_mat11[s11,locuslist] - norm11_melp_mat # ------------- # Figure 3a: Loci chosen for normalization # ------------- par(mfrow=c(1,1)) plot(density(as.numeric(data3[,1])),col=1,ylim=c(0,0.7),xlab="Log Ratio Score",ylab="Density",lwd=3,lty=2,main="Density Plot of Log Ratio Distribution") leg.text <- c("All loci, Normal CD34+ cells","Selected control loci, all cell lines") legend(list(x=-4,y=0.65),legend=leg.text,lwd=3,lty=c(2,1)) for (i in c("MSPI0406S00318682","MSPI0406S00653944","MSPI0406S00890278")) { td <- density(as.numeric(data3[i,])) lines(density(as.numeric(data3[i,])),col=1,lwd=3,lty=1) } # -------- # Figure 3b: correlation with MassArray # -------- malists <- c("5362","6376","2191","2216","7148","2549","7119","2767","2253","1316","2686","3483","2545","2218") # MassArray data obtained from authors of Figueroa et al. massarraydata <- read.delim("MassArray2_no3101.txt",row.names=1,header=F,col.names=c("MSP",tlist[malists])) marows <- row.names(massarraydata) par(mfrow=c(1,2)) plot(as.double(-data3[marows[1],tlist[malists]]),massarraydata[marows[1],tlist[malists]],xlim=c(-max(data3[marows[1:23],tlist[malists]]),-min(data3[marows[1:23],tlist[malists]])),ylim=c(min(massarraydata[marows[1:23],tlist[malists]],na.rm=T),max(massarraydata[marows[1:23],tlist[malists]],na.rm=T)),xlab="(-) HELP, unnormalized",ylab="MassArray") for(i in 1:23) { points(as.double(-data3[marows[i],tlist[malists]]),massarraydata[marows[i],tlist[malists]]) } plot(as.double(-data4[marows[1],tlist[malists]]),massarraydata[marows[1],tlist[malists]],xlim=c(-max(data4[marows[1:23],tlist[malists]]),-min(data4[marows[1:23],tlist[malists]])),ylim=c(min(massarraydata[marows[1:23],tlist[malists]],na.rm=T),max(massarraydata[marows[1:23],tlist[malists]],na.rm=T)),xlab="(-) HELP, normalized",ylab="MassArray") for(i in 1:23) { points(as.double(-data4[marows[i],tlist[malists]]),massarraydata[marows[i],tlist[malists]]) } subset <- !is.na(c(as.matrix(massarraydata[marows[1:23],tlist[malists]]))) mavect1 <- c(as.matrix(-data3[marows[1:23],tlist[malists]])) mavect2 <- c(as.matrix(massarraydata[marows[1:23],tlist[malists]])) cor(mavect1[subset],mavect2[subset]) mavect1 <- c(as.matrix(-data4[marows[1:23],tlist[malists]])) cor(mavect1[subset],mavect2[subset]) # -------- # Figure 4a: HELP vs MELP # -------- # par(mfrow=c(3,6)) ccs <- c(1:18) process_samps <- samplist[train_sublist] distance_matrix <- matrix(NA,nrow=length(process_samps),ncol=length(figueroaloci),dimnames=list(process_samps,figueroaloci)) # not actual figure 4a; this is a alternate that uses only training samples for(i in 1:18) { plot(as.numeric(comp_mat_1[figueroaloci[i],process_samps]),comp_mat_4[process_samps,figueroaloci[i]],xlab="HELP (global norm)",ylab="MELP (local norm)",main=figueroaloci[i]) ccs[i] <- cor(as.numeric(comp_mat_1[figueroaloci[i],process_samps]),comp_mat_4[process_samps,figueroaloci[i]]) helpV <- as.numeric(comp_mat_1[figueroaloci[i],process_samps]) melpV <- comp_mat_4[process_samps,figueroaloci[i]] hmreg <- lm(melpV~helpV) m <- hmreg$coefficients[2] b <- hmreg$coefficients[1] for (j in 1:length(process_samps)) { distance_matrix[j,i] = comp_mat_4[j,i] - ((m * comp_mat_1[i,j]) + b) } abline(hmreg) } # actual figure 4a par(mfrow=c(3,6)) srepeat <- samplist %in% s20 for(i in 1:18) { plot(comp_mat_2[figueroaloci[i],!srepeat],comp_mat_4[!srepeat,figueroaloci[i]],xlab="HELP, normalized",ylab="MELP, normalized",main=figueroaloci[i]) points(comp_mat_2[figueroaloci[i],s20],comp20_mat_4[s20,figueroaloci[i]],col=1) # next 2 lines are extra code to highlight repeat samples, if desired # points(comp_mat_2[figueroaloci[i],srepeat],comp_mat_4[srepeat,figueroaloci[i]],col=2,pch=19) # points(comp_mat_2[figueroaloci[i],s11],comp11_mat_4[s11,figueroaloci[i]],col=4,pch=19) ccs[i] <- cor(comp_mat_2[figueroaloci[i],process_samps],comp_mat_4[process_samps,figueroaloci[i]]) helpV <- comp_mat_2[figueroaloci[i],process_samps] melpV <- comp_mat_4[process_samps,figueroaloci[i]] hmreg <- lm(melpV~helpV) m <- hmreg$coefficients[2] b <- hmreg$coefficients[1] abline(hmreg,col=2,lwd=3) # supplementary code for plotting residuals, not used in paper figs for (j in 1:length(process_samps)) { distance_matrix[j,i] = comp_mat_4[j,i] - ((m * comp_mat_2[i,j]) + b) } } # -------- # Figure 4b: Survival curves # -------- comp_mat_4_comp <- comp_mat_4 # use the 20-cycle repeat data comp_mat_4_comp[s20,] <- comp20_mat_4[s20,] # now train with HELP and test with MELP # 5/21/12 train_samps <- samplist[train_sublist] test_samps <- samplist[test_sublist] validate_samps <- samplist[valid_sublist] train_help <- as.character(kf_train$V3) test_help <- as.character(kf_test$V3) library(MethComp) # contains Deming regression code slopes <- as.vector(length(figueroaloci)) intercepts <- as.vector(length(figueroaloci)) for (i in 1:length(figueroaloci)) { dr <- Deming(data4[figueroaloci[i],train_samps],comp_mat_4_comp[train_samps,figueroaloci[i]]) intercepts[i] <- dr[1] slopes[i] <- dr[2] } sub_data4 <- data4[figueroaloci,] csub_data4 <- sub_data4 * slopes csub_data4 <- csub_data4 + intercepts library(superpc) set.seed(122868) spcx <- csub_data4[figueroaloci,train_help] spcy <- aml_data[train_help,"os"] spcn <- aml_data[train_help,"osec"] spcdata <- list(x=spcx,y=spcy,censoring.status=spcn,featurenames=row.names(spcx)) spcxtest <- csub_data4[figueroaloci,test_help] spcytest <- aml_data[test_help,"os"] spcntest <- aml_data[test_help,"osec"] testdata <- list(x=spcxtest,y=spcytest,censoring.status=spcntest,featurenames=row.names(spcxtest)) spcxvalid <- t(comp_mat_4_comp[c(validate_samps,test_samps),figueroaloci]) hspcxvalid <- data4[figueroaloci,c(validate_samps,test_samps)] spcyvalid <- aml_data[c(validate_samps,test_samps),"os"] spcnvalid <- aml_data[c(validate_samps,test_samps),"osec"] validdata <- list(x=spcxvalid,y=spcyvalid,censoring.status=spcnvalid,featurenames=row.names(spcxvalid)) h_validdata <- list(x=hspcxvalid,y=spcyvalid,censoring.status=spcnvalid,featurenames=row.names(spcxvalid)) spctrain <- superpc.train(spcdata,type="survival") spctr <- superpc.predict.red(spctrain,spcdata,testdata,0.0,prediction.type="continuous",n.components=1) spclf <- superpc.listfeatures(spcdata,spctrain,spctr) # ...and validate on MELP data ttscores <- apply(comp_mat_4_comp[train_samps,spclf[,3]],1,function(x) {sum(x * as.double(spclf[,2]))}) indivscores <- apply(comp_mat_4_comp[c(validate_samps,test_samps),spclf[,3]],1,function(x) {sum(x * as.double(spclf[,2]))}) # plus compare to HELP data h_ttscores <- apply(csub_data4[spclf[,3],train_samps],2,function(x) {sum(x * as.double(spclf[,2]))}) h_indivscores <- apply(csub_data4[spclf[,3],c(validate_samps,test_samps)],2,function(x) {sum(x * as.double(spclf[,2]))}) par(mfrow=c(1,2)) cutbins <- 3 cutpoints <- (0:cutbins) / cutbins ttquant <- quantile(ttscores, cutpoints) h_ttquant <- quantile(h_ttscores,cutpoints) xmax = max(validdata$y) * 1.2 h_totsurv <- survfit(Surv(validdata$y, validdata$censoring.status) ~ as.integer(cut(h_indivscores,h_ttquant))) plot(h_totsurv, col=c("red","blue","orange"), xlab="time (months)", ylab="Prob survival", xlim=c(0,xmax),main="HELP training, HELP validation (all)",lwd=3) m_totsurv <- survfit(Surv(validdata$y, validdata$censoring.status) ~ as.integer(cut(indivscores,ttquant))) plot(m_totsurv, col=c("red","blue","orange"), xlab="time (months)", ylab="Prob survival", xlim=c(0,xmax),main="HELP training, MELP validation (all)",lwd=3) survdiff(Surv(validdata$y, validdata$censoring.status) ~ as.integer(cut(indivscores,ttquant)))