#### ## Code for Eric Melter ## Differential expression and unsupvervised learning ## Started 7-03-09 ## Last modified 03-11 for new folder structure ## WTB #### rm(list=ls()) ## 0) Directory Functions/packages setwd("~/Projects/Meltzer/") .libPaths("~/Scripts/R/ifrit.packages/") source("~/Scripts/SAFE2.0/prune.Cmatrix.R") source("~/Scripts/BinRegInR/binreg.101309.R") source("~/Scripts/R/myfunctions09.R") ## Or load PCA3D() source("~/Scripts/R/functions.0809.R") ## Or load compute.ICC() source("~/Scripts/R/plotfns.R") ## Or load matlab.colors() library(gplots); library(scatterplot3d); library(hgu133plus2.db) library(ROCR);library(limma) library(safe) ################################## ## 0) Load data ################################## DATA.ALL <- read.delim(file="Data/complete_dataset.txt",as.is=T,row.names=1) #dim(DATA.ALL);sum(is.na(DATA.ALL));range(DATA.ALL) names <- gsub("HealthyDonor","Normal",colnames(DATA.ALL)) names <- gsub("_b","_B",names); names <- gsub("_e","_E",names) names[grep("X",names)] <- paste(substr(names[grep("X",names)],7,100), substr(names[grep("X",names)],2,5),sep="_") colnames(DATA.ALL) <- names gene <- unlist(mget(rownames(DATA.ALL),hgu133plus2SYMBOL)) desc <- unlist(mget(rownames(DATA.ALL),hgu133plus2GENENAME)) ################################## ## 1) Analysis of Upper versus Lower ## 7-3-09 ################################## #### a)Read data, create phenotype vector, get annotation keep <- c(grep("U",colnames(DATA.ALL)),grep("L",colnames(DATA.ALL))) keep <- keep[c(-1:-4,-11)] ## Drop singletons and normals data <- DATA.ALL[,keep] pheno <- c(1:6,-1:-6) #### b) Filter genes: (1) mean > 4, and (2) unannotated means <- apply(data,1,mean) ## Density plot suggests using means = 4 as cutoff keep <- means >= 4 & !is.na(gene) data2 <- data[keep,] #### c) Paired t-test / limma Subj <- factor(abs(pheno)) Lobe <- factor(sign(pheno)) design <- model.matrix(~Subj+Lobe) fit <- lmFit(data2,design) fit <- eBayes(fit) tab <- topTable(fit, coef="Lobe1",adjust="BH", number = nrow(data2)) tab <- data.frame(tab, Gene = unlist(mget(tab[,1],hgu133plus2SYMBOL)), Desc = unlist(mget(tab[,1],hgu133plus2GENENAME))) write.table(tab,"Results/UvL.limma.genelist.xls",sep="\t",quote=F,row.names=F) pdf("Results/UvL.HistLimma.pdf",paper="USr",width=0,height=0) par(cex.lab=2.5,cex.axis=2,mar=c(10,10,10,10)) hist(tab$P.Value,breaks = seq(0,1,0.05),ylab="",xlab="Nominal p-value", col=8,main="",bty="o") dev.off() #### d) Unsupervised clustering - Top 10% by C.of.V cov <- apply(data,1,sd)/apply(data,1,mean) keep <- cov > quantile(cov,probs=0.9) data2 <- data[keep,] get.roworder = FALSE if(get.roworder){ dc <- dist(data2,method="euclidean") hc <- hclust(dc,method="average") save(hc,file = "Data/UvL.hc.Rdata") } else attach("Data/UvL.hc.Rdata") dr <- dist(t(data2),method="euclidean") ## columns ordered by top 10% hr <- hclust(dr,method="average") pdf("Results/UvL.heatmap.0210.pdf",paper="USr",width=0,height=0) heatmap.2(as.matrix(t(apply(data2,1,scale))),Rowv=as.dendrogram(hc),Colv=as.dendrogram(hr), dendrogram=c("column"),scale="row",trace="none", key=TRUE,keysize=1.2,symkey=FALSE,density.info="none", labCol=colnames(data),labRow=" ",col=matlab.colors(100),margins = c(8,5), lmat=cbind( c(3, 1,1 ), c(2,2,4 ) ), lwid=c(2, 4), lhei = c(1,4,2)) dev.off() x <- prcomp(t(data)); dim(data) print((x$sdev[1:3])^2 / sum(x$sdev^2)) compute.ICC(abs(pheno),x$x[,1]) corr1 <- 0; corr2 <- 0; for(i in 1:6){ corr1 <- corr1 + cor(data2[,abs(pheno)==i])[1,2]/6 corr2 <- corr2 + sum(cor(data2[,pheno==i],data2[,abs(pheno)!=i & pheno < 0]))/60 corr2 <- corr2 + sum(cor(data2[,-pheno==i],data2[,abs(pheno)!=i & pheno > 0]))/60 } ################################## ## 2) Analysis of Biopsy versus Explant ## 9-2-09 ################################## #### a)Read data, create phenotype vector, get annotation keep <- sapply(c("140U","142U","144U","145U","149U","159U","146L","152U", "157U","158U","160U"),grep,colnames(DATA.ALL)) data <- DATA.ALL[,keep] pheno <- substr(colnames(data),1,1) #### b) Filter genes: (1) mean > 4, and (2) unannotated means <- apply(data,1,mean) keep <- means >= 4 & !is.na(gene) data2 <- data[keep,] #### c) 2-sample t-test / limma design <- cbind(B = 1, EvB = (pheno=="E")) fit <- lmFit(data2,design) fit <- eBayes(fit) tabL = topTable(fit, coef="EvB", number = nrow(data2),adjust="BH") tabL <- data.frame(tabL, Gene = unlist(mget(tabL[,1],hgu133plus2SYMBOL)), Desc = unlist(mget(tabL[,1],hgu133plus2GENENAME))) write.table(tabL,"Results/BvE.limma.genelist.xls",sep="\t",quote=F,row.names=F) pdf("Results/BvE.HistLimma.pdf",paper="USr",width=0,height=0) par(cex.lab=2.5,cex.axis=2,mar=c(10,10,10,10)) hist(tabL$P.Value,breaks = seq(0,1,0.05),ylab="",xlab="Nominal p-value", col=8,main="",bty="o") dev.off() #### d) Unsupervised clustering - Top 10% by C.of.V (all genes) cov <- apply(data,1,sd)/apply(data,1,mean) keep <- cov > quantile(cov,probs=0.9) data2 <- data[keep,] get.roworder <- FALSE if(get.roworder){ dc <- dist(data2,method="euclidean") hc <- hclust(dc,method="average") save(hc,file = "Data/BvE.hc.Rdata") } else attach("Data/BvE.hc.Rdata") dr <- dist(t(data2),method="euclidean") ## DRawn from 10% only hr <- hclust(dr,method="average") pdf("Results/BvE.heatmap.0210.pdf",paper="USr",width=0,height=0) heatmap.2(as.matrix(t(apply(data2,1,scale))),Rowv=as.dendrogram(hc),Colv=as.dendrogram(hr), dendrogram=c("column"),scale="row",trace="none", key=TRUE,keysize=1.2,symkey=FALSE,density.info="none", labCol=colnames(data),labRow=" ",col=matlab.colors(100),margins = c(8,5), lmat=cbind( c(3, 1,1 ), c(2,2,4 ) ), lwid=c(2, 4), lhei = c(1,4,2)) dev.off() if(!get.roworder) detach("file:Data/BvE.hc.Rdata") ################################## ## 3) Analysis of Control versus UIP ## 9-5-09 ################################## #### a)Read data, create phenotype vector keep <- unlist(sapply(c("Norm","140U","142U","144U","145U","149U","159U"),grep,colnames(DATA.ALL))) data <- DATA.ALL[,keep] pheno <- substr(colnames(data),1,1) #### d) Unsupervised clustering - Top 10% by C.of.V (all genes) cov <- apply(data,1,sd)/apply(data,1,mean) keep <- cov > quantile(cov,probs=0.9) data2 <- data[keep,] get.roworder <- FALSE if(get.roworder){ dc <- dist(data2,method="euclidean") hc <- hclust(dc,method="average") save(hc,file = "Data/CvU.hc.Rdata") } else attach("Data/CvU.hc.Rdata") dr <- dist(t(data2),method="euclidean") ## Drawn from 10% only hr <- hclust(dr,method="average") pdf("Results/CvU.heatmap.0210.pdf",paper="USr",width=0,height=0) heatmap.2(as.matrix(t(apply(data2,1,scale))),Rowv=as.dendrogram(hc),Colv=as.dendrogram(hr), dendrogram=c("column"),scale="row",trace="none", key=TRUE,keysize=1.2,symkey=FALSE,density.info="none", labCol=colnames(data),labRow=" ",col=matlab.colors(100), margins = c(8,5), lmat=cbind( c(3, 1,1 ), c(2,2,4 ) ), lwid=c(2, 4), lhei = c(1,4,2)) dev.off() x <- prcomp(t(data)); dim(data) print((x$sdev[1:3])^2 / sum(x$sdev^2)) #### b) Filter genes: (1) mean > 4, and (2) unannotated means <- apply(data,1,mean) keep <- means >= 4 & !is.na(gene) data2 <- data[keep,] #### c) Student t-test funct <- local.t.Student(data2,pheno) df <- ncol(data2)-2 ## Degrees of freedom equals Num.samples - 2 t.vec <- funct(data2) p.vec <- 2*(pt(abs(t.vec),df=df,lower.tail=F)) adj.p <- error.FDR.BH(t(p.vec)) ## FDR adjusted p values (Benjamini Hochberg) tab <- data.frame(Probeset = rownames(data2), T.stat = t.vec, P.val = p.vec, Adj.p = adj.p, Gene = gene[keep], Desc = desc[keep]) tab <- tab[order(tab$P.val),] write.table(tab,"Results/CvU.studentT.genelist.xls",sep="\t",quote=F,row.names=F) #### e) Iterate Models with BinReg if(TRUE){ N.vec <- seq(20,250,1) Miss.f <- Sum.f <- rep(0,length(N.vec)) Miss.c <- Sum.c <- rep(0,length(N.vec)) for(i in 1:length(N.vec)){ N <- N.vec[i] fit <- binreg.WTB(data2,pheno=="B",N,2,type=2, n.burn=500,n.iter=1000,do.cv=T,print.it=F) temp <- abs(fit$fitted[,2]-fit$fitted[,3]) Sum.f[i] <- sum(temp); Miss.f[i] <- mean(temp>0.5) temp <- abs(fit$crossval[,2]-fit$crossval[,3]) Sum.c[i] <- sum(temp); Miss.c[i] <- mean(temp>0.5) print(i) } write.table(cbind(N.vec,Sum.f,Miss.f,Sum.c,Miss.c), "Data/BRP.CvU.txt",sep="\t") } ################################## ## 4) Validation attempt ## 10-23-09 ################################## file = "Data/GSE10667_series_matrix.txt" dataV <- read.table(file,sep="\t",comment.char="",skip=65, header=T,row.names=1) dataV <- log2(dataV) dataV <- quant.norm(dataV) #dim(dataV);sum(is.na(dataV));range(dataV) phenoV <- read.table(file,sep="\t",comment.char="",skip=37,nrows=1, header=T,row.names=1,as.is=T) phenoV <- substr(as.character(phenoV),1,3) get.anno <- FALSE if(get.anno){ file <- "Data/GPL4133-26578-WTB.txt" annoV <- read.delim(file,sep="\t",comment.char="",header=T) probes <- rownames(data) tabALL <- data.frame(Probeset = probes, Gene = unlist(mget(probes,hgu133plus2SYMBOL)), Desc = unlist(mget(probes,hgu133plus2GENENAME))) tabALL <- tabALL[!is.na(tabALL$Gene),] probes <- as.character(tabALL[,1]) tabALL <- data.frame(tabALL, EntrexID = unlist(mget(probes,hgu133plus2ENTREZID)), EnsembleIDs = sapply(mget(probes,hgu133plus2ENSEMBL),paste,collapse=";"), UnigeneIDs = sapply(mget(probes,hgu133plus2UNIGENE),paste,collapse=";"), UniprotIDs = sapply(mget(probes,hgu133plus2UNIPROT),paste,collapse=";"), RefSeqIDs = sapply(mget(probes,hgu133plus2REFSEQ),paste,collapse=";"), ACCNUM = unlist(mget(probes,hgu133plus2ACCNUM))) tabALL <- data.frame(tabALL, Mapped.by.Hs = NA, Mapped.by.NM = NA) for(i in 1:nrow(tabALL)){ IDs1 <- unlist(strsplit(as.character(tabALL$UnigeneIDs[[i]]),";")) maps <- NULL for(id in IDs1) maps <- c(maps,annoV$ID[which(annoV$UNIGENE_ID==id)]) tabALL$Mapped.by.Hs[i] <- paste(maps,collapse=";") IDs2 <- unlist(strsplit(as.character(tabALL$RefSeqIDs[[i]]),";")) maps <- NULL for(id in IDs2) maps <- c(maps,annoV$ID[which(annoV$REFSEQ==id)]) tabALL$Mapped.by.NM[i] <- paste(maps,collapse=";") print(c(i,IDs1,IDs2)) } save(tabALL,file="Data/Map.AffyAgilent.data") } else attach("Data/Map.AffyAgilent.data") probes153 <- tab$Probeset[1:153] tab153 <- tabALL[match(probes153,tabALL$Probeset),] write.table(cbind(tab[1:153,],tab153[,-1:-3]),"Data/Map.CvU.n153.Agilent.xls", sep="\t",quote=F,row.names=F) data2 <- cbind(data[match(tab153$Probeset,rownames(data)),], matrix(0,153,ncol(dataV))) colnames(data2) <- c(colnames(data),colnames(dataV)) keep <- rep(0,nrow(data2)) for(i in 1:nrow(data2)){ rows <- unique(unlist(strsplit(as.character(tab153[i,10:11]),";"))) if(length(rows)){ keep[i] <- T temp <- dataV[rownames(dataV) %in% rows,,drop=F] top <- order(-apply(temp,1,median)) data2[i,-1:-ncol(data)] <- temp[top[1],] } } data2 <- data2[keep==1,] data2[,-1:-ncol(data)] <- hard.norm(data2[,-1:-ncol(data)], apply(data2[,1:ncol(data)],1,mean), apply(data2[,1:ncol(data)],1,sd)) fit2 <- binreg.WTB(data2,c(pheno=="B",rep(2,ncol(dataV))),151,2,type=2, n.burn=1000,n.iter=5000,do.cv=F,alpha=0.1) prob2E <- fit2$predict wilcox.test(prob2E$PROB[phenoV=="Lun"],prob2E$PROB[phenoV!="Lun"]) pred <- prediction(prob2E$PROB,phenoV=="Lun") cut <- pred@cutoffs[[1]][order(pred@fn[[1]] / pred@n.pos[[1]] + pred@fp[[1]]/pred@n.neg[[1]])[1]] agree <- (prob2E$PROB >= cut) == (phenoV=="Lun") table((prob2E$PROB >= cut),(phenoV=="Lun")) sum(table((prob2E$PROB >= cut),(phenoV=="Lun")) * diag(2)) / length(phenoV) pdf("Results/Validate.CvU.pdf",paper="USr",height=0,width=0) par(pty="s",cex.lab=1.5,cex.axis=1.5) plot(performance(pred,"tpr","fpr"),colorize=F,lwd=2,main="Normal versus Biopsy,Type 2") performance(pred,'auc')@y.values lines(0:1,0:1,lty=2) par(pty="m",cex.lab=1.5,cex.axis=1.5) plot(1:length(phenoV),prob2E$PROB,col=c("blue","red")[1+(phenoV=="Lun")],pch=c(7,15)[1+agree],ylim=0:1, main="Normal versus Biopsy",ylab="Predicted Prob",xlab="Sample Number",cex=2) for(i in 1:length(phenoV)) lines(rep(i,2),c(prob2E$CI.UP[i],prob2E$CI.LOW[i]), col=c("blue","red")[1+(phenoV=="Lun")][i]) lines(c(0,100),rep(cut,2),lty=2,lwd=2) dev.off() ################################ ## 5) Make cluster of all Duke samples ## 12-03-09 ################################ data <- DATA.ALL cov <- apply(data,1,sd)/apply(data,1,mean) keep <- cov > quantile(cov,probs=0.9) data2 <- data[keep,] get.roworder = FALSE if(get.roworder){ dc <- dist(data2,method="euclidean") hc <- hclust(dc,method="average") save(hc,file = "Data/ALL.hc.Rdata") } else attach("Data/ALL.hc.Rdata") dr <- dist(t(data2),method="euclidean") ## Drawn from Heatmap only hr <- hclust(dr,method="average") pdf("Results/ALL.heatmap.0210.pdf",paper="USr",width=0,height=0) heatmap.2(as.matrix(t(apply(data2,1,scale))),Rowv=as.dendrogram(hc),Colv=as.dendrogram(hr), dendrogram=c("column"),scale="row",trace="none", key=TRUE,keysize=1.2,symkey=FALSE,density.info="none", labCol=names,labRow=" ",col=matlab.colors(100),margins=c(8,5), lmat=cbind( c(3, 1,1 ), c(2,2,4 ) ), lwid=c(2, 4), lhei = c(1,4,2)) dev.off() x <- prcomp(t(data)); dim(data) print((x$sdev[1:3])^2 / sum(x$sdev^2)) ################################## ## 6) Analysis of Normal versus Late ## 10-12-10 ################################## keep <- unlist(sapply(c("Norm","146L", "152U","157U" ,"158U" ,"160U"),grep,colnames(DATA.ALL))) data <- DATA.ALL[,keep] cov <- apply(data,1,sd)/apply(data,1,mean) keep <- cov > quantile(cov,probs=0.9) data2 <- data[keep,] get.roworder = FALSE if(get.roworder){ dc <- dist(data2,method="euclidean") hc <- hclust(dc,method="average") save(hc,file = "Data/NvL.hc.Rdata") } else attach("Data/NvL.hc.Rdata") dr <- dist(t(data2),method="euclidean") ## Drawn from Heatmap only hr <- hclust(dr,method="average") pdf("Results/NvL.heatmap.0210.pdf",paper="USr",width=0,height=0) heatmap.2(as.matrix(t(apply(data2,1,scale))),Rowv=as.dendrogram(hc),Colv=as.dendrogram(hr), dendrogram=c("column"),scale="row",trace="none", key=TRUE,keysize=1.2,symkey=FALSE,density.info="none", labCol=colnames(data),labRow=" ",col=matlab.colors(100),margins=c(8,5), lmat=cbind( c(3, 1,1 ), c(2,2,4 ) ), lwid=c(2, 4), lhei = c(1,4,2)) dev.off() ## 1) Get top differentially expressed probesets means <- apply(data,1,mean) keep <- means >= 4 & !is.na(gene) data2 <- data[keep,] dim(data2) pheno <- substr(colnames(data),1,1) == "E" funct <- local.t.Student(data2,pheno) df <- ncol(data2)-2 ## Degrees of freedom equals Num.samples - 2 t.vec <- funct(data2) p.vec <- 2*(pt(abs(t.vec),df=df,lower.tail=F)) adj.p <- error.FDR.BH(t(p.vec)) ## FDR adjusted p values (Benjamini Hochberg) tab <- data.frame(Probeset = rownames(data2), T.stat = t.vec, P.val = p.vec, Adj.p = adj.p, Gene = gene[keep], Desc = desc[keep]) tab <- tab[order(tab$P.val),] write.table(tab,"Results/NvL.studentT.genelist.xls",sep="\t",quote=F,row.names=F) ## 1a) Iterate Binreg models if(TRUE){ N.vec <- seq(20,250,1) Miss.f <- Sum.f <- rep(0,length(N.vec)) Miss.c <- Sum.c <- rep(0,length(N.vec)) for(i in 1:length(N.vec)){ N <- N.vec[i] fit <- binreg.WTB(data2,pheno,N,2,type=2, n.burn=500,n.iter=1000,do.cv=T,print.it=F) temp <- abs(fit$fitted[,2]-fit$fitted[,3]) Sum.f[i] <- sum(temp); Miss.f[i] <- mean(temp>0.5) temp <- abs(fit$crossval[,2]-fit$crossval[,3]) Sum.c[i] <- sum(temp); Miss.c[i] <- mean(temp>0.5) print(i) } write.table(cbind(N.vec,Sum.f,Miss.f,Sum.c,Miss.c), "Data/BRP.NvL.txt",sep="\t") } probes70 <- tab$Probeset[1:70] ## 2) Merge datasets tab70 <- tabALL[match(probes70,tabALL$Probeset),] write.table(cbind(tab[1:70,],tab70[,-1:-3]),"Data/Map.NvLate.n70.Agilent.xls", sep="\t",quote=F,row.names=F) data2 <- cbind(data[match(probes70,rownames(data)),], matrix(0,70,ncol(dataV))) colnames(data2) <- c(colnames(data),colnames(dataV)) keep <- rep(0,nrow(data2)) for(i in 1:nrow(data2)){ rows <- unique(unlist(strsplit(as.character(tab70[i,10:11]),";"))) if(length(rows)){ keep[i] <- T temp <- dataV[rownames(dataV) %in% rows,,drop=F] top <- order(-apply(temp,1,median)) data2[i,-1:-ncol(data)] <- temp[top[1],] } } data2 <- data2[keep==1,] num.gene <- nrow(data2) data2[,-1:-ncol(data)] <- hard.norm(data2[,-1:-ncol(data)], apply(data2[,1:ncol(data)],1,mean), apply(data2[,1:ncol(data)],1,sd)) dim(data2);sum(is.na(data2));range(data2) fit2 <- binreg.WTB(data2,c(pheno,rep(2,ncol(dataV))),num.gene,2,type=2, n.burn=1000,n.iter=5000,do.cv=F,alpha=0.1) prob2L <- fit2$predict wilcox.test(prob2L$PROB[phenoV=="Lun"],prob2L$PROB[phenoV!="Lun"]) pred <- prediction(prob2L$PROB,phenoV=="Lun") cut <- pred@cutoffs[[1]][order(pred@fn[[1]] / pred@n.pos[[1]] + pred@fp[[1]]/pred@n.neg[[1]])[1]] agree <- (prob2L$PROB >= cut) == (phenoV=="Lun") table((prob2L$PROB >= cut),(phenoV=="Lun")) sum(table((prob2L$PROB >= cut),(phenoV=="Lun")) * diag(2)) / length(phenoV) pdf("Results/Validate.NvL.n70.pdf",paper="USr",height=0,width=0) par(pty="s",cex.lab=1.5,cex.axis=1.5) plot(performance(pred,"tpr","fpr"),colorize=F,lwd=2,main="Normal versus Biopsy,Type 2") performance(pred,'auc')@y.values lines(0:1,0:1,lty=2) par(pty="m",cex.lab=1.5,cex.axis=1.5) plot(1:length(phenoV),prob2L$PROB,col=c("blue","red")[1+(phenoV=="Lun")],pch=c(7,15)[1+agree],ylim=0:1, main="Normal versus Biopsy",ylab="Predicted Prob",xlab="Sample Number",cex=2) for(i in 1:length(phenoV)) lines(rep(i,2),c(prob2L$CI.UP[i],prob2L$CI.LOW[i]), col=c("blue","red")[1+(phenoV=="Lun")][i]) lines(c(0,100),rep(cut,2),lty=2,lwd=2) dev.off() ######################### ## 9) Validate Normal Versus All signature ## 1-12-11 ################################ drop <- unlist(sapply(c("149L","152L","157L","158L","159L","160L"),grep,colnames(DATA.ALL))) data <- DATA.ALL[,-drop] cov <- apply(data,1,sd)/apply(data,1,mean) keep <- cov > quantile(cov,probs=0.9) data2 <- data[keep,] get.roworder <- TRUE if(get.roworder){ dc <- dist(data2,method="euclidean") hc <- hclust(dc,method="average") save(hc,file = "Data/NvA.hc.Rdata") } else attach("Data/NvA.hc.Rdata") dr <- dist(t(data2),method="euclidean") ## columns ordered by top 10% hr <- hclust(dr,method="average") pdf("Results/ALL.heatmap.0411.pdf",paper="USr",width=0,height=0) heatmap.2(as.matrix(t(apply(data2,1,scale))),Rowv=as.dendrogram(hc),Colv=as.dendrogram(hr), dendrogram=c("column"),scale="row",trace="none", key=TRUE,keysize=1.2,symkey=FALSE,density.info="none", labCol=colnames(data),labRow=" ",col=matlab.colors(100),margins = c(8,5), lmat=cbind( c(3, 1,1 ), c(2,2,4 ) ), lwid=c(2, 4), lhei = c(1,4,2)) dev.off() means <- apply(data,1,mean) keep <- means >= 4 & !is.na(gene) data2 <- data[keep,] ## 1) Get top differentially expressed probesets pheno <- substr(colnames(data),1,1) != "N" funct <- local.t.Student(data2,pheno) df <- ncol(data2)-2 ## Degrees of freedom equals Num.samples - 2 t.vec <- funct(data2) p.vec <- 2*(pt(abs(t.vec),df=df,lower.tail=F)) adj.p <- error.FDR.BH(t(p.vec)) ## FDR adjusted p values (Benjamini Hochberg) tab <- data.frame(Probeset = rownames(data2), T.stat = t.vec, P.val = p.vec, Adj.p = adj.p, Gene = gene[keep], Desc = desc[keep]) tab <- tab[order(tab$P.val),] write.table(tab,"Results/NvA.StudentT.genelist.xls",sep="\t",quote=F,row.names=F) ## Iterate through models if(TRUE){ N.vec <- seq(20,250,1) Miss.f <- Sum.f <- rep(0,length(N.vec)) Miss.c <- Sum.c <- rep(0,length(N.vec)) for(i in 1:length(N.vec)){ N <- N.vec[i] fit <- binreg.WTB(data2,pheno,N,2,type=2, n.burn=500,n.iter=1000,do.cv=T,print.it=F) temp <- abs(fit$fitted[,2]-fit$fitted[,3]) Sum.f[i] <- sum(temp); Miss.f[i] <- mean(temp>0.5) temp <- abs(fit$crossval[,2]-fit$crossval[,3]) Sum.c[i] <- sum(temp); Miss.c[i] <- mean(temp>0.5) print(i) } write.table(cbind(N.vec,Sum.f,Miss.f,Sum.c,Miss.c), "Data/BRP.NvA.txt",sep="\t") } probes151 <- tab$Probeset[1:151] ## 2) Merge datasets tab151 <- tabALL[match(probes151,tabALL$Probeset),] write.table(cbind(tab[1:151,],tab151[,-1:-3]),"Data/Map.NvA.n151.Agilent.xls",sep="\t",quote=F,row.names=F) data2 <- cbind(data[match(probes151,rownames(data)),], matrix(0,151,ncol(dataV))) colnames(data2) <- c(colnames(data),colnames(dataV)) keep <- rep(0,nrow(data2)) for(i in 1:nrow(data2)){ rows <- unique(unlist(strsplit(as.character(tab151[i,10:11]),";"))) if(length(rows)){ keep[i] <- T temp <- dataV[rownames(dataV) %in% rows,,drop=F] top <- order(-apply(temp,1,median)) data2[i,-1:-ncol(data)] <- temp[top[1],] } } data2 <- data2[keep==1,] num.gene <- nrow(data2) data2[,-1:-ncol(data)] <- hard.norm(data2[,-1:-ncol(data)], apply(data2[,1:ncol(data)],1,mean), apply(data2[,1:ncol(data)],1,sd)) dim(data2);sum(is.na(data2));range(data2) fit2 <- binreg.WTB(data2,c(pheno,rep(2,ncol(dataV))),num.gene,2,type=2, n.burn=1000,n.iter=5000,do.cv=F,alpha=0.1) prob2A <- fit2$predict wilcox.test(prob2A$PROB[phenoV=="Lun"],prob2A$PROB[phenoV!="Lun"]) pred <- prediction(prob2A$PROB,phenoV=="Lun") cut <- pred@cutoffs[[1]][order(pred@fn[[1]] / pred@n.pos[[1]] + pred@fp[[1]]/pred@n.neg[[1]])[1]] agree <- (prob2A$PROB >= cut) == (phenoV=="Lun") table((prob2A$PROB >= cut),(phenoV=="Lun")) sum(table((prob2A$PROB >= cut),(phenoV=="Lun")) * diag(2)) / length(phenoV) pdf("Results/Validate.NvA.n151.pdf",paper="USr",height=0,width=0) par(pty="s",cex.lab=1.5,cex.axis=1.5) plot(performance(pred,"tpr","fpr"),colorize=F,lwd=2,main="Normal versus Biopsy,Type 2") performance(pred,'auc')@y.values lines(0:1,0:1,lty=2) par(pty="m",cex.lab=1.5,cex.axis=1.5) plot(1:length(phenoV),prob2A$PROB,col=c("blue","red")[1+(phenoV=="Lun")],pch=c(7,15)[1+agree],ylim=0:1, main="Normal versus Biopsy",ylab="Predicted Prob",xlab="Sample Number",cex=2) for(i in 1:length(phenoV)) lines(rep(i,2),c(prob2A$CI.UP[i],prob2A$CI.LOW[i]), col=c("blue","red")[1+(phenoV=="Lun")][i]) lines(c(0,100),rep(cut,2),lty=2,lwd=2) dev.off() pdf("Results/ROC.pdf",paper="USr",height=0,width=0) pred <- prediction(prob2A$PROB,phenoV=="Lun") spot <- order(pred@fn[[1]] / pred@n.pos[[1]] + pred@fp[[1]]/pred@n.neg[[1]])[1] x <- pred@fp[[1]][spot] / pred@n.neg[[1]] y <- 1 - pred@fn[[1]][spot] / pred@n.pos[[1]] par(pty="s",cex.lab=1.5,cex.axis=1.5) plot(performance(pred,"tpr","fpr"),colorize=F,lwd=4,col="brown",main="") points(x,y,cex=3,col="brown") lines(0:1,0:1,lty=2,lwd=2) pred <- prediction(prob2E$PROB,phenoV=="Lun") spot <- order(pred@fn[[1]] / pred@n.pos[[1]] + pred@fp[[1]]/pred@n.neg[[1]])[1] x <- pred@fp[[1]][spot] / pred@n.neg[[1]] y <- 1 - pred@fn[[1]][spot] / pred@n.pos[[1]] plot(performance(pred,"tpr","fpr"),add=T,lwd=4,col="red") points(x,y,cex=3,col="red") pred <- prediction(prob2L$PROB,phenoV=="Lun") spot <- order(pred@fn[[1]] / pred@n.pos[[1]] + pred@fp[[1]]/pred@n.neg[[1]])[1] x <- pred@fp[[1]][spot] / pred@n.neg[[1]] y <- 1 - pred@fn[[1]][spot] / pred@n.pos[[1]] plot(performance(pred,"tpr","fpr"),add=T,lwd=4,col="green") points(x,y,cex=3,col="green") dev.off() ######################### ## 9) Venn Diagram of signatures ## 1-12-11 ################################ tab.CvU <- read.table("Results/CvU.studentT.genelist.xls",sep="\t",header=T) tab.NvA <- read.table("Results/NvA.StudentT.genelist.xls",sep="\t",header=T) tab.NvL <- read.table("Results/NvL.studentT.genelist.xls",sep="\t",header=T) sum(tab.CvU$Probeset[1:153] %in% tab.NvA$Probeset[1:151]) sum(tab.NvL$Probeset[1:70] %in% tab.NvA$Probeset[1:151]) sum(tab.CvU$Probeset[1:153] %in% tab.NvL$Probeset[1:70]) sum(tab.CvU$Probeset[1:153] %in% tab.NvA$Probeset[1:151] & tab.CvU$Probeset[1:153] %in% tab.NvL$Probeset[1:70]) ##http://chart.apis.google.com/chart?cht=v&chd=t:151,153,70,40,27,21,20&chs=500x500&chdl=Norm v All|Norm v Biopsy|Norm v Explant ######################### ## 9) BRP Model fitting ## 4-19-11 ################################ temp.CvU <- read.table("Data/BRP.CvU.txt",sep="\t",header=T,as.is=T) temp.NvL <- read.table("Data/BRP.NvL.txt",sep="\t",header=T,as.is=T) keep <- temp.CvU$N.vec >= 50 keep1 <- temp.CvU$N.vec == 153 keep2 <- temp.CvU$N.vec == 70 keep3 <- temp.CvU$N.vec == 100 keep4 <- temp.CvU$N.vec == 230 pdf("Results/GeneSize.BRP.pdf",paper="USr",height=0,width=0) par(mfrow=c(2,2),cex.lab=1.25,cex.main=1.5) plot(temp.CvU$N.vec[keep],temp.CvU$Sum.f[keep],type="l",col=1,lwd=2, xlab="Number of Genes",ylab="Sum of Deviances",main = "IPF Biopsy - fitted") points(temp.CvU$N.vec[keep1],temp.CvU$Sum.f[keep1],col="blue",pch=1,cex=3) plot(temp.CvU$N.vec[keep],temp.CvU$Miss.c[keep],ylim=c(0,0.25),col=1,lwd=2, type="l",xlab="Number of Genes",ylab="Missclassification",main="IPF Biopsy - Leave-one-out") plot(temp.NvL$N.vec[keep],temp.NvL$Sum.f[keep],type="l",col=1,lwd=2, xlab="Number of Genes",ylab="Sum of Deviances",main = "IPF Explant - fitted") points(temp.NvL$N.vec[keep2],temp.NvL$Sum.f[keep2],col="blue",pch=1,cex=3) points(temp.NvL$N.vec[keep3],temp.NvL$Sum.f[keep3],col="red",pch=1,cex=3) points(temp.NvL$N.vec[keep4],temp.NvL$Sum.f[keep4],col="red",pch=1,cex=3) plot(temp.NvL$N.vec[keep],temp.NvL$Miss.c[keep],ylim=c(0,0.25),col=1,lwd=2, type="l",xlab="Number of Genes",ylab="Missclassification",main="IPF Explant - Leave-one-out") dev.off()