#R-script to perform the evaluation of the real data set as descriped in #"Assessment and optimisation of normalisation methods for dual-colour antibody microarrays" #Author: Martin Sill ############################################################################### library(Biobase) library(limma) library(statmod) library(vegan) Sys.setlocale("LC_ALL","C") rm(list = ls()) #setwd("~/workspace") # path to realdata.RData # load the data set load("realdata.RData") source("invariant.R") #same background correction RG <- backgroundCorrect(RG,method="normexp",offset=50,normexp.method="mle") #no within normalization NN MA <- MA.RG(RG,bc.method="none") MAnn <- normalizeBetweenArrays(MA) #vsn normalization MAvsn <- normalizeBetweenArrays(RG,method="vsn") #set weights for spike-ins,control and housekeeping features at zero pr <- which(MA$genes$Status=="protein"|MA$genes$Status=="diffexpr") weights <- modifyWeights(RG$weights,RG$genes$Status, c( "protein","background","actin","polyclonal","IgM","albumin","GAPDH","sp_CHTB","sp_GDA0","sp_Hamster", "sp_Mouse","sp_DNP1","sp_DNP2","sp_GFP","sp_FITC","control both","control Cy3","control Cy5","diffexpr") ,c(1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1)) #global loess MAgl <- normalizeWithinArrays(RG,method="loess",weights=weights,iterations=10) MAgl <- normalizeBetweenArrays(MAgl) weights <- modifyWeights(RG$weights,RG$genes$Status, c( "protein","background","actin","polyclonal","IgM","albumin","GAPDH","sp_CHTB","sp_GDA0","sp_Hamster", "sp_Mouse","sp_DNP1","sp_DNP2","sp_GFP","sp_FITC","control both","control Cy3","control Cy5","diffexpr") #,c(1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,1)) ,c(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)) #Tseng invariant selection algorithm MAinvTseng <- invselTseng(RG,pr,weights=weights,plot.chip=NULL) MAinvTseng <- normalizeBetweenArrays(MAinvTseng) #InvMod MAinvMod <- InvMod(RG,pr,weights=weights,plot.chip=NULL) MAinvMod <- normalizeBetweenArrays(MAinvMod) #RDWGL MArdwgl <- RDWGL(RG,pr,weights=weights,plot.chip=NULL) MArdwgl <- normalizeBetweenArrays(MArdwgl) #GPA MAgpa <- procrustesNormalization(MA.RG(RG,bc.method="none")[pr,]) MAgpa <- normalizeBetweenArrays(MAgpa) #reorder features and average replicated spots MAnn<-MA[MAnn$genes$Status %in% c("protein","diffexpr"),] MAvsn<-MAvsn[MAvsn$genes$Status %in% c("protein","diffexpr"),] MAgl<-MAgl[MAgl$genes$Status %in% c("protein","diffexpr"),] MAinvTseng<-MAinvTseng[MAinvTseng$genes$Status %in% c("protein","diffexpr"),] MAinvMod<-MAinvMod[MAinvMod$genes$Status %in% c("protein","diffexpr"),] MArdwgl<-MArdwgl[MArdwgl$genes$Status %in% c("protein","diffexpr"),] MAgpa<-MAgpa[MAgpa$genes$Status %in% c("protein","diffexpr"),] MAnn <- MAnn[order(MAnn$genes[,"Name"]),] MAvsn <- MAvsn[order(MAvsn$genes[,"Name"]),] MAgl <- MAgl[order(MAgl$genes[,"Name"]),] MAinvTseng <- MAinvTseng[order(MAinvTseng$genes[,"Name"]),] MAinvMod <- MAinvMod[order(MAinvMod$genes[,"Name"]),] MArdwgl <- MArdwgl[order(MArdwgl$genes[,"Name"]),] MAgpa <- MAgpa[order(MAgpa$genes[,"Name"]),] MAvsn <- MA.RG(RG.MA(MAvsn),bc.method="none") normM <- list(MAnn,MAvsn,MAgl,MAinvTseng,MAinvMod,MArdwgl,MAgpa) # average replicate spots for(k in 1:length(normM)){ MA <- normM[[k]] MA <- MA[MA$genes$Status %in% c("protein"),] MA <- MA[order(MA$genes[,"Name"]),] normM[[k]] <- avedups(MA,ndups=2,spacing=1,weights=NULL) } #classification cancer vs. healthy via prediction analysis for microarrays #(nearest shrunken centroid classifier) library(pamr) #function to calculate a confusion.table confusion.table <- function(predicted, true, extra = TRUE) { tt <- table(true, predicted) if (extra) { tt1 <- tt diag(tt1) <- 0 tt <- cbind(tt, apply(tt1, 1, sum)/apply(tt, 1, sum)) dimnames(tt)[[2]][ncol(tt)] <- "Class Error rate" print(tt) orate<- round(sum(tt1)/sum(tt),3) cat(c("Overall error rate=", orate ), fill = TRUE) return(orate) } if (!extra) { return(tt) } } pamRES <- list() set.seed(09092010) for (k in 1:length(normM)){ datstart <- normM[[k]] rownames(datstart$M) <- datstart$genes$Name colnames(datstart$M) <- ystart datstart <- datstart$M #apparent error fulldata <- list(x=datstart,y=ystart,geneids=rownames(datstart)) app.pam <- pamr.train(fulldata) app.tune <- pamr.cv(app.pam,fulldata,nfold=10) #tuning app.tune$err <- app.tune$err[-which(app.tune$size==0)] minerr <- which(app.tune$error==min(app.tune$error)) #min test error cvthres <- median(app.tune$threshold[which(app.tune$size==min(app.tune$size[minerr]))]) app.pred <- pamr.predict(app.pam,as.matrix(fulldata$x,ncol=1),threshold=cvthres[1]) # app.err <- confusion.table(app.pred,ystart) cat(app.err) B <- 100 boot.err <- numeric(B) for(l in 1:B){ while(TRUE) { bootcols <- sample(1:ncol(datstart),I(ncol(datstart)*.632),replace=T) y <- ystart[bootcols] if(!any(table(ystart[bootcols])<4)&!length(table(y))< length(unique(ystart))) break; } dat <- datstart[,bootcols] pamcvs <- list(list(),list(),list(),list()) traindata <- list(x=dat,y=y,geneids=rownames(dat)) mypam <- pamr.train(traindata) app.pam <- pamr.train(fulldata) app.tune <- pamr.cv(app.pam,fulldata,nfold=10) #tuning app.tune$err <- app.tune$err[-which(app.tune$size==0)] minerr <- which(app.tune$error==min(app.tune$error)) #min test error cvthres <- median(app.tune$threshold[which(app.tune$size==min(app.tune$size[minerr]))]) outofbag.y <- ystart[-bootcols] testdata <- list(x=datstart[,-bootcols],y=outofbag.y) pred <- pamr.predict(mypam,as.matrix(testdata$x,ncol=1),threshold=cvthres[1]) boot.err[l] <- confusion.table(pred,outofbag.y) } boot632.err <- .368*app.err + .632*boot.err #bootstrap .632 misclassification error (see Efron 1983) pamRES[[k]] <- cbind(boot.err,boot632.err) } matr <- matrix(nrow=100,ncol=7) for (i in 1:7){ matr[,i]<-pamRES[[i]][,2] } colnames(matr) <- c("NN","GL","VSN","InvTseng","InvMod","RDWGL","GPA") save(matr,file="pamRESHCboot632.RData") load("pamRESHCboot632.RData") pdf("error.pdf") par(mfrow=c(1,1)) boxplot(as.data.frame(matr),main="Misclassification error",ylab="error") dev.off()