#R-script to perform the evaluation of the real data set as descriped in 
#"Assessment and optimisation of normalisation methods for dual-colour antibody microarrays"
#Author: Martin Sill
###############################################################################

library(Biobase)
library(limma)
library(statmod)
library(vegan)
Sys.setlocale("LC_ALL","C")
rm(list = ls())

#setwd("~/workspace") # path to realdata.RData
# load the data set
load("realdata.RData")

source("invariant.R")

#same background correction 
RG <- backgroundCorrect(RG,method="normexp",offset=50,normexp.method="mle")

#no within normalization NN 
MA <- MA.RG(RG,bc.method="none")
MAnn <- normalizeBetweenArrays(MA)

#vsn normalization
MAvsn <- normalizeBetweenArrays(RG,method="vsn")

#set weights for spike-ins,control and housekeeping features at zero 
pr <- which(MA$genes$Status=="protein"|MA$genes$Status=="diffexpr")
weights <- modifyWeights(RG$weights,RG$genes$Status, c(
				"protein","background","actin","polyclonal","IgM","albumin","GAPDH","sp_CHTB","sp_GDA0","sp_Hamster",
				"sp_Mouse","sp_DNP1","sp_DNP2","sp_GFP","sp_FITC","control both","control Cy3","control Cy5","diffexpr")
		,c(1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1)) 

#global loess 
MAgl <- normalizeWithinArrays(RG,method="loess",weights=weights,iterations=10)
MAgl <- normalizeBetweenArrays(MAgl)

weights <- modifyWeights(RG$weights,RG$genes$Status, c(
				"protein","background","actin","polyclonal","IgM","albumin","GAPDH","sp_CHTB","sp_GDA0","sp_Hamster",
				"sp_Mouse","sp_DNP1","sp_DNP2","sp_GFP","sp_FITC","control both","control Cy3","control Cy5","diffexpr")
		#,c(1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,1)) 
		,c(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0))

#Tseng invariant selection algorithm 
MAinvTseng <- invselTseng(RG,pr,weights=weights,plot.chip=NULL)
MAinvTseng <- normalizeBetweenArrays(MAinvTseng)

#InvMod
MAinvMod <- InvMod(RG,pr,weights=weights,plot.chip=NULL)
MAinvMod <- normalizeBetweenArrays(MAinvMod)

#RDWGL
MArdwgl <- RDWGL(RG,pr,weights=weights,plot.chip=NULL)
MArdwgl <- normalizeBetweenArrays(MArdwgl)

#GPA
MAgpa <- procrustesNormalization(MA.RG(RG,bc.method="none")[pr,])
MAgpa <- normalizeBetweenArrays(MAgpa) 

#reorder features and average replicated spots
MAnn<-MA[MAnn$genes$Status %in% c("protein","diffexpr"),]
MAvsn<-MAvsn[MAvsn$genes$Status %in% c("protein","diffexpr"),]
MAgl<-MAgl[MAgl$genes$Status %in% c("protein","diffexpr"),]
MAinvTseng<-MAinvTseng[MAinvTseng$genes$Status %in% c("protein","diffexpr"),]
MAinvMod<-MAinvMod[MAinvMod$genes$Status %in% c("protein","diffexpr"),]
MArdwgl<-MArdwgl[MArdwgl$genes$Status %in% c("protein","diffexpr"),]
MAgpa<-MAgpa[MAgpa$genes$Status %in% c("protein","diffexpr"),]

MAnn <-      MAnn[order(MAnn$genes[,"Name"]),]
MAvsn <-   MAvsn[order(MAvsn$genes[,"Name"]),]
MAgl <-    MAgl[order(MAgl$genes[,"Name"]),]
MAinvTseng  <-   MAinvTseng[order(MAinvTseng$genes[,"Name"]),]
MAinvMod  <-   MAinvMod[order(MAinvMod$genes[,"Name"]),]
MArdwgl  <-   MArdwgl[order(MArdwgl$genes[,"Name"]),]
MAgpa  <-   MAgpa[order(MAgpa$genes[,"Name"]),]

MAvsn <- MA.RG(RG.MA(MAvsn),bc.method="none")
normM <- list(MAnn,MAvsn,MAgl,MAinvTseng,MAinvMod,MArdwgl,MAgpa)

# average replicate spots
for(k in 1:length(normM)){
	MA <- normM[[k]]
	MA <- MA[MA$genes$Status %in% c("protein"),]
	MA <- MA[order(MA$genes[,"Name"]),]
	normM[[k]] <- avedups(MA,ndups=2,spacing=1,weights=NULL)
}

#classification cancer vs. healthy via prediction analysis for microarrays 
#(nearest shrunken centroid classifier) 
library(pamr)
#function to calculate a confusion.table
confusion.table <- function(predicted, true, extra = TRUE) {
	tt <- table(true, predicted)
	if (extra) {
		tt1 <- tt
		diag(tt1) <- 0
		tt <- cbind(tt, apply(tt1, 1, sum)/apply(tt, 1, sum))
		dimnames(tt)[[2]][ncol(tt)] <- "Class Error rate"
		print(tt)
		orate<- round(sum(tt1)/sum(tt),3)
		cat(c("Overall error rate=", orate ), fill = TRUE)
		return(orate)
	}
	if (!extra) {
		return(tt)
	}
}

pamRES <- list()

set.seed(09092010)

for (k in 1:length(normM)){
	datstart <- normM[[k]]
	rownames(datstart$M) <- datstart$genes$Name
	colnames(datstart$M) <- ystart
	datstart <- datstart$M
	#apparent error
	fulldata <- list(x=datstart,y=ystart,geneids=rownames(datstart))
	app.pam <- pamr.train(fulldata)
	app.tune <- pamr.cv(app.pam,fulldata,nfold=10) #tuning
	app.tune$err <- app.tune$err[-which(app.tune$size==0)]
	minerr <- which(app.tune$error==min(app.tune$error))                           #min test error
	cvthres <- median(app.tune$threshold[which(app.tune$size==min(app.tune$size[minerr]))])
	app.pred <- pamr.predict(app.pam,as.matrix(fulldata$x,ncol=1),threshold=cvthres[1]) # 
	app.err <- confusion.table(app.pred,ystart)
	cat(app.err)
	B <- 100
	boot.err <- numeric(B)
	for(l in 1:B){
		while(TRUE) {
			bootcols <- sample(1:ncol(datstart),I(ncol(datstart)*.632),replace=T)
			y <- ystart[bootcols]
			if(!any(table(ystart[bootcols])<4)&!length(table(y))< length(unique(ystart))) break;
		}
		dat <- datstart[,bootcols]
		pamcvs <- list(list(),list(),list(),list()) 
		traindata <- list(x=dat,y=y,geneids=rownames(dat))
		mypam <- pamr.train(traindata)
		app.pam <- pamr.train(fulldata)
		app.tune <- pamr.cv(app.pam,fulldata,nfold=10) #tuning
		app.tune$err <- app.tune$err[-which(app.tune$size==0)]
		minerr <- which(app.tune$error==min(app.tune$error))                           #min test error
		cvthres <- median(app.tune$threshold[which(app.tune$size==min(app.tune$size[minerr]))])
		outofbag.y <- ystart[-bootcols]
		testdata <- list(x=datstart[,-bootcols],y=outofbag.y)
		pred <- pamr.predict(mypam,as.matrix(testdata$x,ncol=1),threshold=cvthres[1]) 
		boot.err[l] <- confusion.table(pred,outofbag.y)  
		
	}
	boot632.err <- .368*app.err + .632*boot.err #bootstrap .632 misclassification error (see Efron 1983)
	pamRES[[k]]   <- cbind(boot.err,boot632.err)      	
}

 
matr <- matrix(nrow=100,ncol=7)
for (i in 1:7){
	matr[,i]<-pamRES[[i]][,2]
}
colnames(matr) <- c("NN","GL","VSN","InvTseng","InvMod","RDWGL","GPA")
save(matr,file="pamRESHCboot632.RData")
load("pamRESHCboot632.RData")

pdf("error.pdf")
par(mfrow=c(1,1))
boxplot(as.data.frame(matr),main="Misclassification error",ylab="error")
dev.off()