Additional File 1 The R codes for the longitudinal SAMGSR method L-SAMGS <- function(DATA, tp, cl, nbPermutations=1000, silent=F, c=0.05){ genes <- dimnames(DATA)[[1]] genes <- genes[!is.na(genes)] nb.Samples <- ncol(DATA)/tp nb.GeneSets <-tp # nb of gene sets samT.ok<-NULL s0.time<-NULL for (j in 1:tp){ # Estimate the constant s0 for SAM-like test for each time point DATA.temp<-DATA[, which(((1:ncol(DATA))%%(-tp)+tp)==j)] tmp<- sam.TlikeStat(DATA.temp,cl=cl) s0 <- tmp$s0 if(!silent) print("s0 estimation : done.") samT.ok <-c(samT.ok, as.data.frame(tmp$TlikeStat)) # SAM statistic for each gene s0.time<-c(s0.time, s0) } samT.ok.temp<-matrix(unlist(samT.ok), tp, nrow(DATA), byrow = TRUE) sam.sumsquareT.ok <- apply( samT.ok.temp,2, function(z) sum(z^2)) # SAMGS stat for each gene set #now, fix on the permutation test to get q-values... C1.size<-table(cl)[1] C2.size<-table(cl)[2] nb.Samples<-length(cl) # stats obtained on 'permuted' data permut.C1 <- matrix(NA,nbPermutations,C1.size) sam.sumsquareT.permut <- matrix(NA,nbPermutations,nrow(DATA)) diperm<-matrix(NA,(dim(DATA)[1]*tp), (nbPermutations+1)) diperm[,1]<-unlist(samT.ok) for(i in 1:nbPermutations) { C1.permut <- permut.C1[i,] <- sample(nb.Samples,C1.size) C2.permut <- (1:nb.Samples)[-C1.permut] samT.permut<-NULL for (j in 1:tp){ DATA.temp<-DATA[, which(((1:ncol(DATA))%%(-tp)+tp)==j)] samT.permut <- c(samT.permut, data.frame(sam.TlikeStat(DATA.temp,C1=C1.permut,C2=C2.permut,s0=s0.time[j])$TlikeStat)) } samT.permut.temp<-matrix(unlist(samT.permut), tp, nrow(DATA), byrow = TRUE) sam.sumsquareT.permut[i,] <- apply( samT.permut.temp,2, function(z) sum(z^2)) diperm[,(i+1)] <-unlist(samT.permut) # SAMGS statistics for each gene set - for current permutation if(!silent & i%%100 == 0) print(paste(i," permutations done.")) } GeneSets.pval <- apply(t(sam.sumsquareT.permut) >= sam.sumsquareT.ok ,1,sum)/nbPermutations names(GeneSets.pval)<-rownames(DATA) ##Above is the SAMGS part, the following codes are for the reduction part qobj <- NULL try(qobj <- qvalue(GeneSets.pval)) GeneSets.qval <- rep(NA,nrow(DATA)) if(!is.null(attr(qobj,"class"))){ GeneSets.qval <- qobj$qvalues } samvsgsea<-GeneSets.pval samvsgsea<-samvsgsea[order(GeneSets.qval) ] simresults=list() redset=list() redset.name<-list() rt=length(GeneSets.qval[GeneSets.qval<0.05]) #no of rows in results table redsetsizec=matrix(NA,rt,length(c)) for (op in 1:rt){ gsi=samvsgsea[op] sel.temp<-NULL sel.temp[1]<-which(rownames(DATA)==(names(gsi))) for (k in 1:(tp-1)){ sel.temp[k+1]<-k*nrow(DATA)+sel.temp[1] } digsi=diperm[sel.temp,] digsisq=digsi**2 order(-digsisq[,1]) #order the genes according to their significance. odigsisq=digsisq[order(-digsisq[,1]),] #Reduce gene set #Calculate gene set reduction p-value ns=nrow(odigsisq) #set size gsredpval=rep(0,(ns-1)) #pbar gsredpvalR=rep(0,(ns-1)) samgs=rep(NA,ns) for(g in 1:(ns-1)){ for (i in 2:(nbPermutations+1)){ if (sum(odigsisq[(g+1):ns,i])>=sum(odigsisq[(g+1):ns,1])) gsredpval[g]=gsredpval[g]+1 #pbar if (sum(odigsisq[1:g,i])>=sum(odigsisq[1:g,1])) gsredpvalR[g]=gsredpvalR[g]+1 samgs[g]=sum(odigsisq[(g+1):dim(odigsisq)[1],1]) } } gsredpval=gsredpval/(nbPermutations) #pbar-value #CK gsredpvalR=gsredpvalR/(nbPermutations) #p-value if (.05>=max(gsredpval)) redsetsizec[op,1]=ns else redsetsizec[op,1]=1+sum(c>=gsredpval) print(redsetsizec[op,1]) gspval=cbind(gsredpvalR,gsredpval) simresults[[op]]=gspval redset[[op]]=c(1:tp)[order(-digsisq[,1])][1:(redsetsizec[op,1])] names(redset[[op]])<-names(gsi) }# end of op iterations redset } #Other functions used (those functions were downloaded from Dr. Yasui’s homepage) rowMeansVars <- function(d,margin=1){ if(margin==2) d <- t(d) m <- rowMeans(d,na.rm=T) dif <- d - m ssd <- rowSums(dif^2,na.rm=T) list("means"=m, "sumsquaredif" =ssd, "vars" =ssd/(ncol(d)-1), "centered rows"=dif ) } #SAM statistics sam.TlikeStat <- function(DATA, cl=NULL, C1=NULL,C2=NULL, s0=NULL, s0.param=list(nb.Groups=100,mad.Const=.64), alternative=c("two.sided", "greater", "less")[1], conf.level =0.95 ){ if(!is.null(cl)){ cl <- as.factor(as.character(cl)) C1 <- which(as.numeric(cl)==1) C2 <- which(as.numeric(cl)==2) } if(is.null(C1) | is.null(C2))stop("Error -sam.TlikeStat : classes 1 and 2 are undefined.") nb.Genes<- nrow(DATA) nb.Samples<- ncol(DATA) C1.size<- length(C1) C2.size<- length(C2) stat.C1<- rowMeansVars(DATA[,C1]) stat.C2<- rowMeansVars(DATA[,C2]) diffmean.C1C2<- stat.C1$means - stat.C2$means pooledSqrtVar.C1C2<-sqrt((1/C1.size+1/C2.size)*(stat.C1$sumsquaredif+stat.C2$sumsquaredif)/(nb.Samples-2)) if(is.null(s0)){ nb.Groups <- s0.param$nb.Groups mad.Const <- s0.param$mad.Const tmp <- as.data.frame(cbind(pooledSqrtVar.C1C2,diffmean.C1C2)) tmp <- tmp[order(tmp[,1]),] group.Size<- as.integer(nb.Genes/nb.Groups) percentiles<- seq(0,1,.05) nb.Percentiles <- length(percentiles) s0.quantiles<- quantile(pooledSqrtVar.C1C2,percentiles) tt <- matrix(NA,nb.Groups,nb.Percentiles) coeffvar <- as.data.frame(cbind(s0.quantiles,rep(NA,nb.Percentiles))) for(j in 1:nb.Percentiles){ x<-matrix(tmp[1:(group.Size*nb.Groups),1]/(tmp[1:(group.Size*nb.Groups),2]+ s0.quantiles[j]),group.Size,nb.Groups) tt[,j]=apply(x,2,mad,constant=mad.Const,"na.rm" =TRUE) coeffvar[j,2] <- sd.na(tt[,j])/mean.na(tt[,j]) } s0 <- min(s0.quantiles[coeffvar[,2]==min(coeffvar[,2])]) } tstat <- diffmean.C1C2/(pooledSqrtVar.C1C2+s0) df <- nb.Samples-3 # <- s0 is a supplemental parameter to estimate if (alternative == "less") { pval <- pt(tstat, df) cint <- cbind(rep(-Inf,nb.Genes), tstat + qt(conf.level, df) ) }else if (alternative == "greater") { pval <- pt(tstat, df, lower = FALSE) cint <- cbind(tstat - qt(conf.level, df), rep(Inf,nb.Genes)) }else { pval <- 2 * pt(-abs(tstat), df) alpha <- 1 - conf.level cint <- qt(1 - alpha/2, df) cint <- cbind(tstat -cint,tstat + cint) } list(s0= s0, diffmean = diffmean.C1C2, pooledSqrtVar = pooledSqrtVar.C1C2, TlikeStat = tstat, "p.values (using Student law)"=pval, gm.C1 = stat.C1$means, gm.C2 = stat.C2$means, confidence.intervals=cint) }