### Packages and definition used in data preparations and analysis library(KEGGgraph) library(KEGG.db) library(RColorBrewer) library(Rgraphviz) library(org.Hs.eg.db) #in our analysis we used only human data. library(RBGL) library(qvalue) library(affy) library(gtools) relacje <- c(0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0,0,0,0,0) names(relacje) <- c("activation", "compound", "binding/association", "expression","inhibition", "activation_phosphorylation", "phosphorylation","indirect","inhibition_phosphorylation", "dephosphorylation_inhibition","dissociation", "dephosphorylation","activation_dephosphorylation", "state", "activation_indirect", "inhibition_ubiquination","ubiquination", "expression_indirect", "indirect_inhibition","repression", "binding/association_phosphorylation", "dissociation_phosphorylation","indirect_phosphorylation","missing interaction","hidden compound", "indirect effect","state change","undef") # we used only two types of conections expression (1) and repression (-1). ### ### Data preparation #Here we just extracting info about types of relations and distances between nodes and leafs in each of analyzed pathways. These values depend only on pathway definitions and may be computed only once. v <- paste("../KEGGxml_PosCons/",dir("../KEGGxml_PosCons/"),sep="") #read def on pathways in which consistent subgraph is possible (downloaded from KEGG website) for (infile in v){ G <- parseKGML(infile) # reading a definition of a particular pathway G <- KEGGpathway2Graph(G, genesOnly=TRUE) { if(length(getSubtype(G))>0){ rel <- lapply(getSubtype(G), function(a) { c( ifelse(length(a)==0,"undef",getName(a[[1]])[[1]]), #gathering names and values of single relations in a particular pathway ifelse(length(a)==0,"undef",getValue(a[[1]])[[1]]) ) }) V<-as.data.frame(t(sapply(rel,function(x) { c(x,relacje[x[1]]) }))) colnames(V)<-c("name","mark","value") outfile<-paste("nodesRel/desc_rel_",strsplit(strsplit(infile,"\\/")[[1]][3],"\\.")[[1]][1],".txt",sep="") write.table(V,file=outfile,quote=FALSE,sep="\t") #saving description of relations in a particular pathway } } } #for each pathway we compute distances between each node and leafs files <- dir("KEGGxml_PosCons/") length(files) for(i in 1:length(files)) { M <- parseKGML(paste("KEGGxml_PosCons/",files[i],sep="")); G <- KEGGpathway2Graph(M, genesOnly=TRUE) imp2 <- nodes(G)[(degree(G)[[1]]==0) & (degree(G)[[2]]==0)] #founding nodes witout connections imp3 <- nodes(G)[(degree(G)[[1]]!=0) & (degree(G)[[2]]==0)] #founding leafs dist2 <- sapply(nodes(G),function(y) { { if (sum(y%in%imp2)) {Inf} else if (sum(y%in%imp3)) {0} else { d <- dijkstra.sp(ugraph(G),y)$distance #computing the distance d <- d[imp3] d <- d[!is.na(d)] min(d) } } }) outfile <- paste("distance/path_",gsub(".xml",".txt",files[i]),sep="") write.table(dist2,file=outfile,quote=FALSE,sep="\t") # saving distances for a particular pathway } ### ### Functions needed to perform ACST names <- NULL; GG <- as.list(NULL); VVg <- NULL; G <- as.list(NULL); re_MEN<-function(X,gr1,gr2){ #computing local statistic tt <- apply(X,1,function(x) t.test(x[gr1],x[gr2])$stat) tt } przygotuj<-function(MA_file,files,MM=TRUE){ #reading microarray data and eliminating non-measured genes from pathways #uploading pathways Vg <- read.delim(MA_file,header=T); geny<-NULL; rownames(Vg) <- translateGeneID2KEGGID(rownames(Vg)) OK <- sapply(1:length(files), function(i){ M <- parseKGML(paste("KEGGxml_PosCons/",files[i],sep="")); names <<- c(names,gsub(" ","_",getTitle(M))) G[[i]] <<- KEGGpathway2Graph(M, genesOnly=TRUE) #selecting genes measured in microarray experiment detected <- nodes(G[[i]]) %in% rownames(Vg) ups <- nodes(G[[i]])[detected] GG[[i]] <<- subKEGGgraph(ups, G[[i]]) }) names(names)<<- files; files2 <- dir("KEGGxml/") OK <- sapply(1:length(files2), function(i){ M <- parseKGML(paste("KEGGxml/",files2[i],sep="")); g1 <- unlist(sapply(nodes(M),function(x) x@name)) geny <<- unique(c(geny,g1[grep("hsa:",g1)])) # in case of using non-human data the "hsa:" should be poprely changed }) VVg <<- Vg[rownames(Vg)%in%geny,] } losuj_perm<-function(Ncol,Ngr,n){ #computing permutations AA<-matrix(sample(Ncol,Ngr),nrow=1); i<-1 while(nrow(AA)1); L<-Lp[v1==1] if(length(v2)>0){ L2 <- List[v2]; Lp2 <- Lp[v2] Lt <- list(L2[[1]]); Lpt<-list(Lp2[[1]]) while(length(L2)>0){ aa <- L2[[1]]; aap<-Lp2[[1]] v <- which(sapply(L2,function(x) any(aa%in%x))) aa <- unique(unlist(L2[v])); aap<-unique(unlist(Lp2[v])) w <- which(sapply(Lt,function(x) any(aa%in%x))) { if (length(w)==0) {Lt[[length(Lt)+1]] <- aa; Lpt[[length(Lpt)+1]] <- aap} else { aa<-unique(c(aa,unlist(Lt[w]))); aap<-unique(c(aap,unlist(Lpt[w]))); Lt<-Lt[-w]; Lpt<-Lpt[-w]; Lt[[length(Lt)+1]]<-aa; Lpt[[length(Lpt)+1]]<-aap } } Lp2 <- Lp2[-v]; L2<-L2[-v] } L <- c(Lpt,L) } } L } ConsSons2<-function(node,reV,GG,rel,relacje){ #choosing of consistent sons List <- NULL sons <- edges(GG)[[node]]; if(length(sons)>0){ for(x in sons){ gs<-paste(node,"~",x,sep="") if (relacje[rel[gs]]*reV[node]*reV[x]>0) {List<-c(List,gs)} } } List } findConsPath4<-function(reV,GG,rel,relacje){ #gathering consistent subgraphs Paths <- NULL nodes <- nodes(GG)[(degree(GG)[[2]]!=0)] for (node in nodes){ family <- ConsSons2(node,reV,GG,rel,relacje) { if (length(Paths)>0) { ss <- FALSE Paths <-lapply(Paths,function(oo) { { if(any(family%in%oo)) { ss <<- TRUE unique(c(family,oo)) } else oo } }); if (!ss) Paths[[length(Paths)+1]] <- c(unique(family)) } else Paths[[1]] <- family; } } if(length(Paths)>0) Paths<-Inter2(Paths) Paths } ocen<-function(List,reV,GG,D){ #computing global statistic reV2 <- reV^2 reVme <- (reV2-mean(reV2))/sd(reV2) d2 <- D[,1] d2 <- 1/(d2+1) names(d2) <- rownames(D); if (length(List)>0) List_nodes <- lapply(List,function(x) unique(unlist(strsplit(x,"~")))) if (length(List_nodes)>0){ s2 <- sapply(1:length(List_nodes),function(i){ l <- List_nodes[[i]]; sum(reVme[l])*max(d2[l]) }) } sum(s2) } score <- function(gr1,gr2,folder,string,VVg,GG,rel,names,maxPP,D){ #performing ACST reV<-re_MEN(VVg,gr1,gr2); AA <- sapply(1:length(GG),function(i){ if (numNodes(GG[[i]])!=0){ List <- findConsPath4(reV,GG[[i]],rel[[i]],relacje) ocen(List,reV,GG[[i]],D[[i]]) } else 0 }) AA <- t(AA) rownames(AA) <- names; write.table(AA,file=paste(folder,"/res_",string,".txt",sep=""),col.names=F,sep="\t",quote=F) #all global statistics (for original and permutated labels for) are seved in one file } ### ### Performing ACST # In the first step we read expression data, the types of relations (please see Data preparation) and the distances (please see Data preparation) #uploading pathway definitions files <- dir("KEGGxml_PosCons/") #reading previously saved types of relations rel <- lapply(files, function(infile){ r1 <- read.delim(file=paste("nodesRel/desc_rel_",strsplit(infile,"\\.")[[1]][1],".txt",sep=""),header=TRUE) r2 <- as.vector(r1[,1]); names(r2) <- rownames(r1) r2 }) #reading previously saved distances D <- lapply(files,function(infile){ file <- paste("distance/path_",gsub(".xml",".txt",infile),sep="") read.delim(file=file,header=TRUE,sep="\t") }) #Information regarding microarray data gr1 <- #number of columns containing (for instance) tumor samples gr2 <- #number of columns containing (for instance) normal samples ExpData <- "data.txt" #file containing microarray data res <- "ResDir" #folder in which results should be saved #computing number of analyzed samples N <- length(gr1)+length(gr2) #reading the data aa<-przygotuj(ExpData,files) #selecting permutations com<-losuj_perm(N,length(gr2),1000) # jesli duze dane if (nrow(com)>1001){ com<-com[sample(1:nrow(com),1001),] } ee<-sapply(1:nrow(com),function(ii) { sum(com[ii,]%in%gr2)==length(gr2) }) com<-com[!ee,][1:min(nrow(com[!ee,]),1000),] #computing global statistics for original permutations score(gr1,gr2,res,"org",VVg,GG,rel,names,maxPP,D) #computing global statistics for permuted permutations sapply(1:nrow(com), function(i){ gr2 <- com[i,] gr1 <- c(1:N)[-gr2] score(gr1,gr2,res,i,VVg,GG,rel,names,maxPP,D) TRUE }) ### ### Computing significance of results p_val <- function(v,ii){#computing p-values v1 <- v[ii]; sum(v>=v1)/length(v) } results_pv<-function(ff){#computinf p-value for all (original and permutated) global statisics files <- paste(ff,dir(ff)[-grep("res",dir(ff))],sep="") V <- read.table(file=paste(ff,"res_org.txt",sep=""),header=F) FF <- lapply(files, function(aa) read.table(file=aa,header=F)) TAB <- cbind(V[,1+1],sapply(FF,function(M) M[,1+1])) v <- sapply(1:nrow(TAB),function(j) p_val(TAB[j,],1)) names(v) <- as.vector(V[,1]) v } #computing FDR (permutation model) r_m <- function(p,x){ sum(x<=p) } s <- function(p,PV){ r_m(p,PV[,1])-mean(sapply(2:ncol(PV),function(ii) r_m(p,PV[,ii]))) } res_fdr <- function(p,PV){ p2 <- PV[PV[,1]>=p,1] vv <- sapply(p2,function(pp){ S <- max(s(pp,PV),0) mean(sapply(2:ncol(PV),function(ii) {Rm <- r_m(pp,PV[,ii]); ifelse(Rm!=0 | S!=0,(Rm)/(Rm+S),0) })) }) min(vv) } results_fdr<-function(ff,th=.25){ #obtaining pathways with FDR below given threshold (th) files <- paste(ff,dir(ff)[-grep("org",dir(ff))],sep="") V <- read.table(file=paste(ff,"res_org.txt",sep=""),header=F) FF <- lapply(files, function(aa) read.table(file=aa,header=F)) TAB <- cbind(V[,1+1],sapply(FF,function(M) M[,1+1])) PV <- sapply(1:ncol(TAB), function(i) sapply(1:nrow(TAB),function(j) p_val(TAB[j,],i))) fdr <- rep(7,nrow(PV)) w <- sort(unique(PV[,1])) i <- 1; p <- 0 while(p