### Packages and definition used in data preparations and analysis

library(KEGGgraph)
library(KEGG.db)
library(RColorBrewer)
library(Rgraphviz)
library(org.Hs.eg.db) #in our analysis we used only human data.
library(RBGL)
library(qvalue)
library(affy)
library(gtools)

relacje <- c(0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0,0,0,0,0) 
names(relacje) <- c("activation", "compound", "binding/association", "expression","inhibition", "activation_phosphorylation", "phosphorylation","indirect","inhibition_phosphorylation", "dephosphorylation_inhibition","dissociation", "dephosphorylation","activation_dephosphorylation", "state", "activation_indirect", "inhibition_ubiquination","ubiquination", "expression_indirect", "indirect_inhibition","repression", "binding/association_phosphorylation", "dissociation_phosphorylation","indirect_phosphorylation","missing interaction","hidden compound", "indirect effect","state change","undef") # we used only two types of conections expression (1) and repression (-1). 

###

### Data preparation
#Here we just extracting info about types of relations and distances between nodes and leafs in each of analyzed pathways. These values depend only on pathway definitions and may be computed only once.

v <- paste("../KEGGxml_PosCons/",dir("../KEGGxml_PosCons/"),sep="") #read def on pathways in which consistent subgraph is possible (downloaded from KEGG website)

for (infile in v){
	G <- parseKGML(infile) # reading a definition of a particular pathway
	G <- KEGGpathway2Graph(G, genesOnly=TRUE)
	{
	if(length(getSubtype(G))>0){
	  rel <- lapply(getSubtype(G), function(a) {
					  c(
					  ifelse(length(a)==0,"undef",getName(a[[1]])[[1]]), #gathering names and values of single relations in a particular pathway
					  ifelse(length(a)==0,"undef",getValue(a[[1]])[[1]])
					  )
			  })
	  V<-as.data.frame(t(sapply(rel,function(x)	{
			  c(x,relacje[x[1]])
			  })))
	  colnames(V)<-c("name","mark","value")
	  outfile<-paste("nodesRel/desc_rel_",strsplit(strsplit(infile,"\\/")[[1]][3],"\\.")[[1]][1],".txt",sep="")
	  write.table(V,file=outfile,quote=FALSE,sep="\t") #saving description of relations in a particular pathway
	}
	}
}

#for each pathway we compute distances between each node and leafs 
files <- dir("KEGGxml_PosCons/")
length(files)
for(i in 1:length(files)) {
	M     <- parseKGML(paste("KEGGxml_PosCons/",files[i],sep="")); 
	G     <- KEGGpathway2Graph(M, genesOnly=TRUE)	
	imp2  <- nodes(G)[(degree(G)[[1]]==0) & (degree(G)[[2]]==0)] #founding nodes witout connections
	imp3  <- nodes(G)[(degree(G)[[1]]!=0) & (degree(G)[[2]]==0)] #founding leafs	
	dist2  <- sapply(nodes(G),function(y) {
			{
			if (sum(y%in%imp2)) {Inf}
			else if (sum(y%in%imp3)) {0}
			else {	
				d <- dijkstra.sp(ugraph(G),y)$distance #computing the distance
				d <- d[imp3]
				d <- d[!is.na(d)]
				min(d)
			      }
			}
			})

	outfile <- paste("distance/path_",gsub(".xml",".txt",files[i]),sep="")
	write.table(dist2,file=outfile,quote=FALSE,sep="\t") # saving distances for a particular pathway
	}

###

### Functions needed to perform ACST
names <- NULL; GG <- as.list(NULL); VVg <- NULL; G <- as.list(NULL); 

re_MEN<-function(X,gr1,gr2){ #computing local statistic
  tt <- apply(X,1,function(x) t.test(x[gr1],x[gr2])$stat)
  tt
}

przygotuj<-function(MA_file,files,MM=TRUE){ #reading microarray data and eliminating non-measured genes from pathways
#uploading pathways
	Vg           <- read.delim(MA_file,header=T); geny<-NULL;
	rownames(Vg) <- translateGeneID2KEGGID(rownames(Vg))
	OK <- sapply(1:length(files), function(i){
		M      <-  parseKGML(paste("KEGGxml_PosCons/",files[i],sep="")); 
		names  <<- c(names,gsub(" ","_",getTitle(M)))		
		G[[i]] <<- KEGGpathway2Graph(M, genesOnly=TRUE)	
#selecting genes measured in microarray experiment
		detected   <-  nodes(G[[i]]) %in% rownames(Vg)
		ups        <-  nodes(G[[i]])[detected]
		GG[[i]]    <<- subKEGGgraph(ups, G[[i]])
	})
	names(names)<<- files;
	files2 <- dir("KEGGxml/")
	OK <- sapply(1:length(files2), function(i){
		M    <-  parseKGML(paste("KEGGxml/",files2[i],sep="")); 
		g1   <-  unlist(sapply(nodes(M),function(x) x@name))
		geny <<-  unique(c(geny,g1[grep("hsa:",g1)])) # in case of using non-human data the "hsa:" should be poprely changed
	})
	VVg         <<- Vg[rownames(Vg)%in%geny,]
}


losuj_perm<-function(Ncol,Ngr,n){ #computing permutations
	AA<-matrix(sample(Ncol,Ngr),nrow=1); i<-1
		while(nrow(AA)<n+1){
		  aa  <- sample(Ncol,Ngr)
		  aa2 <- sapply(1:nrow(AA),function(i) length(intersect(AA[i,],aa))==Ngr)
		  if (sum(aa2)==0) {AA<-rbind(AA,aa); }
	}
	AA
}


Inter2 <- function(List){ #merging single consistent relations
	Lp   <- List
	List <- lapply(List,function(p) unique(unlist(strsplit(p,"~"))))
	{
	v1<-sapply(List,function(x) sum(sapply(List,function(y) any(y%in%x))))
	v2<-which(v1>1); 
	L<-Lp[v1==1]
	if(length(v2)>0){
		L2 <- List[v2]; Lp2 <- Lp[v2]
		Lt <- list(L2[[1]]); Lpt<-list(Lp2[[1]])
			while(length(L2)>0){
			aa  <- L2[[1]];  aap<-Lp2[[1]]
			v   <- which(sapply(L2,function(x) any(aa%in%x)))
			aa  <- unique(unlist(L2[v])); aap<-unique(unlist(Lp2[v]))
			w   <- which(sapply(Lt,function(x) any(aa%in%x)))
			{
			if (length(w)==0) {Lt[[length(Lt)+1]] <- aa; Lpt[[length(Lpt)+1]] <- aap}
			else {
			      aa<-unique(c(aa,unlist(Lt[w]))); aap<-unique(c(aap,unlist(Lpt[w]))); 
			      Lt<-Lt[-w]; Lpt<-Lpt[-w]; 	
			      Lt[[length(Lt)+1]]<-aa; Lpt[[length(Lpt)+1]]<-aap
			     }
			}
			Lp2 <- Lp2[-v]; L2<-L2[-v]			
			}
		L <- c(Lpt,L)
		}
}	
L
}


ConsSons2<-function(node,reV,GG,rel,relacje){ #choosing of consistent sons
	List <- NULL
	sons <- edges(GG)[[node]];
	if(length(sons)>0){
		for(x in sons){
				gs<-paste(node,"~",x,sep="")
 				if (relacje[rel[gs]]*reV[node]*reV[x]>0)  {List<-c(List,gs)}
		}	
	}
	List
}

findConsPath4<-function(reV,GG,rel,relacje){ #gathering consistent subgraphs
	Paths <- NULL
	nodes <- nodes(GG)[(degree(GG)[[2]]!=0)]
	for (node in nodes){
		family     <- ConsSons2(node,reV,GG,rel,relacje)
		{
		if (length(Paths)>0) {
 			ss <- FALSE
			Paths <-lapply(Paths,function(oo) {
 						{
 						if(any(family%in%oo)) {
 									ss <<- TRUE
 									unique(c(family,oo))
 									}
 						else oo
 						}
 					});
 			if (!ss) Paths[[length(Paths)+1]] <- c(unique(family))
			}
		else 
			Paths[[1]] <- family;
		}
	}
	if(length(Paths)>0) Paths<-Inter2(Paths)
    Paths
}


ocen<-function(List,reV,GG,D){ #computing global statistic
	reV2  <- reV^2
	reVme <- (reV2-mean(reV2))/sd(reV2)
	d2    <- D[,1]
	d2          <- 1/(d2+1)
	names(d2)   <- rownames(D); 
	if (length(List)>0) 
			  List_nodes <- lapply(List,function(x) unique(unlist(strsplit(x,"~"))))
	if (length(List_nodes)>0){
				s2   <- sapply(1:length(List_nodes),function(i){
					      l <- List_nodes[[i]]; 
  					       sum(reVme[l])*max(d2[l])
					      })
		                 }
sum(s2)
}

score <- function(gr1,gr2,folder,string,VVg,GG,rel,names,maxPP,D){ #performing  ACST
	reV<-re_MEN(VVg,gr1,gr2);
	AA <- sapply(1:length(GG),function(i){
		if (numNodes(GG[[i]])!=0){
 		  List <- findConsPath4(reV,GG[[i]],rel[[i]],relacje) 
		  ocen(List,reV,GG[[i]],D[[i]])
		    }
		else 0
		})
	AA <- t(AA)
	rownames(AA) <- names;
 	write.table(AA,file=paste(folder,"/res_",string,".txt",sep=""),col.names=F,sep="\t",quote=F) #all global statistics (for original and permutated labels for) are seved in one file
}
###

### Performing ACST
# In the first step we read expression data, the types of relations (please see Data preparation) and the distances (please see Data preparation)

#uploading pathway definitions
files <- dir("KEGGxml_PosCons/")
#reading previously saved types of relations 
rel <- lapply(files, function(infile){
		r1 <- read.delim(file=paste("nodesRel/desc_rel_",strsplit(infile,"\\.")[[1]][1],".txt",sep=""),header=TRUE)
		r2 <- as.vector(r1[,1]); names(r2) <- rownames(r1)
		r2
	})
#reading previously saved distances
D   <- lapply(files,function(infile){
	   file <- paste("distance/path_",gsub(".xml",".txt",infile),sep="")
	   read.delim(file=file,header=TRUE,sep="\t") 
	})

#Information regarding microarray data
gr1     <- #number of columns containing (for instance) tumor samples  
gr2     <- #number of columns containing (for instance) normal samples 
ExpData <- "data.txt" #file containing microarray data
res     <- "ResDir"  #folder in which results should be saved

#computing number of analyzed samples
	N       <- length(gr1)+length(gr2)
#reading the data
	aa<-przygotuj(ExpData,files)
#selecting permutations
	com<-losuj_perm(N,length(gr2),1000) # jesli duze dane
	if (nrow(com)>1001){
		 com<-com[sample(1:nrow(com),1001),]
	  }
	ee<-sapply(1:nrow(com),function(ii) {
		sum(com[ii,]%in%gr2)==length(gr2)
	})
	com<-com[!ee,][1:min(nrow(com[!ee,]),1000),]

#computing global statistics for original permutations
score(gr1,gr2,res,"org",VVg,GG,rel,names,maxPP,D)
#computing global statistics for permuted permutations
sapply(1:nrow(com), function(i){
		gr2 <- com[i,]
		gr1 <- c(1:N)[-gr2]
		score(gr1,gr2,res,i,VVg,GG,rel,names,maxPP,D)
		TRUE
		})

###

### Computing significance of results

p_val <- function(v,ii){#computing p-values
	v1 <- v[ii];
	sum(v>=v1)/length(v)
}

results_pv<-function(ff){#computinf p-value for all (original and permutated) global statisics
	files     <- paste(ff,dir(ff)[-grep("res",dir(ff))],sep="")
	V         <- read.table(file=paste(ff,"res_org.txt",sep=""),header=F)
 	FF        <- lapply(files, function(aa) read.table(file=aa,header=F))
	TAB       <- cbind(V[,1+1],sapply(FF,function(M) M[,1+1])) 
 	v         <- sapply(1:nrow(TAB),function(j) p_val(TAB[j,],1))
	names(v)  <- as.vector(V[,1])
	v
}
#computing FDR (permutation model)
r_m <- function(p,x){ 
	sum(x<=p) 
}

s <- function(p,PV){
	r_m(p,PV[,1])-mean(sapply(2:ncol(PV),function(ii) r_m(p,PV[,ii])))
}

res_fdr <- function(p,PV){
	    p2 <- PV[PV[,1]>=p,1]
	    vv <- sapply(p2,function(pp){
	      S  <- max(s(pp,PV),0)
	      mean(sapply(2:ncol(PV),function(ii) {Rm <- r_m(pp,PV[,ii]); ifelse(Rm!=0 | S!=0,(Rm)/(Rm+S),0)  }))
	    })
	    min(vv)
}

results_fdr<-function(ff,th=.25){ #obtaining pathways with FDR below given threshold (th)
	files     <- paste(ff,dir(ff)[-grep("org",dir(ff))],sep="")
	V         <- read.table(file=paste(ff,"res_org.txt",sep=""),header=F)
 	FF        <- lapply(files, function(aa) read.table(file=aa,header=F))
	TAB       <- cbind(V[,1+1],sapply(FF,function(M) M[,1+1])) 
 	PV        <- sapply(1:ncol(TAB), function(i) sapply(1:nrow(TAB),function(j) p_val(TAB[j,],i)))
	fdr       <- rep(7,nrow(PV))
	w         <- sort(unique(PV[,1]))
	i         <- 1; 
        p         <- 0
	while(p<th){
	  ww        <- which(PV[,1]==w[i])
	  temp      <- sapply(ww, function(ee) res_fdr(PV[ee,1],PV))
	  fdr[ww]   <- temp 
	  p         <- temp[1]
	  i         <- i+1
	}
	Ap     <- read.delim("PathNames.txt",sep="\t",header=TRUE)
	Ap[,1] <- gsub(".xml","",Ap[,1])
	VV     <- data.frame(as.vector(V[,1]),Ap[,1],PV[,1],fdr)
	colnames(VV) <- c("PathName","PathID","p-val","adj.p-val")
	VV
}

results_fdr("ResDir")