if (!requireNamespace("BiocManager", quietly = TRUE)) install.packages("BiocManager") BiocManager::install("gage") if (!requireNamespace("BiocManager", quietly = TRUE)) install.packages("BiocManager") BiocManager::install("KEGGREST") install.packages("ggpubr") library(KEGGREST) library(matrixStats) library(tidyverse) library(data.table) library(ggpubr) library(gage) #KEGG kg.eco=kegg.gsets("eco") kg.sets <- kg.eco$kg.sets kg.names <- lapply(kg.sets, length) counts_setsKEGG <- data.frame(unlist(kg.names, use.names = F)) names_setsKEGG <- data.frame(attributes(kg.names)) kegg_geneSets_summary <- cbind(names_setsKEGG,counts_setsKEGG) colnames(kegg_geneSets_summary) <- c("set", "count") isK <- sort(kegg_geneSets_summary$count) #vector of all KEGG set sizes #Ecocyc ecocD <- read.csv("pathway-gene-counts.txt", header = F, sep = "\t") #"pathway-gene-counts.txt" provided in supplement isE <- sort(ecocD$V2) #Ecocyc parameters: N <- 1093 #number of non-redundant genes r <- 100 x <- seq(1:50) # maximum number of discovered genes from any of the sets #KEGG parameters: N <- 1690 # total number of annotated genes in KEGG E. coli pathways, on Apr. 16, 2020 r <- 100 x <- seq(1:50) # maximum number of discovered genes from any of the sets rm(p_chance) rm(p_adj) sensitivityX <- function(isX,N,r,x){ p_chance <- matrix(nrow = length(x), ncol = length(isX)) for(i in 1:length(x)) { for(j in 1:length(isX)) { p_chance[i,j] <- 1- phyper(x[i],isX[j],N-isX[j],r) } } p_adj <- apply(p_chance, 1, function(p_chance) {p.adjust(p_chance, method = "BH")}) out2 <- c() idx <- vector() for(k in 1:dim(p_adj)[1]) { idx[k] <- min(which(p_adj[k,]<0.05)) } out2 <- matrix(nrow = length(idx), ncol = 2) res1 <- matrix(nrow = length(idx), ncol = 2) for(y in 1: length(idx)) { out2[y,] <- cbind(idx[y], p_adj[y,idx[y]]) res1[y,] <- cbind(isX[y], idx[y]) } sensX <- data.frame(res1) sensX <- subset(sensX, sensX$X2 != Inf) return(sensX) } N <- 1690 sensK <- sensitivityX(isK,N,r,x) N <- 1093 sensE <- sensitivityX(isE,N,r,x) #Figure S1 library(reshape2) library(tidyverse) library(plyr) library(dplyr) sensK.melt<-melt(sensK[,c("X1", "X2")], id="X1") sensE.melt<-melt(sensE[,c("X1", "X2")], id="X1") sensK.melt[, "variable"] <-"KEGG" sensE.melt[, "variable"] <-"EcoCyc" data.melted <- rbind(sensK.melt, sensE.melt) colnames(data.melted)[2] <- "annotation" mu <- ddply(data.melted, "annotation", summarise, grp.mean=mean(value)) p1 <- ggboxplot(data.melted, x = "annotation", y = "value", color = "annotation", palette = "jco", add = "jitter", order = c("EcoCyc", "KEGG"), legend="none") + scale_y_continuous(breaks = c(1,2,3,4,5,10,15,20,25)) + labs(y="Minimum number of significantly expressed genes", x="Annotation") + geom_hline(data=mu, aes(yintercept=grp.mean, color=annotation), linetype="dashed") + stat_compare_means() p2 <- gghistogram(data.melted,x="value", color=FALSE, fill="annotation", palette = "jco", legend.title="Database", alpha = 0.5) + labs(x="Minimum number of significantly expressed genes", y="Count") + theme(legend.position = c(0.8,.9)) ggarrange(p1,p2,labels="AUTO" ) isKsorted <- sort(isK,decreasing = T) isK1 <- as.data.frame(isKsorted[-1]) isK1$annotation <- c(rep("KEGG",nrow(isK1))) colnames(isK1)[1] <- "db" isE1 <- as.data.frame(isE) isE1$annotation <- c(rep("EcoCyc",nrow(isE1))) colnames(isE1)[1] <- "db" isKE1 <- rbind(isK1,isE1) mu1 <-ddply(isKE1, "annotation", summarise, grp.mean=mean(db)) # p3 <- gghistogram(isKE1,x="db", color="annotation", fill="annotation",palette = "jco", linetype = "blank", alpha=0.7, legend.title="Database") +labs(x="Gene set sizes", y="Frequency, counts") # p4 <- ggboxplot(isKE1, x = "annotation", y = "db", # color = "annotation", palette = "jco", # add = "jitter", order = c("EcoCyc", "KEGG"), legend="none") + geom_hline(data=mu1, aes(yintercept=grp.mean, color=annotation), # linetype="dashed") + # scale_y_continuous(breaks = c(6.2,29.5,100,300)) + labs(y="Gene set sizes", x="Annotation") + stat_compare_means() # ggarrange(p3,p4, labels = "AUTO") #Figure S2 kk1 <- read.table("kegg_ecocyc_69pairs.txt", header=TRUE) # "kegg_ecocyc_69pairs.txt" is provided in supplement ggscatter(kk1, x = "ecocycSizes", y = "keggSizes", add = "reg.line") + stat_cor(label.x = 45, label.y = 450) + stat_regline_equation(label.x = 45, label.y = 430) +xlab("Sizes of EcoCyc sets") +ylab("Sizes of KEGG sets") ggsave("set69.pdf") #Figure S3 N <- 1690 r <- seq(5,600, 1) m <- kk1$keggSizes # KEGG pathway sizes from kk1 x <- seq(1:100) # maximum number of discovered genes from any of the sets length(x) p_chance_sample <- array(dim=c(length(x),length(r),length(m))) for(i in 1:length(x)) { for(j in 1:length(r)) { for (k in 1:length(m)) { p_chance_sample[i,j,k] <- 1- phyper(x[i],m[k],N-m[k],r[j]) } } } padj_sample <- array(apply(p_chance_sample, 3, function(p_chance_sample) {p.adjust(p_chance_sample, method = "BH")}), dim=c(length(x),length(r),length(m))) dim(padj_sample) p_chance_sample[1,1,] padj_sample[1,1,] ### calculate mininumum or critical sub-set size idx <- matrix(nrow=dim(padj_sample)[2],ncol=dim(padj_sample)[3]) for (r in 1:dim(padj_sample)[2]) { for(k in 1:dim(padj_sample)[3]) { idx[r,k] <- min(which(padj_sample[,r,k]<0.01)) } } N <- 1093 # total number of annotated genes in KEGG E. coli pathways, on Oct.4, 2019 #m <- 50 # number of genes in a mock set # i iterates over a possible number of genes found for each of the annotated sets in the sample r # j iterates over set sizes #r <- seq(0.1*m, 5*m, 0.1*m) r <- seq(5,600, 1) m <- kk1$ecocycSizes # EcoCyc pathway sizes from kk1 #length(m) x <- seq(1:100) # maximum number of discovered genes from any of the sets length(x) p_chance_sample <- array(dim=c(length(x),length(r),length(m))) for(i in 1:length(x)) { for(j in 1:length(r)) { for (k in 1:length(m)) { p_chance_sample[i,j,k] <- 1- phyper(x[i],m[k],N-m[k],r[j]) } } } dim(p_chance_sample) #[1] 50 596 19 padj_sample <- array(apply(p_chance_sample, 3, function(p_chance_sample) {p.adjust(p_chance_sample, method = "BH")}), dim=c(length(x),length(r),length(m))) dim(padj_sample) p_chance_sample[1,1,] padj_sample[1,1,] ### calculate mininumum or critical sub-set size idxE <- matrix(nrow=dim(padj_sample)[2],ncol=dim(padj_sample)[3]) for (r in 1:dim(padj_sample)[2]) { for(k in 1:dim(padj_sample)[3]) { idxE[r,k] <- min(which(padj_sample[,r,k]<0.01)) } } criticalRat <- idx/idxE cbind(apply(criticalRat, 2, min), apply(criticalRat, 2, median), apply(criticalRat, 2, max) ) medcritRat <- apply(criticalRat, 2, median) mean(apply(criticalRat, 2, median)) sort(medcritRat) (meds <- colMedians(criticalRat, na.rm=TRUE)) cR1 <- criticalRat[,order(meds)] boxplot(cR1, ylab = "Ratio of cirtical sub-set sizes, KEGG/EcoCyc", xlab = "Critical sub-sets of analogous pathways,\nordered by median ratio ") #Figure S4 df <- as.data.frame(criticalRat[,33]) df$sampleSize <- seq(5,600,1) colnames(df)[1] <- "ratio" ggplot(df, aes(x=sampleSize, y=ratio)) + geom_point()+ geom_line() + labs(x="Sample size", y="Ratio of critical sub-set sizes, KEGG/EcoCyc") #Figure S5 props <- cbind.data.frame(seq(5,600,1), idx[,33]/kk1[33,5], idxE[,33]/kk1[33,6]) colnames(props) <- c("size", "fkegg", "feco") propPiv <- pivot_longer(props, starts_with("f")) ggplot(propPiv, aes(x=size, y=value, colour=name)) + geom_point()+ geom_line() +labs(x="Sample size", y="Fraction of the set") + scale_color_manual(name="Annotation", labels=c("EcoCyc","KEGG"), values=c("#56B4E9", "#E69F00")) + theme_classic()