############################################################################################################################### ######### Downstream analysis for "An RNA-Seq atlas of gene expression in mouse and rat normal tissues" ######### by Julia F. Soellner, German Leparc, Tobias Hildebrandt, Holger Klein, Leo Thomas, Elia Stupka and Eric Simon ############################################################################################################################### ########################################## ######## setup ########################################## library(limma) library(edgeR) library(ggplot2) library(ggdendro) library(RColorBrewer) library(pheatmap) library(grid) library(gridExtra) library(RMySQL) library(igraph) library(data.table) ## color pallett colors13 <- c(rgb(113,192,94, maxColorValue=255), rgb(159,73,213, maxColorValue=255), rgb(182,207,58, maxColorValue=255), rgb(81,44,119, maxColorValue=255), rgb(165,157,87, maxColorValue=255), rgb(118,123,212, maxColorValue=255), rgb(206,95,49, maxColorValue=255), rgb(99,168,192, maxColorValue=255), rgb(199,70,131, maxColorValue=255), rgb(118,186,149, maxColorValue=255), rgb(177,133,183, maxColorValue=255), rgb(146,90,72, maxColorValue=255), rgb(110,100,103, maxColorValue=255)) ########################################## ######## data import & pre-processsing ########################################## ## create environments to work with both species at the same time mouseEnv <- new.env() ratEnv <- new.env() ## load the data (counts, rpkm, tpm) mouseEnv$counts <- read.table("./data/mouse_counts.txt", check.names = F) mouseEnv$rpkm <- read.table("./data/mouse_rpkm.txt", check.names = F) mouseEnv$tpm <- read.table("./data/mouse_tpm.txt", check.names = F) mouseEnv$pre.design <- read.table("./data/mouse_design.txt", check.names = F) ratEnv$counts <- read.table("./data/rat_counts.txt", check.names = F) ratEnv$rpkm <- read.table("./data/rat_rpkm.txt", check.names = F) ratEnv$tpm <- read.table("./data/rat_tpm.txt", check.names = F) ratEnv$pre.design <- read.table("./data/rat_design.txt", check.names = F) ## use limma to get voom-normalized log(cpm) values for (e in c(mouseEnv, ratEnv)) { group <- factor(e$pre.design[,"group"]) design <- model.matrix(~0+group) colnames(design) <- levels(group) ## put counts into DGE object and normalize dge <- DGEList(counts=e$counts) ## ignore if not expressed in at least one sample isexpr <- rowSums(cpm(dge)>1) >= 1 dge <- dge[isexpr,keep.lib.sizes=FALSE] dge <- calcNormFactors(dge) e$v <- voom(dge,design) } ## in rat we have one sample annotated as "unknown" for which we have no information regarding tissue etc. ## => remove the sample ratEnv$counts <- ratEnv$counts[,which(!grepl("Unknown", colnames(ratEnv$counts)))] ratEnv$rpkm <- ratEnv$rpkm[,which(!grepl("Unknown", colnames(ratEnv$rpkm)))] ratEnv$tpm <- ratEnv$tpm[,which(!grepl("Unknown", colnames(ratEnv$tpm)))] ratEnv$v$E <-ratEnv$v$E[,which(!grepl("Unknown", colnames(ratEnv$v$E)))] ratEnv$pre.design <-ratEnv$pre.design[which(!ratEnv$pre.design$group=="Unknown"),] ## rat: rename "Kindeys" to "Kidney" colnames(ratEnv$counts) <- gsub("Kidneys", "Kidney", colnames(ratEnv$counts)) colnames(ratEnv$rpkm) <- gsub("Kidneys", "Kidney", colnames(ratEnv$rpkm)) colnames(ratEnv$tpm) <- gsub("Kidneys", "Kidney", colnames(ratEnv$tpm)) colnames(ratEnv$v$E) <- gsub("Kidneys", "Kidney", colnames(ratEnv$v$E)) ratEnv$pre.design$group <- droplevels(factor(gsub("Kidneys", "Kidney", ratEnv$pre.design$group))) ratEnv$pre.design$sample <- droplevels(factor(gsub("Kidneys", "Kidney", ratEnv$pre.design$sample))) ## in mouse we have one technical outlier ## => remove the sample mouseEnv$counts <- mouseEnv$counts[,which(!grepl("199_11", colnames(mouseEnv$counts)))] mouseEnv$rpkm <- mouseEnv$rpkm[,which(!grepl("199_11", colnames(mouseEnv$rpkm)))] mouseEnv$tpm <- mouseEnv$tpm[,which(!grepl("199_11", colnames(mouseEnv$tpm)))] mouseEnv$v$E <- mouseEnv$v$E[,which(!grepl("199_11", colnames(mouseEnv$v$E)))] mouseEnv$pre.design <- mouseEnv$pre.design[which(!grepl("199_11", mouseEnv$pre.design$sample)),] ########################################## ######## Prinicipal component analysis ########################################## ## mouse - get tissue association mGroup.list <- unique(mouseEnv$pre.design$group) mGroup <- mouseEnv$pre.design$group mGroup.tags<-as.vector(mGroup) names(mGroup.tags)<-colnames(mouseEnv$v$E) ## rat - get tissue association rGroup.list <- unique(ratEnv$pre.design$group) rGroup <- ratEnv$pre.design$group rGroup.tags<-as.vector(rGroup) names(rGroup.tags)<-colnames(ratEnv$v$E) ## calculate PCA mPrc <- prcomp(t(mouseEnv$v$E), scale=TRUE) rPrc <- prcomp(t(ratEnv$v$E), scale=TRUE) ## calculate % variance explained mPcv <- (mPrc$sdev^2/sum(mPrc$sdev^2))*100 names(mPcv) <- paste0("PC", 1:length(mPcv)) rPcv <- (rPrc$sdev^2/sum(rPrc$sdev^2))*100 names(rPcv) <- paste0("PC", 1:length(rPcv)) ## create screeplot (variance per PC) mDf <- data.frame(PC=1:10, var=mPcv[1:10]) mScree <- ggplot(data=mDf, aes(x=PC, y=var)) + geom_line(color="grey")+ geom_point() + theme_bw(base_size = 20) + scale_x_continuous(limits=c(1,10), breaks=1:10) + ylim(c(0,22)) + ggtitle("Mouse") + ylab("% explained variance") mScree <- arrangeGrob(mScree, top = textGrob("a", x = unit(0, "npc"), y = unit(1, "npc"), just=c("left","top"), gp=gpar(fontface="bold", fontsize=20))) rDf <- data.frame(PC=1:10, var=rPcv[1:10]) rScree <- ggplot(data=rDf, aes(x=PC, y=var)) + geom_line(color="grey")+ geom_point() + theme_bw(base_size = 20) + scale_x_continuous(limits=c(1,10), breaks=1:10) + ylim(c(0,22)) + ggtitle("Rat") + ylab("% explained variance") rScree <- arrangeGrob(rScree, top = textGrob("b", x = unit(0, "npc"), y = unit(1, "npc"), just=c("left","top"), gp=gpar(fontface="bold", fontsize=20))) grid.arrange(mScree, rScree, ncol = 2, newpage = TRUE) ## prepare PCA results for ggplot mPca.df <- data.frame(group=as.character(mGroup), mPrc$x) rPca.df <- data.frame(group=as.character(rGroup), rPrc$x) ## create scatter plots of first PCs mPc1.pc2 <- ggplot(mPca.df,aes(x=PC1,y=PC2,color=group,shape=group)) + geom_point(size=3) + scale_shape_manual(values=1:length(unique(mGroup))) + xlab(paste0("PC1 (", round(mPcv["PC1"], digits=2), " %)")) + ylab(paste0("PC2 (", round(mPcv["PC2"], digits=2), " %)")) + theme_bw() + ggtitle("Mouse") + labs(color="tissue", shape="tissue") + scale_color_manual(values = colors13) + theme(legend.position="none") + coord_fixed(ratio = 1.5) mPc1.pc2 <- arrangeGrob(mPc1.pc2, top = textGrob("a", x = unit(0, "npc"), y = unit(1, "npc"), just=c("left","top"), gp=gpar(fontface="bold"))) rPc1.pc2 <- ggplot(rPca.df,aes(x=PC1,y=PC2,color=group,shape=group)) + geom_point(size=3) + scale_shape_manual(values=1:length(unique(rGroup))) + xlab(paste0("PC1 (", round(rPcv["PC1"], digits=2), " %)")) + ylab(paste0("PC2 (", round(rPcv["PC2"], digits=2), " %)")) + theme_bw() + ggtitle("Rat") + labs(color="tissue", shape="tissue") + scale_color_manual(values = colors13) rPc1.pc2 <- arrangeGrob(rPc1.pc2, top = textGrob("b", x = unit(0, "npc"), y = unit(1, "npc"), just=c("left","top"), gp=gpar(fontface="bold"))) mPc1.pc3 <- ggplot(mPca.df,aes(x=PC1,y=PC3,color=group,shape=group)) + geom_point(size=3) + scale_shape_manual(values=1:length(unique(mGroup))) + xlab(paste0("PC1 (", round(mPcv["PC1"], digits=2), " %)")) + ylab(paste0("PC3 (", round(mPcv["PC3"], digits=2), " %)")) + theme_bw() + ggtitle("Mouse") + labs(color="tissue", shape="tissue") + scale_color_manual(values = colors13) + theme(legend.position="none") + coord_fixed(ratio = 1.5) mPc1.pc3 <- arrangeGrob(mPc1.pc3, top = textGrob("a", x = unit(0, "npc"), y = unit(1, "npc"), just=c("left","top"), gp=gpar(fontface="bold"))) rPc1.pc3 <- ggplot(rPca.df,aes(x=PC1,y=PC3,color=group,shape=group)) + geom_point(size=3) + scale_shape_manual(values=1:length(unique(rGroup))) + xlab(paste0("PC1 (", round(rPcv["PC1"], digits=2), " %)")) + ylab(paste0("PC3 (", round(rPcv["PC3"], digits=2), " %)")) + theme_bw() + ggtitle("Rat") + labs(color="tissue", shape="tissue") + scale_color_manual(values = colors13) rPc1.pc3 <- arrangeGrob(rPc1.pc3, top = textGrob("b", x = unit(0, "npc"), y = unit(1, "npc"), just=c("left","top"), gp=gpar(fontface="bold"))) mPc2.pc3 <- ggplot(mPca.df,aes(x=PC2,y=PC3,color=group,shape=group)) + geom_point(size=3) + scale_shape_manual(values=1:length(unique(mGroup))) + xlab(paste0("PC2 (", round(mPcv["PC2"], digits=2), " %)")) + ylab(paste0("PC3 (", round(mPcv["PC3"], digits=2), " %)")) + theme_bw() + ggtitle("Mouse") + labs(color="tissue", shape="tissue") + scale_color_manual(values = colors13) + theme(legend.position="none") + coord_fixed(ratio = 0.9) mPc2.pc3 <- arrangeGrob(mPc2.pc3, top = textGrob("a", x = unit(0, "npc"), y = unit(1, "npc"), just=c("left","top"), gp=gpar(fontface="bold"))) rPc2.pc3 <- ggplot(rPca.df,aes(x=PC2,y=PC3,color=group,shape=group)) + geom_point(size=3) + scale_shape_manual(values=1:length(unique(rGroup))) + xlab(paste0("PC2 (", round(rPcv["PC2"], digits=2), " %)")) + ylab(paste0("PC3 (", round(rPcv["PC3"], digits=2), " %)")) + theme_bw() + ggtitle("Rat") + labs(color="tissue", shape="tissue") + scale_color_manual(values = colors13) rPc2.pc3 <- arrangeGrob(rPc2.pc3, top = textGrob("b", x = unit(0, "npc"), y = unit(1, "npc"), just=c("left","top"), gp=gpar(fontface="bold"))) grid.arrange(mPc1.pc2, rPc1.pc2, ncol = 2, newpage = TRUE) grid.arrange(mPc1.pc3, rPc1.pc3, ncol = 2, newpage = TRUE) grid.arrange(mPc2.pc3, rPc2.pc3, ncol = 2, newpage = TRUE) ########################################## ######## intra- & inter-tissue variation ########################################## ## mouse - compute squared coefficient of variation for each gene per tissue mDf <- data.frame(ensgid=NA_character_, sCV=NA_integer_, log10mean=NA_integer_, tissue=NA_character_) for(t in 1:length(mGroup.list)){ currT <- as.character(mGroup.list)[t] currD <- mouseEnv$rpkm[which(apply(mouseEnv$rpkm,1,function(x) !any(x==0))) , which(grepl(currT, colnames(mouseEnv$rpkm)))] currStd <- apply(currD, 1, sd) currMean <- apply(currD, 1, mean) currCV <- (currStd / currMean)^2 # sort by mean rpkm idx <- sort(currMean, index.return=T)$ix currMean <- log10(currMean[idx]) currCV <- currCV[idx] currDf <- data.frame(ensgid=names(currMean), sCV=currCV, log10mean=currMean, tissue=rep(currT, length(currMean))) mDf <- rbind(mDf, currDf) } ## remove the NA line of initialization mDf <- mDf[-1,] ## rat - compute squared coefficient of variation for each gene per tissue rDf <- data.frame(ensgid=NA_character_, sCV=NA_integer_, log10mean=NA_integer_, tissue=NA_character_) for(t in 1:length(rGroup.list)){ currT <- as.character(rGroup.list)[t] currD <- ratEnv$rpkm[which(apply(ratEnv$rpkm,1,function(x) !any(x==0))) , which(grepl(currT, colnames(ratEnv$rpkm)))] currStd <- apply(currD, 1, sd) currMean <- apply(currD, 1, mean) currCV <- (currStd / currMean)^2 # sort by mean rpkm idx <- sort(currMean, index.return=T)$ix currMean <- log10(currMean[idx]) currCV <- currCV[idx] currDf <- data.frame(ensgid=names(currMean), sCV=currCV, log10mean=currMean, tissue=rep(currT, length(currMean))) rDf <- rbind(rDf, currDf) } ## remove the NA line of intialization rDf <- rDf[-1,] ## change level ordering in rDf$tissue rDf$tissue <- factor(rDf$tissue, levels = levels(mDf$tissue)) ## generate plots mCVplot <- ggplot(mDf) + geom_smooth(aes(x=log10mean,y=sCV,group=as.factor(tissue),fill=as.factor(tissue),color=as.factor(tissue))) + theme_bw(base_size = 35) + labs(fill = "tissue", color="tissue") + xlab("log10(mean_RPKM)") + ylab(bquote(CV^{2})) + ylim(c(0,0.6)) + scale_fill_manual(values = colors13) + guides(fill = guide_legend(keyheight = 3)) + ggtitle("Mouse") + scale_color_manual(values = colors13) + theme(legend.position="none") mCVplot <- arrangeGrob(mCVplot, top = textGrob("a", x = unit(0, "npc"), y = unit(1, "npc"), just=c("left","top"), gp=gpar(fontface="bold", fontsize=35))) rCVplot <- ggplot(rDf) + geom_smooth(aes(x=log10mean,y=sCV,group=as.factor(tissue),fill=as.factor(tissue),color=as.factor(tissue))) + theme_bw(base_size = 35) + labs(fill = "tissue", color="tissue") + xlab("log10(mean_RPKM)") + ylab(bquote(CV^{2})) + ylim(c(0,0.6)) + scale_fill_manual(values = colors13) + guides(fill = guide_legend(keyheight = 3)) + ggtitle("Rat") + scale_color_manual(values = colors13) rCVplot <- arrangeGrob(rCVplot, top = textGrob("b", x = unit(0, "npc"), y = unit(1, "npc"), just=c("left","top"), gp=gpar(fontface="bold", fontsize=35))) grid.arrange(mCVplot, rCVplot, ncol = 2) ########################################## ######## hierarchical clustering ########################################## ## compute hierarchical clustering mHc <- hclust(dist(t(mouseEnv$v$E))) mHc$labels <- gsub("199", "Mouse", mHc$labels) rHc <- hclust(dist(t(ratEnv$v$E))) rHc$labels <- gsub("199", "Rat", rHc$labels) ## draw dendrogramms mDendro <- ggdendrogram(mHc, rotate = TRUE) + ggtitle("Mouse") + theme_bw() + xlab("") + ylab("") + theme(axis.text=element_text(size=7.5)) mDendro <- arrangeGrob(mDendro, top = textGrob("a", x = unit(0, "npc"), y = unit(1, "npc"), just=c("left","top"), gp=gpar(fontface="bold"))) rDendro <- ggdendrogram(rHc, rotate = TRUE) + ggtitle("Rat") + theme_bw() + xlab("") + ylab("") + theme(axis.text=element_text(size=7.5)) rDendro <- arrangeGrob(rDendro, top = textGrob("b", x = unit(0, "npc"), y = unit(1, "npc"), just=c("left","top"), gp=gpar(fontface="bold"))) grid.arrange(mDendro, rDendro, ncol = 2) ########################################## ######## Orthology - filtering ########################################## ## read data dump from inhouse database containing Ensembl annotation ensemblGenes <- read.table("./data/ensembl84_genes.txt", sep="\t", header=T, quote="") ensemblHomology <- read.table("./data/ensembl84_homology.txt", sep="\t", header=T, quote="") ## convert to data.table for lookup performance ensemblGenesDT <- as.data.table(ensemblGenes) ensemblHomologyDT <- as.data.table(ensemblHomology) ## mouse - limit to protein coding genes with 1-to-1 orthology to human mouseGenes <- c() for(g in 1:nrow(mouseEnv$rpkm)){ currG <- rownames(mouseEnv$rpkm)[g] biotype <- ensemblGenesDT[which(ensemblGenesDT$ensgid==currG), ]$biotype if(length(biotype)==0){ next } else{ if(biotype=="protein_coding"){ # if gene is protein coding: check whether it is a 1-to-1 ortholog to a human gene one2one <- ensemblHomologyDT[which(ensemblHomologyDT$query_ensgid==currG & ensemblHomologyDT$target_species_name=="human"), ]$is_one2one if(length(one2one)!=0){ if(one2one==1){ mouseGenes <- c(mouseGenes, currG) } } } } } ## rat - limit to protein coding genes with 1-to-1 orthology to human ratGenes<- c() for(g in 1:nrow(ratEnv$rpkm)){ currG <- rownames(ratEnv$rpkm)[g] biotype <- ensemblGenesDT[which(ensemblGenesDT$ensgid==currG), ]$biotype if(length(biotype)==0){ next } else{ if(biotype=="protein_coding"){ # if gene is protein coding: check whether it is a 1-to-1 ortholog to a human gene one2one <- ensemblHomologyDT[which(ensemblHomologyDT$query_ensgid==currG & ensemblHomologyDT$target_species_name=="human"), ]$is_one2one if(length(one2one)!=0){ if(unique(one2one)==1){ ratGenes <- c(ratGenes, currG) } } } } } ## retrieve orthology information for the genes with 1-to-1 orthology to human ortho_mouseRat <- ensemblHomologyDT[which(ensemblHomologyDT$query_species_name=="mouse" & ensemblHomologyDT$target_species_name=="rat" & query_ensgid %in% mouseGenes), ] ortho_ratMouse <- ensemblHomologyDT[which(ensemblHomologyDT$query_species_name=="rat" & ensemblHomologyDT$target_species_name=="mouse" & query_ensgid %in% ratGenes), ] ## summarizing the orthology mapping between species coding_summary <- data.frame(species=c("mouse", "rat"), totalGenes=c(length(rownames(mouseEnv$rpkm)), length(rownames(ratEnv$rpkm))), codingOne2one=c(length(mouseGenes), length(ratGenes))) ortho_summary <- data.frame(direction=c("mouse->rat", "rat->mouse"), one2one=c(sum(ortho_mouseRat$is_one2one), sum(ortho_ratMouse$is_one2one)), one2m=c(length(unique(ortho_mouseRat[ortho_mouseRat$is_one2one==0,"query_ensgid"])), length(unique(ortho_ratMouse[ortho_ratMouse$is_one2one==0,"query_ensgid"]))), none=c(sum(!mouseGenes %in% ortho_mouseRat$query_ensgid), sum(!ratGenes %in% ortho_ratMouse$query_ensgid))) ortho_summary$total <- rowSums(ortho_summary[,-1]) ## limit data to 1-to-1 orthologues ortho_mouseRat <- ortho_mouseRat[which(ortho_mouseRat$is_one2one==1), ] ortho_ratMouse <- ortho_ratMouse[which(ortho_ratMouse$is_one2one==1), ] ## limit mouse data to pairs where the rat gene is also a 1-to-1 human orthologue ## and the other way around for rat->mouse direction ortho_mouseRat <- ortho_mouseRat[which(ortho_mouseRat$target_ensgid %in% ratGenes), ] ortho_ratMouse <- ortho_ratMouse[which(ortho_ratMouse$target_ensgid %in% mouseGenes), ] ## remove redundant pairs ortho_dfR <- data.frame(ortho_ratMouse)[, c("target_ensgid", "query_ensgid", "global_identity")] colnames(ortho_dfR) <- c("query_ensgid", "target_ensgid", "global_identity") ortho_df <- rbind(data.frame(ortho_mouseRat)[,c("query_ensgid", "target_ensgid", "global_identity")], ortho_dfR) ortho_df <- ortho_df[!duplicated(ortho_df[,1:2]), ] rm(list=c("ortho_dfR", "ortho_mouseRat", "ortho_ratMouse")) ##################################################### ######## Orthology - sequence identity & correlation ##################################################### ## compute mouse<->rat expression correlation and fetch sequence identity to human for (r in 1:nrow(ortho_df)) { # get sequence identity to human for both species and the mean of both values mHid <- ensemblHomologyDT[which(as.character(ensemblHomologyDT$query_ensgid)==ortho_df[r, "query_ensgid"] & as.character(ensemblHomologyDT$target_species_name)=="human"), ]$global_identity rHid <- ensemblHomologyDT[which(as.character(ensemblHomologyDT$query_ensgid)==ortho_df[r, "target_ensgid"] & as.character(ensemblHomologyDT$target_species_name)=="human"), ]$global_identity ortho_df[r, "mouse_id2human"] <- mHid ortho_df[r, "rat_id2human"] <- rHid ortho_df[r, "mean_id2human"] <- mean(c(mHid, rHid)) # get tissue expression for both genes and summarize replicates mExp <- mouseEnv$tpm[which(rownames(mouseEnv$tpm)==ortho_df$query_ensgid[r]), ] rExp <- ratEnv$tpm[which(rownames(ratEnv$tpm)==ortho_df$target_ensgid[r]), ] mT <- unique(sapply(names(mExp), function(x) strsplit(x, "_")[[1]][3])) mExpMed <- data.frame(expMed=rep(NA_integer_, length(mT)), row.names = mT) for (i in 1:length(mT)) { mExpMed[mT[i], ] <- median(as.numeric(mExp[which(grepl(mT[i], names(mExp)))])) } # mouse fold change - tissue specificity mExpFC <- apply(mExpMed, 1, function(x) x["expMed"]/colMeans(mExpMed)) rT <- unique(sapply(names(rExp), function(x) strsplit(x, "_")[[1]][3])) rExpMed <- data.frame(expMed=rep(NA_integer_, length(rT)), row.names = rT) for (i in 1:length(rT)) { rExpMed[rT[i], ] <- median(as.numeric(rExp[which(grepl(rT[i], names(rExp)))])) } # rat fold change - tissue specificity rExpFC <- apply(rExpMed, 1, function(x) x["expMed"]/colMeans(rExpMed)) # get common tissues and put both vectors in same order keep <- intersect(rownames(mExpMed), rownames(rExpMed)) mExpMed <- mExpMed[keep, ] rExpMed <- rExpMed[keep, ] mExpFC <- mExpFC[keep] rExpFC <- rExpFC[keep] ortho_df[r, "pearson"] <- round(cor(mExpMed, rExpMed, method="pearson"), digits=3) ortho_df[r, "spearman"] <- round(cor(mExpMed, rExpMed, method="spearman"), digits=3) ortho_df[r, "FC_dist"] <- round(dist(rbind(mExpFC, rExpFC), method="manhattan"), digits=3) } g1 <- ggplot(ortho_df, aes(x=mean_id2human, y=pearson)) + geom_point(alpha=0.1) + theme_bw(base_size = 30) + ylab("Pearson correlation coefficient") + xlab("Mean sequence identity to human") g1 <- arrangeGrob(g1, top = textGrob("a", x = unit(0, "npc"), y = unit(1, "npc"), just=c("left","top"), gp=gpar(fontface="bold", fontsize=30))) g2 <- ggplot(ortho_df, aes(x=mean_id2human, y=FC_dist)) + geom_point(alpha=0.1) + theme_bw(base_size = 30) + ylab("Fold change distance") + xlab("Mean sequence identity to human") g2 <- arrangeGrob(g2, top = textGrob("b", x = unit(0, "npc"), y = unit(1, "npc"), just=c("left","top"), gp=gpar(fontface="bold", fontsize=30))) grid.arrange(g1, g2, ncol = 2, newpage = TRUE) ########################################## ######## Highly conserved genes ########################################## ## get pairs with high pearson corr. coefficient, low fold change distance and high sequence identity to human ortho_res <- ortho_df[which(ortho_df$pearson>=0.9 & ortho_df$FC_dist<=2 & ortho_df$mean_id2human>=99), ] ## add human ensgid for (i in 1:nrow(ortho_res)) { hg <- ensemblHomologyDT[which(as.character(ensemblHomologyDT$query_ensgid)==ortho_res[i, "query_ensgid"] & as.character(ensemblHomologyDT$target_species_name)=="human"), ]$target_ensgid ortho_res[i, "human_gene"] <- ensemblGenesDT[which(as.character(ensemblGenesDT$ensgid)==hg), ]$name ############## } ortho_res <- cbind(ortho_res$human_gene, ortho_res[,1:9]) colnames(ortho_res)[1:3] <- c("human_gene", "mouse_ensgid", "rat_ensgid") write.table(ortho_res, file="highly_conserved_genes.txt", quote=F, row.names = FALSE, sep="\t") ########################################## ######## sessionInfo() ########################################## ## R version 3.3.2 (2016-10-31) ## Platform: x86_64-pc-linux-gnu (64-bit) ## Running under: Scientific Linux 7.3 (Nitrogen) ## ## locale: ## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C ## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8 ## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8 ## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C ## [9] LC_ADDRESS=C LC_TELEPHONE=C ## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C ## ## attached base packages: ## [1] grid stats graphics grDevices utils datasets methods ## [8] base ## ## other attached packages: ## [1] data.table_1.10.4 igraph_1.0.1 RMySQL_0.10.12 ## [4] DBI_0.7 gridExtra_2.2.1 pheatmap_1.0.8 ## [7] RColorBrewer_1.1-2 ggdendro_0.1-20 ggplot2_2.2.1 ## [10] edgeR_3.16.5 limma_3.30.13 knitr_1.16 ## ## loaded via a namespace (and not attached): ## [1] Rcpp_0.12.12 magrittr_1.5 MASS_7.3-47 munsell_0.4.3 ## [5] colorspace_1.3-2 lattice_0.20-35 rlang_0.1.1 highr_0.6 ## [9] stringr_1.2.0 plyr_1.8.4 tools_3.3.2 gtable_0.2.0 ## [13] htmltools_0.3.6 yaml_2.1.14 lazyeval_0.2.0 rprojroot_1.2 ## [17] digest_0.6.12 tibble_1.3.3 codetools_0.2-15 evaluate_0.10.1 ## [21] rmarkdown_1.6 labeling_0.3 stringi_1.1.5 scales_0.4.1 ## [25] backports_1.1.0 locfit_1.5-9.1