--- title: "2018-01-22_microarray" author: "Jonathan Lee" date: "1/22/2018" output: html_document --- ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE) source("http://bioconductor.org/biocLite.R") #biocLite() library(ggplot2) library(biomaRt) library(dplyr) library(DESeq2) #install.packages('dendextend') library(dendextend) library(ggrepel) library(Vennerable) library(gridExtra) library(gProfileR) library(stringr) #biocLite("KEGGREST") library(KEGGREST) #biocLite("fgsea") library(fgsea) library(pheatmap) gg_color_hue <- function(n) { hues = seq(15, 375, length = n + 1) hcl(h = hues, l = 65, c = 100)[1:n] } substrLeft <- function(x, n){ substr(x, 1, nchar(x)-n) } ``` ## microarray data set ```{r, eval=T} source("http://bioconductor.org/biocLite.R") biocLite("affy") biocLite("affyPLM") biocLite("limma") library(affy) library(affyPLM) library(limma) ``` ```{r, eval=T} df.affy.ps <- ReadAffy(filenames = c("KC_MOE430A_2_het_polysomes.CEL", "KC_MOE430A_2_wt_polysomes.CEL")) df.affy.rna <- ReadAffy(filenames = c("KC_MOE430A_2_het_TotalRNA.CEL", "KC_MOE430A_2_wt_TotalRNA.CEL")) ``` ```{r, eval=T} MAplot(df.affy.ps, pairs=T, cex=T, plot.method="smoothScatter") MAplot(df.affy.rna, pairs=T, cex=T, plot.method="smoothScatter") ``` ```{r, eval=T} df.rma.ps <- rma(df.affy.ps) df.expr.ps <- exprs(df.rma.ps) df.rma.rna <- rma(df.affy.rna) df.expr.rna <- exprs(df.rma.rna) df.expr.rna <- df.expr.rna[,c(2,1)] colnames(df.expr.rna) <- colnames(df.expr.rna)[c(2,1)] df.expr.ps <- df.expr.ps[,c(2,1)] colnames(df.expr.ps) <- colnames(df.expr.ps)[c(2,1)] ``` ```{r, eval=T} biocLite("mouse430a2.db") library(mouse430a2.db) ``` ```{r, eval=T} # from Mouse430_2.na36.annot.csv (Affy website) # tail -n +23 *csv > probe_anno.csv anno <- read.csv(file="probe_anno.csv", header=T) # RefSeq.Transcript.ID, Probe.Set.ID anno <- anno[,c("RefSeq.Transcript.ID", "Probe.Set.ID")] ``` ```{r, eval=T} library(tidyr) library(dplyr) df.expr.ps <- merge(df.expr.ps, anno, by.x=0, by.y="Probe.Set.ID") df.expr.rna <- merge(df.expr.rna, anno, by.x=0, by.y="Probe.Set.ID") y.ps <- strsplit(as.character(df.expr.ps$RefSeq.Transcript.ID) , " /// ", fixed=TRUE) y.rna <- strsplit(as.character(df.expr.rna$RefSeq.Transcript.ID) , " /// ", fixed=TRUE) df.expr.ps <- data.frame(df.expr.ps[rep(1:nrow(df.expr.ps), sapply(y.ps, length)), 1:3], refseq= unlist(y.ps)) df.expr.rna <- data.frame(df.expr.rna[rep(1:nrow(df.expr.rna), sapply(y.rna, length)), 1:3], refseq= unlist(y.rna)) df.expr.rna <- df.expr.rna[,c(2,3,4)] %>% group_by(refseq) %>% summarise_all(mean) %>% as.data.frame() df.expr.ps <- df.expr.ps[,c(2,3,4)] %>% group_by(refseq) %>% summarise_all(mean) %>% as.data.frame() df.expr.rna <- data.frame(df.expr.rna[,-1], row.names = df.expr.rna[,1]) df.expr.ps <- data.frame(df.expr.ps[,-1], row.names = df.expr.ps[,1]) df.expr.rna$lfc <- df.expr.rna[,1] - df.expr.rna[,2] # het - wt df.expr.ps$lfc <- df.expr.ps[,1] - df.expr.ps[,2] # het - wt colnames(df.expr.rna) <- c("het", "wt", "lfc") colnames(df.expr.ps) <- c("het", "wt", "lfc") df.expr.rna$rank <- rank(df.expr.rna$lfc) df.expr.ps$rank <- rank(df.expr.ps$lfc) ``` ```{r, eval=T} refseq <- getBM(filters="refseq_mrna", attributes=c("refseq_mrna", "external_gene_name"), values=row.names(df.expr.rna), mart=mmmart) df.expr.rna <- merge(df.expr.rna, refseq, by.x=0, by.y="refseq_mrna") df.expr.ps <- merge(df.expr.ps, refseq, by.x=0, by.y="refseq_mrna") ``` ```{r, eval=T} df.expr.rvp <- data.frame(merge(df.expr.rna, df.expr.ps, by="Row.names")) df.expr.rvp$ratio <- df.expr.rvp$lfc.x - df.expr.rvp$lfc.y df.expr.rvp$rank <- rank(df.expr.rvp$ratio) df.expr.rvp <- df.expr.rvp[!duplicated(df.expr.rvp[,-1]),] ``` ```{r, eval=T} df.expr.rvp <- mutate(df.expr.rvp, grp = ifelse(df.expr.rvp$ratio > mean(df.expr.rvp[which(round(df.expr.rvp$rank) == round(max(df.expr.rvp$rank)*0.01)),]$ratio), ifelse(df.expr.rvp$ratio > mean(df.expr.rvp[which(round(df.expr.rvp$rank) == round(max(df.expr.rvp$rank)*0.99)),]$ratio), "a", "b"), "c")) rvp_dist <- ggplot(df.expr.rvp, aes(rank, ratio, color=grp)) + geom_point() + theme_classic() + geom_hline(yintercept=0) + xlab("Gene Rank") + ylab("Polysome vs. Total RNA, HET vs. WT") + scale_color_manual(values=c("darkorchid4", "gray75", "darkorange1")) + geom_hline(yintercept=mean(df.expr.rvp[which(round(df.expr.rvp$rank) == round(max(df.expr.rvp$rank)*0.01)),]$ratio), lty=3) + geom_hline(yintercept=mean(df.expr.rvp[which(round(df.expr.rvp$rank) == round(max(df.expr.rvp$rank)*0.99)),]$ratio), lty=3) + theme(legend.position = "none") rvp_dist ggsave(file="rvp_dist_lines.svg", plot=rvp_dist, width=4, height=4, units="in") ``` ```{r, eval=T} write.table(df.expr.rvp[,c(1,11,12,13)], file="polysome.v.total.txt", row.names=F, sep='\t', quote=F) write.table(df.expr.rna, file="total.rna.txt", row.names=F, sep='\t', quote=F) write.table(df.expr.ps, file="polysome.rna.txt", row.names=F, sep='\t', quote=F) ``` # GSEA analysis ```{r, eval=T} library(tidyr) library(dplyr) df.expr.rvp <- merge(df.expr.rvp, getBM(filters="refseq_mrna", attributes=c("refseq_mrna", "entrezgene"), values=df.expr.rvp$Row.names, mart=mmmart), by.x="Row.names", by.y="refseq_mrna") ``` # KEGG pathways ```{r, eval=T} path <- gsub("mmu:","",keggLink("mmu","pathway")) path <- data.frame(pathway=names(path), gene=as.character(path)) path <- split(as.character(path$gene), path$pathway) path$`path:mmu04927` <- NULL ``` ```{r, eval=T} #biocLite("reactome.db") library(reactome.db) rdb <- reactomePathways(as.character(df.expr.rvp$entrezgene)) ``` ```{r, eval=T} rvp_ranks <- setNames(df.expr.rvp$ratio, df.expr.rvp$entrezgene) ``` ```{r, eval=T} rvp_fgsea_kegg <- fgsea(path, rvp_ranks, nperm=10000, maxSize=500) rvp_fgsea_kegg <- rvp_fgsea_kegg[order(rvp_fgsea_kegg$padj),] ``` ```{r, eval=T} p <- c() p <- c(p, sapply(keggGet(rvp_fgsea_kegg$pathway[1:10]), "[[", "NAME")) length(p) p <- c(p, sapply(keggGet(rvp_fgsea_kegg$pathway[11:20]), "[[", "NAME")) length(p) p <- c(p, sapply(keggGet(rvp_fgsea_kegg$pathway[21:30]), "[[", "NAME")) length(p) p <- c(p, sapply(keggGet(rvp_fgsea_kegg$pathway[31:40]), "[[", "NAME")) length(p) p <- c(p, sapply(keggGet(rvp_fgsea_kegg$pathway[41:50]), "[[", "NAME")) length(p) p <- c(p, sapply(keggGet(rvp_fgsea_kegg$pathway[51:60]), "[[", "NAME")) length(p) p <- c(p, sapply(keggGet(rvp_fgsea_kegg$pathway[61:70]), "[[", "NAME")) length(p) p <- c(p, sapply(keggGet(rvp_fgsea_kegg$pathway[71:80]), "[[", "NAME")) length(p) p <- c(p, sapply(keggGet(rvp_fgsea_kegg$pathway[81:90]), "[[", "NAME")) length(p) p <- c(p, sapply(keggGet(rvp_fgsea_kegg$pathway[91:100]), "[[", "NAME")) length(p) p <- c(p, sapply(keggGet(rvp_fgsea_kegg$pathway[101:110]), "[[", "NAME")) length(p) p <- c(p, sapply(keggGet(rvp_fgsea_kegg$pathway[111:120]), "[[", "NAME")) length(p) p <- c(p, sapply(keggGet(rvp_fgsea_kegg$pathway[121:130]), "[[", "NAME")) length(p) p <- c(p, sapply(keggGet(rvp_fgsea_kegg$pathway[131:140]), "[[", "NAME")) length(p) p <- c(p, sapply(keggGet(rvp_fgsea_kegg$pathway[141:150]), "[[", "NAME")) length(p) p <- c(p, sapply(keggGet(rvp_fgsea_kegg$pathway[151:160]), "[[", "NAME")) length(p) p <- c(p, sapply(keggGet(rvp_fgsea_kegg$pathway[161:170]), "[[", "NAME")) length(p) p <- c(p, sapply(keggGet(rvp_fgsea_kegg$pathway[171:180]), "[[", "NAME")) length(p) p <- c(p, sapply(keggGet(rvp_fgsea_kegg$pathway[181:190]), "[[", "NAME")) length(p) p <- c(p, sapply(keggGet(rvp_fgsea_kegg$pathway[191:200]), "[[", "NAME")) length(p) p <- c(p, sapply(keggGet(rvp_fgsea_kegg$pathway[201:210]), "[[", "NAME")) length(p) p <- c(p, sapply(keggGet(rvp_fgsea_kegg$pathway[211:220]), "[[", "NAME")) length(p) p <- c(p, sapply(keggGet(rvp_fgsea_kegg$pathway[221:230]), "[[", "NAME")) length(p) p <- c(p, sapply(keggGet(rvp_fgsea_kegg$pathway[231:240]), "[[", "NAME")) length(p) p <- c(p, sapply(keggGet(rvp_fgsea_kegg$pathway[241:250]), "[[", "NAME")) length(p) p <- c(p, sapply(keggGet(rvp_fgsea_kegg$pathway[251:260]), "[[", "NAME")) length(p) p <- c(p, sapply(keggGet(rvp_fgsea_kegg$pathway[261:270]), "[[", "NAME")) length(p) p <- c(p, sapply(keggGet(rvp_fgsea_kegg$pathway[271:280]), "[[", "NAME")) length(p) p <- c(p, sapply(keggGet(rvp_fgsea_kegg$pathway[281:290]), "[[", "NAME")) length(p) p <- c(p, sapply(keggGet(rvp_fgsea_kegg$pathway[291:300]), "[[", "NAME")) length(p) p <- c(p, sapply(keggGet(rvp_fgsea_kegg$pathway[301:310]), "[[", "NAME")) length(p) p <- c(p, sapply(keggGet(rvp_fgsea_kegg$pathway[311:318]), "[[", "NAME")) length(p) ``` ```{r, eval=T} rvp_fgsea_kegg$name <- substrLeft(p,23) ``` ```{r, eval=T, fig.height=5, fig.width=6} rvp_fgsea_kegg$name <- factor(rvp_fgsea_kegg$name, levels=rev(unique(rvp_fgsea_kegg$name[order(rvp_fgsea_kegg$NES)]))) rvp_ip <- c("NF-kappa B signaling pathway", "Linoleic acid metabolism", "Arachidonic acid metabolism", "Chemokine signaling pathway", "cAMP signaling pathway", "Cytosolic DNA-sensing pathway", "Phosphatidylinositol signaling system", "PI3K-Akt signaling pathway", "cGMP-PKG signaling pathway", "Jak-STAT signaling pathway", "RNA transport", "mRNA surveillance pathway", "Purine metabolism", "Propanoate metabolism", "Ubiquitin mediated proteolysis", "Arginine and proline metabolism", "p53 signaling pathway", "Amino sugar and nucleotide sugar metabolism", "Mismatch repair", "Cellular senescence", "DNA replication", "Pyrimidine metabolism", "Nucleotide excision repair") rvp_kegg <- ggplot(subset(rvp_fgsea_kegg, pval < 0.05 & name %in% rvp_ip), aes(y = NES, x = name)) + geom_segment(yend=0, aes(xend=name), lty=3) + theme_classic() + ylab("Normalized Enrichment Score") + geom_point(aes(size=size, color=(pval))) + xlab("Significant KEGG Pathways") + coord_flip() + theme(legend.position = "right", axis.text.y = element_text(vjust=0.5)) + guides(size=guide_legend(title="Gene Set Size")) + #scale_color_continuous(low="blue", high="red", name=expression("-Log"[10]*" p-value")) + scale_color_continuous(low="red", high="blue", name="p-value", trans = "log", breaks=c(0.05, 0.005, 0.0005, 0.00005)) + ylim(c(-4,4)) + geom_hline(yintercept=0, lwd=0.1) #+ #annotate("text", y=ceiling(max(tive_fgsea_kegg$NES)), x=1:25, label=rev(tive_fgsea_kegg$leadingEdgeProp), size=3) rvp_kegg ggsave(file="rvp_kegg.svg", plot=rvp_kegg, width=6, height=5, units="in") ```