#statistical modeling
library(mclust)
library(oncomix)
library(glmnet)
library(limma)
library(survival)
library(AUC)
#bioinformatics packages
library(TCGAbiolinks)
library(AIMS)
library(GSA)
library(GenomicRanges)
library(TxDb.Hsapiens.UCSC.hg38.knownGene)
library(FDb.InfiniumMethylation.hg19)
library(rtracklayer)
library(GenomicDataCommons)
library(biomaRt)
#data wrangling, visualization and file organlization
library(here)
library(XML)
library(SummarizedExperiment)
library(ggrepel)
library(RColorBrewer)
library(ggdendro)
library(readxl)
library(tidyverse)
library(janitor)
#set seed for analysis
set.seed(100)
GDCtools
from the TCGAbiolinks package.if(file.exists(here("processed_data", "tcga-exp-brca.rds"))){
e <- readRDS(file = here("processed_data", "tcga-exp-brca.rds"))
} else {
eq <- GDCquery(project = "TCGA-BRCA",
data.category = c("Transcriptome Profiling"),
data.type = c("Gene Expression Quantification"),
workflow.type = "HTSeq - FPKM")
GDCdownload(eq, directory = here("raw_data"))
e <- GDCprepare(eq,directory = here("raw_data"), save=F)
#saveRDS(object = e, file = here("processed_data", "tcga-exp-brca.rds"))
}
#remove FFPE samples
dim(e)
## [1] 56716 1222
e <- e[,!e$is_ffpe]
dim(e)
## [1] 56716 1206
#add technical variables
colData(e)$hosp <- substr(rownames(colData(e)), 6,7)
colData(e)$seqPlate <- substr(rownames(colData(e)), 22,25)
###remove all metaData columns that are NA
colData(e) <- colData(e)[,colSums(is.na(colData(e)))<nrow(colData(e))]
##TPM normalization, then log2(tpm+1) transform
eu <- assay(e)
eu <- scale(eu, center=FALSE, scale=colSums(eu)) * 1000000
eu <- log(eu+1, 2)
assay(e) <- eu
rm(eu)
#only keep the matched normals (11's)...
en <- e[,substr(colnames(e),14,15) == "11"]
#...and their associated cancer samples (01's).
et <- e[,substr(colnames(e),14,15) == "01"]
et <- et[,!duplicated(substr(colnames(et),1,12))] #remove duplicates
sharedId <- intersect(substr(colnames(en), 1,12), substr(colnames(et), 1,12))
en <- en[,en$patient %in% sharedId]
et <- et[,et$patient %in% sharedId]
print(dim(et)); print(dim(en))
## [1] 56716 111
## [1] 56716 111
#remove the male patient
et <- et[,colData(et)$gender == "female"]
en <- en[,colData(en)$gender == "female"]
print(dim(et)); print(dim(en))
## [1] 56716 110
## [1] 56716 110
#order the et and en matrix columns so that they are the same
et <- et[,order(et$patient)]
en <- en[,order(en$patient)]
#select only protein-coding genes
ensg_to_prot <- read.table(file=here("raw_data","ensembl",
"ensg_protein_class3.txt"),
sep= "\t", header=TRUE, stringsAsFactors = FALSE)
ensg_to_prot <- ensg_to_prot[ensg_to_prot$Gene.type == "protein_coding",]
#must have an hgnc_symbol
ensg_to_prot <- ensg_to_prot[ensg_to_prot$NCBI.gene.ID != "",]
#drop transcripts that don't map to protein-coding ensembl gene Ids
et <- et[rownames(et) %in% ensg_to_prot$Gene.stable.ID ,]
en <- en[rownames(en) %in% ensg_to_prot$Gene.stable.ID ,]
print(dim(et)); print(dim(en))
## [1] 19090 110
## [1] 19090 110
#Filter out genes where <20% of patients have a non zero expression value
et <- et[rowSums(assay(et)==0)<=ncol(et)*.20,]
en <- en[rowSums(assay(en)==0)<=ncol(en)*.20,]
print(dim(et)); print(dim(en))
## [1] 16296 110
## [1] 16599 110
#get the isoforms in common between the two
genes_in_common <- intersect(rownames(et), rownames(en))
et <- et[genes_in_common,]
en <- en[genes_in_common,]
print(dim(et)); print(dim(en))
## [1] 16156 110
## [1] 16156 110
pca <- prcomp(x = t(cbind(assay(et), assay(en))), scale. = T, center = T)
#proportion of variance explained by first 2 pc
propVarExpl <- pca$sdev^2 / sum(pca$sdev^2)
barplot(propVarExpl[1:20], names.arg=1:20, xlab="Principal Comp",
ylab="Prop. Variance Explained")
## Color by sequencing plate. Expect to see a rainbow
## (no clustering by sequencing plate) if there is no major batch effect
plot(pca$x[,1:2], col = as.factor(c(colData(et)$seqPlate, colData(en)$seqPlate)))
#color by hospital of procurement
plot(pca$x[,1:2], col = as.factor(c(colData(et)$hosp, colData(en)$hosp)))
## color by whether the sample is tumor or normal.
## This is where most of the variation is.
plot(pca$x[,1:2], col = as.factor(c(rep("tum", ncol(et)),
rep("nml", ncol(en)))))
Note that the main separation is between tumor and adjacent normal samples, mainly along PC1. We have no visually apparent clusters of samples by tissue source site or sequencing plate. This is a good thing!
if(file.exists(here("processed_data", "mmp.rds"))){
mmp <- readRDS(file = here("processed_data", "mmp.rds"))
} else {
mmp <- mixModelParams(exprNml = assay(en), exprTum = assay(et))
saveRDS(mmp, file = here("processed_data", "mmp.rds"))
}
#select genes that have at least 20% of patients in each group and with an SI> 0.99
gene_filt_pi <- rownames(mmp)[mmp$tPi1 < 0.8 & mmp$tPi1 > 0.2 &
mmp$nPi1 < 0.8 & mmp$nPi1 > 0.2]
gene_filt_pi_tum <- rownames(mmp)[mmp$tPi1 < 0.8 & mmp$tPi1 > 0.2]
gene_filt_pi_nml <- rownames(mmp)[mmp$nPi1 < 0.8 & mmp$nPi1 > 0.2]
length(gene_filt_pi_tum); length(gene_filt_pi_nml)
## [1] 8754
## [1] 7011
gene_filt_pi_si <- rownames(mmp)[mmp$tPi1 < 0.8 & mmp$tPi1 > 0.2 &
mmp$nPi1 < 0.8 & mmp$nPi1 > 0.2 & mmp$SI > 0.99]
source(here("scripts", "2017_12_suppl_scripts.R"))
top5genes <- head(gene_filt_pi_si, 5)
mmp2 <- mmp[gene_filt_pi,]
idx2replace <- sapply(top5genes, grep, rownames(mmp2))
geneIdsTop5 <- sapply(top5genes, function(x) ensg_to_prot$HGNC.symbol[grep(x, ensg_to_prot$Gene.stable.ID)])
rownames(mmp2)[idx2replace] <- geneIdsTop5
#show dimensions of the filtered dataset
dim(mmp2)
## [1] 3823 12
dim(mmp2 %>% filter(SI>0.99))
## Warning: package 'bindrcpp' was built under R version 3.4.4
## [1] 154 12
dim(mmp2 %>% filter(SI<0.99))
## [1] 3669 12
#show the statistics for the top5 genes
mmp2$geneNames <- rownames(mmp2)
mmp2 %>% dplyr::filter(geneNames %in% geneIdsTop5)
## nMu1 nMu2 nVar nPi1 tMu1 tMu2 tVar
## 1 0.16984400 0.2011481 0.07545295 0.5457794 1.3920623 4.433789 1.0926988
## 2 2.01932909 2.4169221 0.45999871 0.6140423 2.1950228 6.348791 1.6388138
## 3 1.88282562 2.3464588 0.16567905 0.6763193 2.3873075 5.054060 0.5561141
## 4 0.09438852 0.1249559 0.02950956 0.5685474 0.8106417 2.589987 0.3146127
## 5 1.52736530 1.8303487 0.25922094 0.6286604 1.6849682 4.181181 0.6391453
## tPi1 deltaMu2 deltaMu1 SI score geneNames
## 1 0.6467758 4.232641 1.2222183 1.0000000 1.842271 EPYC
## 2 0.6352695 3.931869 0.1756937 0.9909091 1.642296 NELL2
## 3 0.5785395 2.707601 0.5044819 1.0000000 1.481326 CBX2
## 4 0.4966407 2.465031 0.7162532 1.0000000 1.404656 SLC24A2
## 5 0.7757700 2.350832 0.1576029 0.9909091 1.283091 LAG3
#plot a histogram of the known scores
reds = brewer.pal(n = 9, name = "Reds")
alph = 0.8
#show the distribution of selectivity indices across genes
ggplot(mmp2, aes(x=SI)) +
geom_histogram(binwidth = 0.02, color="grey", alpha=alph, aes(fill=SI>0.99)) +
theme_classic() + scale_fill_manual(values=reds[c(3,7)]) + scale_y_continuous(expand = c(0, 0)) + xlab("Selectivity Index (SI)") + ylab("Frequency") + theme(text=element_text(size=20, color="black"), legend.position="none")
#show the distribution of oncomix scores across genes
ggplot(mmp2, aes(x=score, y=..density.., fill=SI > 0.99)) +
geom_histogram(data=subset(mmp2, SI<0.99),
aes(x=score, y=..density.., fill=SI > 0.99),
binwidth = 0.1, color="grey", alpha=alph) +
geom_histogram(data=subset(mmp2, SI>0.99),
aes(x=score, y=..density.., fill=SI > 0.99),
binwidth = 0.1, color="grey", alpha=alph) +
theme_classic() +
scale_fill_manual(values=reds[c(3,7)]) +
scale_y_continuous(expand = c(0, 0)) +
theme(text=element_text(size=20, color="black"), legend.position="none") +
ylab("Density") + xlab("Oncomix Score")
selIdx_thresh
. We can also show the genes both before and after the selectivity index (ie low SI vs high SI).gglayers = list(
theme(axis.text.x = element_text(size=30),
plot.title=element_text(size=30)),
xlim(-0.2,10), xlab(""), ylab("")
)
for(i in 1:length(top5genes)){
hgnc_gene_symb <- ensg_to_prot %>%
dplyr::filter(Gene.stable.ID == top5genes[i]) %>%
dplyr::select(HGNC.symbol) %>%
as.character()
gglayers[["title"]] <- ggtitle(hgnc_gene_symb)
plotGeneHistUD(mmp, assay(en), assay(et), top5genes[i], linesz=1.5, gglayers)
ggsave(filename = here("output", paste0(Sys.Date(),"_",top5genes[i], "_histog.png")),
width = 7, height=5, units = "in")
}
ensg_to_prot_filt <- ensg_to_prot[ensg_to_prot$Gene.stable.ID %in% rowData(et)$ensembl_gene_id,]
while(length(ensg_to_prot_filt$Gene.stable.ID) != length(unique(ensg_to_prot_filt$Gene.stable.ID))){
ensg_to_prot_filt <- ensg_to_prot_filt[!duplicated(ensg_to_prot_filt$Gene.stable.ID),]
}
rowData(et) = dplyr::left_join(data.frame(rowData(et)), ensg_to_prot_filt, by = c("ensembl_gene_id" = "Gene.stable.ID"))
aims_subtypes <- applyAIMS(assay(et),rowData(et)$NCBI.gene.ID)
## Current k = 20
colData(et)$aims_subtypes <- aims_subtypes$cl[,1]
table(colData(et)$aims_subtypes)
##
## Basal Her2 LumA LumB Normal
## 17 25 36 25 7
table(colData(et)$tumor_stage)
##
## not reported stage i stage ia stage iia stage iib
## 1 16 4 34 29
## stage iiia stage iiib stage iiic stage iv
## 16 4 4 2
table(colData(et)$vital_status)
##
## alive dead
## 66 44
table(colData(et)$race)
##
## asian black or african american
## 1 6
## not reported white
## 1 102
summary(colData(et)$days_to_birth/-365.25)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 30.67 45.77 56.93 57.80 66.43 90.00 1
sd(colData(et)$days_to_birth/-365.25, na.rm = TRUE)
## [1] 14.53671
colData(et) <- cbind(colData(et), as.data.frame(classifyTumors(mmp, top5genes, et)))
hallmarkKeggReactome <- readHmKgRctmepathways(gsea_path = here("raw_data", "genesets"))
B <- nrow(et) - 1
qval_diffexp <- 1e-04
log2FC_diffexp <- 1
OR_pathway <- 20 #this is the Odds Ratio for pathway analysis
qval_pathway <- 1e-20
# save output of differentially expressed genes
allDeGenes.l <- vector("list", length(top5genes))
upregDeGenes.l <- vector("list", length(top5genes))
downregDeGenes.l <- vector("list", length(top5genes))
upregDePaths.l <- vector("list", length(top5genes))
downregDePaths.l <- vector("list", length(top5genes))
for (i in 1:length(top5genes)) {
classif <- paste0("cl_", top5genes[i])
vars <- c(classif, "seqPlate", "hosp")
colData_e_stdze <- colData(et)[, vars]
dsgn <- model.matrix(~colData(et)[, classif] + colData(et)[, "seqPlate"] + colData(et)[,
"hosp"])
allDeGenes <- returnDeGenesLimma(assay(et)[rownames(assay(et)) != top5genes[i],
], dsgn, colOfInt = 2, retAll = T)
allDeGenes$ensgId <- rownames(allDeGenes)
allDeGenes.l[[i]] <- allDeGenes
upregDeGenes <- allDeGenes[allDeGenes$logFC > log2FC_diffexp & allDeGenes$adj.P.Val <
qval_diffexp, ] #473 downregulated
upregDeGenes.l[[i]] <- upregDeGenes
downregDeGenes <- allDeGenes[allDeGenes$logFC < -log2FC_diffexp & allDeGenes$adj.P.Val <
qval_diffexp, ] #118 upregulated
downregDeGenes.l[[i]] <- downregDeGenes
# overrepresentation analysis
if (nrow(upregDeGenes) > 1) {
upregDeGenesEntrez <- unique(rowData(et)[rowData(et)$ensembl_gene_id %in%
rownames(upregDeGenes), "NCBI.gene.ID"])
upregDeGenesEntrez_gse <- gseWrapper(B, upregDeGenesEntrez, hallmarkKeggReactome$genesets,
hallmarkKeggReactome$geneset.names, qval = qval_pathway, OR = OR_pathway)
upregDePaths.l[[i]] <- upregDeGenesEntrez_gse
createForestDF(upregDeGenesEntrez_gse, paste0("Overrepresentation Analysis for Upregulated genes in \nTriple Positive tumors (OR > ",
OR_pathway, " and q < ", qval_pathway, ")\n", top5genes[i]))
ggsave(filename = here("output", paste0(top5genes[i], "_up.png")), width = 12,
height = 7, units = "in")
}
if (nrow(downregDeGenes) > 1) {
downregDeGenesEntrez <- unique(rowData(et)[rowData(et)$ensembl_gene_id %in%
rownames(downregDeGenes), "NCBI.gene.ID"])
downregDeGenesEntrez_gse <- gseWrapper(B, downregDeGenesEntrez, hallmarkKeggReactome$genesets,
hallmarkKeggReactome$geneset.names, qval = qval_pathway, OR = OR_pathway)
downregDePaths.l[[i]] <- downregDeGenesEntrez_gse
createForestDF(downregDeGenesEntrez_gse, paste0("Overrepresentation Analysis for Downregulated genes in \nTriple Positive tumors (OR > ",
OR_pathway, " and q < ", qval_pathway, ")\n", top5genes[i]))
ggsave(filename = here("output", paste0(top5genes[i], "_down.png")), width = 12,
height = 7, units = "in")
}
}
## Coefficients not estimable: colData(et)[, "hosp"]GI
## Coefficients not estimable: colData(et)[, "hosp"]GI
## Coefficients not estimable: colData(et)[, "hosp"]GI
## Coefficients not estimable: colData(et)[, "hosp"]GI
## Coefficients not estimable: colData(et)[, "hosp"]GI
# check to see which genes had differentialy expressed pathways
names(allDeGenes.l) <- top5genes
names(upregDePaths.l) <- top5genes
names(downregDePaths.l) <- top5genes
dim_upreg <- lapply(upregDePaths.l, nrow)
dim_downreg <- lapply(downregDePaths.l, nrow)
dim_upreg <- unlist(lapply(dim_upreg, function(x) ifelse(is.null(x), 0, x)))
dim_downreg <- unlist(lapply(dim_downreg, function(x) ifelse(is.null(x), 0, x)))
# which are non-zero?
genesWithDePathsUpreg <- names(dim_upreg)[dim_upreg != 0]
genesWithDePathsDownreg <- names(dim_downreg)[dim_downreg != 0]
genesWithDePathsUpreg
## [1] "ENSG00000173894" "ENSG00000155886"
genesWithDePathsDownreg
## character(0)
9b. Make a summary table of the pathway analysis from the 5 genes.
numUpGenes <- unlist(lapply(upregDeGenes.l, nrow))
numDownGenes <- unlist(lapply(downregDeGenes.l, nrow))
de_pathway_summ <- cbind(upregulatedGenes = numUpGenes, downregulatedGenes = numDownGenes,
numbPathwaysUp = dim_upreg, numbPathwaysDown = dim_downreg, pathwaySummary = c("-",
"-", "cell cycle", "matrix", "immune")) %>%
as.data.frame() %>%
rownames_to_column("ensg_id") %>%
left_join(ensg_to_prot, by = c(ensg_id = "Gene.stable.ID")) %>% dplyr::select(HGNC.symbol,
upregulatedGenes, downregulatedGenes)
knitr::kable(de_pathway_summ)
HGNC.symbol | upregulatedGenes | downregulatedGenes |
---|---|---|
EPYC | 4 | 0 |
NELL2 | 0 | 0 |
CBX2 | 73 | 17 |
SLC24A2 | 241 | 1 |
LAG3 | 105 | 2 |
Print the pathways that are enriched
orVars = c("estimate.odds ratio", "conf.int1", "conf.int2")
upregDePaths.l.df <- do.call(rbind, upregDePaths.l) %>%
rownames_to_column("ensg") %>%
separate(ensg, into = c("gene", "toRm"),fill="right") %>%
left_join(ensg_to_prot_filt, by = c("gene" = "Gene.stable.ID")) %>%
dplyr::select(HGNC.symbol, geneset.names, bhPval, `estimate.odds ratio`, conf.int1, conf.int2) %>%
as_tibble() %>%
mutate_at(vars(orVars), as.numeric) %>%
mutate_at(vars(orVars), round) %>%
mutate(bhPval = sprintf("%3.1e", bhPval)) %>%
mutate("95% CI" = paste0(conf.int1, "-", conf.int2)) %>%
mutate(geneset.names =gsub("_"," ",str_to_lower(geneset.names))) %>%
dplyr::select(-conf.int1, -conf.int2) %>%
dplyr::rename("Oncogene Candidate"= HGNC.symbol, "Geneset"=geneset.names,"q value" = bhPval, "Odds Ratio" = `estimate.odds ratio`)
knitr::kable(upregDePaths.l.df)
Oncogene Candidate | Geneset | q value | Odds Ratio | 95% CI |
---|---|---|---|---|
CBX2 | hallmark g2m checkpoint | 2.2e-30 | 54 | 31-91 |
CBX2 | hallmark e2f targets | 1.3e-25 | 44 | 25-75 |
SLC24A2 | hallmark epithelial mesenchymal transition | 1.3e-59 | 37 | 26-53 |
#display the genes from the top pathway
#ENSG00000173894 = CBX2
topGeneSet<- upregDePaths.l[["ENSG00000173894"]][1,"geneset.names"]
diffExprDf = allDeGenes.l[["ENSG00000173894"]]
rownames(ensg_to_prot_filt) <- ensg_to_prot_filt$Gene.stable.ID
entrezIdsGeneSet = hallmarkKeggReactome[[1]][[topGeneSet]]
plot.volcano.gsea(hallmarkKeggReactome, diffExprDf, topGeneSet, log2FC_diffexp = log2FC_diffexp, qval_diffexp = qval_diffexp, ensg_map = ensg_to_prot_filt, brew.color.set1 = 3, propSubset = 0.4)
ggsave(filename = here("output", paste0(topGeneSet, "_volcano.png")), width = 10, height=7, units = "in")
Map ensg to chromosome, gene symbol, description using biomart. Generate a table of the differentially expressed genes
ensg_to_prot_filt2 <- ensg_to_prot_filt %>% rename(ensgId = "Gene.stable.ID")
diffExprDf2 <- diffExprDf %>%
filter(logFC > log2FC_diffexp, adj.P.Val < qval_diffexp) %>%
left_join(ensg_to_prot_filt2, by="ensgId") %>%
mutate(inTopGeneSet = NCBI.gene.ID %in%
hallmarkKeggReactome[[1]][[topGeneSet]]) %>%
filter(inTopGeneSet)
#if(file.exists(here("processed_data", "ensg_de_tbl.rds"))){
# ensg_de_tbl <- readRDS(here("processed_data", "ensg_de_tbl.rds"))
#} else {
ensembl <- useMart("ensembl",dataset="hsapiens_gene_ensembl")
ensg_de_tbl <- getBM(mart=ensembl,
filters=c("ensembl_gene_id"),
attributes=c("ensembl_gene_id", "entrezgene",
"description", "chromosome_name","band"),
values = diffExprDf2$ensgId)
ensg_de_tbl <- rename(ensg_de_tbl, ensgId = "ensembl_gene_id")
saveRDS(ensg_de_tbl, here("processed_data", "ensg_de_tbl.rds"))
#}
diffExprDf3 <- left_join(diffExprDf2, ensg_de_tbl, by="ensgId") %>%
select(HGNC.symbol, description, chromosome_name, band, logFC, adj.P.Val) %>%
mutate(Chromosome=paste0(chromosome_name,band)) %>%
arrange(chromosome_name, band) %>%
select(-chromosome_name, -band) %>%
mutate(logFC = round(logFC, 2)) %>%
mutate(description = gsub("\\[[^\\]]*\\]", "", description, perl=TRUE)) %>%
rename("log2(Fold Change)" = "logFC", "q value" = "adj.P.Val", "HGNC symbol" = "HGNC.symbol", "Description" = "description")
knitr::kable(diffExprDf3)
HGNC symbol | Description | log2(Fold Change) | q value | Chromosome |
---|---|---|---|---|
KIF2C | kinesin family member 2C | 1.55 | 1.30e-06 | 1p34.1 |
RAD54L | RAD54 like | 1.26 | 5.80e-06 | 1p34.1 |
CDC20 | cell division cycle 20 | 1.63 | 9.30e-06 | 1p34.2 |
E2F2 | E2F transcription factor 2 | 1.14 | 9.14e-05 | 1p36.12 |
EXO1 | exonuclease 1 | 1.30 | 6.97e-05 | 1q43 |
CENPA | centromere protein A | 1.59 | 7.00e-07 | 2p23.3 |
BUB1 | BUB1 mitotic checkpoint serine/threonine kinase | 1.35 | 6.30e-06 | 2q13 |
CENPE | centromere protein E | 1.09 | 6.48e-05 | 4q24 |
CCNA2 | cyclin A2 | 1.29 | 5.55e-05 | 4q27 |
MAD2L1 | mitotic arrest deficient 2 like 1 | 1.00 | 8.91e-05 | 4q27 |
TTK | TTK protein kinase | 1.29 | 1.06e-05 | 6q14.1 |
EZH2 | enhancer of zeste 2 polycomb repressive complex 2 subunit | 1.01 | 1.26e-05 | 7q36.1 |
CDK1 | cyclin dependent kinase 1 | 1.17 | 7.80e-05 | 10q21.2 |
TROAP | trophinin associated protein | 1.35 | 1.07e-05 | 12q13.12 |
ESPL1 | extra spindle pole bodies like 1, separase | 1.17 | 3.66e-05 | 12q13.13 |
PLK1 | polo like kinase 1 | 1.42 | 1.37e-05 | 16p12.2 |
ORC6 | origin recognition complex subunit 6 | 1.08 | 2.86e-05 | 16q11.2 |
SLC7A5 | solute carrier family 7 member 5 | 1.63 | 6.07e-05 | 16q24.2 |
BIRC5 | baculoviral IAP repeat containing 5 | 1.65 | 1.30e-06 | 17q25.3 |
NDC80 | NDC80, kinetochore complex component | 1.18 | 5.77e-05 | 18p11.32 |
CDC25B | cell division cycle 25B | 1.14 | 2.86e-05 | 20p13 |
TPX2 | TPX2, microtubule nucleation factor | 1.44 | 1.14e-05 | 20q11.21 |
E2F1 | E2F transcription factor 1 | 1.27 | 3.27e-05 | 20q11.22 |
MYBL2 | MYB proto-oncogene like 2 | 2.06 | 1.30e-06 | 20q13.12 |
UBE2C | ubiquitin conjugating enzyme E2 C | 1.64 | 1.58e-05 | 20q13.12 |
AURKA | aurora kinase A | 1.42 | 3.10e-06 | 20q13.2 |
CDC45 | cell division cycle 45 | 1.25 | 4.00e-05 | 22q11.21 |
## identify all available methylation data for tumors with matched normal.
## do this with the GenomicDataCommons package (create a manifest), then
## use TCGAbiolinks to download and read in the data
fieldsToReturn <- c("file_id", "file_name", "associated_entities.entity_submitter_id", "analysis.workflow_link", "analysis.workflow_type", "analysis.workflow_version")
if(!file.exists(here("processed_data", "tcga-methyl-brca.rds"))){
ge_manifest = files() %>%
GenomicDataCommons::select(fieldsToReturn) %>%
GenomicDataCommons::filter( ~ cases.project.project_id == 'TCGA-BRCA' &
type == 'methylation_beta_value' &
platform == "Illumina Human Methylation 450" &
cases.samples.sample_type == "Primary Tumor" &
cases.submitter_id == substr(colnames(et), 1,12)) %>%
results_all() %>% data.frame
#download the files
gdcClientPath <- "C:/Users/Daniel/software/gdc-client_v1.2.0_Windows_x64/gdc-client.exe"
system(paste0(gdcClientPath, " download ", paste(ge_manifest$file_id,collapse = " ")))
## tumors with adjacent normal tissue
mq <- GDCquery(project = "TCGA-BRCA",
data.category = "DNA Methylation",
platform="Illumina Human Methylation 450", #,
barcode = ge_manifest$associated_entities.entity_submitter_id)
GDCdownload(mq, directory = here("raw_data"), files.per.chunk = 3)
m <- GDCprepare(mq,directory = here("raw_data"), save=F)
#saveRDS(object = m, file = here("processed_data", "tcga-methyl-brca.rds"))
} else {
m <- readRDS(file = here("processed_data", "tcga-methyl-brca.rds"))
}
m <- m[substr(rownames(assay(m)),1,2) == "cg",]
if(file.exists(here("processed_data", "cnv_cbx2.rds"))){
cnv2 <- readRDS(here("processed_data", "cnv_cbx2.rds"))
} else {
cnv_manifest_map = files() %>%
GenomicDataCommons::select(fieldsToReturn) %>%
GenomicDataCommons::filter(~data_type == 'Copy Number Segment' &
cases.project.project_id == 'TCGA-BRCA' &
cases.submitter_id == substr(colnames(et), 1,12) &
type == 'copy_number_segment' &
cases.samples.sample_type == "Primary Tumor") %>%
results_all() #108 samples avail
cnv_manifest_map.df <- data.frame(cnv_manifest_map)
#download the cnv data
fnamescnv = lapply(cnv_manifest_map$id,gdcdata,
destination_dir=
here("raw_data", "TCGA-BRCA", "harmonized", "cnv"),
overwrite=TRUE,
progress=FALSE)
cnvList = lapply(fnamescnv, read.table, header = TRUE, sep = "\t",
stringsAsFactors=F)
cnv <- do.call(rbind, cnvList)
cnv2 <- cnv_manifest_map.df %>%
tbl_df() %>%
separate(col = file_name,into = c("Sample", "toRm"),
sep = ".grch38.seg.txt") %>%
dplyr::select(-toRm) %>%
right_join(cnv, by="Sample")
cnv2 <- GRanges(seqnames=paste0("chr",cnv2$Chromosome),
ranges = IRanges(start = cnv2$Start, end=cnv2$End),
tcga_id=cnv2$associated_entities.entity_submitter_id,
seg_mean=cnv2$Segment_Mean,
num_probes=cnv2$Num_Probes)
#saveRDS(cnv2, file= here("processed_data", "cnv_cbx2.rds"))
}
#cnv2 has the tcga ids and the mean segment copy number all in one
Now we have our CNV and methylation data. Now, we need the tumor purity data (from the biospecimen supplement) and retrospective collection data (from the clinical data).
#downloaded the biospecimen_supplement - contains the % tumor cells
if(file.exists(here("processed_data", "tum_cells_df.rds"))){
tum_cells.df <- readRDS(here("processed_data", "tum_cells_df.rds"))
} else {
biosp_manifest_map = files() %>%
GenomicDataCommons::filter(~ cases.project.project_id == 'TCGA-BRCA' &
type == 'biospecimen_supplement' &
cases.submitter_id == substr(colnames(et), 1, 12)) %>%
results_all()
fnamesbiosp = lapply(biosp_manifest_map$id,gdcdata,
destination_dir=
here("raw_data", "TCGA-BRCA", "harmonized", "biosp"),
overwrite=TRUE,
progress=FALSE)
biospFiles <- list.files(here("raw_data", "TCGA-BRCA", "harmonized", "biosp"))
tum_cells <- c()
for(i in 1:length(biospFiles)){
xmlp = xmlParse(here("raw_data", "TCGA-BRCA", "harmonized", "biosp", biospFiles[i]))
xmlr = xmlRoot(xmlp)
tum_c <- xmlr[["patient"]][["samples"]][["sample"]][["portions"]][["portion"]][["slides"]][["slide"]][["percent_tumor_cells"]][["text"]]
tum_cells <- c(tum_cells, xmlValue(tum_c))
}
names(tum_cells) <- substr(biospFiles, 37, 48)
tum_cells.df <- data.frame(tum_cells, stringsAsFactors = F)
rownames(tum_cells.df) <- names(tum_cells)
#saveRDS(tum_cells.df, file=here("processed_data", "tum_cells_df.rds"))
}
colData(et)$percent_tumor_cells <- as.numeric(tum_cells.df[colData(et)$patient,])
summary(colData(et)$percent_tumor_cells)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 15.00 65.25 75.00 73.28 85.00 100.00
sd(colData(et)$percent_tumor_cells)
## [1] 18.96793
#download the clinical_supplement - contains retrospective collection information
if(all(file.exists(here("processed_data", "retro_coll_df.rds"),
here("processed_data", "histo_status_df.rds"),
here("processed_data", "mets_df.rds")))){
retro_coll.df <- readRDS(here("processed_data", "retro_coll_df.rds"))
histo_status.df <- readRDS(here("processed_data", "histo_status_df.rds"))
mets_df <- readRDS(here("processed_data", "mets_df.rds"))
} else {
clin_manifest_map = files() %>%
GenomicDataCommons::filter(~ cases.project.project_id == 'TCGA-BRCA' &
type == 'clinical_supplement' &
cases.submitter_id == substr(colnames(et), 1, 12)) %>%
results_all()
fnamesclin = lapply(clin_manifest_map$id,gdcdata,
destination_dir=
here("raw_data", "TCGA-BRCA", "harmonized", "clin"),
overwrite=TRUE,
progress=FALSE)
clinFiles <- list.files(here("raw_data", "TCGA-BRCA", "harmonized", "clin"))
retro_coll <- c()
histo_status <- c()
node_count <- c()
mets_ihc_status <- c()
mets_he_status <- c()
for(i in 1:length(clinFiles)){
xmlp = xmlParse(here("raw_data", "TCGA-BRCA", "harmonized", "clin", clinFiles[i]))
xmlr = xmlRoot(xmlp)
retro <- xmlr[["patient"]][["tissue_retrospective_collection_indicator"]][["text"]]
retro_coll <- c(retro_coll, xmlValue(retro))
histo <- xmlr[["patient"]][["histological_type"]][["text"]]
histo_status <- c(histo_status, xmlValue(histo))
node <- xmlr[["patient"]][["lymph_node_examined_count"]]
node_count <- c(node_count, xmlValue(node))
mets_ihc <- xmlr[["patient"]][["number_of_lymphnodes_positive_by_ihc"]]
mets_ihc_status <- c(mets_ihc_status, xmlValue(mets_ihc))
mets_he <- xmlr[["patient"]][["number_of_lymphnodes_positive_by_he"]]
mets_he_status <- c(mets_he_status, xmlValue(mets_he))
}
mets_df <- tibble(node_count, mets_ihc_status, mets_he_status) %>%
mutate_all(as.numeric) %>%
mutate_at(vars(starts_with("mets_")), funs(ifelse(is.na(.),0,.))) %>%
mutate(final = ifelse(is.na(node_count), NA, 0)) %>%
mutate(final = ifelse(mets_ihc_status > 0 | mets_he_status > 0, 1, final)) %>%
data.frame()
rownames(mets_df) = substr(clinFiles, 34, 45)
names(retro_coll) <- substr(clinFiles, 34, 45)
names(histo_status) <- substr(clinFiles, 34, 45)
retro_coll.df <- data.frame(retro_coll, stringsAsFactors = FALSE)
histo_status.df <- data.frame(histo_status, stringsAsFactors = FALSE)
#saveRDS(retro_coll.df, here("processed_data", "retro_coll_df.rds"))
#saveRDS(histo_status.df, here("processed_data", "histo_status_df.rds"))
#saveRDS(mets_df, here("processed_data", "mets_df.rds"))
}
colData(et)$retro_coll_yes1 <- ifelse(retro_coll.df[colData(et)$patient,"retro_coll"] ==
"YES", 1, 0)
table(colData(et)$retro_coll_yes1)
##
## 0 1
## 22 88
colData(et)$histology <- histo_status.df[colData(et)$patient,]
table(colData(et)$histology)
##
## Infiltrating Ductal Carcinoma Infiltrating Lobular Carcinoma
## 89 6
## Medullary Carcinoma Mixed Histology (please specify)
## 2 10
## Mucinous Carcinoma Other, specify
## 1 2
colData(et)$nodal_mets <- mets_df[colData(et)$patient,"final"]
table(colData(et)$nodal_mets)
##
## 0 1
## 45 59
Now we need to aggregate and normalize our Data for logistic regression (predict whether oncogene is overexpressed or not). All data will be fit to the interval [0-1], and all categorical variables will be represented as dummy variables. We have 3 classes of variables: DNA methylation (ie all CpGs at gene locus), CNV/AIMS/mRNA expr in normal tissue, Clinical (Age, % tumor cells, Metastasis, Stage, Race, 5 year survival), and Technical (Retrospective collection, Hosp of Procurement, Seq Plate).
#Munging 5-year survival data
days_last_fu <- ifelse(is.na(colData(et)$days_to_death), colData(et)$days_to_last_follow_up,colData(et)$days_to_death)
days_last_fu <- ifelse(days_last_fu > 365.25 * 5, 365.25 * 5, days_last_fu)
vital_stat <- ifelse(colData(et)$vital_status == "alive", 0, 1)
vital_stat <- ifelse(days_last_fu >= 365.25 * 5, 0, vital_stat)
colData(et)$days_last_fu_5yr <- days_last_fu
colData(et)$vital_stat_5yr <- vital_stat
clin_vars_cat <- c("aims_subtypes", "race", "tumor_stage", "vital_stat_5yr")
clin_vars_cont <- c("percent_tumor_cells", "age_at_diagnosis")
tech_vars_cat <- c("retro_coll_yes1", "seqPlate", "hosp")
expr_cnv_cat <- c("aims_subtypes") #cnv and mRNA in nml tissue - extract separately
Now we need the DNA methylation and CNV variables. First, we create GRanges objects for the DNA methylation and CNV matrices. Then, we identify the gene coordinates from our top5 genes. We can then intersect the top5 gene coordinates with the DNA methylation and CNV objects to identify the copy number and methylation status at these gene loci.
#liftover coordinates to hg38
chain <- import.chain(here("raw_data", "liftover", "hg19ToHg38.over.chain"))
hm450 <- get450k() %>%
liftOver(chain) %>%
unlist() %>%
subset(names %in% rownames(m)) %>%
subset(!duplicated(names)) %>% #remove duplicates
data.frame(., row.names=names(.)) %>%
rownames_to_column("cgid") %>%
left_join(data.frame(rowData(m)), .,
by = c("Composite.Element.REF" = "cgid"))
## Fetching coordinates for hg19...
goodLiftOvIdx = !apply(hm450, 1, anyNA)
hm450 <- hm450[goodLiftOvIdx,]
m <- m[goodLiftOvIdx,]
#lifted-over coordinates of 450k array
hm450_gr = GRanges(seqnames = as.character(pull(hm450, seqnames)),
ranges = IRanges(pull(hm450, start), pull(hm450, end)),
cgid = pull(hm450,Composite.Element.REF))
all.equal(rowData(m)$Composite.Element.REF, hm450_gr$cgid)
## [1] TRUE
rowData(m) <- hm450_gr
#then, intersect with the coordinates for the 6 genes
allGeneRange<-genes(TxDb.Hsapiens.UCSC.hg38.knownGene)
#genes_gr <- allGeneRange[as.character(rowData(et[top5genes,])$NCBI.gene.ID),]
#genes_gr$gene_symb <- as.character(rowData(et[top5genes,])$HGNC.symbol)
genes_gr <- allGeneRange[as.character(rowData(et[top5genes,])$NCBI.gene.ID),]
genes_gr$gene_symb <- as.character(rowData(et[top5genes,])$HGNC.symbol)
genes_gr$ensg <- top5genes
#add 1500 bp to the 5' end of the transcript
genes_gr_prom <- promoters(genes_gr, upstream = 1500, downstream = 0)
genes_gr2 <- punion(genes_gr,genes_gr_prom)
genes_gr2@elementMetadata <- genes_gr@elementMetadata
allExons <- exonsBy(TxDb.Hsapiens.UCSC.hg38.knownGene, by="gene") #get the exons
methSE <- rowData(m)$X
#this is how we get the CNV Status for a gene
cnvStatusAtLoci <- function(geneGr, cnvGr){
cnv_subst <- subsetByOverlaps(cnvGr, geneGr)
return(cnv_subst)
}
expr_cnv_cat <- c("aims_subtypes") #cnv and mRNA in nml tissue - extract separately
clin_vars_cat <- c("race", "tumor_stage", "vital_stat_5yr")
tech_vars_cat <- c("retro_coll_yes1", "seqPlate", "hosp")
expr_cnv_cont <- c("expr_nml_txue") #cnv and mRNA in nml tissue
clin_vars_cont <- c("percent_tumor_cells", "age_at_diagnosis")
cpg_vars_cont <- c("Promoter", "Exon_first","Exon_middle", "Exon_last", "Intron")
mmCatVar <- colData(et) %>%
model.matrix(~ aims_subtypes + race + tumor_stage + vital_stat_5yr +
retro_coll_yes1 + seqPlate + hosp, .) %>%
data.frame() %>%
rownames_to_column("tcga_id")%>%
as_tibble() %>%
mutate(tcga_id = substr(tcga_id, 1, 12))
contVar <- colData(et) %>%
data.frame() %>%
as_tibble() %>%
dplyr::select(percent_tumor_cells, age_at_diagnosis, patient) %>%
mutate(percent_tumor_cells=scales::rescale(percent_tumor_cells,to=c(0, 1))) %>%
mutate(age_at_diagnosis=scales::rescale(age_at_diagnosis,to=c(0, 1))) %>%
rename(tcga_id = patient)
allVarClass <- c(cpg_vars_cont, expr_cnv_cat, expr_cnv_cont, "seg_mean", clin_vars_cat, clin_vars_cont, tech_vars_cat)
allVarLabel <-
c(
"TSS1500",
"First Exon",
"Middle Exon",
"Last Exon",
"Intron",
"AIMS subtypes",
"Expr. in Normal Tissue",
"CNV",
"Race",
"Stage",
"5-year survival",
"% Tumor Cells",
"Age",
"Retrospective Collection",
"Seq. plate",
"Hospital"
)
lr_res_maxBeta.l <- list()
aucResults <- c()
for(i in 1:length(genes_gr2)){
cpg_gr <- findAndAnnotateCpGsInGene(genes_gr2[i], allExons, methSE, genes_gr_prom[i])
cpg_bv <- t(assay(m[cpg_gr$cgid,]))
colnames(cpg_bv) <- paste0(cpg_gr$location, "_", cpg_gr$cgid)
#remove columns that are >50% NA
cpg_betaVal <- cpg_bv %>%
.[,colSums(is.na(.)) < 0.5*nrow(.)] %>%
apply(2, range01) %>%
data.frame() %>%
rownames_to_column("tcga_id") %>%
mutate(tcga_id = substr(tcga_id, 1, 12))
cnvStat <- cnvStatusAtLoci(genes_gr2[i], cnv2) %>%
data.frame() %>%
dplyr::select(tcga_id, seg_mean) %>%
mutate(tcga_id=substr(tcga_id,1,12)) %>%
group_by(tcga_id) %>%
summarise(seg_mean = mean(seg_mean)) %>%
#average duplicates
mutate(seg_mean=scales::rescale(seg_mean,to=c(0, 1)))
#get the adjacent normal expression data
exprNml = assay(en)[genes_gr2[i]$ensg,] %>%
data.frame() %>%
rownames_to_column("tcga_id") %>%
mutate(tcga_id = substr(tcga_id, 1,12)) %>%
rename("expr_nml_txue"=".") %>% as_tibble()
xVars <- mmCatVar %>%
left_join(contVar, by="tcga_id") %>%
left_join(cpg_betaVal, by="tcga_id") %>%
left_join(cnvStat, by="tcga_id") %>%
left_join(exprNml, by="tcga_id") %>%
dplyr::select(-tcga_id, -X.Intercept.) %>%
as.matrix() %>%
fillNAwithColMeans() %>%
as_tibble() %>%
select_if(colSums(!is.na(.)) > 0) %>%
as.matrix()
y = colData(et)[,paste0("cl_", genes_gr2[i]$ensg)]
lr_res <- logRegElNetLooc(xVars, y, alpha = 0.5, retGlmNt = TRUE)
predProbs <- sapply(1:nrow(xVars), function(x)
predict(lr_res, xVars[x,,drop=F], type="response"))
auc_res <- predProbs %>%
roc(factor(y)) %>%
auc()
aucResults <- c(aucResults, auc_res)
lr_res_raw <- lr_res %>%
coef() %>%
as.matrix() %>%
data.frame() %>%
rownames_to_column(var="varNm")
lr_res_maxBeta <- allVarClass %>%
sapply(function(x) grep(paste0("^",x), lr_res_raw$varNm)) %>%
lapply(., function(x) lr_res_raw[x,"s0"][which.max(
abs(lr_res_raw[x,"s0"]))]) %>%
lapply(function(x) ifelse(length(x) == 0, NA, x)) %>%
unlist() %>%
data.frame(betaVal=.) %>%
rownames_to_column(var="varNm") %>%
mutate(rnk =rank(-abs(betaVal), na.last=T)) %>%
mutate(rnk=ifelse(is.na(betaVal)," ", ifelse(betaVal == 0, "·", rnk)),
varNm = factor(varNm, levels=rev(allVarClass), labels=rev(allVarLabel)))
lr_res_maxBeta.l <- c(lr_res_maxBeta.l, list(lr_res_maxBeta))
}
names(lr_res_maxBeta.l) <- genes_gr2$gene_symb
names(aucResults) <- genes_gr2$gene_symb
Now, generate a matrix and visualize the coefficients.
coefMatr <- lr_res_maxBeta.l %>%
do.call(rbind, .) %>%
rownames_to_column("gene") %>%
separate("gene", c("gene", "toRm")) %>%
dplyr::select(-toRm) %>%
mutate(betaValCat = cut(.[,"betaVal"],
breaks=c(-Inf, -2.5, -1.25, -0.25, -1*10^(-6),
1*10^(-6), 0.25, 1.25, 2.5, Inf))) %>%
replace_na(list(betaValCat="(-1e-06,1e-06]")) %>%
mutate(gene = factor(gene, levels=genes_gr2$gene_symb))
#plot the coefficient matrix
ggplot(coefMatr, aes(x=gene, y=varNm, fill=betaValCat)) +
geom_tile(color="grey") + coord_equal() + theme_classic() +
scale_fill_brewer(palette = "RdBu", type="div",
drop=FALSE, direction = -1) +
geom_text(aes(label=rnk)) +
theme(axis.ticks = element_blank(),
axis.line = element_blank(),
axis.text = element_text(color="black", size=16),
axis.text.x = element_text(angle=90, vjust=0.5, hjust=1)) +
xlab("") + ylab("")
#plot the AUC values
aucResults.df <- aucResults %>%
as.data.frame()%>%
rownames_to_column("gene") %>%
rename(aucVal=".") %>%
mutate(gene = factor(gene, levels=genes_gr2$gene_symb)) %>%
mutate(colorsCut = cut(aucVal, c(0,0.5,0.6,0.8,0.9,0.95,0.99,0.999,1))) #c(0,0.6,0.8,0.85,0.9,0.95,1)))
ggplot(aucResults.df, aes(y = as.factor("AUC"), x = gene)) +
theme_classic() +
geom_tile(data = aucResults.df, color="black", aes(fill = colorsCut)) +
scale_fill_brewer(palette = "Purples", drop=F) +
theme(aspect.ratio=0.15) +
geom_text(color = "black", size=12, aes(label = round(aucVal,2))) +
theme(axis.text.x = element_blank(), axis.ticks=element_blank(),
axis.text.y= element_text(size=20, hjust=2),
axis.line = element_blank()) +
xlab("") + ylab("")
Plot all the CpGs at the CBX2 locus
class_cbx2 = colData(et) %>%
data.frame %>%
dplyr::select("cl_ENSG00000173894") %>%
rownames_to_column(var = "ptId") %>%
mutate(ptId, ptId = substr(ptId, 1,12))
cbx2CpGs <- methStatusAtLoci(genes_gr2[3], methSE)
cpg_cbx2 <- genes_gr2 %>%
subset(gene_symb == "CBX2") %>%
methStatusAtLoci(methSE) %>%
.$cgid %>%
m[.,] %>%
assay() %>%
t() %>%
data.frame() %>%
rownames_to_column(var = "ptId") %>%
mutate(ptId, ptId = substr(ptId, 1,12)) %>%
left_join(class_cbx2, by="ptId") %>%
mutate(cl_ENSG00000173894,
cl_ENSG00000173894=factor(cl_ENSG00000173894)) %>%
reshape2::melt() %>%
rename(cpg=variable) %>%
#order by position
mutate(cpg=factor(cpg, levels=sort(cbx2CpGs)$cgid)) %>%
na.omit()
## Using ptId, cl_ENSG00000173894 as id variables
#examine the differences in group means between the CpG values
cpg_cbx2.s <- spread(cpg_cbx2, cpg, value)
wilcox.res.pvals = c()
for(ea_col in 3:ncol(cpg_cbx2.s)){
#do a wilcoxon test for differences in means...
wilcox.res = wilcox.test(cpg_cbx2.s[,ea_col] ~
cpg_cbx2.s$cl_ENSG00000173894, cpg_cbx2.s)
wilcox.res.pvals = c(wilcox.res.pvals, wilcox.res$p.value)
}
wilcox.res.pvals.adj <- p.adjust(wilcox.res.pvals, method="BH")
names(wilcox.res.pvals.adj) <- colnames(cpg_cbx2.s)[3:ncol(cpg_cbx2.s)]
wilcox.res.pvals.adj.star <- sig2star01(wilcox.res.pvals.adj)
ggplot(cpg_cbx2, aes(x = cpg, y = value, fill = cl_ENSG00000173894, color = cl_ENSG00000173894)) +
theme_classic() +
theme(axis.text.x = element_text(angle = 90, size = 8, hjust = 1, vjust = 0.5), aspect.ratio=0.30) +
geom_boxplot(outlier.alpha = 0.5) +
scale_fill_brewer(palette = "Paired", name = "CBX2 Expression", labels=c("Baseline", "Overexpressed")) +
scale_color_brewer(palette = "Paired") +
geom_boxplot(color="black", outlier.shape = NA) +
ylab("Beta Value") + ylim(0,1.4) +
annotate("text", label = paste0(" ", wilcox.res.pvals.adj.star),
x = 1:length(wilcox.res.pvals.adj.star), y = 1.02,
color = "black", size = 8, angle = 90, vjust = 0.75) +
annotate("text", x = 10, y = 1, label = "* q<0.01\n**q<0.001\n***q<0.0001")
Plot the most significant CpG from CBX2 and perform post-hoc statistical testing.
cg_sml_pval <- names(which(wilcox.res.pvals.adj == min(wilcox.res.pvals.adj)))
cg_sml_pval
## [1] "cg25270886"
min(wilcox.res.pvals.adj)
## [1] 6.107046e-09
cpg_cbx2_1cg <- cpg_cbx2 %>% filter(cpg == cg_sml_pval) %>% mutate(cl_ENSG00000173894 = ifelse(cl_ENSG00000173894 == 0, "Baseline", "Overexpressed"))
ggplot(cpg_cbx2_1cg, aes(x = cl_ENSG00000173894, y = value,
fill = cl_ENSG00000173894, color = cl_ENSG00000173894)) +
theme_classic() +
geom_jitter(width=0.2, height=0, alpha=0.5, size=4, pch=21, color="black") +
geom_boxplot(fill=NA, outlier.shape = NA) +
theme(text = element_text(color="black", size=20), legend.position="none") +
scale_fill_brewer(palette = "Paired", name = "CBX2 Expression", labels=c("Baseline", "Overexpressed")) +
scale_color_brewer(palette = "Paired") +
ylab("Beta Value") + xlab("")
Now, see which have peaks in the vicinity of CBX2 loci. Create a 28 x n binary matrix, indicating whether a peak is present at the CpG site or not. The file “metadata.txt” is provided; ChIP-seq experiments listed in this file can be downloaded from ENCODE GUI (no easy-to-use interface from R unfortunately).
Look at methylation status across CBX2 locus
path2ChipSeq = here("raw_data",
"2017_04_14_encode_chipSeq_mcf7",
"columnCut1237")
encode_chipSeq_fn = list.files(path2ChipSeq)
#remove any file that doesn't end in .bed
encode_chipSeq_fn = encode_chipSeq_fn[
substr(encode_chipSeq_fn,
nchar(encode_chipSeq_fn[2]) - 3,
nchar(encode_chipSeq_fn[2])) == ".bed"
]
encode_chipSeq_fn.trim = substr(encode_chipSeq_fn, 5, 15)
############
#read in the metadata file
metadata.mcf7.encode = read_delim(file = here("raw_data",
"2017_04_14_encode_chipSeq_mcf7",
"metadata.txt"), delim = "\t", col_names = TRUE) %>%
filter(`File accession` %in% encode_chipSeq_fn.trim) %>%
dplyr::select("File accession","Output type","Experiment target")
encode_chipSeq.mcf7.list = GRangesList()
for(i in 1:nrow(metadata.mcf7.encode)){
encode_chipSeq.list = import(here("raw_data", "2017_04_14_encode_chipSeq_mcf7", "columnCut1237", paste0("cut_", metadata.mcf7.encode[i,"File accession"], ".bed")), format="BED")
x = as.numeric(encode_chipSeq.list@elementMetadata$name)
y = rescale(x,c(10,1000))
y = as.data.frame(matrix(y, ncol=1))
colnames(y) = "rescaledRange"
values(encode_chipSeq.list) <- cbind(values(encode_chipSeq.list), rescaledRange = y)
encode_chipSeq.mcf7.list[[i]] <- encode_chipSeq.list
}
names(encode_chipSeq.mcf7.list) <- metadata.mcf7.encode$`Experiment target` #encode_chipSeq_fn.trim
#plot histograms for each of these to make sure there is no severe right skew
## overlap the CBX2 CpGs with the ENCODE ChIP-seq data from MCF7
encode_chipSeq.mcf7.list.sbst <- encode_chipSeq.mcf7.list %>%
lapply(findOverlaps, cbx2CpGs)
encode_chipSeq.mcf7.list.sbst.chip <-
encode_chipSeq.mcf7.list.sbst %>%
.[lapply(.,length) != 0] %>%
lapply(data.frame) %>%
do.call(rbind.data.frame, .) %>%
rownames_to_column(var = "protein") %>%
separate(protein, c("protein", "toRm"), sep="-") %>%
dplyr::select(-toRm) %>%
cbind(.,as.matrix(cbx2CpGs[.$subjectHits]@elementMetadata)) %>%
as_tibble() %>%
mutate(cgid = factor(cgid, levels = sort(cbx2CpGs)$cgid))
#reorder the levels of the rows
tfs <- c("JUND", "SIN3A", "MAZ","CTBP1")
enh <- c("H3K27ac", "H3K4me1")
prom <- c("H3K4me2", "H3K4me3")
repr <- "H3K9ac"
elong <- "H4K20me1"
othHist <- c("H2AFZ", "H3F3A")
orderChip <- rev(c(othHist, enh, prom, repr, elong, tfs))
encode_chipSeq.mcf7.list.sbst.chip <- encode_chipSeq.mcf7.list.sbst.chip %>% mutate(protein = factor(protein, levels = orderChip))
ggplot(encode_chipSeq.mcf7.list.sbst.chip, aes(x = cgid, y = protein)) + theme_classic() + theme(axis.text.x = element_text(angle = 90, size = 8, hjust = 1, vjust = 0.5)) + coord_fixed(ratio = 1) + geom_tile(color = "lightgrey",width=0.7, height=0.7) + xlab("") + ylab("") + scale_fill_manual(values= c("white", "black"))
Survival analysis using CBX2 data from 110 patients as well as from entire TCGA dataset.
Start by looking at 110 patients.
i=3
cbx2_bound <- mmp2 %>%
rownames_to_column(var="geneId") %>%
filter(geneId == genes_gr2[i]$gene_symb) %>%
dplyr::select(starts_with("tMu")) %>%
as.numeric() %>% mean()
e_cbx2 <- et[genes_gr2[i]$ensg,substr(colnames(et),14,15) == "01"]
colData(e_cbx2)$cbx2Class <- ifelse(as.numeric(assay(e_cbx2)) > cbx2_bound, 1, 0)
days_last_fu <- ifelse(is.na(colData(e_cbx2)$days_to_death), colData(e_cbx2)$days_to_last_follow_up,colData(e_cbx2)$days_to_death)
days_last_fu <- ifelse(days_last_fu > 365.25 * 5, 365.25 * 5, days_last_fu)
vital_stat <- ifelse(colData(e_cbx2)$vital_status == "alive", 0, 1)
vital_stat <- ifelse(days_last_fu >= 365.25 * 5, 0, vital_stat)
colData(e_cbx2)$days_last_fu_5yr <- days_last_fu
colData(e_cbx2)$years_last_fu_5yr <- days_last_fu/365.25
colData(e_cbx2)$vital_stat_5yr <- vital_stat
e_cbx2 <- e_cbx2[,!is.na(e_cbx2$vital_stat_5yr)]
e_cbx2 <- e_cbx2[,!is.na(e_cbx2$years_last_fu_5yr)]
fit <- survfit(Surv(e_cbx2$years_last_fu_5yr, as.numeric(e_cbx2$vital_stat_5yr)) ~ as.factor(cbx2Class), data = colData(e_cbx2))
fit.stat <- survival::survdiff(Surv(e_cbx2$years_last_fu_5yr, as.numeric(e_cbx2$vital_stat_5yr)) ~ as.factor(e_cbx2$cbx2Class))
pval <- pvalFxn(fit.stat)
group1Deaths <- sum(as.numeric(e_cbx2$vital_stat_5yr[e_cbx2$cbx2Class == 0]))
group2Deaths <- sum(as.numeric(e_cbx2$vital_stat_5yr[e_cbx2$cbx2Class == 1]))
#all patients
GGally::ggsurv(fit, back.white = T) +
xlab("Time (Years)") + ylab("Prob. of Survival") +
ggtitle(paste(genes_gr2[i]$gene_symb, ": K-M", "| Lg-Rk p:", round(pval,3),
"\n", "G1:", group1Deaths,"/", fit$n[1],"died |", "G2:",
group2Deaths, "/", fit.stat$n[2], "died",sep=" ")) +
theme(legend.title=element_blank(),
axis.text = element_text(size=18, color = "black"),
text = element_text(size=17)) +
scale_color_brewer(palette = "Paired", direction = 1) +
ylim(c(0,1))
## Loading required package: scales
## Warning: package 'scales' was built under R version 3.4.4
##
## Attaching package: 'scales'
## The following object is masked _by_ '.GlobalEnv':
##
## rescale
## The following object is masked from 'package:purrr':
##
## discard
## The following object is masked from 'package:readr':
##
## col_factor
## Scale for 'colour' is already present. Adding another scale for
## 'colour', which will replace the existing scale.
#ggsave(filename="output/survival_110_cbx2.png")
Now look at all patients in tcga
e_cbx2_all <- e[genes_gr2[i]$ensg,substr(colnames(e),14,15) == "01"]
dim(e_cbx2_all)
## [1] 1 1086
#test <- e[genes_gr2[3]$ensg,substr(colnames(e),14,15) == "01"]
colData(e_cbx2_all)$cbx2Class <- ifelse(as.numeric(assay(e_cbx2_all)) > cbx2_bound, 1, 0)
days_last_fu <- ifelse(is.na(colData(e_cbx2_all)$days_to_death), colData(e_cbx2_all)$days_to_last_follow_up,colData(e_cbx2_all)$days_to_death)
days_last_fu <- ifelse(days_last_fu > 365.25 * 5, 365.25 * 5, days_last_fu)
vital_stat <- ifelse(colData(e_cbx2_all)$vital_status == "alive", 0, 1)
vital_stat <- ifelse(days_last_fu >= 365.25 * 5, 0, vital_stat)
colData(e_cbx2_all)$days_last_fu_5yr <- days_last_fu
colData(e_cbx2_all)$years_last_fu_5yr <- days_last_fu/365.25
colData(e_cbx2_all)$vital_stat_5yr <- vital_stat
e_cbx2_all <- e_cbx2_all[,!is.na(e_cbx2_all$vital_stat_5yr)] #remove pt w/o surv data
dim(e_cbx2_all)
## [1] 1 1084
e_cbx2_all <- e_cbx2_all[,!is.na(e_cbx2_all$years_last_fu_5yr)] #remove pt w/o surv data
dim(e_cbx2_all)
## [1] 1 1084
fit <- survfit(Surv(e_cbx2_all$years_last_fu_5yr, as.numeric(e_cbx2_all$vital_stat_5yr)) ~ as.factor(cbx2Class), data = colData(e_cbx2_all))
fit.stat <- survival::survdiff(Surv(e_cbx2_all$years_last_fu_5yr, as.numeric(e_cbx2_all$vital_stat_5yr)) ~ as.factor(e_cbx2_all$cbx2Class))
pval2 = pvalFxn(fit.stat)
print(pval2)
## [1] 0.01695547
group1Deaths <- sum(as.numeric(e_cbx2_all$vital_stat_5yr[e_cbx2_all$cbx2Class == 0]))
group2Deaths <- sum(as.numeric(e_cbx2_all$vital_stat_5yr[e_cbx2_all$cbx2Class == 1]))
#all patients
GGally::ggsurv(fit, back.white = T) +
xlab("Time (Years)") +
ylab("Prob. of Survival") +
ggtitle(paste(genes_gr2[i]$gene_symb, ": K-M", "| Lg-Rk p:", round(pval2,3),
"\n", "G1:", group1Deaths,"/", fit$n[1],"died |", "G2:",
group2Deaths, "/", fit.stat$n[2], "died",sep=" ")) +
theme(legend.title=element_blank(),
axis.text = element_text(size=18, color = "black"),
text = element_text(size=17)) +
scale_color_brewer(palette = "Paired", direction = 1) + ylim(c(0,1))
## Scale for 'colour' is already present. Adding another scale for
## 'colour', which will replace the existing scale.
#ggsave(filename="output/survival_all_cbx2.png")
#ggsave(filename="output/survival_all_cbx2.png", width = 6, height = 5, units = "in")
p.adjust(c(pval, pval2), "BH")
## [1] 0.07631043 0.03391094
Show histogram of distribution of CBX2 expression values across all patients with cutoff
expr_dat_cbx2_all <- assay(e_cbx2_all["ENSG00000173894",]) %>%
t() %>%
as.data.frame %>%
rownames_to_column("tcga_id") %>%
mutate(cbx2Class = as.factor(ifelse(ENSG00000173894 > cbx2_bound, 1,0)))
table(expr_dat_cbx2_all$cbx2Class)
##
## 0 1
## 635 449
ggplot(expr_dat_cbx2_all, aes(x=ENSG00000173894, fill=cbx2Class)) +
geom_histogram(binwidth = 0.2) + scale_fill_brewer(palette = "Paired", name = "CBX2 Expression", labels=c("Baseline", "Overexpressed")) + theme_classic() + theme(text = element_text(color="black", size=20)) + ylab("Frequency") + xlab("") + geom_vline(xintercept = cbx2_bound) + xlab(paste0("CBX2 mRNA Expression in Tumors"))# (N=",nrow(e_cbx2), ")"))
#ggsave(filename = "output/cbx2_expr_all_tum_classif.png", width=7, height=5)
Plotting relationship between molecular subtype and cbx2 expression
subtypeDf <- colData(et) %>%
data.frame() %>%
dplyr::select(cl_ENSG00000173894, aims_subtypes) %>%
mutate(aims_subtypes = factor(aims_subtypes, levels=c("Normal", "LumA", "LumB", "Her2", "Basal"))) %>%
mutate(cl_ENSG00000173894 = factor(cl_ENSG00000173894))
#simplify
ggplot(subtypeDf, aes(x=aims_subtypes, fill=cl_ENSG00000173894, color=cl_ENSG00000173894)) + geom_histogram(stat="count") + scale_color_brewer(palette = "Paired") + scale_fill_brewer(palette = "Paired") + xlab("") + theme_classic() + scale_y_continuous(expand = c(0,0)) + ylab("Frequency") + theme(text = element_text(size = 16, color="black"))
## Warning: Ignoring unknown parameters: binwidth, bins, pad
## calculate statistics -- fishers exact for each group
subtypes <- unique(subtypeDf$aims_subtypes)
pvalFishStore <- c()
for(st in subtypes){
pvalFish <- subtypeDf %>%
mutate(subtypeBinary = ifelse(aims_subtypes == st, 1,0)) %>%
fisher.test(x=.$cl_ENSG00000173894, y = .$subtypeBinary) %>%
.$p.value
pvalFishStore <- c(pvalFishStore, pvalFish)
}
names(pvalFishStore) <- subtypes
p.adjust(pvalFishStore, method="BH")
## Basal LumA LumB Her2 Normal
## 8.076490e-04 1.765789e-05 1.000000e+00 1.601974e-03 5.005285e-02
## percent high
tbl_perc <- table(subtypeDf$cl_ENSG00000173894, subtypeDf$aims_subtypes)
round(tbl_perc[2,] / colSums(tbl_perc), 2)
## Normal LumA LumB Her2 Basal
## 0.00 0.11 0.40 0.72 0.82
Benchmarking oncomix against mCOPA.
## Prepare the data for mCOPA
if(!file.exists(here("processed_data", "mCopa", "exprMatrMcopa.txt"))){
exprMatrMcopa = cbind(assay(en), assay(et))
exprMatrMcopa = ifelse(exprMatrMcopa< 1*10^(-6), 0 , exprMatrMcopa)
write.table(exprMatrMcopa, here("processed_data", "mCOPA", "exprMatrMcopa.txt"), sep="\t", quote = F)
}
Perl script below. Run this on the command line
module load perl/5.22.1/gcc.4.4.7
perl mCOPA.pl exprMatrMcopa.txt out/mCOPA_Up90.txt out/mCOPA_Down10.txt 90 10 110
cut -f1 out/mCOPA_Up90.txt > out/mCOPA_Up90_gene.txt
cat out/mCOPA_Up90.txt | rev | cut -d$'\t' -f1| rev > out/mCOPA_Up90_gene_log2fc.txt
paste out/mCOPA_Up90_gene.txt out/mCOPA_Up90_gene_log2fc.txt > out/mCOPA_Up90_2col.txt
perl mCOPA.pl exprMatrMcopa.txt out/mCOPA_Up80.txt out/mCOPA_Down20.txt 80 20 110
cut -f1 out/mCOPA_Up80.txt > out/mCOPA_Up80_gene.txt
cat out/mCOPA_Up80.txt | rev | cut -d$'\t' -f1| rev > out/mCOPA_Up80_gene_log2fc.txt
paste out/mCOPA_Up80_gene.txt out/mCOPA_Up80_gene_log2fc.txt > out/mCOPA_Up80_2col.txt
perl mCOPA.pl exprMatrMcopa.txt out/mCOPA_Up70.txt out/mCOPA_Down30.txt 70 30 110
cut -f1 out/mCOPA_Up70.txt > out/mCOPA_Up70_gene.txt
cat out/mCOPA_Up70.txt | rev | cut -d$'\t' -f1| rev > out/mCOPA_Up70_gene_log2fc.txt
paste out/mCOPA_Up70_gene.txt out/mCOPA_Up70_gene_log2fc.txt > out/mCOPA_Up70_2col.txt
Read in the output from mCOPA.
mcopa_70 = read.table(here("processed_data", "mCopa", "out","mCOPA_Up70_2col.txt"), header = T, stringsAsFactors = F) %>%
rename_("genes" = names(.)[1], "log2FC" = names(.)[2]) %>%
mutate(percentile = "70") %>% as_tibble()
mcopa_80 = read.table(here("processed_data", "mCopa", "out", "mCOPA_Up80_2col.txt"), header = T, stringsAsFactors = F) %>%
rename_("genes" = names(.)[1], "log2FC" = names(.)[2]) %>%
mutate(percentile = "80") %>% as_tibble()
mcopa_90 = read.table(here("processed_data", "mCopa", "out", "mCOPA_Up90_2col.txt"), header = T, stringsAsFactors = F) %>%
rename_("genes" = names(.)[1], "log2FC" = names(.)[2]) %>%
mutate(percentile = "90") %>% as_tibble()
mcopa <- bind_rows(mcopa_70, mcopa_80, mcopa_90) %>%
group_by(percentile) %>%
arrange(desc(log2FC), .by_group = TRUE) %>%
mutate(id = row_number())
## how many genes total were ranked between the 3 mcopa runs
mcopa %>% ungroup() %>% select(genes) %>% distinct()
## # A tibble: 2,152 x 1
## genes
## <chr>
## 1 ENSG00000110925
## 2 ENSG00000196517
## 3 ENSG00000127527
## 4 ENSG00000049656
## 5 ENSG00000162736
## 6 ENSG00000125445
## 7 ENSG00000160948
## 8 ENSG00000123131
## 9 ENSG00000132294
## 10 ENSG00000178796
## # ... with 2,142 more rows
## where do top5 oncomix genes fall in mCOPA rankings
mcopa %>% filter(genes %in% genes_gr2$ensg)
## # A tibble: 3 x 4
## # Groups: percentile [3]
## genes log2FC percentile id
## <chr> <dbl> <chr> <int>
## 1 ENSG00000089692 2.34 70 1818
## 2 ENSG00000089692 2.48 80 1155
## 3 ENSG00000089692 2.02 90 1076
## plot the distribution of the top5 mcopa genes
mcopa_top5 <- mcopa %>% top_n(5, log2FC) %>% left_join(ensg_to_prot, by=c("genes"= "Gene.stable.ID"))
gglayers1 = list(
theme(axis.text.x = element_text(size=24), plot.title=element_text(size=26)),
xlim(0,12.5), xlab(""), ylab("")
)
#remove curved lines
for(i in 1:nrow(mcopa_top5)){
gglayers1[["title"]] <- ggtitle(mcopa_top5$HGNC.symbol[i])
gene <- as.character(mcopa[i,"genes"])
plotGeneHistNoProbCrv(mmParams = mmp, exprNml=assay(en), exprTum=assay(et), isof=gene, gglayers1)
ggsave(here("output", "mcopa", paste0(gene,"_", mcopa_top5[i,"percentile"], "_",mcopa_top5[i,"log2FC"], ".png")), width=4, height=4, units="in")
}
Benchmarking oncomix against limma. Perform a 2 sample t-test between tumor and normal samples.
classif <- c(rep(0, ncol(en)), rep(1, ncol(et)))
dsgn <- model.matrix(~ classif +
c(colData(en)[,"seqPlate"], colData(et)[,"seqPlate"]) +
c(colData(en)[,"hosp"], colData(et)[,"hosp"]))
#where do our 5 genes fall inthe limma rank among upregulated genes?
genesUpInTum <- returnDeGenesLimma(cbind(assay(en), assay(et)),
dsgn, colOfInt = 2, retAll=T) %>%
rownames_to_column("genes") %>%
arrange(desc(-adj.P.Val)) %>%
as_tibble() %>% filter(logFC > 0) %>%
mutate(rank = row_number())
## Coefficients not estimable: c(colData(en)[, "hosp"], colData(et)[, "hosp"])GI
dim(genesUpInTum)
## [1] 7388 8
#now we show where our genes fall in the limma ranks
limma_oncmx_olp <- genesUpInTum %>%
filter(genes %in% genes_gr2$ensg)
limma_oncmx_olp
## # A tibble: 5 x 8
## genes logFC AveExpr t P.Value adj.P.Val B rank
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int>
## 1 ENSG00000155886 1.67 0.907 14.6 4.40e⁻³³ 1.13e⁻³¹ 64.5 149
## 2 ENSG00000083782 2.41 1.33 12.4 1.67e⁻²⁶ 2.41e⁻²⁵ 49.4 279
## 3 ENSG00000173894 1.58 2.77 9.36 1.86e⁻¹⁷ 1.21e⁻¹⁶ 28.7 756
## 4 ENSG00000184613 1.54 2.94 5.75 3.32e⁻ ⁸ 9.34e⁻ ⁸ 7.75 2264
## 5 ENSG00000089692 0.683 1.94 4.59 7.92e⁻ ⁶ 1.72e⁻ ⁵ 2.44 3077
## visualize the distribution of the top 5 genes
genesUpInTumTop5 <- genesUpInTum %>%
top_n(5, -adj.P.Val) %>%
left_join(ensg_to_prot, by=c("genes"= "Gene.stable.ID"))
gglayers1 = list(
theme(axis.text.x = element_text(size=24), plot.title=element_text(size=26)),
xlim(0,12.5), xlab(""), ylab("")
)
for(i in 1:nrow(genesUpInTumTop5)){
gene = as.character(genesUpInTumTop5[i,"genes"])
gglayers1[["title"]] <- ggtitle(genesUpInTumTop5$HGNC.symbol[i])
plotGeneHistNoProbCrv(mmParams = mmp, exprNml=assay(en), exprTum=assay(et), isof=gene, gglayers1)
ggsave(here("output", "limma", paste0(gene,"_", genesUpInTumTop5[i,"rank"], "_",mcopa[i,"log2FC"], ".png")), width=4, height=4, units="in")
}
Look at the overlap between known oncogenes and our oncomix scores.
Read in a list of established oncogenes and tumor suppressors from COSMIC, v83,, downloaded Jan. 20, 2018.
cosDat <- read_csv(file=here("raw_data", "2018_01_20_cosmic", "2018_01_20_Census_all.csv"))
## Parsed with column specification:
## cols(
## .default = col_character(),
## `Entrez GeneId` = col_integer(),
## Tier = col_integer()
## )
## See spec(...) for full column specifications.
names(cosDat) <- gsub(" ", "_", names(cosDat))
cosDatOncg <- cosDat %>%
filter(Tier == 1) %>%
filter(str_detect(Role_in_Cancer, "oncogene")) %>%
filter(!str_detect(Role_in_Cancer, "TSG")) %>%
rename("entrezgene" = Entrez_GeneId)
if(file.exists(here("raw_data", "2018_01_20_ensg_entrz","entrez_ensg.rds"))){
entrez_ensg <- readRDS(here("raw_data", "2018_01_20_ensg_entrz","entrez_ensg.rds"))
} else {
ensembl = useMart("ensembl",dataset="hsapiens_gene_ensembl")
attributes = listAttributes(ensembl)
entrez_ensg <- getBM(mart=ensembl,
attributes=c("ensembl_gene_id", "entrezgene"),
values = "entrezgene")
saveRDS(entrez_ensg, file=here("raw_data", "2018_01_20_ensg_entrz","entrez_ensg.rds"))
}
#map the gene symbols to ENSG
cosDatOncgE <- left_join(cosDatOncg, entrez_ensg, by="entrezgene")
#look at distribution of oncomix scores in oncogenes
mmpOnc <- mmp %>%
rownames_to_column("ensembl_gene_id") %>%
mutate(rank = row_number()) %>%
as_tibble() %>%
filter(ensembl_gene_id %in% cosDatOncgE$ensembl_gene_id) %>%
left_join(cosDatOncgE, by="ensembl_gene_id")
mmpOncFilt <- mmpOnc %>% filter(SI > .99, score > 0)
mmpLab <- mmp
idx2replace <- sapply(mmpOncFilt$ensembl_gene_id, grep, rownames(mmpLab))
rownames(mmpLab)[idx2replace] <- mmpOncFilt$Gene_Symbol
#show the scores for the top oncogene candidates identified by oncomix
mmpLab %>% rownames_to_column("gene_id") %>%
filter(gene_id %in% mmpOncFilt$Gene_Symbol)
## gene_id nMu1 nMu2 nVar nPi1 tMu1 tMu2
## 1 LMO1 0.3516258 0.4028593 0.20313801 0.53775722 0.3235132 2.999381
## 2 HOXA13 0.4534165 1.0264406 0.04644180 0.87489764 0.3027152 3.047879
## 3 TAL2 0.4432727 0.5360506 0.24245524 0.58175281 0.5873557 2.750804
## 4 SOX2 0.2122669 1.5307940 0.05508013 0.87160405 0.3203616 3.469381
## 5 HOXD13 0.1895123 1.0034185 0.04627831 0.86168925 0.2894383 2.534061
## 6 WHSC1L1 3.9747188 5.1202297 0.10121847 0.09644618 4.8630534 7.758587
## 7 PSIP1 4.8319195 5.6588232 0.08969016 0.07619842 5.0354927 7.591011
## 8 KAT6A 4.0452877 5.4666472 0.11583664 0.12132554 5.0932465 7.419123
## 9 STIL 1.6322391 2.1144874 0.26739375 0.57626590 2.7396512 4.071955
## 10 ERBB2 4.1760021 6.7871609 0.47168226 0.16744898 6.8321347 10.960164
## 11 SALL4 0.3495496 0.8467502 0.03093821 0.87897864 0.9983323 1.974797
## 12 CCNE1 1.6219497 2.6644384 0.11354100 0.94473181 2.6809223 4.683597
## tVar tPi1 deltaMu2 deltaMu1 SI score
## 1 0.15028154 0.9266807 2.596522 -0.02811254 0.9909091 2.25056731
## 2 0.11748177 0.9361485 2.021438 -0.15070126 1.0000000 2.00821565
## 3 0.23865529 0.8704420 2.214753 0.14408304 0.9909091 1.57510925
## 4 0.26919040 0.8095887 1.938587 0.10809474 0.9909091 1.49252920
## 5 0.09145895 0.9727264 1.530643 0.09992598 0.9909091 1.28122495
## 6 0.43856787 0.9413129 2.638357 0.88833466 1.0000000 1.21023628
## 7 0.49484368 0.9659080 1.932188 0.20357320 0.9909091 1.13367977
## 8 0.40490233 0.9673643 1.952476 1.04795879 1.0000000 0.38377830
## 9 0.32710061 0.5451692 1.957468 1.10741207 0.9909091 0.25323820
## 10 0.83584378 0.9017369 4.173003 2.65613257 0.9909091 0.20744110
## 11 0.25451795 0.5066894 1.128047 0.64878279 1.0000000 0.19380793
## 12 0.75376019 0.8304956 2.019159 1.05897267 1.0000000 0.09288494
#plot the distributions of these oncogenes
gglayers2 = list(
theme(axis.text.x = element_text(size=24), plot.title=element_text(size=26)),
xlim(0,12.5), xlab(""), ylab("")
)
for(i in 1:length(mmpOncFilt$Gene_Symbol)){
gglayers2[["title"]] <- ggtitle(paste0(mmpOncFilt$Gene_Symbol[i], " Score: ", round(mmpOncFilt$score[i],2)))
#print(range(assay(en)[mmpOncFilt$ensembl_gene_id[i],], assay(et)[mmpOncFilt$ensembl_gene_id[i],]))
#print(mmpOncFilt$Gene_Symbol[i])
plotGeneHist(mmp, assay(en), assay(et), mmpOncFilt$ensembl_gene_id[i], gglayers2)
ggsave(filename = here("output", paste0("known_oncog_",mmpOncFilt$Gene_Symbol[i], "_histog.png")), width = 5, height=5, units = "in")
}
Create a plot comparing the selectivity index of known oncogene candidates vs all genes
## is there enrichment of oncomix score among oncogenes??
ggplot(mmpOnc, aes(x=SI)) +
geom_histogram(binwidth = 0.02, color="grey", alpha=alph, aes(fill=SI>0.99)) +
theme_classic() + scale_fill_manual(values=reds[c(3,7)]) + scale_y_continuous(expand = c(0, 0)) + xlab("Selectivity Index") + ylab("Frequency") + theme(text=element_text(size=20, color="black"), legend.position="none")
ggplot(mmpOnc, aes(x=score, y=..density.., fill=SI > 0.99)) +
geom_histogram(data=subset(mmpOnc, SI<0.99), aes(x=score, y=..density.., fill=SI > 0.99), binwidth = 0.1, color="grey", alpha=alph) +
geom_histogram(data=subset(mmpOnc, SI>0.99), aes(x=score, y=..density.., fill=SI > 0.99), binwidth = 0.1, color="grey", alpha=alph) +
theme_classic() + scale_fill_manual(values=reds[c(3,7)]) + scale_y_continuous(expand = c(0, 0)) + theme(text=element_text(size=20, color="black"), legend.position="none") + ylab("Density") + xlab("Oncomix Score")
Expression of oncogenes from the Cancer Gene Census in normal adult tissue from Gtex.
gtexDat <- read.gct(here("raw_data", "2016_10_05_gTex_tissue", "GTEx_Analysis_2016-01-15_v7_RNASeQCv1.1.8_gene_median_tpm.gct"))
gtexDat2 <- gtexDat %>%
as.data.frame() %>%
rownames_to_column("ensg") %>%
as_tibble() %>%
clean_names() %>%
mutate(ensg = substr(ensg, 1,15))
gtexDat2.m <- gtexDat2 %>%
filter(ensg %in% mmpOncFilt$ensembl_gene_id) %>%
gather(key="tissue", value="exprlvl", 2:ncol(.)) %>%
mutate(exprlvlbin = ifelse(exprlvl>1, 1, 0)) %>%
left_join(select(mmpOncFilt, ensembl_gene_id, Gene_Symbol), by=c("ensg" = "ensembl_gene_id")) %>%
mutate(tissue = factor(tissue, levels=sort(unique(tissue), decreasing=TRUE)))
#cluster these samples
gtexDat2.s <- gtexDat2.m %>% select(Gene_Symbol, exprlvlbin, tissue) %>% spread(key = tissue, value=exprlvlbin) %>% as.data.frame()
rownames(gtexDat2.s) <- gtexDat2.s[,"Gene_Symbol"]
gtexDat2.s.m <- as.matrix(gtexDat2.s %>% select(-Gene_Symbol))
d <- dist(gtexDat2.s.m, method = "euclidean")
gtex_clust <- hclust(d, method = "complete" )
plot(gtex_clust, cex = 0.6, hang = -1, lwd = 2)
gtexDat2.m2 <- gtexDat2.m %>%
mutate(Gene_Symbol = factor(Gene_Symbol, levels=gtex_clust$labels[gtex_clust$order]),
exprlvlbin = factor(exprlvlbin, levels=c(0,1), labels=c("< 1", "\u2265 1")))
ggplot(gtexDat2.m2, aes(x = Gene_Symbol, y = tissue, fill = exprlvlbin)) +
geom_tile(color = "white", size = 0.4) +
scale_fill_brewer(direction = 1, name="TPM") +
theme_classic() +
theme(axis.ticks = element_blank(),
axis.line = element_blank(),
axis.text.x = element_text(angle = 90, hjust = 1, vjust=0.5)) +
coord_fixed(ratio = 1) + xlab("") + ylab("")
Supplementary Figure 3A
Percentage of GTEx tissues expressing the CGC gene
gtexDat3 <- gtexDat2 %>%
filter(ensg %in% mmpOnc$ensembl_gene_id) %>%
gather(key="tissue", value="exprlvl", 2:ncol(.)) %>%
mutate(exprlvlbin = ifelse(exprlvl>1, 1, 0))
gtexDat4 <- gtexDat3 %>%
group_by(ensg, exprlvlbin) %>%
summarise (n = n()) %>%
mutate(freq = n / sum(n))
gtexDat5 <- gtexDat4 %>%
ungroup() %>%
## make implicit missing value explicit
complete(ensg, exprlvlbin, fill = list(n = 0, freq = 0))
genes_to_label <- c("CCNE1", "SALL4", "TAL2", "HOXA13", "HOXD13", "SOX2", "ERBB2")
gtexDat6 <- gtexDat5 %>%
left_join(select(mmpOncFilt, ensembl_gene_id, Gene_Symbol), by=c("ensg" = "ensembl_gene_id")) %>%
mutate(idByOm = ifelse(is.na(Gene_Symbol), "No", "Yes"),
Gene_Symbol_label = ifelse(Gene_Symbol %in% genes_to_label,
Gene_Symbol, "")) %>%
filter(exprlvlbin == 1)
ggplot(gtexDat6, aes(x=idByOm, y=freq)) +
geom_violin() +
geom_jitter(width = 0.2, height=0, pch=21, alpha = 0.45, aes(fill=idByOm)) +
theme_classic() +
theme(text = element_text(size=20, color="black")) +
scale_fill_manual(values=c("white", "purple"), guide=FALSE) +
geom_text_repel(size=4, aes(label=Gene_Symbol_label)) +
xlab("Identified by oncomix") +
ylab("% of 53 GTEX tissue expressing the gene")
Supplementary Figure 3A
see whether the genes identified by oncomix are expressed in fewer tissues than the ones not identified by oncomix
gr1 <- gtexDat6 %>% filter(idByOm == "No") %>% pull(freq)
gr2 <- gtexDat6 %>% filter(idByOm == "Yes") %>% pull(freq)
t.test(gr1, gr2, alternative="two.sided")
##
## Welch Two Sample t-test
##
## data: gr1 and gr2
## t = 2.68, df = 11.601, p-value = 0.02055
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.05922785 0.58494771
## sample estimates:
## mean of x mean of y
## 0.8598236 0.5377358
Now, load the breast cancer somatic mutation data from TCGA.
if(file.exists(here("raw_data", "2018_01_20_somMut", "tcga-mut-mutect-brca.rds"))){
mut <- readRDS(here("raw_data", "2018_01_20_somMut", "tcga-mut-mutect-brca.rds"))
} else {
mutq <- GDCquery_Maf("BRCA", pipelines = "mutect")
#saveRDS(mutq, file=here("raw_data", "2018_01_20_somMut", "tcga-mut-mutect-brca.rds"))
}
mut.hiImpct <- mut %>%
filter(IMPACT %in% "HIGH" & Entrez_Gene_Id %in% cosDat$Entrez_GeneId)
top50mutGenes <- sort(names(head(sort(table(mut.hiImpct$Hugo_Symbol),decreasing = TRUE), 25)))
mut.hiImpct.top50 <- mut.hiImpct[mut.hiImpct$Hugo_Symbol %in% top50mutGenes,]
mut.hiImpct.top50.ptid <- names(table(mut.hiImpct.top50$Tumor_Sample_Barcode))
mutMatr <- matrix(0,ncol=length(mut.hiImpct.top50.ptid), nrow=length(mut.hiImpct.top50))
#which patients are associated with which gene mutations
#this is a list of patient TCGA ids
ptIdGeneMut <- sapply(top50mutGenes, function(x) unname(mut.hiImpct.top50[mut.hiImpct.top50$Hugo_Symbol %in% x,"Tumor_Sample_Barcode"]))
ptIdGeneMut.l <- lapply(ptIdGeneMut, function(x) mut.hiImpct.top50.ptid %in% x )
ptIdGeneMut.l.df <- do.call(rbind, ptIdGeneMut.l)
colnames(ptIdGeneMut.l.df) <- mut.hiImpct.top50.ptid
ptIdGeneMut.l.df <- ifelse(ptIdGeneMut.l.df, 1, 0)
mutDat <- SummarizedExperiment(assays=list(mutation = as.matrix(ptIdGeneMut.l.df)))
oncomix_gene_class <- dplyr::select(data.frame(colData(et)), starts_with("cl_"))
ptInCommon <- intersect(substr(rownames(oncomix_gene_class), 1,12), substr(colnames(assay(mutDat)), 1,12))
#create a histogram of mutation frequencies
mutFreq <- data.frame(freq=sort(rowSums(assay(mutDat[,sapply(ptInCommon, function(x) grep(x, colnames(mutDat)))])),decreasing = T))
mutFreq$gene <- rownames(mutFreq)
mutFreq$gene <- factor(mutFreq$gene, levels=mutFreq$gene)
ggplot(mutFreq, aes(x=gene, y=freq/length(ptInCommon))) + geom_bar(stat = "identity") + gg__bar + ylab(paste0("Observed Frequency (N=", length(ptInCommon), ")"))
#expression status
exprStatus.c <- clusterAndReorder(t(oncomix_gene_class)[,substr(rownames(oncomix_gene_class), 1,12) %in% ptInCommon], dimen = "both")
colnames(exprStatus.c) <- substr(colnames(exprStatus.c), 1,12)
exprStatus.c.m <- reshape2::melt(exprStatus.c)
#mutation status
mutDat.c <- clusterAndReorder(assay(mutDat)[,substr(colnames(assay(mutDat)), 1,12) %in% ptInCommon], dimen="both")
#remove rows that are all the same
mutDat.c <- mutDat.c[rowSums(mutDat.c) >= 2,]
colnames(mutDat.c) <- substr(colnames(mutDat.c), 1,12)
mutDat.c.m <- reshape2::melt(mutDat.c)
#bind hormone expression and mutation data together
mutAndExpr <- rbind(mutDat.c, exprStatus.c[,colnames(mutDat.c)])
mutAndExpr.c <- clusterAndReorder(mutAndExpr, dimen = "both")
mutAndExpr.c.m <- reshape2::melt(mutAndExpr.c)
mutAndExpr.c.m$Var2 <- factor(mutAndExpr.c.m$Var2, levels=colnames(mutAndExpr.c))
mutAndExpr.c.m$Var1 <- factor(mutAndExpr.c.m$Var1, levels=rownames(mutAndExpr.c))
mutAndExpr.c.m$Var1 = plyr::mapvalues(mutAndExpr.c.m$Var1, from = paste0("cl_", genes_gr2$ensg), to = genes_gr2$gene_symb)
mutAndExpr.c.m$mut0Expr1 <- ifelse(as.character(mutAndExpr.c.m$Var1) %in% genes_gr2$gene_symb, 1,0)
mutAndExpr.c.m$valMutExpr <- paste0(mutAndExpr.c.m$value, mutAndExpr.c.m$mut0Expr1)
Are there associations between oncogene/tumor suppressor mutations and overexpression of oncogenes? Use fisher’s exact test to look at relationship between each NHR and each mutation (50x6 matrix). Our 2 matrices are mutDat.c and exprStatus.c.sub.
exprStatus.c.sub <- exprStatus.c[,colnames(mutDat.c)]
rownames(exprStatus.c.sub) <- genes_gr2$gene_symb[sapply(substr(rownames(exprStatus.c.sub), 4,18), grep,genes_gr2$ensg)]
orDat <- array(data = 0, dim = c(nrow(mutDat.c), nrow(exprStatus.c.sub), 2), dimnames = list(muts=rownames(mutDat.c), expr=rownames(exprStatus.c.sub), oddsRat=c("pValue", "OR")))
for(i in 1:nrow(mutDat.c)){
for(j in 1:nrow(exprStatus.c.sub)){
d1 <- c(mutDat.c[i,])
d2 <- c(exprStatus.c.sub[j,])
if(length(unique(d1)) == 1 | length(unique(d2)) == 1){
next
}
res <- fisher.test(x = d1, d2)
orDat[i,j,1] <- res$p.value
orDat[i,j,2] <- res$estimate
}
}
orDat.adj <- matrix(p.adjust(as.vector(as.matrix(orDat[,,1])), method='BH'),ncol=ncol(orDat[,,1]))
orDat <- abind::abind(orDat, adjPval=orDat.adj, along = 3)
orDat.m <- reshape2::melt(orDat)
colnames(orDat.m) <- c("mut", 'nhr', "pval_or", "value")
orDat.m$sigStar <- sig2star(orDat.m$value)
orDat.m$nhr <- factor(orDat.m$nhr, levels=genes_gr$gene_symb)
orDat.m$mut <- factor(orDat.m$mut, levels=rev(mutFreq$gene))
#these are the top 3
subset(orDat.m,pval_or=="adjPval") %>% arrange(value) %>% head(3)
## mut nhr pval_or value sigStar
## 1 TP53 CBX2 adjPval 0.05284218
## 2 MAP3K1 NELL2 adjPval 0.54935504
## 3 MAP3K1 EPYC adjPval 0.54935504
ggplot(subset(orDat.m,pval_or=="OR"), aes(y=mut, x=nhr)) +
geom_tile(aes(fill=log(value,10))) +
theme_classic() +
theme(text = element_text(size=16), axis.ticks=element_blank(),
axis.line = element_blank(),
axis.text.x = element_text(angle=90, hjust = 1, vjust=0.5)) +
scale_fill_gradient2(name="log10(Odds Ratio)") +
geom_text(data=subset(orDat.m,pval_or=="adjPval"), aes(label=sigStar)) +
coord_fixed(ratio = 1) + ylab("High-impact mutation") + xlab("")
Objective: Read in, plot, and perform statistical analysis for CBX2 growth curve data
paired <- brewer.pal(6, "Paired")
cbx2 <- read_excel(path = here::here("raw_data", "2017_11_26_CBX2_growth_curve", "CBX2.xlsx"), sheet = "Sheet2")
cbx2$day <- substr(cbx2$day, start = 4, stop = 5)
cbx2.cons <- subset(cbx2, expt_numb %in% 4:6)
cbx2.cons.m <- reshape2::melt(cbx2.cons, id.vars=c("replicate_numb", "day", "expt_numb", "floating"))
cbx2.cons.m$day <- as.numeric(cbx2.cons.m$day)
ggplot(subset(cbx2.cons.m, subset = floating == 0 ), aes(x=day, y=log(value,10), color=variable)) + geom_smooth(size=2) + geom_jitter(position=position_dodge(width=0.4), alpha=0.5, size=4, pch=21, aes(group=variable, fill=variable), color="black", stroke = 2) + theme_classic() + scale_fill_manual(values=paired[c(1,3)]) + scale_color_manual(values=paired[c(1,3)], name=" siRNA") + xlab("Day")+ ylab(bquote('log'[10]* '(cell count)')) + theme(axis.text=element_text(size=16), axis.title=element_text(size=16), plot.title = element_text(size=16)) + ggtitle("siRNA KD: Adherent MCF7, day 3-7") + scale_x_continuous(breaks=c(0,3:7))
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
ggplot(subset(cbx2.cons.m, subset = floating == 1 & day > 0), aes(x=day, y=log(value, 10), color=variable)) + geom_smooth(size=2) + geom_jitter(position=position_dodge(width=0.4), alpha=0.5, size=4, pch=21, aes(group=variable, fill=variable), color="black", stroke = 2) + theme_classic() + scale_fill_manual(values=paired[c(1,3)]) + scale_color_manual(values=paired[c(1,3)], name=" siRNA") + xlab("Day")+ ylab(bquote('log'[10]* '(cell count)')) + theme(axis.text=element_text(size=16), axis.title =element_text(size=16), plot.title = element_text(size=16)) + ggtitle("siRNA KD: Floating MCF7, day 3-7") + scale_x_continuous(breaks=c(0,3:7), limits = c(0,7.1))
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
now, perform a 3-way ANOVA to check for differences in growth rate as a function of siRNA (either control or CBX2 knockdown)
cbx2.cons.m.log2 <- subset(cbx2.cons.m, subset = floating == 0 & day > 0)
cbx2.cons.m.log2.float <- subset(cbx2.cons.m, subset = floating == 1 & day > 0)
cbx2.cons.m.log2.expt5_7 <- cbx2.cons.m.log2 %>%
filter(expt_numb %in% 4:6) %>%
mutate(variable = factor(variable)) %>%
mutate(expt_numb = factor(expt_numb)) %>%
mutate(day = factor(day, ordered = TRUE)) %>%
dplyr::rename(siRNA=variable, cell_counts = value)%>%
mutate(cell_counts_log = log(cell_counts, 10))
aov_res_log10 <- aov(cell_counts_log ~ siRNA + expt_numb + day, data = cbx2.cons.m.log2.expt5_7)
summary(aov_res_log10)
## Df Sum Sq Mean Sq F value Pr(>F)
## siRNA 1 0.1387 0.1387 31.855 6.98e-07 ***
## expt_numb 2 0.0804 0.0402 9.235 0.00037 ***
## day 4 1.6211 0.4053 93.105 < 2e-16 ***
## Residuals 52 0.2263 0.0044
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#total d.f. is # of samples - 1
## now check the floating cells
cbx2.cons.m.log2.float.expt5_7 <- cbx2.cons.m.log2.float %>%
filter(expt_numb %in% 4:6) %>%
mutate(variable = factor(variable)) %>%
mutate(expt_numb = factor(expt_numb)) %>%
mutate(day = factor(day, ordered = TRUE)) %>%
dplyr::rename(siRNA=variable, cell_counts = value)%>%
mutate(cell_counts_log = log(cell_counts, 10))
aov_res_log10.f <- aov(cell_counts_log ~ siRNA + expt_numb + day, data = cbx2.cons.m.log2.float.expt5_7)
summary(aov_res_log10.f)
## Df Sum Sq Mean Sq F value Pr(>F)
## siRNA 1 0.1232 0.1232 3.159 0.0813 .
## expt_numb 2 0.3797 0.1898 4.870 0.0115 *
## day 4 2.4220 0.6055 15.531 2.02e-08 ***
## Residuals 52 2.0272 0.0390
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#calculate the average fold change between each day/condition
#adherent
fold.change.adh <- cbx2.cons.m.log2.expt5_7 %>%
group_by(day, siRNA) %>%
select(day, siRNA, cell_counts) %>%
summarize(mean_cell_count = mean(cell_counts)) %>%
spread(siRNA, mean_cell_count) %>%
mutate(fold_change_ctrl_cbx2 = (si_control -si_CBX2) /si_CBX2)
fold.change.adh
## # A tibble: 5 x 4
## # Groups: day [5]
## day si_control si_CBX2 fold_change_ctrl_cbx2
## <ord> <dbl> <dbl> <dbl>
## 1 3 105583 91833 0.150
## 2 4 203000 156250 0.299
## 3 5 257583 206333 0.248
## 4 6 280417 232833 0.204
## 5 7 354167 250500 0.414
fold.change.adh.overall <- fold.change.adh %>% ungroup() %>%
summarise(mean_overall_fc = mean(fold_change_ctrl_cbx2))
fold.change.adh.overall
## # A tibble: 1 x 1
## mean_overall_fc
## <dbl>
## 1 0.263
# si_control/si_CBX2)
#group_by(day, siRNA) %>%
#floating
fold.change.float <- cbx2.cons.m.log2.float.expt5_7 %>%
group_by(day, siRNA) %>%
select(day, siRNA, cell_counts) %>%
summarize(mean_cell_count = mean(cell_counts)) %>%
spread(siRNA, mean_cell_count) %>%
mutate(fold_change_ctrl_cbx2 = (si_control -si_CBX2) /si_CBX2)
fold.change.float
## # A tibble: 5 x 4
## # Groups: day [5]
## day si_control si_CBX2 fold_change_ctrl_cbx2
## <ord> <dbl> <dbl> <dbl>
## 1 3 5167 5667 -0.0882
## 2 4 12167 14167 -0.141
## 3 5 17667 15333 0.152
## 4 6 23167 16167 0.433
## 5 7 30650 13897 1.21
fold.change.float.overall <- fold.change.float %>% ungroup() %>%
summarise(mean_overall_fc = mean(fold_change_ctrl_cbx2))
fold.change.float.overall
## # A tibble: 1 x 1
## mean_overall_fc
## <dbl>
## 1 0.312
2018-10: Make changes suggested by reviewers
is there overlap between the genes we identified and mammaprint or oncomix genes?
mammaprint <- c("AA555029_RC", "ALDH4A1", "AP2B1", "AYTL2", "BBC3", "C16orf61", "C20orf46", "C9orf30", "CCNE2", "CDC42BPA", "CDCA7", "CENPA", "COL4A2", "DCK", "DIAPH3", "DTL", "EBF4", "ECT2", "EGLN1", "ESM1", "EXT1", "FGF18", "FLT1", "GMPS", "GNAZ", "GPR126", "GPR180", "GSTM3", "HRASLS", "IGFBP5", "JHDM1D", "KNTC2", "LGP2", "LIN9", "LOC100131053", "LOC100288906", "LOC730018", "MCM6", "MELK", "MMP9", "MS4A7", "MTDH", "NMU", "NUSAP1", "ORC6L", "OXCT1", "PALM2", "PECI", "PITRM1", "PRC1", "QSCN6L1", "RAB6B", "RASSF7", "RECQL5", "RFC4", "RTN4RL1", "RUNDC1", "SCUBE2", "SERF1A", "SLC2A3", "STK32B", "TGFB3", "TSPYL5", "UCHL5", "WISP1", "ZNF533")
oncotypedx <- c("MKI67", "AURKA", "BIRC5", "CCNB1", "MYBL2", "ERBB2", "GRB7", "ESR1", "PGR", "BCL2", "SCUBE2", "MMP11", "CTSL2", "GSTM1", "CD68", "BAG1", "ACTB", "GAPDH", "RPLPO", "GUS", "TFRC")
aims <- c("ABAT","ACADSB","ACTG2","ANXA3","APH1B","AR","ASH2L","ASPM","ATP2C2","BCL11A","BCL2","BIRC5","C1orf106","C5orf30","C6orf211","CA12","CAV1","CCNB2","CDC20","CDCA8","CDH3","CDKN1C","CDKN3","CDO1","CELSR1","CENPA","CENPF","CEP55","CIRBP","CKS2","CNIH4","CNN1","COL17A1","CRYAB","CSRP2","CSTB","CX3CL1","DNAJC12","DNALI1","DTL","ECSIT","EPHX2","EPN3","ERBB2","ESR1","FA2H","FAM134B","FAM49B","FBP1","FGFR4","FLNB","FMO5","FOXA1","FOXC1","GABRP","GAMT","GARS","GATA3","GFRA1","GRB7","GSN","GSTP1","HOXA5","HPN","HSPA14","ID4","IGBP1","IGF1","IGFBP6","IRS1","ITM2A","KIAA0020","KIAA0196","KIF13B","KIF2C","KIT","KRT14","KRT17","KRT18","KRT5","KYNU","LAMA3","LAPTM4B","LBR","LDHB","LumA","LumB","LYN","MAD2L1","MAP2K4","MAPT","MCM2","MELK","MEOX1","MFAP4","MKL2","MLF1IP","MLPH","MMP7","MNAT1","MYO10","NAT1","NDC80","NDRG2","NEK2","NEK4","NFIB","NormL","NPY1R","NQO1","PARP1","PBK","PCNA","PPAP2B","PRC1","PRKX","PSMD12","PTN","PTTG1","RACGAP1","RARRES1","RBBP8","RFC4","RRM2","S100A8","SAE1","SCUBE2","SDC1","SERPINA3","SFRP1","SHC2","SLC39A6","SOBP","SPDEF","STARD3","STC2","SYTL2","TCEAL1","TFCP2L1","TFF3","TIMELESS","TK1","TNFRSF21","TOP2A","TRIM29","TSHZ2","TSPAN1","TSPAN7","TTC12","TTK","TYMS","UBE2C","UBE2E3","VWFABAT","VWF")
aims_genes <- aims_subtypes$EntrezID.used
#map to gene symbols
ensembl = useMart("ensembl",dataset="hsapiens_gene_ensembl")
attributes = listAttributes(ensembl)
bm_aims <- getBM(attributes=c('external_gene_name', 'entrezgene'),
filters = 'entrezgene',
values = aims_genes,
mart = ensembl)
intersect(geneIdsTop5, mammaprint)
## character(0)
intersect(geneIdsTop5, oncotypedx)
## character(0)
intersect(geneIdsTop5, bm_aims$external_gene_name)
## character(0)
#aims_subtypes$EntrezID.used
Is the CBX2 association with clinical outcome significant within the individual intrinsic subtypes? How do the patients with luminal A and B cancers with CBX2 over expression fare? Do they have a worse clinical outcome to other luminal A/B cancers with normal CBX2 expression?
To address this question, we performed kaplan meier survival analysis for each of the 5 known subtypes with at least 3 tumor samples per group. We did this for both the original 110 as well as for the entire dataset.
We start by preparing the entire dataset
rowData(e) = dplyr::left_join(data.frame(rowData(e)), ensg_to_prot_filt, by = c("ensembl_gene_id" = "Gene.stable.ID"))
aims_subtypes_all <- applyAIMS(assay(e),rowData(e)$NCBI.gene.ID)
## Current k = 20
e_cbx2_all$aims_subtypes <- aims_subtypes_all$cl[,1][colnames(e_cbx2_all)] #add aims_subtypes to this
#colData(e)$aims_subtypes <- aims_subtypes_all$cl[,1]
#e_cbx2_all replaces et_all
et
dataset (110 samples)et$years_last_fu_5yr <- et$days_last_fu_5yr/365.25
brca_subtypes <- unique(et$aims_subtypes)
subtype_signif <- rep(NA, length(brca_subtypes))
for(st in 1:length(brca_subtypes)){
subt <- brca_subtypes[st]
et.st <- et[,et$aims_subtypes == subt]
if(table(et.st$cl_ENSG00000173894) %>% length() == 1){
subtype_signif[st] <- NA
next
}
fit <- survfit(Surv(et.st$years_last_fu_5yr, as.numeric(et.st$vital_stat_5yr)) ~ as.factor(et.st$cl_ENSG00000173894), data = colData(et.st))
fit.stat <- survival::survdiff(Surv(et.st$years_last_fu_5yr, as.numeric(et.st$vital_stat_5yr)) ~ as.factor(et.st$cl_ENSG00000173894))
pval2 <- pvalFxn(fit.stat)
subtype_signif[st] <- pval2
}
names(subtype_signif) <- brca_subtypes
subtype_signif
## Basal LumA LumB Her2 Normal
## 0.9837661 0.3215067 0.1020868 0.5228596 NA
## Basal LumA LumB Her2 Normal
## 0.9837661 0.3215067 0.1020868 0.5228596 NA
p.adjust(subtype_signif, method="BH")
## Basal LumA LumB Her2 Normal
## 0.9837661 0.6430135 0.4083473 0.6971461 NA
## Basal LumA LumB Her2 Normal
## 0.9837661 0.6430135 0.4083473 0.6971461 NA
e_cbx2_all
dataset (1084 samples with survival data)### Nov 3, 2018
brca_subtypes_all <- unique(e_cbx2_all$aims_subtypes)
subtype_signif_all <- rep(NA, length(brca_subtypes))
for(st in 1:length(brca_subtypes_all)){
subt <- brca_subtypes[st]
et.st <- e_cbx2_all[,e_cbx2_all$aims_subtypes == subt]
if(table(et.st$cbx2Class) %>% length() == 1){
subtype_signif[st] <- NA
next
}
fit <- survfit(Surv(et.st$years_last_fu_5yr, as.numeric(et.st$vital_stat_5yr)) ~ as.factor(et.st$cbx2Class), data = colData(et.st))
fit.stat <- survival::survdiff(Surv(et.st$years_last_fu_5yr, as.numeric(et.st$vital_stat_5yr)) ~ as.factor(et.st$cbx2Class))
pval2 <- pvalFxn(fit.stat)
subtype_signif_all[st] <- pval2
print(subt)
print(fit.stat)
}
## [1] "Basal"
## Call:
## survival::survdiff(formula = Surv(et.st$years_last_fu_5yr, as.numeric(et.st$vital_stat_5yr)) ~
## as.factor(et.st$cbx2Class))
##
## N Observed Expected (O-E)^2/E (O-E)^2/V
## as.factor(et.st$cbx2Class)=0 25 3 2.71 0.03009 0.034
## as.factor(et.st$cbx2Class)=1 174 21 21.29 0.00384 0.034
##
## Chisq= 0 on 1 degrees of freedom, p= 0.854
## [1] "LumA"
## Call:
## survival::survdiff(formula = Surv(et.st$years_last_fu_5yr, as.numeric(et.st$vital_stat_5yr)) ~
## as.factor(et.st$cbx2Class))
##
## N Observed Expected (O-E)^2/E (O-E)^2/V
## as.factor(et.st$cbx2Class)=0 305 17 15.82 0.0879 0.727
## as.factor(et.st$cbx2Class)=1 38 1 2.18 0.6382 0.727
##
## Chisq= 0.7 on 1 degrees of freedom, p= 0.394
## [1] "LumB"
## Call:
## survival::survdiff(formula = Surv(et.st$years_last_fu_5yr, as.numeric(et.st$vital_stat_5yr)) ~
## as.factor(et.st$cbx2Class))
##
## N Observed Expected (O-E)^2/E (O-E)^2/V
## as.factor(et.st$cbx2Class)=0 194 12 16.98 1.46 4.21
## as.factor(et.st$cbx2Class)=1 98 14 9.02 2.75 4.21
##
## Chisq= 4.2 on 1 degrees of freedom, p= 0.0402
## [1] "Her2"
## Call:
## survival::survdiff(formula = Surv(et.st$years_last_fu_5yr, as.numeric(et.st$vital_stat_5yr)) ~
## as.factor(et.st$cbx2Class))
##
## N Observed Expected (O-E)^2/E (O-E)^2/V
## as.factor(et.st$cbx2Class)=0 50 10 7.86 0.582 0.823
## as.factor(et.st$cbx2Class)=1 133 17 19.14 0.239 0.823
##
## Chisq= 0.8 on 1 degrees of freedom, p= 0.364
## [1] "Normal"
## Call:
## survival::survdiff(formula = Surv(et.st$years_last_fu_5yr, as.numeric(et.st$vital_stat_5yr)) ~
## as.factor(et.st$cbx2Class))
##
## N Observed Expected (O-E)^2/E (O-E)^2/V
## as.factor(et.st$cbx2Class)=0 61 6 5.897 0.00181 0.108
## as.factor(et.st$cbx2Class)=1 6 0 0.103 0.10345 0.108
##
## Chisq= 0.1 on 1 degrees of freedom, p= 0.743
names(subtype_signif_all) <- brca_subtypes_all
subtype_signif_all
## LumA Her2 Basal LumB Normal
## 0.85368790 0.39387430 0.04019842 0.36434383 0.74288516
## LumA Her2 Basal LumB Normal
## 0.42784046 0.36434383 0.85368790 0.03889948 0.74288516
p.adjust(subtype_signif_all, method="BH")
## LumA Her2 Basal LumB Normal
## 0.8536879 0.6564572 0.2009921 0.6564572 0.8536879
## LumA Her2 Basal LumB Normal
## 0.7130674 0.7130674 0.8536879 0.1944974 0.8536879
mmp_tbl <- mmp %>% rownames_to_column("ensgID") %>% as_tibble() #filter()
ensg_oexpr <- c("ENSG00000136997" = "MYC", "ENSG00000110092" = "CCND1", "ENSG00000077782" = "FGFR1", "ENSG00000066468" = "FGFR2") #"ENSG00000105173" = "CCNE1"
mmp_tbl %>% filter(ensgID %in% names(ensg_oexpr))
## # A tibble: 4 x 13
## ensgID nMu1 nMu2 nVar nPi1 tMu1 tMu2 tVar tPi1 deltaMu2
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ENSG00000110… 7.48 7.77 0.589 0.377 5.02 8.35 1.06 0.0433 0.577
## 2 ENSG00000077… 6.37 6.48 0.180 0.402 4.88 5.31 1.13 0.561 -1.17
## 3 ENSG00000136… 6.08 8.21 0.466 0.0578 6.13 7.03 1.16 0.399 -1.18
## 4 ENSG00000066… 2.58 5.33 0.552 0.118 3.99 4.73 1.62 0.499 -0.598
## # ... with 3 more variables: deltaMu1 <dbl>, SI <dbl>, score <dbl>
mmp_tbl %>% filter(ensgID %in% names(ensg_oexpr)) %>% select(-ensgID) %>% colMeans
## nMu1 nMu2 nVar nPi1 tMu1 tMu2
## 5.62781582 6.94848218 0.44669542 0.23875939 5.00285769 6.35768709
## tVar tPi1 deltaMu2 deltaMu1 SI score
## 1.24372793 0.37562482 -0.59079509 -0.62495813 0.08636364 -0.24993485
## nMu1 nMu2 nVar nPi1 tMu1 tMu2
## 4.15867522 6.08978928 0.33359595 0.31023367 4.52041263 6.01810348
## tVar tPi1 deltaMu2 deltaMu1 SI score
## 1.15981495 0.46534080 -0.07168581 0.36173741 0.27142857 -0.23342206
mmp2 %>% dplyr::filter(geneNames %in% geneIdsTop5) %>% select(-geneNames) %>% colMeans
## nMu1 nMu2 nVar nPi1 tMu1 tMu2 tVar
## 1.1387505 1.3839667 0.1979722 0.6066698 1.6940005 4.5215614 0.8482769
## tPi1 deltaMu2 deltaMu1 SI score
## 0.6265991 3.1375947 0.5552500 0.9963636 1.5307278
## nMu1 nMu2 nVar nPi1 tMu1 tMu2
## 0.9038492 1.1136966 0.1742516 0.5999572 1.4754776 4.1858719
## tVar tPi1 deltaMu2 deltaMu1 SI score
## 0.7720942 0.6224035 3.0721753 0.5716283 0.9964286 1.5488232
#calculate the mean of the normal expression values from each group
en_tbl <- assay(en) %>% data.frame %>% rownames_to_column("ensg") %>% as_tibble()
en_tbl %>% filter(ensg %in% names(geneIdsTop5)) %>% select(-ensg) %>% rowMeans() %>% mean() #expression of oncomix genes in adj nml tissue
## [1] 1.227439
en_tbl %>% filter(ensg %in% names(ensg_oexpr)) %>% select(-ensg) %>% rowMeans() %>% mean() # expression of genes mentioned by reviewers in adj nml tissue
## [1] 6.797504
Download and filter data from lung cancer
## eq <- GDCquery(project = "TCGA-LUAD",
## workflow.type = "HTSeq - FPKM",
## data.category = c("Transcriptome Profiling"),
## data.type = c("Gene Expression Quantification"))
## GDCdownload(eq)
## e <- GDCprepare(eq, save=FALSE)
## saveRDS(e, file=here("processed_data", "tcga-exp-luad.rds"))
e_luad <- readRDS(here("processed_data", "tcga-exp-luad.rds"))
Download and filter data from endometrial cancer
## eq <- GDCquery(project = "TCGA-UCEC",
## workflow.type = "HTSeq - FPKM",
## data.category = c("Transcriptome Profiling"),
## data.type = c("Gene Expression Quantification"))
## GDCdownload(eq)
## e <- GDCprepare(eq, save=FALSE)
## saveRDS(e, file=here("processed_data", "tcga-exp-ucec-2.rds"))
e_ucec <- readRDS(here("processed_data", "tcga-exp-ucec-2.rds")) #most recent version, downloaded Nov 3, 2018
Download and filter data from prostate cancer
## eq <- GDCquery(project = "TCGA-PRAD",
## workflow.type = "HTSeq - FPKM",
## data.category = c("Transcriptome Profiling"),
## data.type = c("Gene Expression Quantification"))
## GDCdownload(eq)
## e <- GDCprepare(eq, save=FALSE)
## saveRDS(e, file=here("processed_data", "tcga-exp-prad.rds"))
e_prad <- readRDS(here("processed_data", "tcga-exp-prad.rds")) #most recent version, downloaded Nov 3, 2018
Process for cleaning dataset
#remove FFPE samples
e_luad_temp <- clean_dataset(e_luad, ensg_to_prot)
## [1] 19090 513
## [1] 19090 59
## [1] 16336 512
## [1] 16333 59
e_ucec_temp <- clean_dataset(e_ucec, ensg_to_prot)
## [1] 19090 543
## [1] 19090 35
## [1] 15883 543
## [1] 16208 35
e_prad_temp <- clean_dataset(e_prad, ensg_to_prot)
## [1] 19090 495
## [1] 19090 52
## [1] 16407 495
## [1] 16599 52
e_luad_n <- e_luad_temp$en
e_luad_t <- e_luad_temp$et
e_ucec_n <- e_ucec_temp$en
e_ucec_t <- e_ucec_temp$et
e_prad_n <- e_prad_temp$en
e_prad_t <- e_prad_temp$et
rm(e_luad_temp, e_ucec_temp, e_prad_temp)
Now, apply oncomix to these datasets
if(file.exists(here("processed_data", "mmp_luad.rds"))){
mmp_luad <- readRDS(file = here("processed_data", "mmp_luad.rds"))
} else {
mmp_luad <- mixModelParams(exprNml = assay(e_luad_n), exprTum = assay(e_luad_t))
#saveRDS(mmp_luad, file = here("processed_data", "mmp_luad.rds"))
}
if(file.exists(here("processed_data", "mmp_ucec.rds"))){
mmp_ucec <- readRDS(file = here("processed_data", "mmp_ucec.rds"))
} else {
mmp_ucec <- mixModelParams(exprNml = assay(e_ucec_n), exprTum = assay(e_ucec_t))
#saveRDS(mmp_ucec, file = here("processed_data", "mmp_ucec.rds"))
}
if(file.exists(here("processed_data", "mmp_prad.rds"))){
mmp_prad <- readRDS(file = here("processed_data", "mmp_prad.rds"))
} else {
mmp_prad <- mixModelParams(exprNml = assay(e_prad_n), exprTum = assay(e_prad_t))
#saveRDS(mmp_prad, file = here("processed_data", "mmp_prad.rds"))
}
#can also try this with the same genes as we found in the breast cancer dataset
make plots (top5 from each cancer type, also show CBX2 distribution). Make a table of the characteristics of these genes. Conclusion is that we can identify viable oncogene candidates in other cancer types using oncomix.
mmp_luad_filt <- filt_mmp_pi(mmp_luad, pi_min=0.2)
mmp_ucec_filt <- filt_mmp_pi(mmp_ucec, pi_min=0.2)
mmp_prad_filt <- filt_mmp_pi(mmp_prad, pi_min=0.2)
luad_top5 <- filt_genes_pi_si(mmp_luad, pi_min=0.2, si=0.99) %>% head(5)
ucec_top5 <- filt_genes_pi_si(mmp_ucec, pi_min=0.2, si=0.99) %>% head(5)
prad_top5 <- filt_genes_pi_si(mmp_prad, pi_min=0.2, si=0.99) %>% head(5)
#top5genes #breast cancer #geneIdsTop5
selIdx_thresh
. We can also show the genes both before and after the selectivity index (ie low SI vs high SI).gglayers = list(
theme(axis.text.x = element_text(size=30),
plot.title=element_text(size=30)),
xlim(-0.2,10), xlab(""), ylab("")
)
dict_top5_mmp <- list(luad = list(luad_top5, mmp_luad, mmp_luad_filt, e_luad_n, e_luad_t), ucec = list(ucec_top5, mmp_ucec, mmp_ucec_filt, e_ucec_n, e_ucec_t), prad = list(prad_top5, mmp_prad, mmp_prad_filt, e_prad_n, e_prad_t), brca = list(top5genes, mmp, mmp2, en, et))
for(item in 1:length(dict_top5_mmp)){
dict_item <- dict_top5_mmp[[item]]
top5genes_temp <- dict_item[[1]]
mmp_temp <- dict_item[[2]]
mmp_filt_temp <- dict_item[[3]]
tumor_type <- names(dict_top5_mmp)[item]
exp_nml <- dict_item[[4]]
exp_tum <- dict_item[[5]]
#show the distribution of oncomix scores across genes
plotOncomixScores(mmp_dat=mmp_filt_temp, si=0.99, alph=0.8)
ggsave(filename = here("output", paste0(Sys.Date(),"_", tumor_type,"_oncomix_scores.png")),
width = 7, height=5, units = "in")
#plot CBX2 for each gene
gglayers[["title"]] <- ggtitle("CBX2")
p1 <- plotGeneHistUD(mmp_temp, assay(exp_nml), assay(exp_tum), "ENSG00000173894", linesz=1.5, gglayers)
ggsave(filename = here("output", paste0(Sys.Date(),"_", tumor_type,"_cbx2.png")),
plot = p1, width = 7, height=5, units = "in")
for(i in 1:length(top5genes_temp)){
hgnc_gene_symb <- ensg_to_prot %>%
dplyr::filter(Gene.stable.ID == top5genes_temp[i]) %>%
dplyr::select(HGNC.symbol) %>%
as.character()
gglayers[["title"]] <- ggtitle(hgnc_gene_symb)
plotGeneHistUD(mmp_temp, assay(exp_nml), assay(exp_tum), top5genes_temp[i], linesz=1.5, gglayers)
ggsave(filename = here("output", paste0(Sys.Date(),"_",top5genes_temp[i], "_",tumor_type,"_histog.png")),
width = 7, height=5, units = "in")
}
}
#mmp_luad[luad_top5,]
hgnc_gene_symb_luad <- ensg_to_prot %>%
dplyr::filter(Gene.stable.ID %in% luad_top5)
hgnc_gene_symb_ucec <- ensg_to_prot %>%
dplyr::filter(Gene.stable.ID %in% ucec_top5)
hgnc_gene_symb_prad <- ensg_to_prot %>%
dplyr::filter(Gene.stable.ID %in% prad_top5)
mmp_luad_filt[luad_top5,]
## nMu1 nMu2 nVar nPi1 tMu1
## ENSG00000187134 3.6581369 3.9090573 0.23802598 0.5604862 2.5369871
## ENSG00000104760 0.3413745 0.3993426 0.21713352 0.5239677 1.4728785
## ENSG00000143512 0.1979368 0.2325604 0.03464972 0.5659663 0.9522066
## ENSG00000074211 0.1662333 0.1949561 0.06807578 0.5279768 0.5582768
## ENSG00000203685 0.6825409 1.2098345 0.04345076 0.6649862 0.5615161
## tMu2 tVar tPi1 deltaMu2 deltaMu1 SI
## ENSG00000187134 8.290974 1.9164708 0.7568101 4.381917 -1.1211497 1
## ENSG00000104760 6.931749 2.1473224 0.7424024 6.532406 1.1315039 1
## ENSG00000143512 4.804650 0.8475958 0.7194521 4.572089 0.7542697 1
## ENSG00000074211 3.842905 0.5562381 0.7190085 3.647949 0.3920435 1
## ENSG00000203685 4.048130 0.4560578 0.7993966 2.838296 -0.1210248 1
## score
## ENSG00000187134 3.348570
## ENSG00000104760 3.036447
## ENSG00000143512 2.935574
## ENSG00000074211 2.631592
## ENSG00000203685 2.459812
mmp_ucec_filt[ucec_top5,]
## nMu1 nMu2 nVar nPi1 tMu1
## ENSG00000144407 0.3349394 1.302873 0.080686771 0.7143399 0.3596356
## ENSG00000105464 0.5494817 1.219765 0.024937659 0.7977214 1.6878711
## ENSG00000171956 0.2262360 1.472682 0.048900742 0.7739908 0.4142403
## ENSG00000159263 0.8911819 1.294689 0.026862794 0.4367342 1.1912037
## ENSG00000124140 0.3211505 0.617614 0.009385667 0.7367469 0.6706597
## tMu2 tVar tPi1 deltaMu2 deltaMu1 SI
## ENSG00000144407 3.945496 0.3797388 0.7581404 2.642623 0.02469619 1
## ENSG00000105464 4.819607 0.7371933 0.4965516 3.599842 1.13838940 1
## ENSG00000171956 3.894673 0.5092749 0.7589731 2.421991 0.18800426 1
## ENSG00000159263 3.606586 0.5671638 0.4983658 2.311897 0.30002186 1
## ENSG00000124140 2.316188 0.2271221 0.7722145 1.698574 0.34950917 1
## score
## ENSG00000144407 2.157501
## ENSG00000105464 1.699321
## ENSG00000171956 1.675811
## ENSG00000159263 1.417848
## ENSG00000124140 1.112557
mmp_prad_filt[prad_top5,]
## nMu1 nMu2 nVar nPi1 tMu1 tMu2
## ENSG00000103111 5.0004520 5.947607 0.13591512 0.5119697 5.7448699 8.314026
## ENSG00000127588 0.3401660 1.583877 0.09411245 0.7751049 1.2229182 3.648904
## ENSG00000197558 0.3785834 1.058948 0.03467521 0.7919055 0.9103318 2.115098
## ENSG00000141696 4.5620935 4.940011 0.05277128 0.3735454 5.1985208 6.011682
## ENSG00000142544 2.1021616 2.387614 0.15905386 0.4006381 2.9541089 3.729702
## tVar tPi1 deltaMu2 deltaMu1 SI score
## ENSG00000103111 0.6586718 0.7671649 2.366419 0.7444179 1 0.8274142
## ENSG00000127588 0.5589943 0.7685654 2.065027 0.8827522 1 0.5291679
## ENSG00000197558 0.1851788 0.7901408 1.056150 0.5317484 1 0.3045476
## ENSG00000141696 0.2186459 0.7998280 1.071671 0.6364273 1 0.1638264
## ENSG00000142544 0.1719460 0.6542453 1.342088 0.8519473 1 0.1591408
# How many genes in each group? (SI > and < 0.99)
dim(subset(mmp_luad_filt, SI<.99))
## [1] 3868 12
dim(subset(mmp_luad_filt, SI>.99))
## [1] 581 12
dim(subset(mmp_prad_filt, SI<.99))
## [1] 3241 12
dim(subset(mmp_prad_filt, SI>.99))
## [1] 20 12
dim(subset(mmp_ucec_filt, SI<.99))
## [1] 4233 12
dim(subset(mmp_ucec_filt, SI>.99))
## [1] 224 12
dim(subset(mmp2, SI<.99))
## [1] 3669 13
dim(subset(mmp2, SI>.99))
## [1] 154 13
What’s the CBX2 rank across the 3 cancer types?
#mmp_luad %>% rownames_to_column("ensg") %>% mutate(rank=1:n()) %>% filter(ensg == "ENSG00000173894")
mmp_luad_filt %>% rownames_to_column("ensg") %>% filter(SI > 0.99) %>% mutate(rank=1:n()) %>% filter(ensg == "ENSG00000173894")
## ensg nMu1 nMu2 nVar nPi1 tMu1 tMu2
## 1 ENSG00000173894 1.255722 2.235828 0.1322124 0.6746648 2.558439 4.754407
## tVar tPi1 deltaMu2 deltaMu1 SI score rank
## 1 0.8431715 0.7306803 2.518579 1.302717 1 0.2404778 189
mmp_luad_filt %>% rownames_to_column("ensg") %>% filter(SI > 0.99) %>% mutate(rank=1:n()) %>% dim() #nrows
## [1] 581 14
#mmp_ucec %>% rownames_to_column("ensg") %>% mutate(rank=1:n()) %>% filter(ensg == "ENSG00000173894")
mmp_ucec_filt %>% rownames_to_column("ensg") %>% filter(SI > 0.99) %>% mutate(rank=1:n()) %>% filter(ensg == "ENSG00000173894") #189
## ensg nMu1 nMu2 nVar nPi1 tMu1 tMu2
## 1 ENSG00000173894 2.152284 2.966645 0.1418946 0.643624 2.673769 4.888474
## tVar tPi1 deltaMu2 deltaMu1 SI score rank
## 1 0.7687237 0.2148887 1.921829 0.5214852 1 0.4897253 28
mmp_ucec_filt %>% rownames_to_column("ensg") %>% filter(SI > 0.99) %>% mutate(rank=1:n()) %>% dim() #3868
## [1] 224 14
#mmp_prad %>% rownames_to_column("ensg") %>% mutate(rank=1:n()) %>% filter(ensg == "ENSG00000173894")
mmp_prad_filt %>% rownames_to_column("ensg") %>% filter(SI > 0.99) %>% mutate(rank=1:n()) %>% filter(ensg == "ENSG00000173894")
## [1] ensg nMu1 nMu2 nVar nPi1 tMu1 tMu2
## [8] tVar tPi1 deltaMu2 deltaMu1 SI score rank
## <0 rows> (or 0-length row.names)
mmp_prad_filt %>% rownames_to_column("ensg") %>% filter(SI > 0.99) %>% mutate(rank=1:n()) %>% dim() #20
## [1] 20 14
#mmp %>% rownames_to_column("ensg") %>% mutate(rank=1:n()) %>% filter(ensg == "ENSG00000173894") #404 / 15638;
mmp2 %>% rownames_to_column("ensg") %>% filter(SI > 0.99) %>% mutate(rank=1:n()) %>% filter(ensg == "CBX2") #3 / 139
## ensg nMu1 nMu2 nVar nPi1 tMu1 tMu2 tVar
## 1 CBX2 1.882826 2.346459 0.165679 0.6763193 2.387308 5.05406 0.5561141
## tPi1 deltaMu2 deltaMu1 SI score geneNames rank
## 1 0.5785395 2.707601 0.5044819 1 1.481326 CBX2 3
mmp2 %>% rownames_to_column("ensg") %>% filter(SI > 0.99) %>% mutate(rank=1:n()) %>% dim() #3 / 139
## [1] 154 15
mmp3 <- mmp %>% filter(SI > .99, tPi1 < 0.8, mmp$tPi1 > 0.2,
mmp$nPi1 < 0.8, mmp$nPi1 > 0.2) #rownames_to_column()
top5_mmp <- head(mmp3, 5) %>% as_tibble()
not5_mmp <- mmp3[6:nrow(mmp3),] %>% as_tibble()
top5_mmp
## # A tibble: 5 x 12
## nMu1 nMu2 nVar nPi1 tMu1 tMu2 tVar tPi1 deltaMu2 deltaMu1
## * <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.170 0.201 0.0755 0.546 1.39 4.43 1.09 0.647 4.23 1.22
## 2 2.02 2.42 0.460 0.614 2.20 6.35 1.64 0.635 3.93 0.176
## 3 1.88 2.35 0.166 0.676 2.39 5.05 0.556 0.579 2.71 0.504
## 4 0.0944 0.125 0.0295 0.569 0.811 2.59 0.315 0.497 2.47 0.716
## 5 1.53 1.83 0.259 0.629 1.68 4.18 0.639 0.776 2.35 0.158
## # ... with 2 more variables: SI <dbl>, score <dbl>
not5_mmp
## # A tibble: 149 x 12
## nMu1 nMu2 nVar nPi1 tMu1 tMu2 tVar tPi1 deltaMu2 deltaMu1
## * <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.387 0.470 0.148 0.600 0.657 2.47 0.297 0.752 2.00 0.270
## 2 0.277 0.346 0.135 0.583 0.794 2.42 0.304 0.537 2.07 0.518
## 3 1.25 2.07 0.291 0.392 1.60 4.75 0.954 0.752 2.67 0.343
## 4 0.895 1.02 0.168 0.375 1.16 2.81 0.277 0.778 1.79 0.269
## 5 0.797 1.23 0.0763 0.628 1.39 3.31 0.367 0.688 2.08 0.597
## 6 1.04 1.14 0.116 0.547 1.33 3.10 0.606 0.750 1.96 0.291
## 7 2.48 3.08 0.130 0.229 2.74 5.12 0.719 0.675 2.05 0.260
## 8 0.834 0.959 0.234 0.603 0.851 2.45 0.334 0.752 1.49 0.0176
## 9 4.49 4.54 0.119 0.450 5.06 6.25 0.187 0.758 1.70 0.576
## 10 0.946 1.52 0.0814 0.581 1.44 3.22 0.329 0.767 1.70 0.491
## # ... with 139 more rows, and 2 more variables: SI <dbl>, score <dbl>
The null hypothesis is defined as there being no significant difference in the oncomix score of the top 5 ranked oncogene candidates (top5_mmp
) relative to the rest of the 134 genes (not5_mmp
) that passed the initial filters (0.2 > \(π_T\) & \(π_N\) > 0.8, selectivity index > 0.99).
The alternative hypothesis is that the oncomix scores of the top 5 ranked oncogene candidates are significantly higher than those genes not ranking in the top 5.
We first test the likelihood of the null hypothesis being true in our observed data:
obs_score_diff <- t.test(top5_mmp$score, not5_mmp$score, alternative = "greater")
obs_score_diff
##
## Welch Two Sample t-test
##
## data: top5_mmp$score and not5_mmp$score
## t = 15.479, df = 6.4548, p-value = 1.204e-06
## alternative hypothesis: true difference in means is greater than 0
## 95 percent confidence interval:
## 1.487059 Inf
## sample estimates:
## mean of x mean of y
## 1.5307278 -0.1667759
We see that the oncomix scores from the top 5 genes are significantly greater than the oncomix scores of the bottom 134 genes (p = 1.210^{-6}).
Now, to calculate the power of rejecting the null hypothesis, we sample from the (joint) parameter spaces of these two groups that we know are different (not5_mmp
and top5_mmp
). We then recalculate the oncomix scores based on the sampled parameters, and repeat the same statistical test as above on 1000 simulated datasets. Our power is calculated as the % of times that we correctly reject the null hypothesis.
We perform the power calculation by sampling from a multivariate Gaussian (except SI, which is done using bootstrap due to narrow range (0-1) and non-Gaussianity).
n_sims <- 1000
sim_res_multiv <- sapply(1:n_sims, function(x) calc_rand_pval_effect_size_multiv(top5_mmp, not5_mmp))
perc_power_multiv <- 100*sum(sim_res_multiv["pval",] < 0.05)/n_sims
perc_power_multiv #at a p-value of 0.05
## [1] 100
perc_power_multiv_obs_p <- 100*sum(sim_res_multiv["pval",] < obs_score_diff$p.value)/n_sims
perc_power_multiv_obs_p #at p-value of obs_score_diff$p.value
## [1] 72.1
Lastly, we show the power as a function of the p-value
#show power versus p-value
p_val_thresh <- c(seq(1e-6, 0.05, by = 0.000001), 0.05)
power <- sapply(p_val_thresh, function(x) sum(sim_res_multiv["pval",] < x)/n_sims )
df <- tibble(p_val_thresh, power)
ggplot(df, aes(x=p_val_thresh, y=power)) +
geom_point() +
scale_x_log10() +
geom_vline(xintercept=c(obs_score_diff$p.value, 0.05),
color=c("red", "blue")) +
xlab("p-value") + ylab("Power") + theme_bw() +
ggtitle("Power as a function of the p-value \n Red line is observed p-value threshold \n Blue line is p=0.05") +
theme(plot.title = element_text(hjust = 0.5)) +
scale_x_log10(breaks = trans_breaks("log10", function(x) 10^x),
labels = trans_format("log10", math_format(10^.x))) +
annotation_logticks(sides="b")
## Scale for 'x' is already present. Adding another scale for 'x', which
## will replace the existing scale.
#red line indicates the observed p-value threshold in the study
#blue line indicates p=0.05 ()
Below, we document the sources and dates of retrieval when each data file was downloaded from:
TCGA mRNA expression data with clinical annotations were downloaded in November 2018, GDC Version 0.13.0, using TCGAbiolinks
R package. Genes aligned using standard GDC pipelines and annotated to human genome GRCh38.p12. Additional clinical annotations (e.g. % tumor cells, histology, and retrospective collection) were downloaded using the GenomicDataCommons
R package.
TCGA tumor CpG 450k methylation data was downloaded in June 2018, GDC Version 0.12.0, using GenomicDataCommons
and TCGAbiolinks
R packages. CpG sites were lifted over from hg19 to hg38 using the liftover
R package.
GTEx data was downloaded from https://www.gtexportal.org/home/datasets as “GTEx_Analysis_2016-01-15_v7_RNASeQCv1.1.8_gene_median_tpm.gct.gz” in March 2018 and then gunzipped using the following command:
gunzip GTEx_Analysis_2016-01-15_v7_RNASeQCv1.1.8_gene_median_tpm.gct.gz
rutils
tool. ENCFF accession ID’s are listed below for the downloaded files:ENCFF024TEH.bed; ENCFF180XXZ.bed; ENCFF329QYZ.bed; ENCFF541DRZ.bed; ENCFF694ZRC.bed; ENCFF921OMT.bed; ENCFF042AWM.bed; ENCFF184DUF.bed; ENCFF343QQE.bed; ENCFF541HRT.bed; ENCFF708ACK.bed; ENCFF928YTD.bed; ENCFF046BRP.bed; ENCFF190ZXX.bed; ENCFF401IAI.bed; ENCFF569ZCY.bed; ENCFF730UAD.bed; ENCFF932XEU.bed; ENCFF083AZM.bed; ENCFF209WRW.bed; ENCFF408TWV.bed; ENCFF577EMC.bed; ENCFF737BNW.bed; ENCFF942TCG.bed; ENCFF105PFS.bed; ENCFF220RUS.bed; ENCFF432GSK.bed; ENCFF578NMN.bed; ENCFF762CDY.bed; ENCFF947ABI.bed; ENCFF105VHD.bed; ENCFF225VFR.bed; ENCFF434DYI.bed; ENCFF583NFB.bed; ENCFF762MGC.bed; ENCFF993FPN.bed; ENCFF140SFK.bed; ENCFF233RBO.bed; ENCFF441UHA.bed; ENCFF618NVV.bed; ENCFF780WLS.bed; ENCFF150PTQ.bed; ENCFF247VVK.bed; ENCFF456MGR.bed; ENCFF621ZSK.bed; ENCFF784QFH.bed; ENCFF160RLI.bed; ENCFF269RME.bed; ENCFF464QAL.bed; ENCFF625IUE.bed; ENCFF800CDQ.bed; ENCFF161SFU.bed; ENCFF275WAD.bed; ENCFF495PCJ.bed; ENCFF666YGQ.bed; ENCFF838LXI.bed; ENCFF170POB.bed; ENCFF300OKR.bed; ENCFF496RVC.bed; ENCFF679UGD.bed; ENCFF899MQW.bed; ENCFF179TFZ.bed; ENCFF306PBX.bed; ENCFF514BKI.bed; ENCFF687REM.bed; ENCFF907UNK.bed
CNV data were downloaded in January 2018, GDC Version 0.10.0, using the GenomicDataCommons
R package.
Mutation data were downloaded in January 2018, GDC Version 0.10.0, using the GDCquery_Maf
function from the TCGAbiolinks
R package.
sessionInfo()
## R version 3.4.3 (2017-11-30)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: OS X El Capitan 10.11.6
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats4 parallel methods stats graphics grDevices utils
## [8] datasets base
##
## other attached packages:
## [1] scales_1.0.0
## [2] bindrcpp_0.2.2
## [3] janitor_0.4.0.9000
## [4] forcats_0.2.0
## [5] stringr_1.3.1
## [6] dplyr_0.7.7
## [7] purrr_0.2.5
## [8] readr_1.1.1
## [9] tidyr_0.8.1
## [10] tibble_1.4.2
## [11] tidyverse_1.2.1
## [12] readxl_1.0.0
## [13] ggdendro_0.1-20
## [14] RColorBrewer_1.1-2
## [15] ggrepel_0.8.0.9000
## [16] ggplot2_3.1.0
## [17] SummarizedExperiment_1.8.1
## [18] DelayedArray_0.4.1
## [19] matrixStats_0.52.2
## [20] XML_3.98-1.16
## [21] here_0.1
## [22] biomaRt_2.34.2
## [23] GenomicDataCommons_1.2.0
## [24] magrittr_1.5
## [25] rtracklayer_1.38.3
## [26] FDb.InfiniumMethylation.hg19_2.2.0
## [27] org.Hs.eg.db_3.5.0
## [28] TxDb.Hsapiens.UCSC.hg19.knownGene_3.2.2
## [29] TxDb.Hsapiens.UCSC.hg38.knownGene_3.4.0
## [30] GenomicFeatures_1.30.3
## [31] AnnotationDbi_1.40.0
## [32] GenomicRanges_1.30.3
## [33] GenomeInfoDb_1.14.0
## [34] IRanges_2.12.0
## [35] S4Vectors_0.16.0
## [36] GSA_1.03
## [37] AIMS_1.10.0
## [38] Biobase_2.38.0
## [39] BiocGenerics_0.24.0
## [40] e1071_1.6-8
## [41] TCGAbiolinks_2.9.5
## [42] AUC_0.3.0
## [43] survival_2.41-3
## [44] limma_3.34.9
## [45] glmnet_2.0-13
## [46] foreach_1.4.4
## [47] Matrix_1.2-12
## [48] oncomix_0.99.3
## [49] mclust_5.4
## [50] rmarkdown_1.9.17
##
## loaded via a namespace (and not attached):
## [1] backports_1.1.2 circlize_0.4.4
## [3] aroma.light_3.8.0 plyr_1.8.4
## [5] selectr_0.4-1 ConsensusClusterPlus_1.42.0
## [7] lazyeval_0.2.1 splines_3.4.3
## [9] BiocParallel_1.12.0 sva_3.26.0
## [11] digest_0.6.18 htmltools_0.3.6
## [13] memoise_1.1.0 cluster_2.0.6
## [15] doParallel_1.0.14 ComplexHeatmap_1.17.1
## [17] Biostrings_2.46.0 annotate_1.56.1
## [19] modelr_0.1.1 R.utils_2.7.0
## [21] prettyunits_1.0.2 colorspace_1.3-2
## [23] blob_1.1.1 rvest_0.3.2
## [25] haven_1.1.1 crayon_1.3.4
## [27] RCurl_1.95-4.11 jsonlite_1.5
## [29] genefilter_1.60.0 bindr_0.1.1
## [31] zoo_1.8-4 iterators_1.0.10
## [33] glue_1.3.0 survminer_0.4.3
## [35] gtable_0.2.0 zlibbioc_1.24.0
## [37] XVector_0.18.0 GetoptLong_0.1.6
## [39] shape_1.4.4 abind_1.4-5
## [41] DESeq_1.30.0 GGally_1.3.2
## [43] DBI_1.0.0 edgeR_3.20.9
## [45] ggthemes_4.0.1 Rcpp_0.12.19
## [47] xtable_1.8-3 progress_1.2.0
## [49] cmprsk_2.2-7 foreign_0.8-69
## [51] bit_1.1-12 matlab_1.0.2
## [53] km.ci_0.5-2 httr_1.3.1
## [55] reshape_0.8.7 pkgconfig_2.0.2
## [57] R.methodsS3_1.7.1 utf8_1.1.3
## [59] locfit_1.5-9.1 labeling_0.3
## [61] tidyselect_0.2.5 rlang_0.3.0.1
## [63] reshape2_1.4.3 cellranger_1.1.0
## [65] munsell_0.5.0 tools_3.4.3
## [67] cli_1.0.0 downloader_0.4
## [69] RSQLite_2.1.1 broom_0.4.5
## [71] evaluate_0.10.1 yaml_2.1.19
## [73] knitr_1.20 bit64_0.9-7
## [75] survMisc_0.5.5 EDASeq_2.12.0
## [77] nlme_3.1-131 R.oo_1.22.0
## [79] xml2_1.2.0 rstudioapi_0.7
## [81] compiler_3.4.3 curl_3.1
## [83] geneplotter_1.56.0 stringi_1.2.4
## [85] highr_0.6 lattice_0.20-35
## [87] psych_1.8.4 KMsurv_0.1-5
## [89] pillar_1.1.0 GlobalOptions_0.1.0
## [91] data.table_1.11.8 bitops_1.0-6
## [93] R6_2.3.0 latticeExtra_0.6-28
## [95] hwriter_1.3.2 RMySQL_0.10.15
## [97] ShortRead_1.36.0 gridExtra_2.3
## [99] codetools_0.2-15 MASS_7.3-48
## [101] assertthat_0.2.0 rprojroot_1.3-2
## [103] rjson_0.2.15 withr_2.1.2
## [105] GenomicAlignments_1.14.2 Rsamtools_1.30.0
## [107] mnormt_1.5-5 GenomeInfoDbData_1.0.0
## [109] mgcv_1.8-23 hms_0.4.2
## [111] grid_3.4.3 class_7.3-14
## [113] snakecase_0.8.3.1 ggpubr_0.1.8
## [115] lubridate_1.7.4
The functions found in 2017_12_suppl_scripts.R
are listed below
scatterMixPlot2 <- function (mmParams, selIndThresh = 1, geneLabels = NULL, sizesX, sizesY, ...) {
mmParams <- as.data.frame(mmParams)
oneOverAlpha <- diff(range(mmParams$deltaMu2))
alpha1 <- 1/oneOverAlpha
quants <- c(0.05, 0.5, 0.95)
colorsRed <- RColorBrewer::brewer.pal(n = length(quants),
name = "Reds")
deltaMu2Quant <- stats::quantile(mmParams[, "deltaMu2"],
quants)
deltaMu1Quant <- stats::quantile(1/(abs(mmParams[, "deltaMu1"]) +
alpha1), quants)
deltaMu2 <- deltaMu1 <- NULL
x <- ggplot(data = as.data.frame(mmParams), aes(x = deltaMu2,
y = 1/(abs(deltaMu1) + alpha1))) + theme_classic() +
geom_hline(yintercept = deltaMu1Quant, col = colorsRed,
size = sizesY) + geom_vline(xintercept = deltaMu2Quant,
col = colorsRed, size = sizesX) + geom_point(alpha = 0.5) +
xlab(expression(paste(Delta, mu[2]))) + ylab(expression(paste(frac(1,
paste(Delta, mu[1], " + ", alpha)))))# + ...
if (selIndThresh < 1) {
mmParamsSi <- mmParams[mmParams$SI > selIndThresh, ]
x <- x + geom_point(data = as.data.frame(mmParamsSi),
aes(x = deltaMu2, y = 1/(abs(deltaMu1) + alpha1)),
size = 10, alpha = 0.1, col = colorsRed[length(colorsRed)],
fill = colorsRed[length(colorsRed)]) + ggtitle(bquote(Distribution ~
of ~ Mixture ~ Model ~ Parameters * "," ~ alpha ~
"=" ~ .(round(alpha1, 2)) * ", SI >" ~ .(selIndThresh))) + ...
}
else if (!is.null(geneLabels)) {
mmParamsSi <- mmParams[geneLabels, ]
mmParamsSi$geneLabels <- geneLabels
x <- x + geom_point(data = as.data.frame(mmParamsSi),
aes(x = deltaMu2, y = 1/(abs(deltaMu1) + alpha1)),
size = 10, alpha = 0.1, col = colorsRed[length(colorsRed)],
fill = colorsRed[length(colorsRed)]) + ggrepel::geom_text_repel(data = mmParamsSi,
aes(x = deltaMu2, y = 1/(abs(deltaMu1) + alpha1)),
label = rownames(mmParamsSi)) + ggtitle(bquote(Distribution ~
of ~ Mixture ~ Model ~ Parameters * "," ~ alpha ~
"=" ~ .(round(alpha1, 2)))) + ...
}
else {
x <- x + ggtitle(bquote(Distribution ~ of ~ Mixture ~
Model ~ Parameters * "," ~ alpha ~ "=" ~ .(round(alpha1,
2)))) + ...
}
return(x)
}
#part 9.
classifyTumors <- function(mmp, genes, exprMat){
#get the midpoint between the 2 tumor curves
mmp_diff <- apply(mmp[genes, c("tMu2", "tMu1")], 1, mean)
boolMtrx = t(assay(exprMat)[genes, ] > mmp_diff)
boolMtrx <- ifelse(boolMtrx, 1, 0)
colnames(boolMtrx) <- paste0("cl_", colnames(boolMtrx))
return(boolMtrx)
}
returnDeGenesLimma <- function(expr, design, colOfInt, pval=1, logFC=0, retAll=F){
fit_t <- lmFit(object = expr, design = design)
ebfit_t <- eBayes(fit_t)
if(retAll == T){
topTableRes_t <- topTable(ebfit_t, coef = colOfInt, number = Inf)
return(topTableRes_t)
}
topTableRes_t <- topTable(ebfit_t, coef = colOfInt, number = nrow(assay(expr)), adjust.method = "BH", p.value = pval, lfc = logFC)
return(topTableRes_t)
}
genesetEnrichmentFisher <- function(B, Targ, GS.GS, GS.names){
#calculates the fisher exact t test for enrichment in a particular set of
#target gene entrez IDs (T) among the number of background genes (B). GS.GS
#takes the form "hGMT$genesets", where hGMT is a variable that stores the .gmt
#file object downloaded from the msigdb website that was read in using the
#GMT() function. GS.names takes the form "hGMT$geneset.names"
#1. Initialize an empty dataframe
t <- unlist(fisher.test(matrix(data = c(1,4,6,2), nrow = 2), conf.int = T))
colNam <- c(names(t), "geneset.names", "MatchesBtwTargAndPathway", "NumbGenesInMsigdbPathway", "TargLen", "BackgroundLen")
fisherDf <- data.frame(matrix(NA, nrow=1, ncol=13))
names(fisherDf) <- colNam
TargLen <- length(Targ)
#2. Loop through each geneset and calculate pvalues
for(i in 1:length(GS.GS)){
pathwayEntrezIDs <- GS.GS[[i]] #these are characters
M <- length(intersect(Targ, pathwayEntrezIDs)) #number of overlapping genes
P <- length(pathwayEntrezIDs) #numb of genes in pathway
conting.matrix <- matrix(c(M, TargLen-M, P-M, B-TargLen-P+M), nrow = 2, byrow = TRUE) #create contigency matrix
fisherTest <- fisher.test(conting.matrix, conf.int = T)
fisherDf <- rbind(fisherDf, c(unlist(fisherTest), GS.names[[i]], M, P, TargLen, B))
}
fisherDf = fisherDf[-1,] #remove the row of NA's (in row 1)
fisherDf$bhPval <- p.adjust(fisherDf$p.value, method = "BH")
fisherDf$bonferroni <- p.adjust(fisherDf$p.value, method = "bonferroni")
fisherDf$neg.log10.bhPval <- -log10(as.numeric(fisherDf$bhPval))
fisherDf$dataset <-
unlist(lapply(strsplit(fisherDf$geneset.names, "_"), function(x) x[1]))
fisherDf <- fisherDf[order(fisherDf$bhPval),]
#rank by bhPval pbal
return(fisherDf)
}
gseWrapper <- function(B, Targ, genesets, geneset.names, qval, OR){
#dir can equal "up" or "down" for up or down regulated, respectivtly
#Targ = upregDeGenesEntrez; genesets=hallmarkKeggReactome$genesets;
#geneset.names = hallmarkKeggReactome$geneset.names; qval=qval_pathway; OR=OR_pathway
res_gse <- genesetEnrichmentFisher(B = B, Targ = Targ, GS.GS = genesets, GS.names=geneset.names)
res_gse_filt <- res_gse[as.numeric(res_gse$bhPval) < qval &
as.numeric(res_gse$conf.int1) > OR &
as.numeric(res_gse$conf.int1) > OR,] #`estimate.odds ratio`
return(res_gse_filt)
}
GSA.read.gmt.silent <- function (filename) {
a = scan(filename, what = list("", ""), sep = "\t", quote = NULL,
fill = T, flush = T, multi.line = F)
geneset.names = a[1][[1]]
geneset.descriptions = a[2][[1]]
dd = scan(filename, what = "", sep = "\t", quote = NULL)
nn = length(geneset.names)
n = length(dd)
ox = rep(NA, nn)
ii = 1
for (i in 1:nn) {
#cat(i)
while ((dd[ii] != geneset.names[i]) | (dd[ii + 1] !=
geneset.descriptions[i])) {
ii = ii + 1
}
ox[i] = ii
ii = ii + 1
}
genesets = vector("list", nn)
for (i in 1:(nn - 1)) {
#cat(i, fill = T)
i1 = ox[i] + 2
i2 = ox[i + 1] - 1
geneset.descriptions[i] = dd[ox[i] + 1]
genesets[[i]] = dd[i1:i2]
}
geneset.descriptions[nn] = dd[ox[nn] + 1]
genesets[[nn]] = dd[(ox[nn] + 2):n]
out = list(genesets = genesets, geneset.names = geneset.names,
geneset.descriptions = geneset.descriptions)
class(out) = "GSA.genesets"
return(out)
}
readHmKgRctmepathways <- function(gsea_path){
hallmark <- GSA.read.gmt.silent(paste0(gsea_path,"/h.all.v5.1.entrez.gmt")) #50 genesets
kegg <- GSA.read.gmt.silent(paste0(gsea_path,"/c2.cp.kegg.v5.1.entrez.gmt")) #185 genesets
reactome <- GSA.read.gmt.silent(paste0(gsea_path,"/c2.cp.reactome.v5.1.entrez.gmt")) #673 genesets
hallmarkKeggReactome <- list(genesets = c(hallmark$genesets, kegg$genesets, reactome$genesets), geneset.names =
c(hallmark$geneset.names, kegg$geneset.names, reactome$geneset.names), geneset.descriptions =
c(hallmark$geneset.descriptions, kegg$geneset.descriptions, reactome$geneset.descriptions))
names(hallmarkKeggReactome[[1]]) <- hallmarkKeggReactome$geneset.names
return(hallmarkKeggReactome)
}
forestplot <- function(d, title, xlab=expression(log[10] *"(Odds Ratio)"), ylab="Pathway"){
p <- ggplot(d, aes(x=x, y=y, ymin=ylo, ymax=yhi)) +
geom_pointrange(size = 1.25) +
coord_flip() +
#geom_hline(aes(x=0, yintercept = 1)) +
ylab(xlab) +
xlab(ylab) + #switch because of the coord_flip() above
ggtitle(title) +
scale_y_log10() +
theme_classic() +
geom_text(aes(label=numbMatches)) +
guides(alpha=FALSE) +
theme(axis.text.x=element_text(size=12), axis.text.y = element_text(size = 7), legend.text = element_text(size = 12))
return(p)
}
createForestDF <- function(data, title){
d <- data.frame(x=data$geneset.names, y=as.numeric(data$`estimate.odds ratio`), ylo = as.numeric(data$conf.int1), yhi = as.numeric(data$conf.int2), pval=as.numeric(data$p.value), numbGeneInPway = data$NumbGenesInMsigdbPathway,
numbMatches=data$MatchesBtwTargAndPathway)
d$x <-factor(d$x, levels=d[order(-d$y), "x"])
return(forestplot(d, title, xlab=expression("Odds Ratio("*log[10] *" scale)")))
}
gseWrapper <- function(B, Targ, genesets, geneset.names, qval, OR){
#dir can equal "up" or "down" for up or down regulated, respectivtly
res_gse <- genesetEnrichmentFisher(B = B, Targ = Targ, GS.GS = genesets, GS.names=geneset.names)
res_gse_filt <- res_gse[as.numeric(res_gse$bhPval) < qval & as.numeric(res_gse$`estimate.odds ratio`) > OR & as.numeric(res_gse$conf.int1) > OR,]
return(res_gse_filt)
}
plot.volcano.gsea = function(hallmarkKeggReactome, diffExprDf, geneset.name, log2FC_diffexp, qval_diffexp, ensg_map, brew.color.set1, propSubset = 1){
entrezIdsGeneSet = hallmarkKeggReactome[[1]][[geneset.name]]
diffExprDf$logFC_thresh = diffExprDf$logFC > log2FC_diffexp & diffExprDf$adj.P.Val < qval_diffexp
diffExprDf2 = dplyr::left_join(diffExprDf, ensg_map, by= c("ensgId" = "Gene.stable.ID"))
diffExprDf2$inGeneSet = diffExprDf2$NCBI.gene.ID %in% entrezIdsGeneSet
#if(randSubset){ #randomly downsample the # of genes for visualization purposed
diffExprDf2$propSubset = TRUE
if(propSubset < 1){
diffExprDf2$propSubset <- sample(c(TRUE, FALSE), prob=c(propSubset, 1-propSubset), size=nrow(diffExprDf2), replace=TRUE)
}
diffExprDf2$NCBI.gene.ID %in% sample(entrezIdsGeneSet, ceiling(propSubset*length(entrezIdsGeneSet)), replace=FALSE)
#}
#should make this a threshold for the labels, not the color of dots
colors_set1 = RColorBrewer::brewer.pal("Set1", n = 9) #green is 3, blue is 2
ggplot(diffExprDf2, aes(x = logFC, y = -log(adj.P.Val, 10), label = HGNC.symbol, color = logFC_thresh & inGeneSet)) +
theme_bw() + geom_vline(xintercept = c(-1,1), linetype="dashed", color = "grey") +
geom_hline(yintercept = -log(qval_diffexp,10), linetype="dashed", color = "grey") +
geom_point(aes(alpha = ifelse(logFC_thresh, 0.3, 0.25))) +
ggrepel::geom_text_repel(aes(label=ifelse(inGeneSet & logFC_thresh & propSubset,
as.character(HGNC.symbol),''))) +
scale_color_manual(values = colors_set1[c(9,brew.color.set1)]) +
theme(axis.text = element_text(size=14)) +
labs(size = 14, x = expression(log[2](Fold~Change)), y = (expression(-log[10](q~value)))) +
ggtitle(paste0("Volcano Plot Highlights Upregulated Genes Within the\n", stringr::str_to_title(geneset.name, locale = "en"), " Geneset"))
}
dupMean <- function(segMean, id){
result <- split(segMean, id, drop = F)
result2 <- plyr::laply(result, mean)
names(result2) <- names(result)
return(result2)
}
returnCNVvalues <- function(cnvGr, gene_Gr){
overLaps = subsetByOverlaps(cnvGr, gene_Gr) #1 gene across all pts
#take the average of any two that have the same tcga_id
vectNoDupl <- dupMean(segMean = overLaps$segment_mean, overLaps$tcga_id)
return(vectNoDupl)
}
fillNAwithColMeans <- function(data){
for(i in 1:ncol(data)){
data[is.na(data[,i]), i] <- mean(data[,i], na.rm = TRUE)
}
return(data)
}
logRegElNetLooc <- function(x, y, alpha, retGlmNt=FALSE){
#creates a matrix of coefficients using elastic net regression.
CV <- cv.glmnet(x, y, family="binomial", type.measure = "class", alpha = alpha, nlambda = 100, nfolds = length(y)) #this is Leave one out Cross validation
fit <- glmnet(x, y,family="binomial", alpha = alpha, lambda = CV$lambda.1se)
coeffic <- coef(fit) #get the coefficients from the fit model
if(retGlmNt){
return(fit)
}
return(coeffic)
}
sig2star <- function(s, breaks=c(-Inf, 0.001,0.01,0.05,1),
labels=c("***","**","*","")) {
r <- s
r[] <- as.character(cut(s, breaks=breaks, labels))
r[is.na(r)] <- ""
r
}
sig2star01 <- function(s, breaks=c(-Inf, 0.0001,0.001,0.01,1),
labels=c("***","**","*","")) {
r <- s
r[] <- as.character(cut(s, breaks=breaks, labels))
r[is.na(r)] <- ""
r
}
getExonsInGene <- function(allExons, gene_gr){
exonsInGene <- allExons %>%
subsetByOverlaps(gene_gr) %>% unlist() %>%
IRanges::reduce() %>% sort()
if(unique(strand(exonsInGene)) == "+"){
exonsInGene$Exon_order = paste0("Exon_middle", 0:{length(exonsInGene)-1})
exonsInGene$Exon_order[length(exonsInGene)] = "Exon_last"
exonsInGene$Exon_order[1] = "Exon_first"
} else if(unique(strand(exonsInGene)) == "-"){
exonsInGene$Exon_order = "Exon_middle"
exonsInGene$Exon_order[1] = "Exon_last"
exonsInGene$Exon_order[length(exonsInGene)] = "Exon_first"
}
return(exonsInGene)
}
methStatusAtLoci <- function(geneGr, methSE){
#methSE = Methylation summarized experiment object with GRanges
#object as first column
meth_subst = subsetByOverlaps(methSE, geneGr)
return(meth_subst)
}
findAndAnnotateCpGsInGene <- function(gene_gr, allExons, methSE, genes_gr_prom){
#gene_gr has the coordinates of the gene within which CpGs are to be searched for
#allExons is a txDB object that contains all the exons
#methSE is a genomicranges object that contains all the CpGs
cpgsInGene <- subsetByOverlaps(methSE, gene_gr)
exonsInGene <- getExonsInGene(allExons, gene_gr)
exonIndices <- findOverlaps(cpgsInGene, exonsInGene)
cpgInExon <- cpgsInGene[queryHits(exonIndices)]
cpgInExon$location <- exonsInGene[subjectHits(exonIndices)]$Exon_order
cpgInProm <- subsetByOverlaps(cpgsInGene, genes_gr_prom)
if(length(cpgInProm) > 0) {cpgInProm$location <- "Promoter"}
cpgInIntron <- cpgsInGene[!cpgsInGene$cgid %in% c(cpgInProm$cgid, cpgInExon$cgid)]
if(length(cpgInIntron) > 0) {cpgInIntron$location <- "Intron"}
cpgTot <- sort(c(cpgInExon, cpgInIntron, cpgInProm))
return(cpgTot)
}
range01 <- function(x){(x-min(x, na.rm = TRUE))/(max(x, na.rm = TRUE)-min(x, na.rm=TRUE))}
scatterMixPlot3 <- function (mmParams, selIndThresh = 1, geneLabels = NULL, sizesX, sizesY, ...) {
mmParams <- as.data.frame(mmParams)
oneOverAlpha <- diff(range(mmParams$deltaMu2))
alpha1 <- 1/oneOverAlpha
quants <- c(0.05, 0.5, 0.95)
colorsRed <- RColorBrewer::brewer.pal(n = length(quants),
name = "Reds")
deltaMu2Quant <- stats::quantile(mmParams[, "deltaMu2"],
quants)
deltaMu1Quant <- stats::quantile(1/(abs(mmParams[, "deltaMu1"]) +
alpha1), quants)
deltaMu2 <- deltaMu1 <- NULL
x <- ggplot(data = as.data.frame(mmParams), aes(x = deltaMu2,
y = 1/(abs(deltaMu1) + alpha1))) + theme_classic() +
geom_hline(yintercept = deltaMu1Quant, col = colorsRed,
size = sizesY) + geom_vline(xintercept = deltaMu2Quant,
col = colorsRed, size = sizesX) + geom_point(alpha = 0.5) +
xlab(expression(paste(Delta, mu[2]))) + ylab(expression(paste(frac(1,
paste(Delta, mu[1], " + ", alpha)))))# + ...
if (selIndThresh < 1) {
mmParamsSi <- mmParams[mmParams$SI > selIndThresh, ]
x <- x + geom_point(data = as.data.frame(mmParamsSi),
aes(x = deltaMu2, y = 1/(abs(deltaMu1) + alpha1)),
size = 10, alpha = 0.25, col = colorsRed[length(colorsRed)],
fill = colorsRed[length(colorsRed)]) + ggtitle(bquote(Distribution ~
of ~ Mixture ~ Model ~ Parameters * "," ~ alpha ~
"=" ~ .(round(alpha1, 2)) * ", SI >" ~ .(selIndThresh))) + ...
}
else if (!is.null(geneLabels)) {
mmParamsSi <- mmParams[geneLabels, ]
mmParamsSi$geneLabels <- geneLabels
x <- x + geom_point(data = as.data.frame(mmParamsSi),
aes(x = deltaMu2, y = 1/(abs(deltaMu1) + alpha1)),
size = 10, alpha = 0.25, col = colorsRed[length(colorsRed)],
fill = colorsRed[length(colorsRed)]) + ggrepel::geom_label_repel(data = mmParamsSi,
aes(x = deltaMu2, y = 1/(abs(deltaMu1) + alpha1)),
label = rownames(mmParamsSi), alpha=0.75, color="black") + ggtitle(bquote(Distribution ~
of ~ Mixture ~ Model ~ Parameters * "," ~ alpha ~
"=" ~ .(round(alpha1, 2)))) + ...
}
else {
x <- x + ggtitle(bquote(Distribution ~ of ~ Mixture ~
Model ~ Parameters * "," ~ alpha ~ "=" ~ .(round(alpha1,
2)))) + ...
}
return(x)
}
gg__bar = list(
theme_classic(),
theme(legend.position="top",
axis.ticks = element_blank(),
axis.text = element_text(size=16, face="italic"),
axis.text.x = element_text(angle = 90, hjust=1,vjust=0.5)),
#axis.text.y = element_text(vjust=0.5, hjust = 1)),
scale_y_continuous(expand = c(0,0)))
gg__tile = list(
theme_classic(),
theme(legend.position="top",
axis.ticks = element_blank(),
axis.text = element_text(size=16, face="italic"),
axis.text.x = element_text(angle = 90, hjust=1,vjust=0.5),
axis.line = element_blank()),
coord_fixed()) #,
#axis.text.y = element_text(vjust=0.5, hjust = 1)),
#, scale_y_continuous(expand = c(0,0)))
gg__point = list(
theme_classic(),
theme(legend.position="top",
axis.ticks.x = element_blank(),
axis.text = element_text(size=16, color="black"),
#axis.line = element_blank()
))
#' Cluster and Reorder Rows and Columns
#'
#' Clusters the rows and columns of a numeric matrix using hclust
#'
#' @param matr matrix to be clustered
#' @param dimen Indicates the dimension that the clustering is performed in. One of either "row", "column", or "both".
#'
#' @return returns a matrix of clustered values by either row, column, or both,
clusterAndReorder <- function(matr, dimen="both"){
#dimen can be one of row, column or both
if(dimen == "row"){
hcl <- hclust(dist(matr))
matr.new <- matr[hcl$order,]
return(matr.new)
} else if(dimen == "column"){
hcl <- hclust(dist(t(matr)))
matr.new <- matr[,hcl$order]
return(matr.new)
} else if(dimen == "both"){
hcl.r <- hclust(dist(matr))
hcl.c <- hclust(dist(t(matr)))
matr.new <- matr[hcl.r$order, hcl.c$order]
return(matr.new)
}
}
clusterDendro <- function(matr, dimen="row"){
#dimen can be one of row, column or both
if(dimen == "row"){
hcl <- hclust(dist(matr))
return(hcl)
} else if(dimen == "column"){
hcl <- hclust(dist(t(matr)))
return(hcl)
}
}
plotGeneHistNoProbCrv <- function (mmParams, exprNml, exprTum, isof, ...) {
exprNml <- toMatrix(exprNml)
exprTum <- toMatrix(exprTum)
tidyDf <- as.data.frame(cbind(as.numeric(c(exprTum[isof,
], exprNml[isof, ])), as.factor(c(rep("tumor", ncol(exprTum)),
rep("normal", ncol(exprNml))))), stringsAsFactors = FALSE)
colnames(tidyDf) <- c("expr", "type")
expr <- type <- ..density.. <- NULL
p1 <- ggplot(tidyDf, aes(x = expr, color = as.factor(type),
fill = as.factor(type), group = as.factor(type))) + theme_classic() +
geom_histogram(data = subset(tidyDf, type == 1), fill = "#F8766D",
alpha = 0.2, aes(y = ..density..), binwidth = 0.2) +
geom_histogram(data = subset(tidyDf, type == 2), fill = "#00BFC4",
alpha = 0.2, aes(y = ..density..), binwidth = 0.2) +
geom_rug(alpha = 0.3, show.legend = FALSE) + theme(axis.title.y = element_blank(),
axis.text.y = element_blank(), axis.ticks.y = element_blank(),
legend.position = "none", plot.title = element_text(size = 12),
axis.text = element_text(size = 8), axis.title = element_text(size = 8)) +
ggtitle(paste0(isof, " : SI = ", round(mmParams[isof,
"SI"], 4))) + xlab(expression(Log[2] * "(TPM Reads)")) + ...
print(p1)
}
rescale <- function (x, newrange) {
if (missing(x) | missing(newrange)) {
usage.string <- paste("Usage: rescale(x,newrange)\n",
"\twhere x is a numeric object and newrange is the new min and max\n",
sep = "", collapse = "")
stop(usage.string)
}
if (is.numeric(x) && is.numeric(newrange)) {
xna <- is.na(x)
if (all(xna))
return(x)
if (any(xna))
xrange <- range(x[!xna])
else xrange <- range(x)
if (xrange[1] == xrange[2])
return(x)
mfac <- (newrange[2] - newrange[1])/(xrange[2] - xrange[1])
return(newrange[1] + (x - xrange[1]) * mfac)
}
else {
warning("Only numeric objects can be rescaled")
return(x)
}
}
plotGeneHistUD <- function(mmParams, exprNml, exprTum, isof, linesz=1.5, ...){
exprNml <- toMatrix(exprNml)
exprTum <- toMatrix(exprTum)
tidyDf <- as.data.frame(cbind(as.numeric(c(exprTum[isof,], exprNml[isof,])),
as.factor(c(rep("tumor",ncol(exprTum)), rep("normal",ncol(exprNml))))),
stringsAsFactors=FALSE)
colnames(tidyDf) <- c("expr", "type")
expr <- type <- ..density.. <- NULL # Setting the variables to NULL first
p1 <- ggplot(tidyDf, aes(x=expr, color=as.factor(type),
fill=as.factor(type),
group=as.factor(type))) +
theme_classic() +
geom_histogram(data=subset(tidyDf,type == 1),fill="#F8766D",
alpha=0.2, aes(y=..density..), binwidth = .2) +
geom_histogram(data=subset(tidyDf,type == 2),fill="#00BFC4",
alpha=0.2, aes(y=..density..), binwidth = .2) +
geom_rug(alpha=0.3, show.legend=FALSE) +
theme(axis.title.y=element_blank(),
axis.text.y=element_blank(),
axis.ticks.y=element_blank(),
legend.position="none",
plot.title=element_text(size=12),
axis.text=element_text(size=8),
axis.title=element_text(size=8),
axis.line.y=element_blank()) +
ggtitle(paste0(isof, " : SI = ",
round(mmParams[isof,"SI"],4))) +
xlab(expression(Log[2] *"(TPM Reads)")) +
stat_function(fun="dnorm", colour="#F8766D",
args=list(mean=mmParams[isof,"nMu2"],
sd=sqrt(mmParams[isof,"nVar"])), size=linesz) +
stat_function(fun="dnorm", colour="#F8766D",
args=list(mean=mmParams[isof,"nMu1"],
sd=sqrt(mmParams[isof,"nVar"])), size=linesz) +
stat_function(fun="dnorm", colour="#00BFC4",
args=list(mean=mmParams[isof,"tMu2"],
sd=sqrt(mmParams[isof,"tVar"])), size=linesz) +
stat_function(fun="dnorm", colour="#00BFC4",
args=list(mean=mmParams[isof,"tMu1"],
sd=sqrt(mmParams[isof,"tVar"])), size=linesz) + ...
print(p1)
return(p1)
}
plotGeneHist2 <- function (mmParams, exprNml, exprTum, isof, txtSize = 10){
exprNml <- toMatrix(exprNml)
exprTum <- toMatrix(exprTum)
tidyDf <- as.data.frame(cbind(as.numeric(c(exprTum[isof,
], exprNml[isof, ])), as.factor(c(rep("tumor", ncol(exprTum)),
rep("normal", ncol(exprNml))))), stringsAsFactors = FALSE)
colnames(tidyDf) <- c("expr", "type")
expr <- type <- ..density.. <- NULL
p1 <- ggplot(tidyDf, aes(x = expr, color = as.factor(type),
fill = as.factor(type), group = as.factor(type))) + theme_classic() +
geom_histogram(data = subset(tidyDf, type == 1), fill = "#F8766D",
alpha = 0.2, aes(y = ..density..), binwidth = 0.2) +
geom_histogram(data = subset(tidyDf, type == 2), fill = "#00BFC4",
alpha = 0.2, aes(y = ..density..), binwidth = 0.2) +
geom_rug(alpha = 0.3, show.legend = FALSE) + theme(axis.title.y = element_blank(),
axis.text.y = element_blank(), axis.ticks.y = element_blank(),
legend.position = "none", plot.title = element_text(size = 12),
axis.text = element_text(size = txtSize), axis.title = element_text(size = txtSize)) +
ggtitle(paste0(isof, " : SI = ", round(mmParams[isof,
"SI"], 4))) + xlab(expression(Log[2] * "(TPM Reads)")) +
stat_function(fun = "dnorm", colour = "#F8766D", args = list(mean = mmParams[isof,
"nMu2"], sd = sqrt(mmParams[isof, "nVar"]))) + stat_function(fun = "dnorm",
colour = "#F8766D", args = list(mean = mmParams[isof,
"nMu1"], sd = sqrt(mmParams[isof, "nVar"]))) + stat_function(fun = "dnorm",
colour = "#00BFC4", args = list(mean = mmParams[isof,
"tMu2"], sd = sqrt(mmParams[isof, "tVar"]))) + stat_function(fun = "dnorm",
colour = "#00BFC4", args = list(mean = mmParams[isof,
"tMu1"], sd = sqrt(mmParams[isof, "tVar"])))
print(p1)
}
clean_dataset <- function(e, ensg_to_prot){
#returns adj normal and tumor dataframes
#remove ffpe samples
e <- e[,!e$is_ffpe]
#add technical variables
colData(e)$hosp <- substr(rownames(colData(e)), 6,7)
colData(e)$seqPlate <- substr(rownames(colData(e)), 22,25)
###remove all metaData columns that are NA
colData(e) <- colData(e)[,colSums(is.na(colData(e)))<nrow(colData(e))]
##TPM normalization, then log2(tpm+1) transform
eu <- assay(e)
eu <- scale(eu, center=FALSE, scale=colSums(eu)) * 1000000
eu <- log(eu+1, 2)
assay(e) <- eu
#separate tumor and adjacent normal samples
et <- e[,substr(colnames(e),14,15) == "01"]
en <- e[,substr(colnames(e),14,15) == "11"]
#print("ok")
#remove duplicates
et <- et[,!duplicated(substr(colnames(et),1,12))]
en <- en[,!duplicated(substr(colnames(en),1,12))]
#print("ok2")
#drop transcripts that don't map to protein-coding ensembl gene Ids
et <- et[rownames(et) %in% ensg_to_prot$Gene.stable.ID ,]
en <- en[rownames(en) %in% ensg_to_prot$Gene.stable.ID ,]
print(dim(et)); print(dim(en))
#remove patients with NA values
et <- et[,!apply(assay(et), 2, anyNA)]
en <- en[,!apply(assay(en), 2, anyNA)]
#Filter out genes where <20% of patients have a non zero expression value
et <- et[rowSums(assay(et)==0)<=ncol(et)*.20,]
en <- en[rowSums(assay(en)==0)<=ncol(en)*.20,]
print(dim(et)); print(dim(en))
#get the genes in common between the two
genes_in_common <- intersect(rownames(et), rownames(en))
et <- et[genes_in_common,]
en <- en[genes_in_common,]
#print("ok4")
return(list(et=et, en=en))
}
filt_genes_pi_si <- function(m, pi_min, si){
## m is the mmp object
## pi is the minimum proportion of samples per group
## si is the selectivity index
gene_filt_pi_si <- rownames(m)[m$tPi1 < (1-pi_min) & m$tPi1 > pi_min &
m$nPi1 < 0.8 & m$nPi1 > pi_min & m$SI > si]
return(gene_filt_pi_si)
}
filt_mmp_pi <- function(m, pi_min){
## m is the mmp object
## pi is the minimum proportion of samples per group
m_filt <- m[m$tPi1 < (1-pi_min) & m$tPi1 > pi_min &
m$nPi1 < 0.8 & m$nPi1 > pi_min,]
return(m_filt)
}
plotOncomixScores <- function(mmp_dat, si=0.99, alph=0.8){
reds <- brewer.pal(n = 9, name = "Reds")
p <- ggplot(mmp_dat, aes(x=score, y=..density.., fill=SI > si)) +
geom_histogram(data=subset(mmp_dat, SI<si),
aes(x=score, y=..density.., fill=SI > si),
binwidth = 0.1, color="grey", alpha=alph) +
geom_histogram(data=subset(mmp_dat, SI>si),
aes(x=score, y=..density.., fill=SI > si),
binwidth = 0.1, color="grey", alpha=alph) +
theme_classic() +
scale_fill_manual(values=reds[c(3,7)]) +
scale_y_continuous(expand = c(0, 0)) +
theme(text=element_text(size=20, color="black"), legend.position="none") +
ylab("Density") + xlab("Oncomix Score")
print(p)
}
pvalFxn <- function(survDifObj){
p.val <- 1 - pchisq(survDifObj$chisq, length(survDifObj$n) - 1)
return(p.val)
}
#### below are the simulation scripts ####
ret_mean_sd <- function(dat){
dat2 <- mclust::Mclust(data = dat, G = 1, modelNames = "E", verbose = FALSE)
mean <- dat2$parameters$mean
std_dev <- sqrt(dat2$parameters$variance$sigmasq)
return(list("mean"=mean, "std_dev"=std_dev))
}
ret_mean_sd_multiv_Gauss <- function(dat){
dat2 <- mclust::Mclust(data = dat, G = 1, verbose = FALSE)
mean <- dat2$parameters$mean
std_dev <- sqrt(dat2$parameters$variance$sigmasq)
return(list("mean"=mean, "std_dev"=std_dev))
}
#given the parameters, calc the score
score_from_params <- function(SI, deltaMu2, deltaMu1, nVar, tVar){
score <- SI * {(deltaMu2 - deltaMu1) - (nVar + tVar)}
return(score)
}
score_from_params2 <- function(params){
score <- params$SI * {(params$deltaMu2 - params$deltaMu1) - (params$nVar + params$tVar)}
return(score)
}
mean_sd_score_params <- function(dat){
five_vars <- c("deltaMu2", "deltaMu1", "SI", "nVar", "tVar")
res <- lapply(five_vars, function(x) ret_mean_sd(dat[[x]]))
names(res) <- five_vars
return(res)
}
mean_sd_score_params_multiv <- function(dat){
four_vars <- c("deltaMu2", "deltaMu1", "nVar", "tVar") #"SI",
params <- dat %>% dplyr::select(four_vars) %>% mclust::Mclust(G=1, verbose = FALSE)
m <- as.numeric(params$parameters$mean)
v <- params$parameters$variance$sigma[,,1]
si_vals <- dat$SI
return(list(mean=m, cov=v, si_vals=si_vals)) #mean, covariance, si values for resampling
}
rand_draw_gauss <- function(top_params, n_draws){
res <- lapply(top_params, function(x) rnorm(n = n_draws, mean = x$mean, sd = x$std_dev))
return(res)
}
rand_draw_gauss_multiv <- function(params, n_draws){
res <- MASS::mvrnorm(n = n_draws, mu = as.numeric(params$mean), Sigma = params$cov) %>%
as.data.frame()
res$SI <- sample(x = params$si_vals, size = n_draws, replace = TRUE) #resampling for SI, this is non-Gaussian (range 0-1). Also encountered difficulties fitting parameters for Beta distribution
return(res)
}
draw_rand_scores <- function(mmp, n_draws){
mmp %>%
mean_sd_score_params() %>%
rand_draw_gauss(n_draws = n_draws) %>%
score_from_params2()
}
draw_rand_scores_multiv <- function(mmp, n_draws){
sc <- mmp %>%
mean_sd_score_params_multiv() %>%
rand_draw_gauss_multiv(n_draws = n_draws) %>%
score_from_params2()
}
pval_effect_size <- function(top, not){
res <- t.test(top, not, alternative = "greater")
effect_size <- mean(top) - mean(not)
pval <- res$p.value
return(c("pval" = pval, "effect_size" = effect_size))
}
calc_rand_pval_effect_size <- function(top, not){
top_rand_scores <- draw_rand_scores(top, n_draws=nrow(top))
not_rand_scores <- draw_rand_scores(not, n_draws=nrow(not))
pval_effect_size(top_rand_scores, not_rand_scores)
}
calc_rand_pval_effect_size_multiv <- function(top, not){
top_rand_scores <- draw_rand_scores_multiv(top, n_draws=nrow(top))
not_rand_scores <- draw_rand_scores_multiv(not, n_draws=nrow(not))
pval_effect_size(top_rand_scores, not_rand_scores)
}
#try sampling from the joint distribution
read.gct <-
function (file) {
expr = read.table(file, skip = 2, header = TRUE, sep = "\t", quote = "")
rownames(expr) = expr[,1]
checkName = table(expr[,1])
if(max(checkName) > 1) {
stop(paste("Genes in gct file should be unique: ", names(which.max(checkName)), sep = " "))
}
expr = expr[,-c(1,2)]
expr = as.matrix(expr)
return(expr)
}