---
title: "nORF_Analysis"
author: "Matt Neville"
date: "14/05/2020"
output: html_document
---

The following is a document reproducing analyses performed in the manuscript "A platform for curated products from novel Open Reading Frames (nORFs) prompts reinterpretation of disease variants." It uses the nORF files generated in the nORF_Data_Prep.Rmd document.

Command line steps are given in bash blocks with eval = F so will not run by default. These are provided for replication purposes, exact commands needed may differ by machine.

# 1. Setup and Downloads

## 1a. Load required libraries
```{r Libraries, include=FALSE}
library(scales)
library(circlize)
library(ggpubr)
library(ggrepel)
library(rmeta)
library(ggalluvial)
library(magrittr)
library(tidyverse)
```

## 1b. Prepare input/output files
```{bash, eval = F}
unzip dataFiles/gencode.v30.annotation.CDS.bed.zip
unzip dataFiles/LDSCresults.zip
unzip dataFiles/MAPSresults.zip

mkdir plots
```

# 2. Data overview figures

## Figure 3 b+c
```{r Figure3}
nORF_annotation <- read_tsv("nORFclassification.tsv", col_types = 'ccccll')

figure3b <- function() {
  dataTable <- as.data.frame(table(nORF_annotation$ORFannotation)) 
  dataTable$Freq[3] <- dataTable$Freq[3] + dataTable$Freq[6]
  
  #annotationTable$Category <- c("Transcribed", "Canonical CDS", "3'UTR", "5'UTR", "all", "all_CDS", "all_NoCDS", "noInFrame", "noInFrame_CDS", "noInFrame_NoCDS")
  trans = function(x) pmin(x, breakpoint) + scaling_factor * pmax(x - breakpoint, 0)
  scaling_factor = 0.2
  breakpoint = 3.5e4
  yticks = c(0, 5e3, 1e4, 1.5e4, 2e4, 2.5e4, 3e4, 5e4, 7.5e4, 1e5)
  ylabels = c(0, 5e3, 1e4, 1.5e4, 2e4, 2.5e4, 3e4, 9e4, 9.5e4, 1e5)
  
  break_end = 1.05
  
  annotationTable <- dataTable[c(16,17,19, 3,5 ,13, 8,7, 1,9,10,11,12),]
  annotationTable <- annotationTable %>%
    mutate(Freq_t = trans(Freq)) %>% 
    mutate(Var1 = c("5'UTR", "5'UTR-altCDS", "5'UTR-intronic", "altCDS", "altCDS-intronic", "3'UTR", "intronic", "intergenic", "antisense", "ncRNA", "nmd", "pseudogene", "retained_intron")) %>% 
    mutate(colorCode = c("o","o","o","b","b","o","g","g","r","r","r","r","r"))
  annotationTable$label <- factor(annotationTable$Var1, levels = annotationTable$Var1)
  
  annotationPlot <- ggplot(annotationTable, aes(x = label, y = Freq_t, fill = colorCode)) + 
    geom_bar(colour="black", stat="identity") +
    theme_minimal() +
    scale_fill_manual(values= c("#56B4E9", "#009E73", "#E69F00", "#CC6666"), guide = 'none') +
    labs(x = NULL, y = "Count") +
    theme(axis.text.y = element_text(size=18), axis.text.x = element_text(angle = 40, hjust = 1, size = 18), axis.title = element_text(size = 20)) +
    theme(plot.margin = unit(c(0.5,3,0,2),"cm")) +
    geom_rect(aes(xmin=0.4, xmax=Inf, ymin=3.3e4, ymax=breakpoint * break_end), fill='white') +
    geom_hline(yintercept = 3.3e4, color='gray', linetype='dashed') +
    geom_hline(yintercept = breakpoint * break_end, color='gray', linetype='dashed') +
    theme(axis.line.y = element_blank()) +
    scale_y_continuous(limits=c(0, NA), breaks=trans(yticks), labels=comma(ylabels),
                       expand = c(0, 0), position="left") 
  return(annotationPlot)
}
figure3b()

figure3c <- function() {
  swissprotLengths <- read_tsv("dataFiles/uniprot_swissprot_lengths_June19.tsv", col_types = "ci") %>% 
    mutate(type = "canonical")
  colnames(swissprotLengths) <- c("name", "length", "type")
  norfsAnnot <- nORF_annotation %>% 
    dplyr::select(novelORF_ID, ORFannotation)
  norfLengths0 <- read_tsv("noInFrame_38.gtf", col_names = F, col_types = 'ccciicccc') %>%
    separate(X9, into = c("novelORF_ID", NA, NA, "length"), sep = "; ") %>% 
    mutate(novelORF_ID = str_remove(str_remove(novelORF_ID, "gene_id \""), "\"")) %>% 
    mutate(length = as.numeric(str_remove(str_remove(length, "sorf_length \""), "\";"))) %>% 
    left_join(norfsAnnot, by="novelORF_ID")
  norfLengths <- norfLengths0 %>% 
    mutate(type = "other") %>% 
    mutate(type = ifelse(ORFannotation %in% c("ncRNA", "pseudogene", "nmd", "bidirectional_promoter_lncRNA", "retained_intron", "antisense"), "ncRNA", type)) %>% 
    mutate(type = ifelse(ORFannotation %in% c("cds", "cds-intronic", "cds-utr3", "cds-intergenic"), "altCDS", type)) %>% 
    mutate(type = ifelse(ORFannotation %in% c("utr5", "utr5-intronic", "utr5-cds","utr3", "utr3-intergenic", "utr5-intergenic", "utr3-intronic"), "UTR", type)) %>% 
    mutate(type = ifelse(ORFannotation %in% c("intergenic"), "intergenic", type)) %>% 
    filter(type != "other")

  lengthTable <- bind_rows(swissprotLengths, norfLengths)
  lengthTable$type <- factor(lengthTable$type,
                             levels = c("UTR", "altCDS", "intergenic", "ncRNA" ,"canonical"),ordered = TRUE)
  lengthPlot <- ggplot(lengthTable, aes(factor(type), length)) + 
    theme_minimal() +
    geom_violin(aes(fill = factor(type)), scale = "area") +
    scale_y_continuous(trans ='log10') +
    labs(x = NULL, y = "Length (AA)") +
    theme(axis.text.y = element_text(size=18), axis.text.x = element_text(angle = 40, hjust = 1, size = 18), axis.title = element_text(size = 20)) +
    scale_fill_manual(values= c("#E69F00", "#56B4E9", "#009E73", "#CC6666",  	"#D3D3D3"), guide = 'none') + 
    geom_boxplot(width=0.1, outlier.shape = NA)
  return(lengthPlot)
}
figure3c()


```

## Figure 3d: Circos Genomic density plot
```{r circos}
#194,407 nORFs genomic density against 754,731 known protein-coding region CDS
#Figure 3d
norf <- read.table("noInFrame_38.bed", stringsAsFactors = F)
cds <- read.table("gencode.v30.annotation.CDS.bed", stringsAsFactors = F)

circos.initializeWithIdeogram(species='hg38', chromosome.index = paste0("chr", c(1:22, "X", "Y")), plotType = c('labels')) 
circos.genomicDensity(cds, col = "#D3D3D3")
circos.genomicDensity(norf, col = "darkblue")
legend(x = 1, y = 0.4, legend = c("canonical", "nORFs"), fill = c("#D3D3D3","darkblue" ), cex = 2, bty = "n")
```

## Full Figure 3
```{r Fig3}
p3b = figure3b() + theme(plot.margin = margin(5.5, 8.5, 5.5, 5.5))
p3c = figure3c() #+ theme(plot.margin = margin(5.5, 8.5, 5.5, 5.5))
png('plots/figure3.png', height=6, width=16, units = 'in', res=300)
ggarrange(p3b,p3c, widths = c(2,1), labels = c('b', 'c'))
dev.off()
```

# 3. LDSC

## Random-effect meta-analysis across traits
With Supplementary table 2
```{r}
#random-effect meta analysis across traits
stringsAsFactors=FALSE
resultFiles <- c("blood_EOSINOPHIL_COUNT","blood_PLATELET_COUNT","blood_RBC_DISTRIB_WIDTH","blood_RED_COUNT","blood_WHITE_COUNT","bmd_HEEL_TSCOREz","body_BALDING1","body_BMIz","body_HEIGHTz","body_WHRadjBMIz","bp_SYSTOLICadjMEDz","cov_EDU_COLLEGE","cov_SMOKING_STATUS","disease_AID_ALL","disease_ALLERGY_ECZEMA_DIAGNOSED","disease_ASTHMA_DIAGNOSED","disease_CARDIOVASCULAR","disease_T2D","lung_FEV1FVCzSMOKE","lung_FVCzSMOKE","mental_NEUROTICISM","other_MORNINGPERSON","pigment_SUNBURN","repro_AgeFirstBirth_Female","repro_MENARCHE_AGE","repro_MENOPAUSE_AGE","repro_NumberChildrenEverBorn_Pooled")

metaAnalysisCommon <- function(resultFiles, folder) {
  #CORRECT VERSION
  M       = 5961159
  enr        = NULL
  enr_sd     = NULL
  enrstat    = NULL
  enrstat_sd = NULL
  tau        = NULL
  tau_sd     = NULL
  
  for (trait in resultFiles) {
    data         = read_tsv(paste0(folder, trait,".customLF.enrichments"), col_names = T, col_types = 'cdddddddddddddddd')    #modify the .results path
    log          = read.table(paste0(folder, trait,".customLF.log"),h=F,fill=T) #modify the .log path
    h2g          = as.numeric(as.character(log[which(log$V4=="h2:"),5]))
    enr          = cbind(enr   , data$CVE)
    enr_sd       = cbind(enr_sd, data$CVE_se)
    myenrstat    = (h2g/M)*((data$CVE)-(1-data$CVE*data$prop_common_snps)/(1-data$prop_common_snps))
    myenrstat_z  = qnorm(data$CVE_p/2) #step2
    myenrstat_sd = myenrstat/myenrstat_z #step3
    enrstat      = cbind(enrstat   , myenrstat)
    enrstat_sd   = cbind(enrstat_sd, myenrstat_sd)
  } 
  #meta analysis begins here
  enr_meta = NULL
  for (i in 1:nrow(enr)){
    test1 = meta.summaries(enr[i,],enr_sd[i,],method="random")
    if (data$prop_common_snps[i]==1) {
      enr_meta = rbind(enr_meta,c(test1$summary,test1$se.summary,NA)) # case of the base annotation
    } else {
      test2 = meta.summaries(enrstat[i,],enrstat_sd[i,],method="random")
      enr_meta = rbind(enr_meta,c(test1$summary,test1$se.summary,2*pnorm(-abs(test2$summary/test2$se.summary))))
    }
  }
  
  out = data.frame(as.character(data$annotation), as.numeric(as.character(data$prop_common_snps)) ,as.numeric(as.character(enr_meta[,1])),
                   as.numeric(as.character(enr_meta[,2])),as.numeric(as.character(enr_meta[,3])))
  colnames(out) = c("Annotation", "Prop_common_snps","CVE","CVE_se","CVE_pval")
  
  return(out)
}

metaAnalysisLowfrq <- function(resultFiles, folder) {
  #CORRECT VERSION
  M       = 5961159
  enr        = NULL
  enr_sd     = NULL
  enrstat    = NULL
  enrstat_sd = NULL
  tau        = NULL
  tau_sd     = NULL
  
  for (trait in resultFiles) {
    data         = read_tsv(paste0(folder, trait,".customLF.enrichments"), col_names = T, col_types = 'cdddddddddddddddd')    #modify the .results path
    log          = read.table(paste0(folder, trait,".customLF.log"),h=F,fill=T) #modify the .log path
    h2g          = as.numeric(as.character(log[which(log$V4=="h2:"),5]))
    enr          = cbind(enr   , data$LFVE)
    enr_sd       = cbind(enr_sd, data$LFVE_se)
    myenrstat    = 3*((data$LFVE)-(1-data$LFVE*data$prop_lowfrq_snps)/(1-data$prop_lowfrq_snps))
    myenrstat_z  = qnorm(data$LFVE_p/2) #step2
    myenrstat_sd = myenrstat/myenrstat_z #step3
    enrstat      = cbind(enrstat   , myenrstat)
    enrstat_sd   = cbind(enrstat_sd, myenrstat_sd)
  } #close for loop
  #meta analysis begins here
  enr_meta = NULL
  for (i in 1:nrow(enr)){
    test1 = meta.summaries(enr[i,],enr_sd[i,],method="random")
    if (data$prop_common_snps[i]==1) {
      enr_meta = rbind(enr_meta,c(test1$summary,test1$se.summary,NA)) # case of the base annotation
    } else {
      test2 = meta.summaries(enrstat[i,],enrstat_sd[i,],method="random")
      enr_meta = rbind(enr_meta,c(test1$summary,test1$se.summary,2*pnorm(-abs(test2$summary/test2$se.summary))))
    }
  }
  
  out = data.frame(as.character(data$annotation), as.numeric(as.character(data$prop_lowfrq_snps)) ,as.numeric(as.character(enr_meta[,1])),
              as.numeric(as.character(enr_meta[,2])),as.numeric(as.character(enr_meta[,3])))
  
  colnames(out) = c("Annotation", "Prop_lowfrq_snps","LFVE","LFVE_se","LFVE_pval")
  
  return(out)
}

common <- metaAnalysisCommon(resultFiles, folder = "dataFiles/LDSCresults/")
lowfrq <- metaAnalysisLowfrq(resultFiles, folder = "dataFiles/LDSCresults/")

fullMeta <- common %>% 
  left_join(lowfrq, by = "Annotation") %>% 
  mutate('LFVE/CVE' = LFVE/CVE)

#Supplementary table 2
write_tsv(fullMeta, "plots/supTable2.txt")
```

## Figure 4
```{r LDSC_Figure}

#COMMON
figure4a <- function(fullMeta) {
  altTable <- fullMeta[c(68,69,71,70,72,74,73),]
  altTable$Annotation <- c("Transcribed", "Canonical CDS", "5'UTR", "3'UTR", "nORFs", "nORFs_altCDS", "nORFs_noCDS")
  altTable$commonlabel <- paste(altTable$Annotation," (",round(altTable$Prop_common_snps*100, digits = 2),"%)",sep='')
  altTable$commonlabel <- factor(altTable$commonlabel, levels = altTable$commonlabel)
  altTable$colorCode <- c("Canonical","Canonical","Canonical","Canonical","nORFs","nORFs","nORFs")
  altPlot = ggplot(altTable, aes(x = commonlabel, y = CVE, fill = colorCode)) + 
    geom_bar(stat="identity", width = .6 ) +
    theme_minimal() +
    scale_fill_manual(values=c("#56B4E9","#999999"),name=NULL) +
    labs(x = "Annotation (% of common variants)", y = "CVE") +
    theme(axis.text.y = element_text(size=10), axis.text.x = element_text(angle = 40, hjust = 1, size = 10), axis.title = element_text(size = 14), axis.title.x = element_text(margin = margin(t = 10, r = 0, b = 0, l = 0)), axis.title.y = element_text(margin = margin(t = 0, r = 10, b = 0, l = 0))) +
    theme(plot.margin = unit(c(0.5,0.5,1.5,1),"cm")) +
    coord_cartesian(xlim = c(1, 7), clip = 'off') +
    scale_y_continuous(breaks = c(0,2,4,6,8,10)) +
    geom_errorbar(aes(ymin = CVE-CVE_se, ymax=CVE+CVE_se), width=.2, position=position_dodge(.9)) +
    geom_hline(yintercept = 1, linetype = "dashed") +
    annotate("text", x = 8.25, y = 1, label = "No enrichment", size = 3) 
  altPlot
}
figure4a(fullMeta)

#LOWFRQ
figure4b <- function(fullMeta) {
  altTableLF <- fullMeta[c(68,69,71,70,72,74,73),]
  altTableLF$Annotation <- c("Transcribed", "Canonical CDS", "5'UTR", "3'UTR", "nORFs", "nORFs_altCDS", "nORFs_noCDS")
  altTableLF$lowfrqlabel <- paste(altTableLF$Annotation," (",round(altTableLF$Prop_lowfrq_snps*100, digits = 2),"%)",sep='')
  altTableLF$lowfrqlabel <- factor(altTableLF$lowfrqlabel, levels = altTableLF$lowfrqlabel)
  altTableLF$colorCode <-  c("Canonical","Canonical","Canonical","Canonical","nORFs","nORFs","nORFs")
  altPlotLF = ggplot(altTableLF, aes(x = lowfrqlabel, y = LFVE, fill = colorCode)) + 
    geom_bar(stat="identity", width = .6 ) +
    theme_minimal() +
    scale_fill_manual(values=c("#56B4E9","#999999"), name=NULL) +
    labs(x = "Annotation (% of low freq variants)", y = "LFVE") +
    theme(axis.text.y = element_text(size=10), axis.text.x = element_text(angle = 40, hjust = 1, size = 10), axis.title = element_text(size = 14),
          axis.title.x = element_text(margin = margin(t = 10, r = 0, b = 0, l = 0)), axis.title.y = element_text(margin = margin(t = 0, r = 10, b = 0, l = 0))) +
    theme(plot.margin = unit(c(0,0.5,1.5,1),"cm")) +
    coord_cartesian(xlim = c(1, 7), clip = 'off') +
    geom_errorbar(aes(ymin = LFVE-LFVE_se, ymax=LFVE+LFVE_se), width=.2, position=position_dodge(.9)) +
    geom_hline(yintercept = 1, linetype = "dashed") +
    annotate("text", x = 8.25, y = 1, label = "No enrichment", size = 3) 
  return(altPlotLF)
}
figure4b(fullMeta)

figure4c <- function(fullMeta) {
  ratioTable <- fullMeta[c(68,69,71,70,72,74,73),]
  ratioTable$Annotation <- c("Transcribed", "Canonical CDS", "5'UTR", "3'UTR", "nORFs", "nORFs_altCDS", "nORFs_noCDS")
  ratioTable$colorCode <- c("Canonical","Canonical", "Canonical", "Canonical", "nORFs", "nORFs", "nORFs")
  ratioPlot <- ggplot(ratioTable, aes(x=CVE, y= LFVE, label = Annotation, color = colorCode, shape = colorCode)) +
    geom_point(size = 3) +
    theme_minimal() +
    theme(axis.title = element_text(size = 14)) +
    theme(axis.title.x = element_text(margin = margin(t = 10, r = 0, b = 0, l = 0)), axis.title.y = element_text(margin = margin(t = 0, r = 10, b = 0, l = 0))) +
    geom_errorbar(aes(ymin = LFVE-LFVE_se, ymax=LFVE+LFVE_se)) +
    geom_errorbarh(aes(xmin = CVE-CVE_se, xmax=CVE+CVE_se)) +
    scale_color_manual(values=c("#56B4E9","#999999"), name=NULL) +
    scale_shape_manual(values=c(16,17), name=NULL) +
    scale_x_continuous(limits = c(0,NA), breaks = c(0,2,4,6,8,10)) +
    #scale_y_continuous(limits = c(NA,15)) +
    geom_abline(slope = 1, intercept = 0, linetype = "dashed", colour = "lightgrey") +
    geom_abline(slope = 2, intercept = 0, linetype = "dashed", colour = "lightgrey") +
    geom_abline(slope = 3, intercept = 0, linetype = "dashed", colour = "lightgrey") +
    geom_abline(slope = 4, intercept = 0, linetype = "dashed", colour = "lightgrey") +
    geom_abline(slope = 5, intercept = 0, linetype = "dashed", colour = "lightgrey") +
    annotate("text", x = 11, y = 12, label = "x1", size = 3, colour = "darkgrey") +
    annotate("text", x = 11, y = 23.2, label = "x2", size = 3, colour = "darkgrey") +
    annotate("text", x = 11, y = 35, label = "x3", size = 3, colour = "darkgrey") +
    annotate("text", x = 8.6, y = 36, label = "x4", size = 3, colour = "darkgrey") +
    annotate("text", x = 6.7, y = 36, label = "x5", size = 3, colour = "darkgrey") +
    geom_text_repel(colour = "black", point.padding = .5)
  return(ratioPlot)
}
figure4c(fullMeta)


figure4 = function() {
  p4a = figure4a(fullMeta) 
  p4b = figure4b(fullMeta)
  p4c = figure4c(fullMeta)
  png('plots/figure4.png', height=14, width=8, units = 'in', res=300)
  ggarrange(p4a,p4b,p4c, nrow = 3, ncol = 1, labels = c('a', 'b', 'c'))
  dev.off()
}
figure4()
```

# 4. MAPS

```{r MAPS}

###Code to create figures for gnomAD MAPS analysis

color_syn = '#AAAAAA'
color_mis = '#FF6103'
color_lof = '#9D1309'

plainMapsExomes <- read_tsv("dataFiles/MAPSresults/maps_plain_exomes.txt", col_names = T, col_types = 'cldddddd')
syn_exomes_maps = plainMapsExomes %>% filter(protein_coding & worst_csq == 'synonymous_variant') %$% maps
mis_exomes_maps = plainMapsExomes %>% filter(protein_coding & worst_csq == 'missense_variant') %$% maps
nonsense_exomes_maps = plainMapsExomes %>% filter(protein_coding & worst_csq == 'stop_gained') %$% maps
stoplost_exomes_maps = plainMapsExomes %>% filter(protein_coding & worst_csq == 'stop_lost') %$% maps

plainMapsGenomes <- read_tsv("dataFiles/MAPSresults/maps_plain_genomes.txt", col_names = T, col_types = 'cldddddd')
syn_genomes_maps = plainMapsGenomes %>% filter(protein_coding & worst_csq == 'synonymous_variant') %$% maps
mis_genomes_maps = plainMapsGenomes %>% filter(protein_coding & worst_csq == 'missense_variant') %$% maps
nonsense_genomes_maps = plainMapsGenomes %>% filter(protein_coding & worst_csq == 'stop_gained') %$% maps
stoplost_genomes_maps = plainMapsGenomes %>% filter(protein_coding & worst_csq == 'stop_lost') %$% maps


format_vep_category <- function(category_list) {
  return(category_list %>%
           gsub("_"," ", .) %>%
           gsub(" variant", "", .) %>%
           gsub("non coding transcript exon", "ncRNA", .) %>%
           gsub(" prime ","'", .))
}

regroup_maps = function(data, maps_grouping) {
  maps = data %>%
    group_by_at(vars(maps_grouping)) %>% 
    dplyr::summarize(singleton_count=sum(singleton_count),
              expected_singletons=sum(expected_singletons),
              variant_count=sum(variant_count),
              ps=singleton_count / variant_count,
              maps=(singleton_count - expected_singletons)/variant_count,
              maps_sem=sqrt(ps * (1 - ps) / variant_count),
              maps_upper=maps + 1.96 * maps_sem,
              maps_lower=maps - 1.96 * maps_sem) %>% ungroup
  return(maps)
}

canonicalChosenCsq <- c('missense_variant', 'non_coding_transcript_exon_variant',
                        'synonymous_variant', 'intron_variant',
                        '5_prime_UTR_variant', '3_prime_UTR_variant', 'intergenic_variant')
nORFsChosenCsq <- c('missense_variant', 'stop_gained', 'stop_lost', 'splice_acceptor_variant', 'splice_donor_variant',
               'synonymous_variant', 'intron_variant', 'intergenic_variant', 'upstream_gene_variant', 'downstream_gene_variant')

load_maps_data <- function(data_type = 'exomes', type = "segmented", group_splice = T, group_noncoding = T) {
  if (type == "segmented") {
    mapsData <- read_tsv(paste0('dataFiles/MAPSresults/maps_norfs_', data_type, '.txt'), col_types = 'ccididdd') %>%
      filter(worst_csq %in% canonicalChosenCsq,
             nORF_csq %in% nORFsChosenCsq) %>% 
      mutate(worst_csq=format_vep_category(worst_csq),
             nORF_csq=format_vep_category(nORF_csq),
             maps_upper=maps + 1.96 * maps_sem,
             maps_lower=maps - 1.96 * maps_sem)
  }
  else if (type == "nofilter") {
    mapsData <- read_tsv(paste0('dataFiles/MAPSresults/noCanonicalFilter_', data_type, '.txt'), col_types = 'ccididdd') %>%
      filter(worst_csq %in% canonicalChosenCsq,
             nORF_csq %in% nORFsChosenCsq) %>% 
      mutate(worst_csq=format_vep_category(worst_csq),
             nORF_csq=format_vep_category(nORF_csq),
             maps_upper=maps + 1.96 * maps_sem,
             maps_lower=maps - 1.96 * maps_sem)
  }
  else if (type == "onlynorfs") {
    mapsData <- read_tsv(paste0('dataFiles/MAPSresults/onlynorfs_', data_type, '.txt'), col_types = 'cididdd') %>%
      filter(nORF_csq %in% nORFsChosenCsq) %>% 
      mutate(nORF_csq=format_vep_category(nORF_csq),
             maps_upper=maps + 1.96 * maps_sem,
             maps_lower=maps - 1.96 * maps_sem)
  }
  if (group_splice & type %in% c("segmented", "nofilter")) {
    mapsData = mapsData %>%
      mutate(nORF_csq = fct_recode(nORF_csq, 'essential splice' = 'splice donor',
                                   'essential splice' = 'splice acceptor')) %>%
      regroup_maps(c('worst_csq','nORF_csq'))
  }
  if (group_splice & type == "onlynorfs") {
    mapsData = mapsData %>%
      mutate(nORF_csq = fct_recode(nORF_csq, 'essential splice' = 'splice donor',
                                   'essential splice' = 'splice acceptor')) %>%
      regroup_maps('nORF_csq')
  }
  if (group_noncoding & type %in% c("segmented", "nofilter")) {
    mapsData = mapsData %>%
      #mutate(worst_csq = fct_recode(worst_csq, 'non-coding' = "5'UTR",'non-coding' = 'intergenic', 'non-coding' = "3'UTR", 'non-coding' = 'non coding transcript','non-coding' = 'intron')) %>%
      mutate(nORF_csq = fct_recode(nORF_csq, 'non-coding' = 'intron', 'non-coding' = 'intergenic', 'non-coding' = 'downstream gene','non-coding' = 'upstream gene')) %>% 
      regroup_maps(c('worst_csq','nORF_csq'))    
  }
  if (group_noncoding & type == "onlynorfs") {
    mapsData = mapsData %>%
      mutate(nORF_csq = fct_recode(nORF_csq, 'non-coding' = 'intron',
                                   'non-coding' = 'intergenic',
                                   'non-coding' = 'downstream gene',
                                   'non-coding' = 'upstream gene')) %>%
      regroup_maps('nORF_csq')    
  }
  mapsData = mapsData %>% 
    filter(variant_count > 100)
  
  return(mapsData)
}


#Both with canonical filter
#mapsData = load_maps_data(data_type = 'exomes', type = "segmented")
#mapsData = load_maps_data(data_type = 'genomes', type = "segmented")
#Both no filter
ordering_worst_csq = c('intergenic', 'intron', "5'UTR", "3'UTR", 'ncRNA',
                       'synonymous', 'missense')
ordering_nORF_csq = c('non-coding', 'synonymous', 'missense', 'stop lost', 'stop gained')


exomesData = load_maps_data(data_type = 'exomes', type = "nofilter") %>%
  mutate(worst_csq=fct_relevel(worst_csq, ordering_worst_csq )) %>% 
  mutate(nORF_csq=fct_relevel(nORF_csq, ordering_nORF_csq))
genomesData = load_maps_data(data_type = 'genomes', type = "nofilter") %>%
  mutate(worst_csq=fct_relevel(worst_csq, ordering_worst_csq )) %>% 
  mutate(nORF_csq=fct_relevel(nORF_csq, ordering_nORF_csq))


exomesTileData <- load_maps_data(data_type = 'exomes', type = "nofilter") %>% 
  dplyr::select(worst_csq, nORF_csq, variant_count) %>% 
  bind_rows(tibble(worst_csq = "intergenic", 
                   nORF_csq = c("missense", "stop lost", "stop gained", "synonymous"), 
                   variant_count = 0)) %>%
  mutate(worst_csq=fct_relevel(worst_csq, ordering_worst_csq )) %>% 
  mutate(nORF_csq=fct_relevel(nORF_csq, ordering_nORF_csq))
  
exomesTiles <- ggplot(exomesTileData, aes(worst_csq, nORF_csq)) + 
  theme_minimal() +
  geom_tile(aes(fill=variant_count), colour = "black", ) +
  scale_fill_gradient(low = "white", high = "#56B4E9", trans = pseudo_log_trans(sigma = 100), 
                      breaks = c(1e+03, 1e+05, 1e+07),
                      limits = c(NA, 1.1e+08),
                      name = "Variant Count") +
  geom_text(aes(label=comma(variant_count)), size = 3.5) +
  theme(axis.text.x = element_text(angle = 40, hjust = 1), 
        axis.title.x = element_text(margin = margin(t = 10, r = 0, b = 0, l = 0)), 
        axis.title.y = element_text(margin = margin(t = 0, r = 10, b = 0, l = 0))) +
  ggtitle("Exomes") +
  theme(plot.title=element_text(size=15, vjust=1, hjust = 0.5)) +
  xlab("Canonical Consequence") + ylab("nORF Consequence") 
exomesTiles

genomesTiles <- ggplot(genomesData, aes(worst_csq, nORF_csq)) + 
  theme_minimal() +
  geom_tile(aes(fill=variant_count), colour = "black", ) +
  scale_fill_gradient(low = "white", high = "#56B4E9", trans = pseudo_log_trans(sigma = 100), 
                      breaks = c(1e+03, 1e+05, 1e+07),
                      limits = c(NA, 1.1e+08),
                      name = "Variant Count") +
  geom_text(aes(label=comma(variant_count)), size = 3) +
  theme(axis.text.x = element_text(angle = 40, hjust = 1), 
        axis.title.x = element_text(margin = margin(t = 10, r = 0, b = 0, l = 0)), 
        axis.title.y = element_text(margin = margin(t = 0, r = 10, b = 0, l = 0))) +
  ggtitle("Genomes") +
  theme(plot.title=element_text(size=15, vjust=1, hjust = 0.5)) +
  xlab("Canonical Consequence") + ylab("nORF Consequence") 
genomesTiles

figureTiles = function() {
  pMa = exomesTiles + theme(plot.margin = unit(c(0.1,0,1.1,0.1), "cm")) #+ theme(legend.position = "none")
  pMb = genomesTiles + theme(plot.margin = unit(c(0.1,0.1,0.1,0.1), "cm")) 
  pdf('plots/figureTiles.pdf', height=12, width=8)
  ggarrange(pMa,pMb, labels = c('a', 'b'), ncol = 1, nrow = 2)
  #ggarrange(p2a, p2c, p2b, p2d, ncol = 2, nrow = 2, labels = c('a', 'c', 'b', 'd'))
  dev.off()
  png('plots/figureTiles.png', height=12, width=8, units = 'in', res=300)
  ggarrange(pMa,pMb, labels = c('a', 'b'), ncol = 1, nrow = 2)
  dev.off()
}
figureTiles()

#Segmented plot
exomesPlot <- ggplot(exomesData) + aes(x = worst_csq, y = maps, ymin = maps_lower, ymax = maps_upper, color = nORF_csq) + 
  geom_pointrange(position = position_dodge(width = 0.5)) + 
  geom_hline(yintercept = mis_exomes_maps, color = color_mis, linetype = 'dashed') + 
  geom_hline(yintercept = syn_exomes_maps, color = color_syn, linetype = 'dashed') + 
  geom_hline(yintercept = nonsense_exomes_maps, color = color_lof, linetype = 'dashed') + 
  #geom_hline(yintercept = stoplost_exomes_maps, color = color_lof, linetype = 'dashed') + 
  annotate('text', x = 0.9, y = mis_exomes_maps + 0.007, size = 3, hjust = 1, color = color_mis, label = 'missense') + 
  annotate('text', x = 1.025, y = syn_exomes_maps + 0.007, size = 3, hjust = 1, color = color_syn, label = 'synonymous') + 
  annotate('text', x = 1.00, y = nonsense_exomes_maps + 0.007, size = 3, hjust = 1, color = color_lof, label = 'stop gained') + 
  #annotate('text', x = 0.9, y = stoplost_exomes_maps + 0.007, size = 3, hjust = 1, color = color_lof, label = 'stop lost') + 
  theme_classic() +
  ggtitle("Exomes") +
  theme(plot.title=element_text(size=15, vjust=1, hjust = 0.5)) +
  xlab("Canonical Consequence") + ylab("MAPS Score") + labs(color='nORF Consequence') +
  scale_y_continuous(limits = c(-0.07, 0.16), breaks = c(-0.05, 0, 0.05, 0.1, 0.15)) +
  theme(plot.margin = margin(0, 5.5, 0, 5.5)) + 
  guides(color = guide_legend(reverse = TRUE))
exomesPlot


genomesPlot <- ggplot(genomesData) + aes(x = worst_csq, y = maps, ymin = maps_lower, ymax = maps_upper, color = nORF_csq) + 
  geom_pointrange(position = position_dodge(width = 0.5)) + 
  geom_hline(yintercept = mis_genomes_maps, color = color_mis, linetype = 'dashed') + 
  geom_hline(yintercept = syn_genomes_maps, color = color_syn, linetype = 'dashed') + 
  geom_hline(yintercept = nonsense_genomes_maps, color = color_lof, linetype = 'dashed') + 
  #geom_hline(yintercept = stoplost_genomes_maps, color = color_lof, linetype = 'dashed') + 
  annotate('text', x = 0.9, y = mis_genomes_maps + 0.007, size = 3, hjust = 1, color = color_mis, label = 'missense') + 
  annotate('text', x = 1.025, y = syn_genomes_maps + 0.007, size = 3, hjust = 1, color = color_syn, label = 'synonymous') + 
  annotate('text', x = 1.0, y = nonsense_genomes_maps + 0.007, size = 3, hjust = 1, color = color_lof, label = 'stop gained') + 
  #annotate('text', x = 0.9, y = stoplost_genomes_maps - 0.007, size = 3, hjust = 1, color = color_lof, label = 'stop lost') + 
  theme_classic() +
  ggtitle("Genomes") +
  theme(plot.title=element_text(size=15, vjust=1, hjust = 0.5)) +
  xlab("Canonical Consequence") + ylab("MAPS Score") + labs(color='nORF Consequence') +
  scale_y_continuous(limits = c(-0.07, 0.19), breaks = c(-0.05, 0, 0.05, 0.1, 0.15)) +
  theme(plot.margin = unit(c(1,0,1,0), "cm")) + 
  guides(color = guide_legend(reverse = TRUE))
genomesPlot

figureMAPS = function() {
  pMa = exomesPlot + theme(plot.margin = unit(c(0.1,0.1,1,0.1), "cm")) 
  pMb = genomesPlot + theme(plot.margin = unit(c(0.1,0.1,1,0.1), "cm")) 
  pdf('plots/figureMAPS.pdf', height=10, width=11)
  ggarrange(pMa,pMb, heights = c(1,1), labels = c('a', 'b'), ncol = 1, nrow = 2)
  #ggarrange(p2a, p2c, p2b, p2d, ncol = 2, nrow = 2, labels = c('a', 'c', 'b', 'd'))
  dev.off()
  png('plots/figureMAPS.png', height=10, width=13, units = 'in', res=300)
  ggarrange(pMa,pMb, heights = c(1,1), labels = c('a', 'b'), ncol = 1, nrow = 2)
  dev.off()
}
figureMAPS()


#Only norfs
ordering_nORF_csq = c('non-coding', 'synonymous', 'missense', 'stop lost', 'stop gained', 'essential splice')

exomesOnly = load_maps_data(data_type = 'exomes', type = "onlynorfs") %>% 
  mutate(nORF_csq=fct_relevel(nORF_csq, ordering_nORF_csq))
genomesOnly = load_maps_data(data_type = 'genomes', type = "onlynorfs") %>% 
  mutate(nORF_csq=fct_relevel(nORF_csq, ordering_nORF_csq))

#Basic plot
exomesOnlyPlot <- ggplot(exomesOnly) + aes(x = nORF_csq, y = maps, ymin = maps_lower, ymax = maps_upper, color = "#E69F00") + 
  geom_pointrange() + 
  geom_point(size=5) +
  geom_hline(yintercept = mis_exomes_maps, color = color_mis, linetype = 'dashed') + 
  geom_hline(yintercept = syn_exomes_maps, color = color_syn, linetype = 'dashed') + 
  geom_hline(yintercept = nonsense_exomes_maps, color = color_lof, linetype = 'dashed') + 
  #geom_hline(yintercept = stoplost_exomes_maps, color = color_lof, linetype = 'dashed') + 
  annotate('text', x = 0.78, y = mis_exomes_maps + 0.007, size = 3, hjust = 1, color = color_mis, label = 'missense') + 
  annotate('text', x = 0.87, y = syn_exomes_maps + 0.007, size = 3, hjust = 1, color = color_syn, label = 'synonymous') + 
  annotate('text', x = 0.85, y = nonsense_exomes_maps + 0.007, size = 3, hjust = 1, color = color_lof, label = 'stop gained') + 
  #annotate('text', x = 0.9, y = stoplost_exomes_maps + 0.007, size = 3, hjust = 1, color = color_lof, label = 'stop lost') + 
  theme_classic() +
  ggtitle("Exomes") +
  theme(plot.title=element_text(size=15, vjust=1, hjust = 0.5)) +
  xlab("nORF Consequence") + ylab("MAPS Score") +
  scale_y_continuous(limits = c(-0.07, 0.20), breaks = c(-0.05, 0, 0.05, 0.1, 0.15)) +
  theme(plot.margin = margin(0, 5.5, 0, 5.5)) +
  theme(legend.position = "none")
exomesOnlyPlot

genomesOnlyPlot <- ggplot(genomesOnly) + aes(x = nORF_csq, y = maps, ymin = maps_lower, ymax = maps_upper, color = "#E69F00") + 
  geom_pointrange() + 
  geom_point(size=5) +
  geom_hline(yintercept = mis_genomes_maps, color = color_mis, linetype = 'dashed') + 
  geom_hline(yintercept = syn_genomes_maps, color = color_syn, linetype = 'dashed') + 
  geom_hline(yintercept = nonsense_genomes_maps, color = color_lof, linetype = 'dashed') + 
  #geom_hline(yintercept = stoplost_genomes_maps, color = color_lof, linetype = 'dashed') + 
  annotate('text', x = 0.78, y = mis_genomes_maps + 0.007, size = 3, hjust = 1, color = color_mis, label = 'missense') + 
  annotate('text', x = 0.87, y = syn_genomes_maps + 0.007, size = 3, hjust = 1, color = color_syn, label = 'synonymous') + 
  annotate('text', x = 0.85, y = nonsense_genomes_maps + 0.007, size = 3, hjust = 1, color = color_lof, label = 'stop gained') + 
  #annotate('text', x = 0.9, y = stoplost_genomes_maps - 0.007, size = 3, hjust = 1, color = color_lof, label = 'stop lost') + 
  theme_classic() +
  ggtitle("Genomes") +
  theme(plot.title=element_text(size=15, vjust=1, hjust = 0.5)) +
  xlab("nORF Consequence") + ylab("MAPS Score") +
  scale_y_continuous(limits = c(-0.07, 0.20), breaks = c(-0.05, 0, 0.05, 0.1, 0.15)) +
  theme(plot.margin = margin(0, 5.5, 0, 5.5)) +
  theme(legend.position = "none")
genomesOnlyPlot

figureMAPS_Sup = function() {
  pMa = exomesOnlyPlot + theme(plot.margin = unit(c(0.1,0.1,1,0.1), "cm")) 
  pMb = genomesOnlyPlot + theme(plot.margin = unit(c(0.1,0.1,1,0.1), "cm")) 
  pdf('plots/figureMAPS_Sup.pdf', height=10, width=11)
  ggarrange(pMa,pMb, heights = c(1,1), labels = c('a', 'b'), ncol = 1, nrow = 2)
  dev.off()
  png('plots/figureMAPS_Sup.png', height=10, width=10, units = 'in', res=300)
  ggarrange(pMa,pMb, heights = c(1,1), labels = c('a', 'b'), ncol = 1, nrow = 2)
  dev.off()
}
figureMAPS_Sup()
```


# 5. Variant Analysis

## Figure 6: Reinterpreting COSMIC, HGMD and ClinVar mutations in the context of nORFs
```{r}
#Variant counts

format_vep_category <- function(category_list) {
  return(category_list %>%
           gsub("_"," ", .) %>%
           gsub(" variant", "", .) %>%
           gsub("non coding transcript exon", "ncRNA", .) %>%
           gsub(" prime ","'", .))
}


canonicalChosenCsq <- c('missense_variant', 'non_coding_transcript_exon_variant',
                        'synonymous_variant', 'intron_variant',
                        '5_prime_UTR_variant', '3_prime_UTR_variant', 'intergenic_variant')
nORFsChosenCsq <- c('stop_gained', 'stop_lost', 'frameshift_variant')

getCounts <- function(norfVCF, vepVCF, chosenCsq = nORFsChosenCsq) {
  nORFs <- read_tsv(norfVCF, comment = '#', col_names = F, col_types = 'cccccccccccccc') %>% 
    dplyr::select(X1, X7) %>% 
    filter(X7 %in% chosenCsq)
  vep <- read_tsv(vepVCF, comment = '#', col_names = F, col_types = 'cccccccccccccc') %>% 
    dplyr::select(X1, X7) %>% 
    filter(X1 %in% nORFs$X1)
  joined <- nORFs %>% 
    left_join(vep, by = "X1") %>% 
    filter(X7.x %in% nORFsChosenCsq) %>% 
    filter(X7.y %in% canonicalChosenCsq) %>% 
    mutate(X7.y = format_vep_category(X7.y)) %>% 
    mutate(X7.x = format_vep_category(X7.x))
  colnames(joined) <- c("ID", "nORF_csq", "vep_csq")
  
  counts <- summarise(group_by(joined , nORF_csq, vep_csq), counts = n())
  return(joined)
}

figure6a <- function() {
  nORFs <- read_tsv("dataFiles/cosmic/cosmicCoding_norfs.vcf", comment = '#', col_names = F, col_types = 'cccccccccccccc') %>% 
    dplyr::select(X1, X7) %>% 
    filter(X7 %in% nORFsChosenCsq)
  nORFs2 <- read_tsv("dataFiles/cosmic/cosmicNonCoding_norfs.vcf", comment = '#', col_names = F, col_types = 'cccccccccccccc') %>% 
    dplyr::select(X1, X7) %>% 
    filter(X7 %in% nORFsChosenCsq) %>% 
    bind_rows(nORFs)
  vep <- read_tsv("dataFiles/cosmic/cosmicCoding_vep.vcf", comment = '#', col_names = F, col_types = 'cccccccccccccc') %>% 
    dplyr::select(X1, X7) %>% 
    filter(X1 %in% nORFs2$X1)
  vep2 <- read_tsv("dataFiles/cosmic/cosmicNonCoding_vep.vcf", comment = '#', col_names = F, col_types = 'cccccccccccccc') %>% 
    dplyr::select(X1, X7) %>% 
    filter(X1 %in% nORFs2$X1) %>% 
    bind_rows(vep)
  subtypes <- read_tsv("dataFiles/cosmic/CosmicCodingSubtypes.tsv", comment = '#', col_names = c("PrimarySite", "PrimaryHistology", "ID"), col_types = 'ccc') %>% 
    filter(ID %in% nORFs2$X1) %>% 
    select(PrimarySite, ID)
  subtypes2 <- read_tsv("dataFiles/cosmic/CosmicNonCodingSubtypes.tsv", comment = '#', col_names = c("PrimarySite", "ID"), col_types = 'cc') %>% 
    filter(ID %in% nORFs2$X1) %>% 
    bind_rows(subtypes)
  joined <- nORFs2 %>% 
    left_join(vep2, by = "X1") %>% 
    filter(X7.x %in% nORFsChosenCsq) %>% 
    filter(X7.y %in% canonicalChosenCsq) %>% 
    mutate(X7.y = format_vep_category(X7.y)) %>% 
    mutate(X7.x = format_vep_category(X7.x))
  colnames(joined) <- c("ID", "nORF_csq", "vep_csq")
  
  variantsSubtyped <- joined %>% 
    left_join(subtypes2) %>% 
    filter(PrimarySite != "NS")
  
  subtypesBelow1000 <- summarise(group_by(variantsSubtyped, PrimarySite), counts = n()) %>% 
    filter(counts < 4000) 
  
  variantsSubtypedCondensed <- variantsSubtyped %>% 
    mutate(PrimarySite = ifelse(PrimarySite %in% subtypesBelow1000$PrimarySite, "other", PrimarySite))
  
  counts <- summarise(group_by(variantsSubtypedCondensed , nORF_csq, vep_csq, PrimarySite), counts = n()) %>% 
    mutate(PrimarySite = str_replace(PrimarySite, "haematopoietic_and_lymphoid_tissue", "haematopoietic/lymphoid")) %>% 
    mutate(PrimarySite = str_replace(PrimarySite, "_", " ")) %>% 
    filter(vep_csq != "intergenic")
  
  cosmicPlot <- ggplot(data = counts, aes(axis1 = PrimarySite, axis2 = vep_csq, axis3 = nORF_csq, y = counts)) +
    scale_x_discrete(limits = c("PrimarySite","vep csq", "nORF csq" )) +
    scale_y_continuous(label=comma) +
    geom_alluvium(aes(fill = nORF_csq)) +
    geom_stratum() + geom_text(stat = "stratum", label.strata = TRUE) +
    geom_label(stat = "stratum", aes(label=ifelse(vep_csq == 'intron', vep_csq,NA))) +
    geom_label(stat = "stratum", aes(label=ifelse(PrimarySite == "haematopoietic/lymphoid" , PrimarySite,NA))) +
    theme_minimal() +
    ylab("COSMIC variants") +
    guides(fill=guide_legend(title="nORF csq")) +
    theme(axis.title = element_text(size = 14), axis.text.x = element_text(size = 14), 
          axis.text.y = element_text(size = 14))
  return(cosmicPlot)
}
figure6a()


figure6b <- function() {
  hgmdJoined <- getCounts(norfVCF = "dataFiles/hgmd/hgmd_norfs.vcf", vepVCF = "dataFiles/hgmd/hgmd_vep.vcf", chosenCsq = nORFsChosenCsq) %>% 
    mutate(source = "hgmd")
  bothJoined <- getCounts(norfVCF = "dataFiles/clinvar/clinvar_norfs.vcf", vepVCF = "dataFiles/clinvar/clinvar_vep.vcf", chosenCsq = nORFsChosenCsq) %>% 
    mutate(source = "clinvar") %>% 
    bind_rows(hgmdJoined)
  
  bothCounts <- summarise(group_by(bothJoined , nORF_csq, vep_csq, source), counts = n())
  
  bothPlot <- ggplot(data = bothCounts, aes(axis1 = source, axis2 = vep_csq, axis3 = nORF_csq, y = counts)) +
    scale_x_discrete(limits = c("source","vep csq", "nORF csq" )) +
    geom_alluvium(aes(fill = nORF_csq)) +
    geom_stratum() + 
    geom_text(stat = "stratum", label.strata = TRUE) +
    geom_label(stat = "stratum", aes(label=ifelse(vep_csq %in% c('ncRNA', 'intron'), vep_csq,NA))) +
    theme_minimal() +
    ylab("Disease variants") +
    guides(fill=guide_legend(title="nORF csq")) +
    theme(axis.title = element_text(size = 14), axis.text.x = element_text(size = 14), 
          axis.text.y = element_text(size = 14))
  return(bothPlot)
}
figure6b()

figure6 = function() {
  #p6a = figure6a()
  p6a = cosmicPlot
  p6b = figure6b()
  pdf('plots/figure6.pdf', height=18, width=13)
  ggarrange(p6a,p6b, nrow = 2, ncol = 1,labels = c('a', 'b'))
  dev.off()
  png('plots/figure6.png', height=15, width=15, units = 'in', res=300)
  ggarrange(p6a,p6b, nrow = 2, ncol = 1,labels = c('a', 'b'))
  dev.off()
}
figure6()
```

## Sup tables 3 and 4
```{r variantTables}
#Tables
#Use COSMIC function code and then
cosmicTableCounts <- summarise(group_by(variantsSubtypedCondensed, vep_csq, nORF_csq), counts = n())
write.table(cosmicTableCounts, "plots/TableS3.txt", sep = '\t', row.names = F, col.names = T, quote = F)


hgmdJoined <- getCounts(norfVCF = "dataFiles/hgmd/hgmd_norfs.vcf", vepVCF = "dataFiles/hgmd/hgmd_vep.vcf", chosenCsq = nORFsChosenCsq) %>% 
  mutate(source = "hgmd")
bothJoined <- getCounts(norfVCF = "dataFiles/clinvar/clinvar_norfs.vcf", vepVCF = "dataFiles/clinvar/clinvar_vep.vcf", chosenCsq = nORFsChosenCsq) %>% 
  mutate(source = "clinvar") %>% 
  bind_rows(hgmdJoined)

bothCounts <- summarise(group_by(bothJoined, vep_csq, nORF_csq), counts = n())

write.table(bothCounts, "plots/TableS4.txt", sep = '\t', row.names = F, col.names = T, quote = F)


```

## Table 2 (HGMD)
```{r table1}

chosenCsq <- c('stop_gained', 'stop_lost', 'splice_acceptor_variant', 'splice_donor_variant', 'frameshift_variant')

nORFs <- read_tsv("dataFiles/hgmd/hgmd_norfs.vcf", comment = '#', col_names = F, col_types = 'cccccccccccccc') %>% 
  dplyr::select(X1, X7) %>% 
  filter(X7 %in% chosenCsq)
vep <- read_tsv("dataFiles/hgmd/hgmd_vep.vcf", comment = '#', col_names = F, col_types = 'cccccccccccccc') %>% 
  dplyr::select(X1, X7) %>% 
  filter(X1 %in% nORFs$X1)
joined <- nORFs %>% 
  left_join(vep, by = "X1") %>% 
  filter(!(X7.y %in% chosenCsq)) %>% 
  mutate(X7.y = format_vep_category(X7.y)) %>% 
  mutate(X7.x = format_vep_category(X7.x))
colnames(joined) <- c("ID", "nORF_csq", "vep_csq")

hgmdCandidates <- joined %>% 
  filter(!(vep_csq %in% c("missense", "start lost", "splice region", "inframe insertion", "inframe deletion", "protein altering"))) %>% 
  filter(!(nORF_csq %in% c("frameshift")))

hgmdVCF <- read_tsv("dataFiles/hgmd/hgmd2019_hg38.vcf", comment = '#', col_names = F, col_types = 'cicccccc') 

hgmdVCFfiltered <- hgmdVCF %>% 
  filter(X3 %in% hgmdCandidates$ID) %>% 
  mutate(PHEN = str_remove(str_extract(X8, 'PHEN=".*"'), "PHEN=")) %>% 
  mutate(PHEN = str_remove_all(PHEN, '"')) %>% 
  mutate(CLASS = str_remove(str_extract(X8, 'CLASS=[^;]*;'), "CLASS=")) %>% 
  mutate(GENE = str_remove(str_extract(X8, 'GENE=[^;]*;'), "GENE=")) %>% 
  mutate(GENE = str_remove(GENE, ";")) %>% 
  filter(CLASS =="DM;") %>% 
  mutate(ID = X3) %>% 
  select(c(X1,X2,ID, PHEN, GENE)) %>% 
  left_join(hgmdCandidates)

find_nORFs <- hgmdVCF %>% 
  filter(X3 %in% hgmdVCFfiltered$ID)
fwrite(find_nORFs, "dataFiles/hgmd/find_nORFs.vcf", col.names = F, quote = F, sep = '\t')

#Ran this code in VEP:
#sudo docker run -d -t -i -v $HOME/vep_data:/opt/vep/.vep ensemblorg/ensembl-vep ./vep -i /opt/vep/.vep/find_nORFs.vcf -o /opt/vep/.vep/found_norfs.vcf --gtf /opt/vep/.vep/norfs_38.gtf.gz --force_overwrite --fasta /opt/vep/.vep/homo_sapiens/97_GRCh38/Homo_sapiens.GRCh38.dna.toplevel.fa.gz

found_norfs <- read_tsv("dataFiles/hgmd/found_norfs.vcf", comment = '#', col_names = F, col_types = 'cccccccccccccc') %>% 
  dplyr::filter(X7 %in% c("stop_gained", "stop_lost")) %>% 
  distinct(X1, .keep_all = T) %>% 
  select(X1,X4)
colnames(found_norfs) <- c("ID", "nORF")

hgmdnORFseqs <- readRDS("dataFiles/nORFsDB1.3.rds") %>% 
  dplyr::filter(id %in% found_norfs$nORF) %>% 
  select(id, NA.3)
colnames(hgmdnORFseqs) <- c("nORF", "Length(AA)")

table1 <- hgmdVCFfiltered %>% 
  left_join(found_norfs, by = "ID") %>% 
  left_join(hgmdnORFseqs, by = "nORF") %>% 
  select(c(X1,X2,ID, PHEN, GENE, vep_csq, nORF, nORF_csq, 'Length(AA)'))
colnames(table1)[1:2] <-c("Chr", "Pos")
write_tsv(table1, "plots/table2.tsv")
```

## Table 3 (ClinVar)
```{r table2}

chosenCsq <- c('stop_gained', 'stop_lost', 'splice_acceptor_variant', 'splice_donor_variant', 'frameshift_variant')

nORFs <- read_tsv("dataFiles/clinvar/clinvar_norfs.vcf", comment = '#', col_names = F, col_types = 'cccccccccccccc') %>% 
  dplyr::select(X1, X7) %>% 
  filter(X7 %in% chosenCsq)
vep <- read_tsv("dataFiles/clinvar/clinvar_vep.vcf", comment = '#', col_names = F, col_types = 'cccccccccccccc') %>% 
  dplyr::select(X1, X7) %>% 
  filter(X1 %in% nORFs$X1)
joined <- nORFs %>% 
  left_join(vep, by = "X1") %>% 
  filter(!(X7.y %in% chosenCsq)) %>% 
  mutate(X7.y = format_vep_category(X7.y)) %>% 
  mutate(X7.x = format_vep_category(X7.x))
colnames(joined) <- c("ID", "nORF_csq", "vep_csq")

clinvarCandidates <- joined %>% 
  filter(!(vep_csq %in% c("missense", "start lost", "splice region", "inframe insertion", "inframe deletion", "protein altering"))) %>% 
  filter(!(nORF_csq %in% c("frameshift")))

clinvarVCF <- read_tsv("dataFiles/clinvar/clinvar_20190708.vcf", comment = '#', col_names = F, col_types = 'cccccccccccccc') 

clinvarVCFfiltered <- clinvarVCF %>% 
  filter(X3 %in% clinvarCandidates$ID) %>% 
  filter(X3 != "624401") %>% 
  mutate(PHEN = str_remove(str_extract(X8, 'CLNDN=[^;]*;'), "CLNDN=")) %>% 
  mutate(PHEN = str_remove(PHEN, "not_provided")) %>% 
  mutate(PHEN = str_remove(PHEN, "|")) %>% 
  mutate(PHEN = str_remove(PHEN, ";")) %>% 
  mutate(GENE = str_remove(str_extract(X8, 'GENEINFO=[^;]*:'), "GENEINFO=")) %>% 
  mutate(GENE = str_remove(GENE, ":")) %>% 
  mutate(CLNSIG = str_remove(str_extract(X8, 'CLNSIG=[^;]*;'), "CLNSIG=")) %>% 
  filter(CLNSIG %in% c("Pathogenic/Likely_pathogenic;", "Pathogenic;", "Likely_pathogenic;")) %>% 
  mutate(ID = X3) %>% 
  select(c(X1,X2,ID, PHEN, GENE)) %>% 
  left_join(clinvarCandidates)
clinvarVCFfiltered$GENE[7] <- "HBB"

find_nORFsClin <- clinvarVCF %>% 
  filter(X3 %in% clinvarVCFfiltered$ID)
write_tsv(find_nORFsClin, "dataFiles/clinvar/find_nORFsClin.vcf", col_names = F)

#Ran this code in VEP:
#sudo docker run -d -t -i -v $HOME/vep_data:/opt/vep/.vep ensemblorg/ensembl-vep ./vep -i /opt/vep/.vep/find_nORFsClin.vcf -o /opt/vep/.vep/found_norfsClin.vcf --gtf /opt/vep/.vep/norfs_38.gtf.gz --force_overwrite --fasta /opt/vep/.vep/homo_sapiens/97_GRCh38/Homo_sapiens.GRCh38.dna.toplevel.fa.gz
found_norfsClin <- read_tsv("dataFiles/clinvar/found_norfsClin.vcf", comment = '#', col_names = F, col_types = 'cccccccccccccc') %>% 
  dplyr::filter(X7 %in% c("stop_gained", "stop_lost")) %>% 
  distinct(X1, .keep_all = T) %>% 
  select(X1,X4)
colnames(found_norfsClin) <- c("ID", "nORF")


clin_nORFseqs <- readRDS("dataFiles/nORFsDB1.3.rds") %>% 
  dplyr::filter(id %in% found_norfsClin$nORF) %>% 
  select(id, NA.3)
colnames(clin_nORFseqs) <- c("nORF", "Length(AA)")

table2 <- clinvarVCFfiltered %>% 
  left_join(found_norfsClin, by = "ID") %>% 
  left_join(clin_nORFseqs, by = "nORF") %>% 
  select(c(X1,X2,ID, PHEN, GENE, vep_csq, nORF, nORF_csq, 'Length(AA)'))
colnames(table2)[1:2] <-c("Chr", "Pos")
write_tsv(table2, "plots/table3.tsv")
```