### packages ### library(ggplot2) library(dplyr) library(stringr) library(fmsb) library(grid) library(gridExtra) library(varhandle) library(glue) ##Datafile FINAL_IMPACT_Solid_ViralDataset_pubready <- read.delim("~/FINAL_IMPACT_Solid_ViralDataset_pubready.txt", stringsAsFactors=FALSE) tmp78 <- FINAL_IMPACT_Solid_ViralDataset_pubready %>% rename(tax_name = Name, readcount = Virus_Read_Count, species_name = Taxonomy_Species, genus_name = Taxonomy_Genus) %>% mutate(type = "virus") %>% filter(DMP_ASSAY_ID != "P-0020610-T05-IM6") %>% select(DMP_ASSAY_ID,readcount,species_name,genus_name,GeneralTumorType,DetailedTumorType,tax_name,SampleType,type) tumor_type_quantities <- Solid_samples_DB %>% group_by(GeneralTumorType) %>% tally() ## determine frequency of specific virus type for odds ratio analysis ## Readcount adjusted based on validation for HPV and EBV virus_type_quantities<- FINAL_IMPACT_Solid_ViralDataset_pubready %>% filter((tax_name!="Human gammaherpesvirus 4" & readcount>1)|(tax_name=="Human gammaherpesvirus 4" & readcount>70)) %>% select(DMP_ASSAY_ID, GeneralTumorType.x, readcount, tax_name) %>% distinct(tax_name, DMP_ASSAY_ID) %>% group_by(tax_name) %>% tally() #merge dataset with appropriate filters used above and create classes needed for odds ratios OddsMerged <- FINAL_IMPACT_Solid_ViralDataset_pubready %>% left_join(unfactor(filtered_virus), c("DMP_ASSAY_ID"="DMP_ASSAY_ID")) %>% filter((tax_name!="Human gammaherpesvirus 4" & readcount>1)|(tax_name=="Human gammaherpesvirus 4" & readcount>70)) %>% select(DMP_ASSAY_ID, GeneralTumorType.x, readcount, tax_name) %>% distinct(tax_name, DMP_ASSAY_ID, .keep_all = TRUE) %>% group_by(tax_name, GeneralTumorType.x) %>% tally() %>% ungroup() %>% left_join(tumor_type_quantities, c("GeneralTumorType.x"= "GeneralTumorType")) %>% rename(PosCases=n.x, TotCases=n.y, GeneralTumorType = GeneralTumorType.x) %>% left_join(virus_type_quantities, c("tax_name"= "tax_name")) %>% rename(PositiveClassPositive=PosCases) %>% mutate(PositiveClassNegative=TotCases-PositiveClassPositive) %>% mutate(NegativeClassPositive=n-PositiveClassPositive) %>% mutate(NegativeClassNegative=48148-TotCases) %>% mutate(X=paste(GeneralTumorType,tax_name, sep = "_")) %>% mutate(NegativeClassPositive=ifelse(NegativeClassPositive==0,0.5,NegativeClassPositive)) %>% rename(Taxonomy_ID_Label=tax_name) %>% select(GeneralTumorType,Taxonomy_ID_Label,X,PositiveClassPositive,PositiveClassNegative,NegativeClassPositive,NegativeClassNegative) #convert factors to strings OddsMerged[,1:3] <- unfactor(OddsMerged[,1:3]) ### statistics ### ##calculate odds ratios k <- oddsratio(OddsMerged$PositiveClassPositive, OddsMerged$PositiveClassNegative, OddsMerged$NegativeClassPositive, OddsMerged$NegativeClassNegative, conf.level = 0.95, p.calc.by.independence = FALSE) ## Merging odds ratio with original dataframe. OddsToPlot <- data.frame(OddsMerged[,1:7], k$estimate, k$conf.int[1:(length(k$conf.int)/2)], k$conf.int[(((length(k$conf.int))/2)+1):length(k$conf.int)], k$p.value) names(OddsToPlot) <- c("GeneralTumorType", "Taxonomy_ID_Label", "X_label", "PositiveClassPositive", "PositiveClassNegative", "NegativeClassPositive", "NegativeClassNegative", "OddsRatio", "OddsRatio95CIlow", "OddsRatio95CIhigh", "pvalue") ## Sort for significant TaxonomyIDS and filter for significant associations ## removed all associations that are uncommon. Must have >2 examples ## removed family level (e.g. Herpesviridae, Human papillomavirus) categories as they are uninformative ## Bonferroni adjusted for 82 hypotheses ## arrange by the virus ## Filter confidence interval that crosses 1 OddstoPlot10 <- OddsToPlot %>% filter(PositiveClassPositive >2) filter(Taxonomy_ID_Label!="Herpesviridae") %>% filter(Taxonomy_ID_Label!="root") %>% filter(Taxonomy_ID_Label!="Human papillomavirus") %>% mutate_if(is.character, str_replace_all, pattern = "_", replacement = " - ") %>% filter(pvalue < (0.05/82)) %>% arrange(Taxonomy_ID_Label) %>% filter(OddsRatio95CIlow>1) row.names(OddstoPlot10) <- NULL ## dataframe for making forest plot with GGplot df10 <- data.frame(yAxis = length(OddstoPlot10$X_label):1, boxOdds = OddstoPlot10$OddsRatio, boxCILow = OddstoPlot10$OddsRatio95CIlow, boxCIHigh = OddstoPlot10$OddsRatio95CIhigh) ### plotting fig3a ### ##basic ggplot K <- ggplot(df10, aes(x = boxOdds, y = yAxis)) ##strings for reporting odds rations and CI OddstoPlot10['OddsRatio'] <- round(OddstoPlot10['OddsRatio'], digits = 1) OddstoPlot10['OddsRatio95CIlow'] <- round(OddstoPlot10['OddsRatio95CIlow'], digits = 2) OddstoPlot10['OddsRatio95CIhigh'] <- round(OddstoPlot10['OddsRatio95CIhigh'], digits = 2) ## add text for plotting to dataframe OddstoPlot10 <- OddstoPlot10 %>% mutate(concate = glue('{OddsRatio}, 95% CI ({OddsRatio95CIlow} - {OddsRatio95CIhigh})')) ##adding bells and whistles. Autoplot <- K + geom_vline(aes(xintercept = 1), size = .25, linetype = "dashed") + geom_errorbarh(aes(xmax = boxCIHigh, xmin = boxCILow), size = .5, height = .2, color = "gray50") + geom_point(size = 3.5, color = "deepskyblue") + theme_bw() + theme(panel.grid.minor = element_blank(), plot.margin = unit(c(0, -11, 0, -6), "cm"), axis.text.x = element_text(size = 15), axis.title.x = element_text(size = 20)) + scale_x_continuous(breaks = c(0.01,0.05, 0.1, 0.50, 5, 10, 50)) + coord_trans(x = "log10") + ylab("") + xlab("Odds ratio (log scale)") + scale_y_continuous(labels = NULL, breaks = length(OddstoPlot10$X_label):1) ## grid arrange to separate virus from tumor type and add the odds ratio results tab_base_data <- OddstoPlot10[,c(1,2,12)] tab_base <- ggplot(tab_base_data, aes(y=df10$yAxis)) + ylab(NULL) + xlab("") + theme(plot.title = NULL, plot.margin = NULL, axis.text.x=element_text(color="white"), ## need text to be printed so it stays aligned with figure but white so it's invisible axis.line=element_blank(), axis.text.y=element_blank(),axis.ticks=element_blank(), axis.title.y=element_blank(),legend.position="none", panel.background=element_blank(),panel.border=element_blank(),panel.grid.major=element_blank(), panel.grid.minor=element_blank(),plot.background=element_blank()) tab1 <- tab_base + geom_text(aes(x=1, label=GeneralTumorType), hjust=0) tab2 <- tab_base + geom_text(aes(x=1, label=Taxonomy_ID_Label), hjust=0) tab3 <- tab_base + geom_text(aes(x=1, label=concate), hjust=0) lay <- matrix(c(1,1,2,2,3,4,4), nrow=1) c_autoplot <- NULL c_autoplot <- grid.arrange(tab1, tab2, Autoplot, tab3, layout_matrix = lay) ### Saving ### ggsave("Fig3a.pdf",plot = c_autoplot, dpi = 300, units = "cm", width = 80, height = 30) #### HHV6 Calculations and plots ##packages library(dplyr) library(tidyr) library(ggplot2) library(formattable) library(varhandle) ##datasets NORMALS_HHV6 <- read.delim("~/NORMALS_HHV6.txt", stringsAsFactors=FALSE) FINAL_IMPACT_Solid_ViralDataset_pubready <- read.delim("~/FINAL_IMPACT_Solid_ViralDataset_pubready.txt", stringsAsFactors=FALSE) ## merging normal and tumor dataframes for HHV6 HHV6_readcounts2 <- NORMALS_HHV6 %>% filter(str_detect(Name, "Human betaherpesvirus 6|Roseolovirus")) %>% group_by(Sample) %>% summarise(Readcount.N = sum(Read_Count)) %>% ungroup() %>% mutate(Sample = substr(Sample, 1,8)) %>% group_by(Sample) %>% slice(1) %>% ungroup() %>% right_join((FINAL_IMPACT_Solid_ViralDataset_pubready %>% filter(str_detect(Name, "Human betaherpesvirus 6|Roseolovirus")) %>% group_by(Sample, GeneralTumorType) %>% summarise(Readcount.T = sum(Virus_Read_Count)) %>% ungroup %>% mutate(Sample = substr(Sample, 1,8)) %>% group_by(Sample) %>% slice(1) %>% ungroup())) %>% rename(case = Sample) %>% select(case,Readcount.N, Readcount.T, GeneralTumorType) %>% mutate(Readcount.N = ifelse(is.na(Readcount.N)==TRUE, 0, Readcount.N)) %>% filter(Readcount.T >1) ###Plotting fig 3b HHV6 <- HHV6_readcounts2 %>% mutate(virus = "Human herpesvirus 6") %>% filter(Readcount.T >1) %>% mutate(Readcount.N = ifelse(Readcount.N==0,1,Readcount.N)) %>% mutate(No_reads.N = ifelse(Readcount.N==1, "square", "circle" )) %>% ggplot(aes(x =Readcount.N, y = Readcount.T, col = No_reads.N , shape = No_reads.N)) + geom_point(aes(size = 8), position = "jitter") + scale_x_log10(limits=c(-1,500), breaks=c(1,10,100), labels=c(0,10,100)) + xlab("HHV6 Readcount in Blood (Log10 Transformed)")+ ylab("HHV6 Readcount in Tumor (Log10 Transformed)") + scale_color_manual(values= c("lightsteelblue4", "lightsteelblue") ) + #scale_shape_manual(c(19,15))+ scale_y_log10(limits=c(-1,300) ) + theme(legend.position = "none",axis.title = element_text(size = 25)) %>% ggsave("fig3b.pdf",plot = plotHHV6, dpi = 300, units = "cm", width = 40, height = 20) ## Tumor type quantities for fig3c Tumor_type_freq <- Solid_samples_DB %>% distinct(DMP_ASSAY_ID, .keep_all = TRUE) %>% group_by(GeneralTumorType) %>% tally() %>% filter(n>150) ## Normal with HHV6 by tumor type HHV6_n <- HHV6_readcounts2 %>% mutate(virus = "Human herpesvirus 6") %>% filter(Readcount.T >1) %>% group_by(Readcount.N, GeneralTumorType) %>% tally(name = "ATumor_only") %>% filter(Readcount.N==0) %>% merge(Tumor_type_freq, by.x = "GeneralTumorType", by.y = "GeneralTumorType") %>% transmute(GeneralTumorType, Readcount.N, ATumor_only = ATumor_only / n) ## Tumor with HHV6 by tumor type HHV6_t <- HHV6_readcounts2 %>% mutate(virus = "Human herpesvirus 6") %>% filter(Readcount.T >1) %>% group_by(GeneralTumorType) %>% tally(name = "tumor") %>% filter(tumor>3) %>% merge(Tumor_type_freq, by = "GeneralTumorType") %>% transmute(GeneralTumorType, tumor = tumor / n) ## merge tumor and normal by tumor type data HHV6_t_n <- merge(HHV6_n, HHV6_t, by.x = "GeneralTumorType", by.y = "GeneralTumorType", all.y = TRUE) HHV6_t_n_plot <- HHV6_t_n %>% mutate(percent_n_neg = ATumor_only / tumor, percent_n_neg=scales::percent(percent_n_neg)) %>% mutate(signif=ifelse(ATumor_only / tumor > 0.61, "Majority_Tumor_only_HHV6", "Majority_ciHHV6")) mutate(ciHHV6 = tumor - ATumor_only) %>% gather(HHV6_status,cases, c(3,7)) ## plotting fig3c plotHHV6 <- HHV6_t_n_plot %>% ggplot(aes(x=reorder(GeneralTumorType,-tumor,sum), y=cases, fill=HHV6_status)) + geom_bar(stat = "identity") + xlab("Tumor Type") + ylab("HHV6 Frequency") +scale_x_discrete() + theme(axis.text.x = element_text(angle = 60, hjust = 1)) + scale_fill_manual(name = "HHV6 Status", labels = c("Tumor Only (Somatic)","Chromosomally Integrated (ciHHV6)"), values=c("lightsteelblue", "lightsteelblue4")) + theme(axis.text = element_text(size = 20), axis.title = element_text(size = 25)) + scale_y_continuous(labels = function(x) paste0(x*100, "%")) %>% ggsave("fic3c.pdf",plot = HHV6, dpi = 300, units = "cm", width = 40, height = 40) #### Mutation enrichment analysis ## packages library(ggplot2) library(dplyr) library(stringr) library(fmsb) library(EnhancedVolcano) library(ggrepel) ## datasets #HPV mutations Oddsmuts <- read.delim("~/OddsHPVmutations.txt", stringsAsFactors=FALSE) # HPV copy number Oddsmuts <- read.delim("~/HPV_CNA_volcano_data.txt", stringsAsFactors=FALSE) #EBV Oddsmuts <- read.delim("~/EBV_volcano_data.txt", stringsAsFactors=FALSE) ## Rename columns names(Oddsmuts) <- c("Gene", "Cytoband", "NegativeClassPositive", "percentnegclasspos", "NegativeClassNegative","PositiveClassPositive", "percentposclasspos", "PositiveClassNegative", "Logratio", "oldp", "qval", "enrichedin") names(Oddsmuts) <- c("Gene", "Cytoband", "NegativeClassPositive", "percentnegclasspos", "NegativeClassNegative","PositiveClassPositive", "percentposclasspos", "PositiveClassNegative", "Logratio", "oldp", "qval", "enrichedin") ### statistics ## Haldane-Anscombe correction Oddsmuts <- replace(Oddsmuts, Oddsmuts == 0, 0.5) ## calculate oddsratio and generate dataframe containing calculations k <- oddsratio(Oddsmuts$PositiveClassPositive, Oddsmuts$PositiveClassNegative, Oddsmuts$NegativeClassPositive, Oddsmuts$NegativeClassNegative, conf.level = 0.95, p.calc.by.independence = FALSE) OddsToPlotmut <- data.frame(Oddsmuts[,1:12], k$estimate, k$conf.int[1:(length(k$conf.int)/2)], k$conf.int[(((length(k$conf.int))/2)+1):length(k$conf.int)], k$p.value) ## Rename collumns names(OddsToPlotmut) <- c("Gene", "Cytoband", "NegativeClassPositive", "percentnegclasspos", "NegativeClassNegative","PositiveClassPositive", "percentposclasspos", "PositiveClassNegative", "Logratio", "oldp", "qval", "enrichedin", "OddsRatio", "OddsRatio95CIlow", "OddsRatio95CIhigh", "pvalue") ##volcano HPVmut <- EnhancedVolcano(OddsToPlotmut, lab = OddsToPlotmut$Gene, col= c('gray48','gray53', 'gray63', 'red3'), DrawConnectors = TRUE, widthConnectors = 0.5, x = "OddsRatio", y = "oldp", xlab = "Log2 Odds Ratio", pCutoff = 0.001, FCcutoff = 2, pLabellingCutoff = 0.001, transcriptPointSize = 6.0, transcriptLabSize = 7, legend=c('No Change', 'Odds Ratio > 2', 'p-value < 0.001', 'Odds Ratio > 2 & p-value < 0.001'), legendPosition = "bottom") ggsave("fig4a.pdf",plot = HPVmut, dpi = 300, units = "cm", width = 20, height = 20) ###HPV CVR ## Rename columns names(Oddsmuts) <- c("Gene", "Cytoband", "NegativeClassPositive", "percentnegclasspos", "NegativeClassNegative","PositiveClassPositive", "percentposclasspos", "PositiveClassNegative", "Logratio", "oldp", "qval", "enrichedin") ### statistics ## Haldane-Anscombe correction Oddsmuts <- replace(Oddsmuts, Oddsmuts == 0, 0.5) ## calculate oddsratio and generate dataframe containing calculations k <- oddsratio(Oddsmuts$PositiveClassPositive, Oddsmuts$PositiveClassNegative, Oddsmuts$NegativeClassPositive, Oddsmuts$NegativeClassNegative, conf.level = 0.95, p.calc.by.independence = FALSE) OddsToPlotmut <- data.frame(Oddsmuts[,1:12], k$estimate, k$conf.int[1:(length(k$conf.int)/2)], k$conf.int[(((length(k$conf.int))/2)+1):length(k$conf.int)], k$p.value) ## Rename collumns names(OddsToPlotmut) <- c("Gene", "CNA_type", "NegativeClassPositive", "percentnegclasspos", "NegativeClassNegative","PositiveClassPositive", "percentposclasspos", "PositiveClassNegative", "Logratio", "oldp", "qval", "enrichedin", "OddsRatio", "OddsRatio95CIlow", "OddsRatio95CIhigh", "pvalue") ##volcano OddsToPlotmut <- OddsToPlotmut %>% filter(Gene != "CDKN2B") %>% mutate(var1 = str_replace(Gene, "CDKN2A", "CDKN2A/B")) %>% mutate(GeneCNA = paste(var1, CNA_type, sep = " ")) HPVCNA <- EnhancedVolcano(OddsToPlotmut, lab = OddsToPlotmut$GeneCNA, col= c('gray48', 'gray53', 'gray63', 'red3'), DrawConnectors = TRUE, widthConnectors = 0.5, x = "OddsRatio", y = "oldp", xlab = "Log2 Odds Ratio", pCutoff = 0.001, pLabellingCutoff = 0.001, transcriptPointSize = 6.0, transcriptLabSize = 3, legend=c('No Change', 'Odds Ratio > 2', 'p-value < 0.001', 'Odds Ratio > 2 & p-value < 0.001'), legendPosition = "bottom") volcEBV3 ggsave("fig4b.pdf",plot = HPVCNA, dpi = 300, units = "cm", width = 20, height = 20) ## statistics ## Haldane-Anscombe correction Oddsmuts <- replace(Oddsmuts, Oddsmuts == 0, 0.5) ## calculate oddsratio and generate dataframe containing calculations k <- oddsratio(Oddsmuts$PositiveClassPositive, Oddsmuts$PositiveClassNegative, Oddsmuts$NegativeClassPositive, Oddsmuts$NegativeClassNegative, conf.level = 0.95, p.calc.by.independence = FALSE) OddsToPlotmut <- data.frame(Oddsmuts[,1:12], k$estimate, k$conf.int[1:(length(k$conf.int)/2)], k$conf.int[(((length(k$conf.int))/2)+1):length(k$conf.int)], k$p.value) ## Rename collumns names(OddsToPlotmut) <- c("Gene", "Cytoband", "NegativeClassPositive", "percentnegclasspos", "NegativeClassNegative","PositiveClassPositive", "percentposclasspos", "PositiveClassNegative", "Logratio", "oldp", "qval", "enrichedin", "OddsRatio", "OddsRatio95CIlow", "OddsRatio95CIhigh", "pvalue") ##volcano volcEBV1 <- EnhancedVolcano(OddsToPlotmut, lab = OddsToPlotmut$Gene, col= c('gray48', 'gray53', 'gray63', 'red3'), DrawConnectors = TRUE, widthConnectors = 0.5, x = "OddsRatio", y = "oldp", xlab = "Log2 Odds Ratio", pCutoff = 0.001, pLabellingCutoff = 0.001, transcriptPointSize = 6.0, transcriptLabSize = 7, legend=c('No Change', 'Odds Ratio > 2', 'p-value < 0.001', 'Odds Ratio > 2 & p-value < 0.001'), legendPosition = "bottom") ggsave("fig4c.pdf",plot = volcEBV1, dpi = 300, units = "cm", width = 20, height = 20) ### TMB violins library(ggplot2) library(reshape2) library(ggpubr) datahpv <- read.delim("~/HPV_TMB.txt", stringsAsFactors=FALSE)[14:17] datahpv %>% unfactor(.) %>% mutate(factorz = as.factor(datahpv[,3])) %>% filter(str_detect(Comparison, "All Tumor Types")) %>% mutate(logTMB = log(TMB,base = 2)) %>% filter(logTMB != "-Inf") %>% mutate(orders = ifelse(str_detect(factorz, "EBV")==TRUE, 1,2)) %>% ggplot(aes(x = reorder(factorz, -orders), y = logTMB))+ geom_violin(width=0.6, fill= "slateblue1") + geom_boxplot(width=0.06) + labs(y = "log2TMB (mutation per MB)", x = "Virus Status") + theme_classic() + theme(axis.text.x = element_text(size = 20, color = "black", hjust = 0.5, vjust = 0.5, angle = 90), axis.text.y = element_text(size = 15), axis.title.y = element_text(size = 15), axis.title.x = element_text(size = 18), legend.text = element_text(size = 20), legend.title = element_text(size = 20)) + stat_compare_means(method = "t.test", #unpaired comparisons = list(c("HPV Positive", "HPV Negative"), c("EBV Positive", "EBV Negative"))) ggsave("fig4d.pdf",plot = newboxplot , dpi = 300, units = "cm", width = 40, height = 40) SupportCancel Full-text Access