#BACTERIAL EXTRACELLULAR VESICLES OF THE FETAL ENVIRONMENT PROJECT, proteomic data analysis #Updated 20.09.2022 Anna Kaisanlahti, Reunanen group #Copyright © 2022 Anna Kaisanlahti #Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files #(“Kaisanlahti_AMBI_proteomics.R”), to deal in the Software without restriction, including without limitation the rights to use, #copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is #furnished to do so, subject to the following conditions: #The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. #THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF #MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY #CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE #SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. #The following packages are needed in the analysis, install these if you do not have them in your R installation install.packages("data.table") install.packages("tidyverse") install.packages("tidyr") install.packages("gtools") install.packages("stats") install.packages("RColorBrewer") install.packages("cowplot") install.packages("compositions") install.packages("ggVennDiagram") #Import R packages to your R sesssion library("data.table") library("tidyverse") library("tidyr") library("gtools") library("stats") library("RColorBrewer") library("cowplot") library("compositions") library("ggplot2") library("ggVennDiagram") library("gplots") #SECTION 1.0, PREPARING THE DATA #Read into R the 'proteins' files from PEAKS export files for both swissprot and trEMBL results #Change the folder path to your protein files swiss_prot_results_PEAKS_file <- read.csv("swiss\\proteins.csv", na.strings=c("")) trEMBL_results_PEAKS_file <- read.csv("trEMBL\\proteins.csv", na.strings=c("")) #Separate UniProt accession number and ID to separate columns separated_data_swiss <- separate(swiss_prot_results_PEAKS_file, col = Accession, into = c("UniProt_ID", "UniProt_accession"), sep = '\\|') separated_data_trEMBL <- separate(trEMBL_results_PEAKS_file, col = Accession, into = c("UniProt_ID", "UniProt_accession"), sep = '\\|') #Separate samples to individual files in swiss prot data #Subset uniprot_ID and sample coverage columns, remove score and overall coverage columns #Switch column indexes according to your data sample_columns_swiss <- separated_data_swiss[ , 4:59] sample_columns_swiss <- separated_data_swiss[ , 4:59] sample_columns_swiss$X.10lgP <- NULL sample_columns_swiss$Coverage.... <- NULL #Create sample list for individual samples and create individual files by looping through the sample coverage columns sample_list_swiss <- list() for(i in seq_along(sample_columns_swiss[,2:ncol(sample_columns_swiss)])){ sample_list_swiss[[i]] <- sample_columns_swiss[,c(1,(i+1))] } #Filter out the rows with 0 values (proteins not present in the sample or coverage <1%) sample_list_filtered_swiss <- lapply(1:length(sample_list_swiss), function(i) {subset(sample_list_swiss[[i]], sample_list_swiss[[i]][2] > 0)}) #Subset the uniprot_IDs as a list and name them according to sample names in original result file protein_IDs_swiss <- lapply(sample_list_filtered_swiss, "[[", "UniProt_accession") names(protein_IDs_swiss) <- names(swiss_prot_results_PEAKS_file[6:58]) #Separate samples to individual files in trEMBL prot data #Subset uniprot_ID and sample coverage columns, remove score and overall coverage columns sample_columns_trEMBL <- separated_data_trEMBL[ , 4:59] sample_columns_trEMBL$X.10lgP <- NULL sample_columns_trEMBL$Coverage.... <- NULL #Create sample list for individual samples and create individual files by looping through the sample coverage columns sample_list_trEMBL <- list() for(i in seq_along(sample_columns_trEMBL[,2:ncol(sample_columns_trEMBL)])){ sample_list_trEMBL[[i]] <- sample_columns_trEMBL[,c(1,(i+1))] } #Filter out the rows with 0 values (proteins not present in the sample or coverage <1%) sample_list_filtered_trEMBL <- lapply(1:length(sample_list_trEMBL), function(i) {subset(sample_list_trEMBL[[i]], sample_list_trEMBL[[i]][2] > 0)}) #Subset the uniprot_IDs as a list and name them according to sample names in original result file protein_IDs_trEMBL <- lapply(sample_list_filtered_trEMBL, "[[", "UniProt_accession") names(protein_IDs_trEMBL) <- names(swiss_prot_results_PEAKS_file[6:58]) #Export the protein ID lists setwd("your_output_folder") mapply(write.table, protein_IDs_swiss, file=paste("swiss_", names(swiss_prot_results_PEAKS_file[6:58])), MoreArgs = list(row.names=FALSE, col.names=FALSE, quote=FALSE)) mapply(write.table, protein_IDs_trEMBL, file=paste("trEMBL_", names(swiss_prot_results_PEAKS_file[6:58])), MoreArgs = list(row.names=FALSE, col.names=FALSE, quote=FALSE)) #SECTION 2.0, protein data fetching, filtration and calculating statistics #AT THIS STEP MOVE TO BASH AND DO THE FOLLOWING STEPS: #1. Concenate swissProt and trEMBL results files of each sample #2. Retrieve protein info from UniProt using the exported and combined result files, using ID mapping tool from: #https://www.uniprot.org/id-mapping #List fetched UniProt protein data files for importing uniprot_data_files <- list.files(path = "your_output_folder\\sample_data_from_uniprot", full.names=TRUE) #Fix file list order to numerical uniprot_files <- mixedsort(uniprot_data_files) #Create names for the files from file list sample_file_names <- lapply(uniprot_files, basename) #Import data and rename files uniprot_data_unfiltered <- lapply(uniprot_files, read.csv2, sep ="\t", header = TRUE, check.names = FALSE, na.strings=c("","NA"), stringsAsFactors = TRUE) names(uniprot_data_unfiltered) <- sample_file_names #Subset fecal and amniotic fluid samples to own data lists (check the sample names and numbers from PEAKS if needed) AM_samples_unfiltered <- uniprot_data_unfiltered[1:27] FE_samples_unfiltered <- uniprot_data_unfiltered[28:53] #Check out empty samples (=negative control) in uniprot by exporting the entry columns from control samples empty_AM <- uniprot_data_unfiltered[[27]]$Entry empty_FE <- uniprot_data_unfiltered[[53]]$Entry #Export negative controls if needed setwd("your_output_folder") write.table(empty_AM, file="empty_AM_proteins", row.names=FALSE, col.names=FALSE, quote=FALSE) write.table(empty_FE, file="empty_FE_proteins", row.names=FALSE, col.names=FALSE, quote=FALSE) #Filter out the protein IDs present in negative control samples AM_samples <- lapply(1:length(AM_samples_unfiltered), function(i) {subset(AM_samples_unfiltered[[i]], !(AM_samples_unfiltered[[i]]$Entry %in% AM_samples_unfiltered[[27]]$Entry))}) FE_samples <- lapply(1:length(FE_samples_unfiltered), function(i) {subset(FE_samples_unfiltered[[i]], !(FE_samples_unfiltered[[i]]$Entry %in% FE_samples_unfiltered[[26]]$Entry))}) #Delete negative control samples from the data files lists AM_samples[27] = NULL FE_samples[26] = NULL #Separate bacterial and human proteins in samples AM_samples_bacteria <- lapply(1:length(AM_samples), function (i) {subset(AM_samples[[i]], AM_samples[[i]][8] == "Bacteria")}) AM_samples_human <- lapply(1:length(AM_samples), function (i) {subset(AM_samples[[i]], AM_samples[[i]][8] == "Eukaryota")}) FE_samples_bacteria <- lapply(1:length(FE_samples), function (i) {subset(FE_samples[[i]], FE_samples[[i]][8] == "Bacteria")}) FE_samples_human <- lapply(1:length(FE_samples), function (i) {subset(FE_samples[[i]], FE_samples[[i]][8] == "Eukaryota")}) #List unique protein IDs in each group AM_bacterial_proteins_all <- unique(unlist(lapply(AM_samples_bacteria, "[[", "Entry"))) AM_human_proteins_all <- unique(unlist(lapply(AM_samples_human, "[[", "Entry"))) FE_bacterial_proteins_all <- unique(unlist(lapply(FE_samples_bacteria, "[[", "Entry"))) FE_human_proteins_all <- unique(unlist(lapply(FE_samples_human, "[[", "Entry"))) #Export lists of unique proteins setwd("your_output_folder") write.table(AM_bacterial_proteins_all, file="AM_bacterial_proteins_all", row.names=FALSE, col.names=FALSE, quote=FALSE) write.table(AM_human_proteins_all, file="AM_human_proteins_all", row.names=FALSE, col.names=FALSE, quote=FALSE) write.table(FE_bacterial_proteins_all, file="FE_bacterial_proteins_all", row.names=FALSE, col.names=FALSE, quote=FALSE) write.table(FE_human_proteins_all, file="FE_human_proteins_all", row.names=FALSE, col.names=FALSE, quote=FALSE) #Calculate protein counts from eah sample group AM_bacterial_proteins_all_count <- length(AM_bacterial_proteins_all) AM_human_proteins_all_count <- length(AM_human_proteins_all) FE_bacterial_proteins_all_count <- length(FE_bacterial_proteins_all) FE_human_proteins_all_count <- length(FE_human_proteins_all) #Calculate mean and range of bacterial and human proteins in individual samples #mean AM_samples_bacteria_mean <- round(mean(unlist(lapply(AM_samples_bacteria, nrow)))) AM_samples_human_mean <- round(mean(unlist(lapply(AM_samples_human, nrow)))) FE_samples_bacteria_mean <- round(mean(unlist(lapply(FE_samples_bacteria, nrow)))) FE_samples_human_mean <- round(mean(unlist(lapply(FE_samples_human, nrow)))) #range AM_samples_bacteria_range <- range(unlist(lapply(AM_samples_bacteria, nrow))) AM_samples_human_range <- range(unlist(lapply(AM_samples_human, nrow))) FE_samples_bacteria_range <- range(unlist(lapply(FE_samples_bacteria, nrow))) FE_samples_human_range <- range(unlist(lapply(FE_samples_human, nrow))) #SECTION 3.0, SAMPLE DISTRIBUTION, TAXONOMY AND FUNCTION OF AMNIOTIC FLUID AND FECAL #EV BACTERIAL PROTEINS #SAMPLE DISTRIBUTION CHARTS #Subset 'entry' columns from each data frame in AM bacterial data frame list AM_samples_bacteria_entry_columns <- lapply(AM_samples_bacteria, "[[", "Entry") #AM bacterial proteins distribution chart protein_distribution_AM_bacteria <- data.frame( AM_bacterial_proteins_all, c1 = as.integer(AM_bacterial_proteins_all %in% AM_samples_bacteria_entry_columns[[1]]), c2 = as.integer(AM_bacterial_proteins_all %in% AM_samples_bacteria_entry_columns[[2]]), c3 = as.integer(AM_bacterial_proteins_all %in% AM_samples_bacteria_entry_columns[[3]]), c4 = as.integer(AM_bacterial_proteins_all %in% AM_samples_bacteria_entry_columns[[4]]), c5 = as.integer(AM_bacterial_proteins_all %in% AM_samples_bacteria_entry_columns[[5]]), c6 = as.integer(AM_bacterial_proteins_all %in% AM_samples_bacteria_entry_columns[[6]]), c7 = as.integer(AM_bacterial_proteins_all %in% AM_samples_bacteria_entry_columns[[7]]), c8 = as.integer(AM_bacterial_proteins_all %in% AM_samples_bacteria_entry_columns[[8]]), c9 = as.integer(AM_bacterial_proteins_all %in% AM_samples_bacteria_entry_columns[[9]]), c10 = as.integer(AM_bacterial_proteins_all %in% AM_samples_bacteria_entry_columns[[10]]), c11 = as.integer(AM_bacterial_proteins_all %in% AM_samples_bacteria_entry_columns[[11]]), c12 = as.integer(AM_bacterial_proteins_all %in% AM_samples_bacteria_entry_columns[[12]]), c13 = as.integer(AM_bacterial_proteins_all %in% AM_samples_bacteria_entry_columns[[13]]), c14 = as.integer(AM_bacterial_proteins_all %in% AM_samples_bacteria_entry_columns[[14]]), c15 = as.integer(AM_bacterial_proteins_all %in% AM_samples_bacteria_entry_columns[[15]]), c16 = as.integer(AM_bacterial_proteins_all %in% AM_samples_bacteria_entry_columns[[16]]), c17 = as.integer(AM_bacterial_proteins_all %in% AM_samples_bacteria_entry_columns[[17]]), c18 = as.integer(AM_bacterial_proteins_all %in% AM_samples_bacteria_entry_columns[[18]]), c19 = as.integer(AM_bacterial_proteins_all %in% AM_samples_bacteria_entry_columns[[19]]), c20 = as.integer(AM_bacterial_proteins_all %in% AM_samples_bacteria_entry_columns[[20]]), c21 = as.integer(AM_bacterial_proteins_all %in% AM_samples_bacteria_entry_columns[[21]]), c22 = as.integer(AM_bacterial_proteins_all %in% AM_samples_bacteria_entry_columns[[22]]), c23 = as.integer(AM_bacterial_proteins_all %in% AM_samples_bacteria_entry_columns[[23]]), c24 = as.integer(AM_bacterial_proteins_all %in% AM_samples_bacteria_entry_columns[[24]]), c25 = as.integer(AM_bacterial_proteins_all %in% AM_samples_bacteria_entry_columns[[25]]), c26 = as.integer(AM_bacterial_proteins_all %in% AM_samples_bacteria_entry_columns[[26]])) #Calculate rowsums and append the result as a new column rowsums_column_AM_bacteria <- rowSums(protein_distribution_AM_bacteria[2:27]) protein_distribution_AM_bacteria <- cbind(protein_distribution_AM_bacteria, rowsums_column_AM_bacteria) #Sort the data table by rowsums protein_distribution_AM_bacteria <- protein_distribution_AM_bacteria[order(-protein_distribution_AM_bacteria$rowsums_column_AM_bacteria),] write.table(protein_distribution_AM_bacteria, file = "your_output_folder\\AM_bacteria_protein_distribution", row.names = FALSE, col.names = TRUE, quote=FALSE, sep = "\t") #AM, HUMAN PROTEINS #Subset 'entry' columns from each data frame in AM bacterial data frame list AM_samples_human_entry_columns <- lapply(AM_samples_human, "[[", "Entry") #AM samples, human proteins distribution chart protein_distribution_AM_human <- data.frame( AM_human_proteins_all, c1 = as.integer(AM_human_proteins_all %in% AM_samples_human_entry_columns[[1]]), c2 = as.integer(AM_human_proteins_all %in% AM_samples_human_entry_columns[[2]]), c3 = as.integer(AM_human_proteins_all %in% AM_samples_human_entry_columns[[3]]), c4 = as.integer(AM_human_proteins_all %in% AM_samples_human_entry_columns[[4]]), c5 = as.integer(AM_human_proteins_all %in% AM_samples_human_entry_columns[[5]]), c6 = as.integer(AM_human_proteins_all %in% AM_samples_human_entry_columns[[6]]), c7 = as.integer(AM_human_proteins_all %in% AM_samples_human_entry_columns[[7]]), c8 = as.integer(AM_human_proteins_all %in% AM_samples_human_entry_columns[[8]]), c9 = as.integer(AM_human_proteins_all %in% AM_samples_human_entry_columns[[9]]), c10 = as.integer(AM_human_proteins_all %in% AM_samples_human_entry_columns[[10]]), c11 = as.integer(AM_human_proteins_all %in% AM_samples_human_entry_columns[[11]]), c12 = as.integer(AM_human_proteins_all %in% AM_samples_human_entry_columns[[12]]), c13 = as.integer(AM_human_proteins_all %in% AM_samples_human_entry_columns[[13]]), c14 = as.integer(AM_human_proteins_all %in% AM_samples_human_entry_columns[[14]]), c15 = as.integer(AM_human_proteins_all %in% AM_samples_human_entry_columns[[15]]), c16 = as.integer(AM_human_proteins_all %in% AM_samples_human_entry_columns[[16]]), c17 = as.integer(AM_human_proteins_all %in% AM_samples_human_entry_columns[[17]]), c18 = as.integer(AM_human_proteins_all %in% AM_samples_human_entry_columns[[18]]), c19 = as.integer(AM_human_proteins_all %in% AM_samples_human_entry_columns[[19]]), c20 = as.integer(AM_human_proteins_all %in% AM_samples_human_entry_columns[[20]]), c21 = as.integer(AM_human_proteins_all %in% AM_samples_human_entry_columns[[21]]), c22 = as.integer(AM_human_proteins_all %in% AM_samples_human_entry_columns[[22]]), c23 = as.integer(AM_human_proteins_all %in% AM_samples_human_entry_columns[[23]]), c24 = as.integer(AM_human_proteins_all %in% AM_samples_human_entry_columns[[24]]), c25 = as.integer(AM_human_proteins_all %in% AM_samples_human_entry_columns[[25]]), c26 = as.integer(AM_human_proteins_all %in% AM_samples_human_entry_columns[[26]])) #Calculate rowsums and append the result as a new column rowsums_column_AM_human <- rowSums(protein_distribution_AM_human[2:27]) protein_distribution_AM_human <- cbind(protein_distribution_AM_human, rowsums_column_AM_human) #Sort the data table by rowsums protein_distribution_AM_human <- protein_distribution_AM_human[order(-protein_distribution_AM_human$rowsums_column_AM_human),] write.table(protein_distribution_AM_human, file = "your_output_folder\\AM_human_protein_distribution", row.names = FALSE, col.names = TRUE, quote=FALSE, sep = "\t") #FEcal samples, bacteria FE_samples_bacteria_entry_columns <- lapply(FE_samples_bacteria, "[[", "Entry") protein_distribution_FE_bacteria <- data.frame( FE_bacterial_proteins_all, c1 = as.integer(FE_bacterial_proteins_all %in% FE_samples_bacteria_entry_columns[[1]]), c2 = as.integer(FE_bacterial_proteins_all %in% FE_samples_bacteria_entry_columns[[2]]), c3 = as.integer(FE_bacterial_proteins_all %in% FE_samples_bacteria_entry_columns[[3]]), c4 = as.integer(FE_bacterial_proteins_all %in% FE_samples_bacteria_entry_columns[[4]]), c5 = as.integer(FE_bacterial_proteins_all %in% FE_samples_bacteria_entry_columns[[5]]), c6 = as.integer(FE_bacterial_proteins_all %in% FE_samples_bacteria_entry_columns[[6]]), c7 = as.integer(FE_bacterial_proteins_all %in% FE_samples_bacteria_entry_columns[[7]]), c8 = as.integer(FE_bacterial_proteins_all %in% FE_samples_bacteria_entry_columns[[8]]), c9 = as.integer(FE_bacterial_proteins_all %in% FE_samples_bacteria_entry_columns[[9]]), c10 = as.integer(FE_bacterial_proteins_all %in% FE_samples_bacteria_entry_columns[[10]]), c11 = as.integer(FE_bacterial_proteins_all %in% FE_samples_bacteria_entry_columns[[11]]), c12 = as.integer(FE_bacterial_proteins_all %in% FE_samples_bacteria_entry_columns[[12]]), c13 = as.integer(FE_bacterial_proteins_all %in% FE_samples_bacteria_entry_columns[[13]]), c14 = as.integer(FE_bacterial_proteins_all %in% FE_samples_bacteria_entry_columns[[14]]), c15 = as.integer(FE_bacterial_proteins_all %in% FE_samples_bacteria_entry_columns[[15]]), c16 = as.integer(FE_bacterial_proteins_all %in% FE_samples_bacteria_entry_columns[[16]]), c17 = as.integer(FE_bacterial_proteins_all %in% FE_samples_bacteria_entry_columns[[17]]), c18 = as.integer(FE_bacterial_proteins_all %in% FE_samples_bacteria_entry_columns[[18]]), c19 = as.integer(FE_bacterial_proteins_all %in% FE_samples_bacteria_entry_columns[[19]]), c20 = as.integer(FE_bacterial_proteins_all %in% FE_samples_bacteria_entry_columns[[20]]), c21 = as.integer(FE_bacterial_proteins_all %in% FE_samples_bacteria_entry_columns[[21]]), c22 = as.integer(FE_bacterial_proteins_all %in% FE_samples_bacteria_entry_columns[[22]]), c23 = as.integer(FE_bacterial_proteins_all %in% FE_samples_bacteria_entry_columns[[23]]), c24 = as.integer(FE_bacterial_proteins_all %in% FE_samples_bacteria_entry_columns[[24]]), c25 = as.integer(FE_bacterial_proteins_all %in% FE_samples_bacteria_entry_columns[[25]])) #Calculate rowsums and append the result as a new column rowsums_column_FE_bacteria <- rowSums(protein_distribution_FE_bacteria[2:26]) protein_distribution_FE_bacteria <- cbind(protein_distribution_FE_bacteria, rowsums_column_FE_bacteria) #Sort the data table by rowsums protein_distribution_FE_bacteria <- protein_distribution_FE_bacteria[order(-protein_distribution_FE_bacteria$rowsums_column_FE_bacteria),] write.table(protein_distribution_FE_bacteria, file = "your_output_folder\\FE_bacteria_protein_distribution", row.names = FALSE, col.names = TRUE, quote=FALSE, sep = "\t") #Fecal samples, human proteins FE_samples_human_entry_columns <- lapply(FE_samples_human, "[[", "Entry") protein_distribution_FE_human <- data.frame( FE_human_proteins_all, c1 = as.integer(FE_human_proteins_all %in% FE_samples_human_entry_columns[[1]]), c2 = as.integer(FE_human_proteins_all %in% FE_samples_human_entry_columns[[2]]), c3 = as.integer(FE_human_proteins_all %in% FE_samples_human_entry_columns[[3]]), c4 = as.integer(FE_human_proteins_all %in% FE_samples_human_entry_columns[[4]]), c5 = as.integer(FE_human_proteins_all %in% FE_samples_human_entry_columns[[5]]), c6 = as.integer(FE_human_proteins_all %in% FE_samples_human_entry_columns[[6]]), c7 = as.integer(FE_human_proteins_all %in% FE_samples_human_entry_columns[[7]]), c8 = as.integer(FE_human_proteins_all %in% FE_samples_human_entry_columns[[8]]), c9 = as.integer(FE_human_proteins_all %in% FE_samples_human_entry_columns[[9]]), c10 = as.integer(FE_human_proteins_all %in% FE_samples_human_entry_columns[[10]]), c11 = as.integer(FE_human_proteins_all %in% FE_samples_human_entry_columns[[11]]), c12 = as.integer(FE_human_proteins_all %in% FE_samples_human_entry_columns[[12]]), c13 = as.integer(FE_human_proteins_all %in% FE_samples_human_entry_columns[[13]]), c14 = as.integer(FE_human_proteins_all %in% FE_samples_human_entry_columns[[14]]), c15 = as.integer(FE_human_proteins_all %in% FE_samples_human_entry_columns[[15]]), c16 = as.integer(FE_human_proteins_all %in% FE_samples_human_entry_columns[[16]]), c17 = as.integer(FE_human_proteins_all %in% FE_samples_human_entry_columns[[17]]), c18 = as.integer(FE_human_proteins_all %in% FE_samples_human_entry_columns[[18]]), c19 = as.integer(FE_human_proteins_all %in% FE_samples_human_entry_columns[[19]]), c20 = as.integer(FE_human_proteins_all %in% FE_samples_human_entry_columns[[20]]), c21 = as.integer(FE_human_proteins_all %in% FE_samples_human_entry_columns[[21]]), c22 = as.integer(FE_human_proteins_all %in% FE_samples_human_entry_columns[[22]]), c23 = as.integer(FE_human_proteins_all %in% FE_samples_human_entry_columns[[23]]), c24 = as.integer(FE_human_proteins_all %in% FE_samples_human_entry_columns[[24]]), c25 = as.integer(FE_human_proteins_all %in% FE_samples_human_entry_columns[[25]])) #Calculate rowsums and append the result as a new column rowsums_column_FE_human <- rowSums(protein_distribution_FE_human[2:26]) protein_distribution_FE_human <- cbind(protein_distribution_FE_human, rowsums_column_FE_human) #Sort the data table by rowsums protein_distribution_FE_human <- protein_distribution_FE_human[order(-protein_distribution_FE_human$rowsums_column_FE_human),] write.table(protein_distribution_FE_human, file = "your_output_folder\\FE_human_protein_distribution", row.names = FALSE, col.names = TRUE, quote=FALSE, sep = "\t") #Analyze taxonomy of the bacterial proteins #List unique phyla present in the samples phyla_list_AM <- unique(unlist(lapply(AM_samples_bacteria, "[[", "Taxonomic lineage (PHYLUM)"))) phyla_list_FE <- unique(unlist(lapply(FE_samples_bacteria, "[[", "Taxonomic lineage (PHYLUM)"))) #Gather phyla present in individual samples in the sample groups phylum_columns_AM <- lapply(AM_samples_bacteria, "[[", "Taxonomic lineage (PHYLUM)") phylum_columns_FE <- lapply(FE_samples_bacteria, "[[", "Taxonomic lineage (PHYLUM)") phyla_count_AM <- lapply(phylum_columns_AM, as.data.frame) phyla_count_FE <- lapply(phylum_columns_FE, as.data.frame) phyla_count_tables_AM <- lapply(phyla_count_AM, table) phyla_count_tables_FE <- lapply(phyla_count_FE, table) phyla_count_tables_AM <- lapply(phyla_count_tables_AM, as.data.frame) phyla_count_tables_FE <- lapply(phyla_count_tables_FE, as.data.frame) #Rename columns and data frames phyla_count_tables_AM <- lapply(phyla_count_tables_AM, setnames, c("X..i..", "Freq"), c("PHYLUM", "Proteins"), skip_absent = TRUE) phyla_count_tables_FE <- lapply(phyla_count_tables_FE, setnames, c("X..i..", "Freq"), c("PHYLUM", "Proteins"), skip_absent = TRUE) #Reorder rows based on protein amounts phyla_count_tables_AM <- lapply(1:length(phyla_count_tables_AM), function(i) {arrange(phyla_count_tables_AM[[i]], desc(Proteins))}) phyla_count_tables_FE <- lapply(1:length(phyla_count_tables_FE), function(i) {arrange(phyla_count_tables_FE[[i]], desc(Proteins))}) #Prepare initial data frame where to merge phyla count tables initial_table_AM <- data.frame(PHYLUM = phyla_list_AM, test = rep.int(0, length(phyla_list_AM)), row.names= NULL) initial_table_FE <- data.frame(PHYLUM = phyla_list_FE, test = rep.int(0, length(phyla_list_FE)), row.names= NULL) #Adding initial help table to the list of merged data frames to match row numbers initial_table_and_phylum_count_tables_list_AM <- append(phyla_count_tables_AM, list(initial_table_AM), 0) initial_table_and_phylum_count_tables_list_FE <- append(phyla_count_tables_FE, list(initial_table_FE), 0) #Merge sample phyla columns combined_phyla_table_AM <- reduce(initial_table_and_phylum_count_tables_list_AM, left_join, by = "PHYLUM", all = TRUE, all.x = TRUE, all.y = TRUE) combined_phyla_table_FE <- reduce(initial_table_and_phylum_count_tables_list_FE, left_join, by = "PHYLUM", all = TRUE, all.x = TRUE, all.y = TRUE) #Getting rid of initial data frame column combined_phyla_table_AM$test <- NULL combined_phyla_table_FE$test <- NULL #Switching NA values to zeros in all other columns except PHYLUM combined_phyla_table_AM[, 2:27][is.na(combined_phyla_table_AM[, 2:27])] <- 0 combined_phyla_table_FE[, 2:26][is.na(combined_phyla_table_FE[, 2:26])] <- 0 #Remove PHYLUM row combined_phyla_table_AM <- subset(combined_phyla_table_AM, !is.na(combined_phyla_table_AM$PHYLUM)) combined_phyla_table_FE <- subset(combined_phyla_table_FE, !is.na(combined_phyla_table_FE$PHYLUM)) #Setting sample names for column names names(combined_phyla_table_AM) = c( 'PHYLUM', 'AM01', 'AM02', 'AM03A', 'AM03B', 'AM04', 'AM05', 'AM06', 'AM07', 'AM08', 'AM09', 'AM10', 'AM11', 'AM12', 'AM13', 'AM14', 'AM15', 'AM16', 'AM17', 'AM18', 'AM19', 'AM20', 'AM21', 'AM22', 'AM23', 'AM24', 'AM25' ) names(combined_phyla_table_FE) = c( 'PHYLUM', 'FE01', 'FE02', 'FE03', 'FE04', 'FE05', 'FE06', 'FE07', 'FE08', 'FE09', 'FE10', 'FE11', 'FE12', 'FE13', 'FE14', 'FE15', 'FE16', 'FE17', 'FE18', 'FE19', 'FE20', 'FE21', 'FE22', 'FE23', 'FE24', 'FE25' ) #Export phyla tables write.table(combined_phyla_table_AM, file = "your_output_folder\\phyla_map_table_AM", row.names = FALSE, col.names = TRUE, quote=FALSE, sep = "\t") write.table(combined_phyla_table_FE, file = "your_output_folder\\phyla_map_table_FE", row.names = FALSE, col.names = TRUE, quote=FALSE, sep = "\t") #DRAWING HEATMAP PRESENTATION OF BACTERIAL PROTEIN TAXONOMY #Combine phyla tables generated for amniotic fluid and fecal bacterial proteins combined_heatmap_AM_FE <- merge(combined_phyla_table_AM, combined_phyla_table_FE, by="PHYLUM", all=TRUE) #Set NA values to 0 combined_heatmap_AM_FE[is.na(combined_heatmap_AM_FE)] <- 0 #Set out the filtering by indicating how many protein hits are needed in total across all samples combined_heatmap_AM_FE_filtered <- filter(combined_heatmap_AM_FE,rowSums(combined_heatmap_AM_FE[,-1]) >= 10) #Add pseudo count +0.5 and transform hit values to log10 rownames(combined_heatmap_AM_FE_filtered) <- combined_heatmap_AM_FE_filtered[,1] combined_heatmap_AM_FE_filtered[,1] <- NULL combined_heatmap_AM_FE_filtered <- combined_heatmap_AM_FE_filtered + 0.5 combined_heatmap_AM_FE_filtered <- log10((as.matrix(combined_heatmap_AM_FE_filtered))) combined_heatmap_AM_FE_filtered <- as.data.frame(combined_heatmap_AM_FE_filtered) combined_heatmap_AM_FE_filtered$PHYLUM <- rownames(combined_heatmap_AM_FE_filtered) #Transform data table to appropriate form for ggplot2 combined_heatmap_AM_FE_modified <- pivot_longer(combined_heatmap_AM_FE_filtered, !PHYLUM, names_to = "sample", values_to = "proteins") #FUNCTIONAL CHARACTERISTICS OF AMNIOTIC FLUID AND FECAL EV BACTERIAL PROTEINS #Process GO, biological process columns and make data table with counts #Amniotic fluid AM_BP_columns <- lapply(1:length(AM_samples_bacteria), function(i) {subset(AM_samples_bacteria[[i]], select = "Gene ontology (biological process)")}) AM_BP_columns_renamed <- lapply(1:length(AM_BP_columns), function(i) {rename(AM_BP_columns[[i]], GO_BP = "Gene ontology (biological process)")}) AM_BP_columns_fixed <- lapply(1:length(AM_BP_columns_renamed), function(i) {separate_rows(AM_BP_columns_renamed[[i]], GO_BP, sep = "; ")}) AM_BP_table <- as.data.frame(table(unlist(AM_BP_columns_fixed))) #Fecal samples FE_BP_columns <- lapply(1:length(FE_samples_bacteria), function(i) {subset(FE_samples_bacteria[[i]], select = "Gene ontology (biological process)")}) FE_BP_columns_renamed <- lapply(1:length(FE_BP_columns), function(i) {rename(FE_BP_columns[[i]], GO_BP = "Gene ontology (biological process)")}) FE_BP_columns_fixed <- lapply(1:length(FE_BP_columns_renamed), function(i) {separate_rows(FE_BP_columns_renamed[[i]], GO_BP, sep = "; ")}) FE_BP_table <- as.data.frame(table(unlist(FE_BP_columns_fixed))) #Calculate % for how many proteins GOs exist #Amniotic fluid AM_BP_NAs <- sum(is.na(unlist(AM_BP_columns_renamed))) AM_BP_length <- length(unlist(AM_BP_columns_renamed)) AM_BP_percent <- (AM_MF_length-AM_BP_NAs)/AM_MF_length * 100 #Fecal samples FE_BP_NAs <- sum(is.na(unlist(FE_BP_columns_renamed))) FE_BP_length <- length(unlist(FE_BP_columns_renamed)) FE_BP_percent <- (FE_MF_length-FE_BP_NAs)/FE_BP_length * 100 #Export GO biological process lists for sample groups write.table(AM_BP_table, file = "your_output_folder\\AM_BP", row.names = FALSE, col.names = TRUE, quote=FALSE, sep = "\t") write.table(FE_BP_table, file = "your_output_folder\\FE_BP", row.names = FALSE, col.names = TRUE, quote=FALSE, sep = "\t") #The GO annotations of the identified proteins were manually traced to the GO terms next to root "biological process" using #https://www.ebi.ac.uk/QuickGO/annotations #The manual annotation data was written to files "AM_BP_hierarchy_df.txt" and "FE_BP_hierarchy_df.txt #Import manually annotated GO data tables AM_BP_hierarchy_df <- read.csv2("your_output_folder\\AM_BP_hierarchy_df.txt", sep ="\t", header = TRUE, check.names = FALSE, na.strings=c("","NA")) FE_BP_hierarchy_df <- read.csv2("your_output_folder\\FE_BP_hierarchy_df.txt", sep ="\t", header = TRUE, check.names = FALSE, na.strings=c("","NA")) #Print unique GO terms present in the manually annotated GO data #Amniotic fluid AM_uniq_BP <- unique(as.vector(as.matrix(AM_BP_hierarchy_df[ , 3:5]))) AM_uniq_BP <- AM_uniq_BP[! AM_uniq_BP %in% NA] #Fecal samples FE_uniq_BP <- unique(as.vector(as.matrix(FE_BP_hierarchy_df[ , 3:5]))) FE_uniq_BP <- FE_uniq_BP[! FE_uniq_BP %in% NA] AM_FE_uniq_BP <- unique(c(AM_uniq_BP, FE_uniq_BP)) #Calculate biological process GO counts for amniotic fluid samples AM_GOs_BP <- c( sum(AM_BP_hierarchy_df[which(AM_BP_hierarchy_df$level_2A == "cellular process" | AM_BP_hierarchy_df$level_2B == "cellular process" | AM_BP_hierarchy_df$level_2C == "cellular process"), 2]), sum(AM_BP_hierarchy_df[which(AM_BP_hierarchy_df$level_2A == "response to stimulus" | AM_BP_hierarchy_df$level_2B == "response to stimulus" | AM_BP_hierarchy_df$level_2C == "response to stimulus"), 2]), sum(AM_BP_hierarchy_df[which(AM_BP_hierarchy_df$level_2A == "metabolic process" | AM_BP_hierarchy_df$level_2B == "metabolic process" | AM_BP_hierarchy_df$level_2C == "metabolic process"), 2]), sum(AM_BP_hierarchy_df[which(AM_BP_hierarchy_df$level_2A == "biological process involved in interspecies interaction between organisms" | AM_BP_hierarchy_df$level_2B == "biological process involved in interspecies interaction between organisms" | AM_BP_hierarchy_df$level_2C == "biological process involved in interspecies interaction between organisms"), 2]), sum(AM_BP_hierarchy_df[which(AM_BP_hierarchy_df$level_2A == "localization" | AM_BP_hierarchy_df$level_2B == "localization" | AM_BP_hierarchy_df$level_2C == "localization"), 2]), sum(AM_BP_hierarchy_df[which(AM_BP_hierarchy_df$level_2A == "biological regulation" | AM_BP_hierarchy_df$level_2B == "biological regulation" | AM_BP_hierarchy_df$level_2C == "biological regulation"), 2]), sum(AM_BP_hierarchy_df[which(AM_BP_hierarchy_df$level_2A == "developmental process" | AM_BP_hierarchy_df$level_2B == "developmental process" | AM_BP_hierarchy_df$level_2C == "developmental process"), 2]), sum(AM_BP_hierarchy_df[which(AM_BP_hierarchy_df$level_2A == "viral process" | AM_BP_hierarchy_df$level_2B == "viral process" | AM_BP_hierarchy_df$level_2C == "viral process"), 2])) #Calculate biological process GO counts for fecal samples FE_GOs_BP <- c( sum(FE_BP_hierarchy_df[which(FE_BP_hierarchy_df$level_2A == "cellular process" | FE_BP_hierarchy_df$level_2B == "cellular process" | FE_BP_hierarchy_df$level_2C == "cellular process"), 2]), sum(FE_BP_hierarchy_df[which(FE_BP_hierarchy_df$level_2A == "response to stimulus" | FE_BP_hierarchy_df$level_2B == "response to stimulus" | FE_BP_hierarchy_df$level_2C == "response to stimulus"), 2]), sum(FE_BP_hierarchy_df[which(FE_BP_hierarchy_df$level_2A == "metabolic process" | FE_BP_hierarchy_df$level_2B == "metabolic process" | FE_BP_hierarchy_df$level_2C == "metabolic process"), 2]), sum(FE_BP_hierarchy_df[which(FE_BP_hierarchy_df$level_2A == "biological process involved in interspecies interaction between organisms" | FE_BP_hierarchy_df$level_2B == "biological process involved in interspecies interaction between organisms" | FE_BP_hierarchy_df$level_2C == "biological process involved in interspecies interaction between organisms"), 2]), sum(FE_BP_hierarchy_df[which(FE_BP_hierarchy_df$level_2A == "localization" | FE_BP_hierarchy_df$level_2B == "localization" | FE_BP_hierarchy_df$level_2C == "localization"), 2]), sum(FE_BP_hierarchy_df[which(FE_BP_hierarchy_df$level_2A == "biological regulation" | FE_BP_hierarchy_df$level_2B == "biological regulation" | FE_BP_hierarchy_df$level_2C == "biological regulation"), 2]), sum(FE_BP_hierarchy_df[which(FE_BP_hierarchy_df$level_2A == "developmental process" | FE_BP_hierarchy_df$level_2B == "developmental process" | FE_BP_hierarchy_df$level_2C == "developmental process"), 2]), sum(FE_BP_hierarchy_df[which(FE_BP_hierarchy_df$level_2A == "viral process" | FE_BP_hierarchy_df$level_2B == "viral process" | FE_BP_hierarchy_df$level_2C == "viral process"), 2])) #Create data table for GO counts AM_FE_BP_hierarchy_counts_table <- data.frame(AM_FE_uniq_BP,AM_GOs_BP, FE_GOs_BP) #Add pseudocount to GO counts and convert counts to log10 rownames(AM_FE_BP_hierarchy_counts_table) <- AM_FE_BP_hierarchy_counts_table[,1] AM_FE_BP_hierarchy_counts_table[,1] <- NULL AM_FE_BP_hierarchy_counts_table <- AM_FE_BP_hierarchy_counts_table + 0.5 AM_FE_BP_hierarchy_counts_table <- log10((as.matrix(AM_FE_BP_hierarchy_counts_table))) AM_FE_BP_hierarchy_counts_table <- as.data.frame(AM_FE_BP_hierarchy_counts_table) AM_FE_BP_hierarchy_counts_table$AM_FE_uniq_BP <- rownames(AM_FE_BP_hierarchy_counts_table) #Split long GO name to 2 rows AM_FE_BP_hierarchy_counts_table$AM_FE_uniq_BP[AM_FE_BP_hierarchy_counts_table$AM_FE_uniq_BP == 'biological process involved in interspecies interaction between organisms'] <- 'biological process involved in \n interspecies interaction between organisms' #Transform data for ggplot AM_FE_BP_hierarchy_counts_table_mofidied <- pivot_longer(AM_FE_BP_hierarchy_counts_table, !AM_FE_uniq_BP, names_to = "group", values_to = "counts") #Process GO molecular function columns and make data table with counts #Amniotic fluid AM_MF_columns <- lapply(1:length(AM_samples_bacteria), function(i) {subset(AM_samples_bacteria[[i]], select = "Gene ontology (molecular function)")}) AM_MF_columns_renamed <- lapply(1:length(AM_MF_columns), function(i) {rename(AM_MF_columns[[i]], GO_MF = "Gene ontology (molecular function)")}) AM_MF_columns_fixed <- lapply(1:length(AM_MF_columns_renamed), function(i) {separate_rows(AM_MF_columns_renamed[[i]], GO_MF, sep = "; ")}) AM_MF_table <- as.data.frame(table(unlist(AM_MF_columns_fixed))) #Fecal samples FE_MF_columns <- lapply(1:length(FE_samples_bacteria), function(i) {subset(FE_samples_bacteria[[i]], select = "Gene ontology (molecular function)")}) FE_MF_columns_renamed <- lapply(1:length(FE_MF_columns), function(i) {rename(FE_MF_columns[[i]], GO_MF = "Gene ontology (molecular function)")}) FE_MF_columns_fixed <- lapply(1:length(FE_MF_columns_renamed), function(i) {separate_rows(FE_MF_columns_renamed[[i]], GO_MF, sep = "; ")}) FE_MF_table <- as.data.frame(table(unlist(FE_MF_columns_fixed))) #Calculate % for how many proteins GOs exist #Amniotic fluid AM_MF_NAs <- sum(is.na(unlist(AM_MF_columns_renamed))) AM_MF_length <- length(unlist(AM_MF_columns_renamed)) AM_MF_percent <- (AM_MF_length-AM_MF_NAs)/AM_MF_length * 100 #Fecal samples FE_MF_NAs <- sum(is.na(unlist(FE_MF_columns_renamed))) FE_MF_length <- length(unlist(FE_MF_columns_renamed)) FE_MF_percent <- (FE_MF_length-FE_MF_NAs)/FE_MF_length * 100 #Export GO molecular function lists for sample groups write.table(AM_MF_table, file = "your_output_folder\\AM_MF", row.names = FALSE, col.names = TRUE, quote=FALSE, sep = "\t") write.table(FE_MF_table, file = "your_output_folder\\FE_MF", row.names = FALSE, col.names = TRUE, quote=FALSE, sep = "\t") #The GO annotations of the identified proteins were manually traced to the GO terms next to root "molecular function" using #https://www.ebi.ac.uk/QuickGO/annotations #The manual annotation data was written to files "AM_MF_hierarchy_df.txt" and "FE_MF_hierarchy_df.txt #Import manually annotated GO data tables AM_MF_hierarchy_df <- read.csv2("Z:\\bEVs of foetal environment project, Anna Kaisanlahti\\proteomics\\R_analysis_files\\results\\GO\\AM_MF_hierarchy_df.txt", sep ="\t", header = TRUE, check.names = FALSE, na.strings=c("","NA")) FE_MF_hierarchy_df <- read.csv2("Z:\\bEVs of foetal environment project, Anna Kaisanlahti\\proteomics\\R_analysis_files\\results\\GO\\FE_MF_hierarchy_df.txt", sep ="\t", header = TRUE, check.names = FALSE, na.strings=c("","NA")) #Print unique GO terms present in the manually annotated GO data #Amniotic fluid AM_uniq_MF <- unique(as.vector(as.matrix(AM_MF_hierarchy_df[ , 3:4]))) AM_uniq_MF <- AM_uniq_MF[! AM_uniq_MF %in% NA] #Fecal samples FE_uniq_MF <- unique(as.vector(as.matrix(FE_MF_hierarchy_df[ , 3:4]))) FE_uniq_MF <- FE_uniq_MF[! FE_uniq_MF %in% NA] AM_FE_uniq_MF <- unique(c(AM_uniq_MF, FE_uniq_MF)) #Calculate molecular function GO counts for amniotic fluid samples AM_GOs_MF <- c( sum(AM_MF_hierarchy_df[which(AM_MF_hierarchy_df$level_2A == "catalytic activity" | AM_MF_hierarchy_df$level_2B == "catalytic activity"), 2]), sum(AM_MF_hierarchy_df[which(AM_MF_hierarchy_df$level_2A == "binding" | AM_MF_hierarchy_df$level_2B == "binding"), 2]), sum(AM_MF_hierarchy_df[which(AM_MF_hierarchy_df$level_2A == "transporter activity" | AM_MF_hierarchy_df$level_2B == "transporter activity"), 2]), sum(AM_MF_hierarchy_df[which(AM_MF_hierarchy_df$level_2A == "structural molecule activity" |AM_MF_hierarchy_df$level_2B == "structural molecule activity"), 2]), sum(AM_MF_hierarchy_df[which(AM_MF_hierarchy_df$level_2A == "toxin activity" | AM_MF_hierarchy_df$level_2B == "toxin activity"), 2]), sum(AM_MF_hierarchy_df[which(AM_MF_hierarchy_df$level_2A == "cargo receptor activity" | AM_MF_hierarchy_df$level_2B == "cargo receptor activity"), 2]), sum(AM_MF_hierarchy_df[which(AM_MF_hierarchy_df$level_2A == "ATP-dependent activity" | AM_MF_hierarchy_df$level_2B == "ATP-dependent activity"), 2]), sum(AM_MF_hierarchy_df[which(AM_MF_hierarchy_df$level_2A == "translation regulator activity" | AM_MF_hierarchy_df$level_2B == "translation regulator activity"), 2]), sum(AM_MF_hierarchy_df[which(AM_MF_hierarchy_df$level_2A == "molecular adaptor activity" | AM_MF_hierarchy_df$level_2B == "molecular adaptor activity"), 2])) #Calculate molecular function GO counts for fecal samples FE_GOs_MF <- c( sum(FE_MF_hierarchy_df[which(FE_MF_hierarchy_df$level_2A == "catalytic activity" | FE_MF_hierarchy_df$level_2B == "catalytic activity"), 2]), sum(FE_MF_hierarchy_df[which(FE_MF_hierarchy_df$level_2A == "binding" | FE_MF_hierarchy_df$level_2B == "binding"), 2]), sum(FE_MF_hierarchy_df[which(FE_MF_hierarchy_df$level_2A == "transporter activity" | FE_MF_hierarchy_df$level_2B == "transporter activity"), 2]), sum(FE_MF_hierarchy_df[which(FE_MF_hierarchy_df$level_2A == "structural molecule activity" |FE_MF_hierarchy_df$level_2B == "structural molecule activity"), 2]), sum(FE_MF_hierarchy_df[which(FE_MF_hierarchy_df$level_2A == "toxin activity" | FE_MF_hierarchy_df$level_2B == "toxin activity"), 2]), sum(FE_MF_hierarchy_df[which(FE_MF_hierarchy_df$level_2A == "cargo receptor activity" | FE_MF_hierarchy_df$level_2B == "cargo receptor activity"), 2]), sum(FE_MF_hierarchy_df[which(FE_MF_hierarchy_df$level_2A == "ATP-dependent activity" | FE_MF_hierarchy_df$level_2B == "ATP-dependent activity"), 2]), sum(FE_MF_hierarchy_df[which(FE_MF_hierarchy_df$level_2A == "translation regulator activity" | FE_MF_hierarchy_df$level_2B == "translation regulator activity"), 2]), sum(FE_MF_hierarchy_df[which(FE_MF_hierarchy_df$level_2A == "molecular adaptor activity" | FE_MF_hierarchy_df$level_2B == "molecular adaptor activity"), 2])) #Create data table for GO counts AM_FE_MF_hierarchy_counts_table <- data.frame(AM_FE_uniq_MF,AM_GOs_MF, FE_GOs_MF) ##Add pseudocount to GO counts and convert counts to log10 rownames(AM_FE_MF_hierarchy_counts_table) <- AM_FE_MF_hierarchy_counts_table[,1] AM_FE_MF_hierarchy_counts_table[,1] <- NULL AM_FE_MF_hierarchy_counts_table <- AM_FE_MF_hierarchy_counts_table + 0.5 AM_FE_MF_hierarchy_counts_table <- log10((as.matrix(AM_FE_MF_hierarchy_counts_table))) AM_FE_MF_hierarchy_counts_table <- as.data.frame(AM_FE_MF_hierarchy_counts_table) AM_FE_MF_hierarchy_counts_table$AM_FE_uniq_MF <- rownames(AM_FE_MF_hierarchy_counts_table) #Transform data for ggplot AM_FE_MF_hierarchy_counts_table_mofidied <- pivot_longer(AM_FE_MF_hierarchy_counts_table, !AM_FE_uniq_MF, names_to = "group", values_to = "counts") #Draw multipanel figure with cowplot #Amniotic fluid and fecal EV bacterial protein phylum heatmap and GOs #Draw heatmap for AM samples and store it in a variable library(grid) AM_lab <- textGrob("AM EVs", gp=gpar(fontsize=8)) FE_lab <- textGrob("FE EVs", gp=gpar(fontsize=8)) AM_FE_heatmap <- ggplot(combined_heatmap_AM_FE_modified, aes(x=sample,y=reorder(PHYLUM, proteins), fill= proteins)) + geom_tile() + annotation_custom(AM_lab, xmin=15,xmax=15,ymin=-2,ymax=-2) + annotation_custom(FE_lab, xmin=37.5,xmax=37.5,ymin=-2,ymax=-2) + coord_cartesian(clip="off") + scale_fill_distiller(palette = "RdPu") + ylab("PHYLUM") + guides(fill = guide_colourbar(barwidth = 0.5, barheight = 3, title = "log10")) + theme(axis.title.x = element_blank(), panel.background = element_blank(), axis.text.x = element_text(size=6, angle=90, hjust = 1.5, face = "bold"), axis.ticks.x = element_blank(), axis.title.y = element_text(size=8), axis.text.y = element_text(size=8, face = "bold"), plot.margin = unit(c(0,0,0.2,0), "cm")) #Draw heatmap for AM+FE GO biological process and store it in a variable AM_FE_GOs_BP <- ggplot(AM_FE_BP_hierarchy_counts_table_mofidied, aes(x=group, y=reorder(AM_FE_uniq_BP, counts), fill= counts)) + geom_tile() + scale_x_discrete(labels = c("AM EVs", "FE EVs")) + scale_fill_distiller(palette = "RdPu") + guides(fill = guide_colourbar(barwidth = 0.5, barheight = 3, title = "log10")) + theme(axis.title.x=element_blank(), panel.background=element_blank(), axis.ticks.x=element_blank(), axis.text.x=element_text(angle=90, size=8,face = "bold"), axis.title.y=element_blank(), axis.text.y = element_text(size=8, face = "bold"), plot.margin = unit(c(0.5,4,0,0), "cm")) #Draw heatmap for AM+FE GO molecular function and store it in a variable AM_FE_GOs_MF <- ggplot(AM_FE_MF_hierarchy_counts_table_mofidied, aes(x=group, y=reorder(AM_FE_uniq_MF, counts), fill= counts)) + geom_tile() + scale_x_discrete(labels = c("AM EVs", "FE EVs")) + scale_fill_distiller(palette = "RdPu") + guides(fill = guide_colourbar(barwidth = 0.5, barheight = 3, title = "log10")) + theme(axis.title.x=element_blank(), panel.background=element_blank(), axis.ticks.x=element_blank(), axis.text.x=element_text(angle=90, size=8, face = "bold"), axis.title.y=element_blank(), axis.text.y = element_text(size=8, face = "bold"), plot.margin = unit(c(0.5,6,0,0), "cm")) #Plot the figures in 2 rows top_row <- plot_grid(AM_FE_heatmap, labels = c("A"), label_size = 14) bottom_row <- plot_grid(AM_FE_GOs_BP, AM_FE_GOs_MF, labels = c("B", "C"), label_size = 14) tiff(file="your_output_folder\\AM_FE_characterization_multipanel.tiff", res = 300, width = 10, height = 4, units = 'in') plot_grid(top_row, bottom_row, ncol = 1) dev.off() #Section 4.0, characterize the bacterial proteins identified in amniotic fluid and fecal samples, overlap, GOs and taxonomy #Export protein IDs that overlap in amniotic fluid and fecal sample groups #Bacteria bacterial_protein_IDs_overlapping_AM_FE <- intersect(as.character(AM_bacterial_proteins_all), as.character(FE_bacterial_proteins_all)) write.table(bacterial_protein_IDs_FEping_AM_FE, file = "your_output_folder\\overlap_AM_FE_bacteria", row.names = FALSE, col.names = TRUE, quote=FALSE, sep = "\t") #Human human_protein_IDs_overlapping_AM_FE <- intersect(as.character(AM_human_proteins_all), as.character(FE_human_proteins_all)) write.table(human_protein_IDs_FEping_AM_FE, file = "your_output_folder\\overlap_AM_FE_human", row.names = FALSE, col.names = TRUE, quote=FALSE, sep = "\t") #Fetch overlapping bacterial protein data from UniProt using the exported overlap protein IDs with ID mapping tool from: #https://www.uniprot.org/id-mapping #Import overlapping data table exported from UniProt ID retrieval website overlap_bacterial_proteins_AM_FE <- read.csv2("your_output_folder\\overlap_AM_FE_bacterial_data.txt", na.strings=c(""), sep = "\t", stringsAsFactors = FALSE) #Process GO, biological process columns and make data table with counts overlap_bacterial_proteins_AM_FE_BP_column <-subset(overlap_bacterial_proteins_AM_FE, select = Gene.Ontology..biological.process.) overlap_bacterial_proteins_AM_FE_BP_column <- rename(overlap_bacterial_proteins_AM_FE_BP_column, GO_BP = "Gene.Ontology..biological.process.") overlap_bacterial_proteins_AM_FE_BP_fixed <- separate_rows(overlap_bacterial_proteins_AM_FE_BP_column, GO_BP, sep = "; ") overlap_bacterial_proteins_GO_BP_table <- as.data.frame(table(overlap_bacterial_proteins_AM_FE_BP_fixed)) #Calculate % for how many proteins GOs exist overlap_BP_NAs <- sum(is.na(overlap_bacterial_proteins_AM_FE_BP_column)) overlap_BP_length <- length(unlist(overlap_bacterial_proteins_AM_FE_BP_column)) overlap_BP_percent <- (overlap_MF_length-overlap_MF_NAs)/overlap_BP_length * 100 #The GO annotations of the identified proteins were manually traced to the GO terms next to root "biological process" using #https://www.ebi.ac.uk/QuickGO/annotations #The manual annotation data was written to file "overlap_BP_hierarchy_df.txt" #Import manually annotated GO data table overlap_BP_hierarchy_df <- read.csv2("your_output_file\\overlap_BP_hierarchy_df.txt", sep ="\t", header = TRUE, check.names = FALSE, na.strings=c("","NA")) #Print unique GO terms present in the manually annotated GO data overlap_uniq_BP <- unique(as.vector(as.matrix(overlap_BP_hierarchy_df[ , 3:6]))) overlap_uniq_BP <- overlap_uniq_BP[! overlap_uniq_BP %in% NA] #Calculate biological process GO counts overlap_GOs_BP <- c( sum(overlap_BP_hierarchy_df[which(overlap_BP_hierarchy_df$level_2A == "response to stimulus" | overlap_BP_hierarchy_df$level_2B == "response to stimulus" | overlap_BP_hierarchy_df$level_2C == "response to stimulus" | overlap_BP_hierarchy_df$level_2D == "response to stimulus"), 2]), sum(overlap_BP_hierarchy_df[which(overlap_BP_hierarchy_df$level_2A == "cellular process" | overlap_BP_hierarchy_df$level_2B == "cellular process" | overlap_BP_hierarchy_df$level_2C == "cellular process"| overlap_BP_hierarchy_df$level_2D == "cellular process"), 2]), sum(overlap_BP_hierarchy_df[which(overlap_BP_hierarchy_df$level_2A == "biological process involved in interspecies interaction between organisms" | overlap_BP_hierarchy_df$level_2B == "biological process involved in interspecies interaction between organisms" | overlap_BP_hierarchy_df$level_2C == "biological process involved in interspecies interaction between organisms"| overlap_BP_hierarchy_df$level_2D == "biological process involved in interspecies interaction between organisms"), 2]), sum(overlap_BP_hierarchy_df[which(overlap_BP_hierarchy_df$level_2A == "metabolic process" | overlap_BP_hierarchy_df$level_2B == "metabolic process" | overlap_BP_hierarchy_df$level_2C == "metabolic process"| overlap_BP_hierarchy_df$level_2D == "metabolic process"), 2]), sum(overlap_BP_hierarchy_df[which(overlap_BP_hierarchy_df$level_2A == "localization" | overlap_BP_hierarchy_df$level_2B == "localization" | overlap_BP_hierarchy_df$level_2C == "localization"| overlap_BP_hierarchy_df$level_2D == "localization"), 2]), sum(overlap_BP_hierarchy_df[which(overlap_BP_hierarchy_df$level_2A == "developmental process" | overlap_BP_hierarchy_df$level_2B == "developmental process" | overlap_BP_hierarchy_df$level_2C == "developmental process"| overlap_BP_hierarchy_df$level_2D == "developmental process"), 2]), sum(overlap_BP_hierarchy_df[which(overlap_BP_hierarchy_df$level_2A == "biological regulation" | overlap_BP_hierarchy_df$level_2B == "biological regulation" | overlap_BP_hierarchy_df$level_2C == "biological regulation"| overlap_BP_hierarchy_df$level_2D == "biological regulation"), 2]), sum(overlap_BP_hierarchy_df[which(overlap_BP_hierarchy_df$level_2A == "signaling" | overlap_BP_hierarchy_df$level_2B == "signaling" | overlap_BP_hierarchy_df$level_2C == "signaling"| overlap_BP_hierarchy_df$level_2D == "signaling"), 2])) #Create data table for GO counts overlap_BP_hierarchy_counts_table <- data.frame(overlap_uniq_BP, overlap_GOs_BP) #Add GOs terms for 3 proteins missing from the current UniProt release #Note the difference in database versions when analyzing proteomics data and fetching protein data from UniProt website overlap_BP_hierarchy_counts_table$overlap_GOs_BP <- c(1,51,1,53,8,3,1,1) #Modify GO term name 'biological process involved in interspecies interaction between organisms' to be in 2 rows overlap_BP_hierarchy_counts_table$overlap_uniq_BP[overlap_BP_hierarchy_counts_table$overlap_uniq_BP == 'biological process involved in interspecies interaction between organisms'] <- 'biological process involved in \n interspecies interaction between organisms' #Overlap GOs, molecular function #Process GO, molecular function columns and make data table with counts overlap_bacterial_proteins_AM_FE_MF_column <-subset(overlap_bacterial_proteins_AM_FE, select = Gene.Ontology..molecular.function.) overlap_bacterial_proteins_AM_FE_MF_column <- rename(overlap_bacterial_proteins_AM_FE_MF_column, GO_MF = "Gene.Ontology..molecular.function.") overlap_bacterial_proteins_AM_FE_MF_fixed <- separate_rows(overlap_bacterial_proteins_AM_FE_MF_column, GO_MF, sep = "; ") overlap_bacterial_proteins_GO_MF_table <- as.data.frame(table(overlap_bacterial_proteins_AM_FE_MF_fixed)) #Calculate % for how many proteins GOs exist overlap_MF_NAs <- sum(is.na(overlap_bacterial_proteins_AM_FE_MF_column)) overlap_MF_length <- length(unlist(overlap_bacterial_proteins_AM_FE_MF_column)) overlap_MF_percent <- (overlap_MF_length-overlap_MF_NAs)/overlap_MF_length * 100 #The GO annotations of the identified proteins were manually traced to the GO terms next to root "molecular function" using #https://www.ebi.ac.uk/QuickGO/annotations #The manual annotation data was written to file "overlap_MF_hierarchy_df.txt" #Import manually annotated GO data table overlap_MF_hierarchy_df <- read.csv2("your_output_file\\overlap_MF_hierarchy_df.txt", sep ="\t", header = TRUE, check.names = FALSE, na.strings=c("","NA")) #Print unique GO terms present in the manually annotated GO data overlap_uniq_MF <- unique(as.vector(as.matrix(overlap_MF_hierarchy_df[ , 3:4]))) overlap_uniq_MF <- overlap_uniq_MF[! overlap_uniq_MF %in% NA] #Calculate molecular function GO counts overlap_GOs_MF <- c( sum(overlap_MF_hierarchy_df[which(overlap_MF_hierarchy_df$level_2A == "catalytic activity" | overlap_MF_hierarchy_df$level_2B == "catalytic activity"), 2]), sum(overlap_MF_hierarchy_df[which(overlap_MF_hierarchy_df$level_2A == "binding" | overlap_MF_hierarchy_df$level_2B == "binding"), 2]), sum(overlap_MF_hierarchy_df[which(overlap_MF_hierarchy_df$level_2A == "transporter activity" | overlap_MF_hierarchy_df$level_2B == "transporter activity"), 2]), sum(overlap_MF_hierarchy_df[which(overlap_MF_hierarchy_df$level_2A == "structural molecule activity" | overlap_MF_hierarchy_df$level_2B == "structural molecule activity"), 2]), sum(overlap_MF_hierarchy_df[which(overlap_MF_hierarchy_df$level_2A == "toxin activity" | overlap_MF_hierarchy_df$level_2B == "toxin activity"), 2]), sum(overlap_MF_hierarchy_df[which(overlap_MF_hierarchy_df$level_2A == "translation regulator activity" | overlap_MF_hierarchy_df$level_2B == "translation regulator activity"), 2]), sum(overlap_MF_hierarchy_df[which(overlap_MF_hierarchy_df$level_2A == "ATP-dependent activity" | overlap_MF_hierarchy_df$level_2B == "ATP-dependent activity"), 2])) #Create data table for GO counts overlap_MF_hierarchy_counts_table <- data.frame(overlap_uniq_MF, overlap_GOs_MF) #Add GOs terms for 3 proteins missing from the curretn UniProt release #Note the difference in database versions when analyzing proteomics data and fetching protein data from UniProt website overlap_MF_hierarchy_counts_table$overlap_GOs_MF <- c(44,50,3,15,1,1,4) #OVerlap protein bacteria taxonomy #From the overlapping protein identifications, taxonomy of bacterial proteins is checked manually #from the UniProt ID mapping data table fetched with unique protein IDs in the sample groups #https://www.uniprot.org/id-mapping #Phyla level taxonomy data of bacterial proteins overlap_phylum <- c( "Firmicutes", "Actinobacteria", "Proteobacteria", "Spirochaetes", "Acidobacteria", "Cyanobacteria", "Bacteroidetes") overlap_phylum_freq <- c( 19, 9, 22, 1, 1, 7, 19) #Construct data frame from the data above overlap_taxa_df <- data.frame(overlap_phylum, overlap_phylum_freq) #Venn diagrams with ggVennDiagram #Draw Venn diagram for amniotic fluid and fecal samples bacterial proteins to a variable x_bac <- list(as.character(AM_bacterial_proteins_all), as.character(FE_bacterial_proteins_all)) names(x_bac) <- c("AM EVs", "FE EVs") venn_bac <- ggVennDiagram(x_bac, label_alpha = 0, set_size = 5, label_size = 5) + scale_fill_gradient(low = "pink1", high = "pink4") + scale_color_manual(values = c("black", "black")) #Draw overlap, taxonomy and GO figures #Draw Venn diagram for amniotic fluid and fecal samples human proteins to a variable x_human <- list(as.character(AM_human_proteins_all), as.character(FE_human_proteins_all)) names(x_human) <- c("AM EVs", "FE EVs") venn_human <-ggVennDiagram(x_human, label_alpha = 0, set_size = 5, label_size = 5) + scale_fill_gradient(low = "pink1", high = "pink4") + scale_color_manual(values = c("black","black")) #Draw overlap taxonomy column blot to variable overlap_taxa_colblot <- ggplot(data = overlap_taxa_df, aes(x = reorder(overlap_phylum, overlap_phylum_freq), y = overlap_phylum_freq)) + geom_col(fill="grey") + coord_flip() + ylab("number of hits") + theme(panel.background=element_blank(), axis.title.y = element_blank(), axis.title.x = element_text(size=11), axis.text.y=element_text(size=11,face = "bold"), axis.text.x=element_text(size=11,face = "bold"), plot.margin = unit(c(0.5,0,0,1.5), "cm")) #Draw overlap GO BP column blot to variablee overlap_GO_BP_colplot <- ggplot(data = overlap_BP_hierarchy_counts_table, aes(x = reorder(overlap_uniq_BP,overlap_GOs_BP), y = overlap_GOs_BP)) + geom_col(fill="grey") + coord_flip() + ylab("number of hits") + theme(panel.background=element_blank(), axis.title.y = element_blank(), axis.title.x = element_text(size=11), axis.text.y=element_text(size=11,face = "bold"), axis.text.x=element_text(size=11,face = "bold"), plot.margin = unit(c(0.5,0,0,0), "cm")) #Draw overlap GO MF column blot to variable overlap_GO_MF_colplot <- ggplot(data = overlap_MF_hierarchy_counts_table, aes(x = reorder(overlap_uniq_MF,overlap_GOs_MF), y = overlap_GOs_MF)) + geom_col(fill="grey") + coord_flip() + ylab("number of hits") + theme(panel.background = element_blank(), axis.title.y = element_blank(), axis.title.x = element_text(size=11), axis.text.y=element_text(size=11,face = "bold"), axis.text.x=element_text(size=11,face = "bold"), plot.margin = unit(c(0.5,0,0,0), "cm")) #Draw multipanel figure with cowplot with the figures above top_row <- plot_grid(venn_bac, venn_human, labels = c("A", "B"), label_size = 20, ncol = 2) bottom_row <- plot_grid(overlap_taxa_colblot, overlap_GO_BP_colplot, overlap_GO_MF_colplot, labels = c("C", "D", "E"), label_size = 20, ncol = 3, rel_widths = c(0.8,1.3,1.1)) tiff(file="your_output_folder\\overlap_multipanel.tiff", res = 300, width = 13, height = 5, units = 'in') plot_grid(top_row, bottom_row, ncol = 1) dev.off()