#BACTERIAL EXTRACELLULAR VESICLES OF THE FETAL ENVIRONMENT PROJECT, proteomic data analysis
#Updated 20.09.2022 Anna Kaisanlahti, Reunanen group

#Copyright © 2022 Anna Kaisanlahti

#Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files 
#(“Kaisanlahti_AMBI_proteomics.R”), to deal in the Software without restriction, including without limitation the rights to use, 
#copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is 
#furnished to do so, subject to the following conditions:
  
#The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.

#THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
#MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
#CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
#SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

#The following packages are needed in the analysis, install these if you do not have them in your R installation
install.packages("data.table") 
install.packages("tidyverse")
install.packages("tidyr")
install.packages("gtools")
install.packages("stats")
install.packages("RColorBrewer")
install.packages("cowplot")
install.packages("compositions")
install.packages("ggVennDiagram")

#Import R packages to your R sesssion
library("data.table") 
library("tidyverse")
library("tidyr")
library("gtools")
library("stats")
library("RColorBrewer")
library("cowplot")
library("compositions")
library("ggplot2")
library("ggVennDiagram")
library("gplots")

#SECTION 1.0, PREPARING THE DATA
#Read into R the 'proteins' files from PEAKS export files for both swissprot and trEMBL results
#Change the folder path to your protein files
swiss_prot_results_PEAKS_file <- read.csv("swiss\\proteins.csv",
                                          na.strings=c(""))

trEMBL_results_PEAKS_file <- read.csv("trEMBL\\proteins.csv",
                                      na.strings=c(""))

#Separate UniProt accession number and ID to separate columns
separated_data_swiss <- separate(swiss_prot_results_PEAKS_file, col = Accession, into = c("UniProt_ID", "UniProt_accession"), sep = '\\|')
separated_data_trEMBL <- separate(trEMBL_results_PEAKS_file, col = Accession, into = c("UniProt_ID", "UniProt_accession"), sep = '\\|')

#Separate samples to individual files in swiss prot data
#Subset uniprot_ID and sample coverage columns, remove score and overall coverage columns
#Switch column indexes according to your data
sample_columns_swiss <- separated_data_swiss[ , 4:59]
sample_columns_swiss <- separated_data_swiss[ , 4:59]
sample_columns_swiss$X.10lgP <- NULL
sample_columns_swiss$Coverage.... <- NULL

#Create sample list for individual samples and create individual files by looping through the sample coverage columns
sample_list_swiss <- list()
for(i in  seq_along(sample_columns_swiss[,2:ncol(sample_columns_swiss)])){
  sample_list_swiss[[i]] <- sample_columns_swiss[,c(1,(i+1))]
}

#Filter out the rows with 0 values (proteins not present in the sample or coverage <1%)
sample_list_filtered_swiss <- lapply(1:length(sample_list_swiss), function(i) {subset(sample_list_swiss[[i]], sample_list_swiss[[i]][2] > 0)})

#Subset the uniprot_IDs as a list and name them according to sample names in original result file
protein_IDs_swiss <- lapply(sample_list_filtered_swiss, "[[", "UniProt_accession")
names(protein_IDs_swiss) <- names(swiss_prot_results_PEAKS_file[6:58])

#Separate samples to individual files in trEMBL prot data
#Subset uniprot_ID and sample coverage columns, remove score and overall coverage columns
sample_columns_trEMBL <- separated_data_trEMBL[ , 4:59]
sample_columns_trEMBL$X.10lgP <- NULL
sample_columns_trEMBL$Coverage.... <- NULL

#Create sample list for individual samples and create individual files by looping through the sample coverage columns
sample_list_trEMBL <- list()
for(i in  seq_along(sample_columns_trEMBL[,2:ncol(sample_columns_trEMBL)])){
  sample_list_trEMBL[[i]] <- sample_columns_trEMBL[,c(1,(i+1))]
}

#Filter out the rows with 0 values (proteins not present in the sample or coverage <1%)
sample_list_filtered_trEMBL <- lapply(1:length(sample_list_trEMBL), function(i) {subset(sample_list_trEMBL[[i]], sample_list_trEMBL[[i]][2] > 0)})

#Subset the uniprot_IDs as a list and name them according to sample names in original result file
protein_IDs_trEMBL <- lapply(sample_list_filtered_trEMBL, "[[", "UniProt_accession")
names(protein_IDs_trEMBL) <- names(swiss_prot_results_PEAKS_file[6:58])

#Export the protein ID lists
setwd("your_output_folder")
mapply(write.table, protein_IDs_swiss, file=paste("swiss_", names(swiss_prot_results_PEAKS_file[6:58])), MoreArgs = list(row.names=FALSE, col.names=FALSE, quote=FALSE))
mapply(write.table, protein_IDs_trEMBL, file=paste("trEMBL_", names(swiss_prot_results_PEAKS_file[6:58])), MoreArgs = list(row.names=FALSE, col.names=FALSE, quote=FALSE))

#SECTION 2.0, protein data fetching, filtration and calculating statistics

#AT THIS STEP MOVE TO BASH AND DO THE FOLLOWING STEPS:
#1. Concenate swissProt and trEMBL results files of each sample
#2. Retrieve protein info from UniProt using the exported and combined result files, using ID mapping tool from:
#https://www.uniprot.org/id-mapping

#List fetched UniProt protein data files for importing
uniprot_data_files <- list.files(path = "your_output_folder\\sample_data_from_uniprot",
                                 full.names=TRUE)
#Fix file list order to numerical
uniprot_files <- mixedsort(uniprot_data_files)

#Create names for the files from file list
sample_file_names <- lapply(uniprot_files, basename)

#Import data and rename files
uniprot_data_unfiltered <- lapply(uniprot_files, read.csv2, sep ="\t", header = TRUE, check.names = FALSE, na.strings=c("","NA"), stringsAsFactors = TRUE)
names(uniprot_data_unfiltered) <- sample_file_names

#Subset fecal and amniotic fluid samples to own data lists (check the sample names and numbers from PEAKS if needed)
AM_samples_unfiltered <- uniprot_data_unfiltered[1:27]
FE_samples_unfiltered <- uniprot_data_unfiltered[28:53]

#Check out empty samples (=negative control) in uniprot by exporting the entry columns from control samples
empty_AM <- uniprot_data_unfiltered[[27]]$Entry
empty_FE <- uniprot_data_unfiltered[[53]]$Entry

#Export negative controls if needed
setwd("your_output_folder")
write.table(empty_AM, file="empty_AM_proteins", row.names=FALSE, col.names=FALSE, quote=FALSE)
write.table(empty_FE, file="empty_FE_proteins", row.names=FALSE, col.names=FALSE, quote=FALSE)

#Filter out the protein IDs present in negative control samples
AM_samples <- lapply(1:length(AM_samples_unfiltered), function(i) {subset(AM_samples_unfiltered[[i]], !(AM_samples_unfiltered[[i]]$Entry %in% AM_samples_unfiltered[[27]]$Entry))})
FE_samples <- lapply(1:length(FE_samples_unfiltered), function(i) {subset(FE_samples_unfiltered[[i]], !(FE_samples_unfiltered[[i]]$Entry %in% FE_samples_unfiltered[[26]]$Entry))})

#Delete negative control samples from the data files lists
AM_samples[27] = NULL
FE_samples[26] = NULL

#Separate bacterial and human proteins in samples
AM_samples_bacteria <- lapply(1:length(AM_samples), function (i) {subset(AM_samples[[i]], AM_samples[[i]][8] == "Bacteria")})
AM_samples_human <- lapply(1:length(AM_samples), function (i) {subset(AM_samples[[i]], AM_samples[[i]][8] == "Eukaryota")})
FE_samples_bacteria <- lapply(1:length(FE_samples), function (i) {subset(FE_samples[[i]], FE_samples[[i]][8] == "Bacteria")})
FE_samples_human <- lapply(1:length(FE_samples), function (i) {subset(FE_samples[[i]], FE_samples[[i]][8] == "Eukaryota")})

#List unique protein IDs in each group
AM_bacterial_proteins_all <- unique(unlist(lapply(AM_samples_bacteria, "[[", "Entry")))
AM_human_proteins_all <- unique(unlist(lapply(AM_samples_human, "[[", "Entry")))
FE_bacterial_proteins_all <- unique(unlist(lapply(FE_samples_bacteria, "[[", "Entry")))
FE_human_proteins_all <- unique(unlist(lapply(FE_samples_human, "[[", "Entry")))

#Export lists of unique proteins
setwd("your_output_folder")
write.table(AM_bacterial_proteins_all, file="AM_bacterial_proteins_all", row.names=FALSE, col.names=FALSE, quote=FALSE)
write.table(AM_human_proteins_all, file="AM_human_proteins_all", row.names=FALSE, col.names=FALSE, quote=FALSE)
write.table(FE_bacterial_proteins_all, file="FE_bacterial_proteins_all", row.names=FALSE, col.names=FALSE, quote=FALSE)
write.table(FE_human_proteins_all, file="FE_human_proteins_all", row.names=FALSE, col.names=FALSE, quote=FALSE)

#Calculate protein counts from eah sample group
AM_bacterial_proteins_all_count <- length(AM_bacterial_proteins_all)
AM_human_proteins_all_count <- length(AM_human_proteins_all)
FE_bacterial_proteins_all_count <- length(FE_bacterial_proteins_all)
FE_human_proteins_all_count <- length(FE_human_proteins_all)

#Calculate mean and range of bacterial and human proteins in individual samples
#mean
AM_samples_bacteria_mean <- round(mean(unlist(lapply(AM_samples_bacteria, nrow))))
AM_samples_human_mean <- round(mean(unlist(lapply(AM_samples_human, nrow))))
FE_samples_bacteria_mean <- round(mean(unlist(lapply(FE_samples_bacteria, nrow))))
FE_samples_human_mean <- round(mean(unlist(lapply(FE_samples_human, nrow))))
#range
AM_samples_bacteria_range <- range(unlist(lapply(AM_samples_bacteria, nrow)))
AM_samples_human_range <- range(unlist(lapply(AM_samples_human, nrow)))
FE_samples_bacteria_range <- range(unlist(lapply(FE_samples_bacteria, nrow)))
FE_samples_human_range <- range(unlist(lapply(FE_samples_human, nrow)))

#SECTION 3.0, SAMPLE DISTRIBUTION, TAXONOMY AND FUNCTION OF AMNIOTIC FLUID AND FECAL
#EV BACTERIAL PROTEINS

#SAMPLE DISTRIBUTION CHARTS
#Subset 'entry' columns from each data frame in AM bacterial data frame list
AM_samples_bacteria_entry_columns <- lapply(AM_samples_bacteria, "[[", "Entry")

#AM bacterial proteins distribution chart
protein_distribution_AM_bacteria <- data.frame(
  AM_bacterial_proteins_all,
  c1 = as.integer(AM_bacterial_proteins_all %in% AM_samples_bacteria_entry_columns[[1]]),
  c2 = as.integer(AM_bacterial_proteins_all %in% AM_samples_bacteria_entry_columns[[2]]),
  c3 = as.integer(AM_bacterial_proteins_all %in% AM_samples_bacteria_entry_columns[[3]]),
  c4 = as.integer(AM_bacterial_proteins_all %in% AM_samples_bacteria_entry_columns[[4]]),
  c5 = as.integer(AM_bacterial_proteins_all %in% AM_samples_bacteria_entry_columns[[5]]),
  c6 = as.integer(AM_bacterial_proteins_all %in% AM_samples_bacteria_entry_columns[[6]]),
  c7 = as.integer(AM_bacterial_proteins_all %in% AM_samples_bacteria_entry_columns[[7]]),
  c8 = as.integer(AM_bacterial_proteins_all %in% AM_samples_bacteria_entry_columns[[8]]),
  c9 = as.integer(AM_bacterial_proteins_all %in% AM_samples_bacteria_entry_columns[[9]]),
  c10 = as.integer(AM_bacterial_proteins_all %in% AM_samples_bacteria_entry_columns[[10]]),
  c11 = as.integer(AM_bacterial_proteins_all %in% AM_samples_bacteria_entry_columns[[11]]),
  c12 = as.integer(AM_bacterial_proteins_all %in% AM_samples_bacteria_entry_columns[[12]]),
  c13 = as.integer(AM_bacterial_proteins_all %in% AM_samples_bacteria_entry_columns[[13]]),
  c14 = as.integer(AM_bacterial_proteins_all %in% AM_samples_bacteria_entry_columns[[14]]),
  c15 = as.integer(AM_bacterial_proteins_all %in% AM_samples_bacteria_entry_columns[[15]]),
  c16 = as.integer(AM_bacterial_proteins_all %in% AM_samples_bacteria_entry_columns[[16]]),
  c17 = as.integer(AM_bacterial_proteins_all %in% AM_samples_bacteria_entry_columns[[17]]),
  c18 = as.integer(AM_bacterial_proteins_all %in% AM_samples_bacteria_entry_columns[[18]]),
  c19 = as.integer(AM_bacterial_proteins_all %in% AM_samples_bacteria_entry_columns[[19]]),
  c20 = as.integer(AM_bacterial_proteins_all %in% AM_samples_bacteria_entry_columns[[20]]),
  c21 = as.integer(AM_bacterial_proteins_all %in% AM_samples_bacteria_entry_columns[[21]]),
  c22 = as.integer(AM_bacterial_proteins_all %in% AM_samples_bacteria_entry_columns[[22]]),
  c23 = as.integer(AM_bacterial_proteins_all %in% AM_samples_bacteria_entry_columns[[23]]),
  c24 = as.integer(AM_bacterial_proteins_all %in% AM_samples_bacteria_entry_columns[[24]]),
  c25 = as.integer(AM_bacterial_proteins_all %in% AM_samples_bacteria_entry_columns[[25]]),
  c26 = as.integer(AM_bacterial_proteins_all %in% AM_samples_bacteria_entry_columns[[26]]))

#Calculate rowsums and append the result as a new column
rowsums_column_AM_bacteria <- rowSums(protein_distribution_AM_bacteria[2:27])
protein_distribution_AM_bacteria <- cbind(protein_distribution_AM_bacteria, rowsums_column_AM_bacteria)

#Sort the data table by rowsums
protein_distribution_AM_bacteria <- protein_distribution_AM_bacteria[order(-protein_distribution_AM_bacteria$rowsums_column_AM_bacteria),]
write.table(protein_distribution_AM_bacteria, file = "your_output_folder\\AM_bacteria_protein_distribution", row.names = FALSE, col.names = TRUE, quote=FALSE, sep = "\t")

#AM, HUMAN PROTEINS
#Subset 'entry' columns from each data frame in AM bacterial data frame list
AM_samples_human_entry_columns <- lapply(AM_samples_human, "[[", "Entry")

#AM samples, human proteins distribution chart
protein_distribution_AM_human <- data.frame(
  AM_human_proteins_all,
  c1 = as.integer(AM_human_proteins_all %in% AM_samples_human_entry_columns[[1]]),
  c2 = as.integer(AM_human_proteins_all %in% AM_samples_human_entry_columns[[2]]),
  c3 = as.integer(AM_human_proteins_all %in% AM_samples_human_entry_columns[[3]]),
  c4 = as.integer(AM_human_proteins_all %in% AM_samples_human_entry_columns[[4]]),
  c5 = as.integer(AM_human_proteins_all %in% AM_samples_human_entry_columns[[5]]),
  c6 = as.integer(AM_human_proteins_all %in% AM_samples_human_entry_columns[[6]]),
  c7 = as.integer(AM_human_proteins_all %in% AM_samples_human_entry_columns[[7]]),
  c8 = as.integer(AM_human_proteins_all %in% AM_samples_human_entry_columns[[8]]),
  c9 = as.integer(AM_human_proteins_all %in% AM_samples_human_entry_columns[[9]]),
  c10 = as.integer(AM_human_proteins_all %in% AM_samples_human_entry_columns[[10]]),
  c11 = as.integer(AM_human_proteins_all %in% AM_samples_human_entry_columns[[11]]),
  c12 = as.integer(AM_human_proteins_all %in% AM_samples_human_entry_columns[[12]]),
  c13 = as.integer(AM_human_proteins_all %in% AM_samples_human_entry_columns[[13]]),
  c14 = as.integer(AM_human_proteins_all %in% AM_samples_human_entry_columns[[14]]),
  c15 = as.integer(AM_human_proteins_all %in% AM_samples_human_entry_columns[[15]]),
  c16 = as.integer(AM_human_proteins_all %in% AM_samples_human_entry_columns[[16]]),
  c17 = as.integer(AM_human_proteins_all %in% AM_samples_human_entry_columns[[17]]),
  c18 = as.integer(AM_human_proteins_all %in% AM_samples_human_entry_columns[[18]]),
  c19 = as.integer(AM_human_proteins_all %in% AM_samples_human_entry_columns[[19]]),
  c20 = as.integer(AM_human_proteins_all %in% AM_samples_human_entry_columns[[20]]),
  c21 = as.integer(AM_human_proteins_all %in% AM_samples_human_entry_columns[[21]]),
  c22 = as.integer(AM_human_proteins_all %in% AM_samples_human_entry_columns[[22]]),
  c23 = as.integer(AM_human_proteins_all %in% AM_samples_human_entry_columns[[23]]),
  c24 = as.integer(AM_human_proteins_all %in% AM_samples_human_entry_columns[[24]]),
  c25 = as.integer(AM_human_proteins_all %in% AM_samples_human_entry_columns[[25]]),
  c26 = as.integer(AM_human_proteins_all %in% AM_samples_human_entry_columns[[26]]))

#Calculate rowsums and append the result as a new column
rowsums_column_AM_human <- rowSums(protein_distribution_AM_human[2:27])
protein_distribution_AM_human <- cbind(protein_distribution_AM_human, rowsums_column_AM_human)

#Sort the data table by rowsums
protein_distribution_AM_human <- protein_distribution_AM_human[order(-protein_distribution_AM_human$rowsums_column_AM_human),]
write.table(protein_distribution_AM_human, file = "your_output_folder\\AM_human_protein_distribution", row.names = FALSE, col.names = TRUE, quote=FALSE, sep = "\t")

#FEcal samples, bacteria
FE_samples_bacteria_entry_columns <- lapply(FE_samples_bacteria, "[[", "Entry")

protein_distribution_FE_bacteria <- data.frame(
  FE_bacterial_proteins_all,
  c1 = as.integer(FE_bacterial_proteins_all %in% FE_samples_bacteria_entry_columns[[1]]),
  c2 = as.integer(FE_bacterial_proteins_all %in% FE_samples_bacteria_entry_columns[[2]]),
  c3 = as.integer(FE_bacterial_proteins_all %in% FE_samples_bacteria_entry_columns[[3]]),
  c4 = as.integer(FE_bacterial_proteins_all %in% FE_samples_bacteria_entry_columns[[4]]),
  c5 = as.integer(FE_bacterial_proteins_all %in% FE_samples_bacteria_entry_columns[[5]]),
  c6 = as.integer(FE_bacterial_proteins_all %in% FE_samples_bacteria_entry_columns[[6]]),
  c7 = as.integer(FE_bacterial_proteins_all %in% FE_samples_bacteria_entry_columns[[7]]),
  c8 = as.integer(FE_bacterial_proteins_all %in% FE_samples_bacteria_entry_columns[[8]]),
  c9 = as.integer(FE_bacterial_proteins_all %in% FE_samples_bacteria_entry_columns[[9]]),
  c10 = as.integer(FE_bacterial_proteins_all %in% FE_samples_bacteria_entry_columns[[10]]),
  c11 = as.integer(FE_bacterial_proteins_all %in% FE_samples_bacteria_entry_columns[[11]]),
  c12 = as.integer(FE_bacterial_proteins_all %in% FE_samples_bacteria_entry_columns[[12]]),
  c13 = as.integer(FE_bacterial_proteins_all %in% FE_samples_bacteria_entry_columns[[13]]),
  c14 = as.integer(FE_bacterial_proteins_all %in% FE_samples_bacteria_entry_columns[[14]]),
  c15 = as.integer(FE_bacterial_proteins_all %in% FE_samples_bacteria_entry_columns[[15]]),
  c16 = as.integer(FE_bacterial_proteins_all %in% FE_samples_bacteria_entry_columns[[16]]),
  c17 = as.integer(FE_bacterial_proteins_all %in% FE_samples_bacteria_entry_columns[[17]]),
  c18 = as.integer(FE_bacterial_proteins_all %in% FE_samples_bacteria_entry_columns[[18]]),
  c19 = as.integer(FE_bacterial_proteins_all %in% FE_samples_bacteria_entry_columns[[19]]),
  c20 = as.integer(FE_bacterial_proteins_all %in% FE_samples_bacteria_entry_columns[[20]]),
  c21 = as.integer(FE_bacterial_proteins_all %in% FE_samples_bacteria_entry_columns[[21]]),
  c22 = as.integer(FE_bacterial_proteins_all %in% FE_samples_bacteria_entry_columns[[22]]),
  c23 = as.integer(FE_bacterial_proteins_all %in% FE_samples_bacteria_entry_columns[[23]]),
  c24 = as.integer(FE_bacterial_proteins_all %in% FE_samples_bacteria_entry_columns[[24]]),
  c25 = as.integer(FE_bacterial_proteins_all %in% FE_samples_bacteria_entry_columns[[25]]))

#Calculate rowsums and append the result as a new column
rowsums_column_FE_bacteria <- rowSums(protein_distribution_FE_bacteria[2:26])
protein_distribution_FE_bacteria <- cbind(protein_distribution_FE_bacteria, rowsums_column_FE_bacteria)

#Sort the data table by rowsums
protein_distribution_FE_bacteria <- protein_distribution_FE_bacteria[order(-protein_distribution_FE_bacteria$rowsums_column_FE_bacteria),]
write.table(protein_distribution_FE_bacteria, file = "your_output_folder\\FE_bacteria_protein_distribution", row.names = FALSE, col.names = TRUE, quote=FALSE, sep = "\t")

#Fecal samples, human proteins
FE_samples_human_entry_columns <- lapply(FE_samples_human, "[[", "Entry")

protein_distribution_FE_human <- data.frame(
  FE_human_proteins_all,
  c1 = as.integer(FE_human_proteins_all %in% FE_samples_human_entry_columns[[1]]),
  c2 = as.integer(FE_human_proteins_all %in% FE_samples_human_entry_columns[[2]]),
  c3 = as.integer(FE_human_proteins_all %in% FE_samples_human_entry_columns[[3]]),
  c4 = as.integer(FE_human_proteins_all %in% FE_samples_human_entry_columns[[4]]),
  c5 = as.integer(FE_human_proteins_all %in% FE_samples_human_entry_columns[[5]]),
  c6 = as.integer(FE_human_proteins_all %in% FE_samples_human_entry_columns[[6]]),
  c7 = as.integer(FE_human_proteins_all %in% FE_samples_human_entry_columns[[7]]),
  c8 = as.integer(FE_human_proteins_all %in% FE_samples_human_entry_columns[[8]]),
  c9 = as.integer(FE_human_proteins_all %in% FE_samples_human_entry_columns[[9]]),
  c10 = as.integer(FE_human_proteins_all %in% FE_samples_human_entry_columns[[10]]),
  c11 = as.integer(FE_human_proteins_all %in% FE_samples_human_entry_columns[[11]]),
  c12 = as.integer(FE_human_proteins_all %in% FE_samples_human_entry_columns[[12]]),
  c13 = as.integer(FE_human_proteins_all %in% FE_samples_human_entry_columns[[13]]),
  c14 = as.integer(FE_human_proteins_all %in% FE_samples_human_entry_columns[[14]]),
  c15 = as.integer(FE_human_proteins_all %in% FE_samples_human_entry_columns[[15]]),
  c16 = as.integer(FE_human_proteins_all %in% FE_samples_human_entry_columns[[16]]),
  c17 = as.integer(FE_human_proteins_all %in% FE_samples_human_entry_columns[[17]]),
  c18 = as.integer(FE_human_proteins_all %in% FE_samples_human_entry_columns[[18]]),
  c19 = as.integer(FE_human_proteins_all %in% FE_samples_human_entry_columns[[19]]),
  c20 = as.integer(FE_human_proteins_all %in% FE_samples_human_entry_columns[[20]]),
  c21 = as.integer(FE_human_proteins_all %in% FE_samples_human_entry_columns[[21]]),
  c22 = as.integer(FE_human_proteins_all %in% FE_samples_human_entry_columns[[22]]),
  c23 = as.integer(FE_human_proteins_all %in% FE_samples_human_entry_columns[[23]]),
  c24 = as.integer(FE_human_proteins_all %in% FE_samples_human_entry_columns[[24]]),
  c25 = as.integer(FE_human_proteins_all %in% FE_samples_human_entry_columns[[25]]))

#Calculate rowsums and append the result as a new column
rowsums_column_FE_human <- rowSums(protein_distribution_FE_human[2:26])
protein_distribution_FE_human <- cbind(protein_distribution_FE_human, rowsums_column_FE_human)

#Sort the data table by rowsums
protein_distribution_FE_human <- protein_distribution_FE_human[order(-protein_distribution_FE_human$rowsums_column_FE_human),]
write.table(protein_distribution_FE_human, file = "your_output_folder\\FE_human_protein_distribution", row.names = FALSE, col.names = TRUE, quote=FALSE, sep = "\t")

#Analyze taxonomy of the bacterial proteins
#List unique phyla present in the samples
phyla_list_AM <- unique(unlist(lapply(AM_samples_bacteria, "[[", "Taxonomic lineage (PHYLUM)")))
phyla_list_FE <- unique(unlist(lapply(FE_samples_bacteria, "[[", "Taxonomic lineage (PHYLUM)")))

#Gather phyla present in individual samples in the sample groups
phylum_columns_AM <- lapply(AM_samples_bacteria, "[[", "Taxonomic lineage (PHYLUM)")
phylum_columns_FE <- lapply(FE_samples_bacteria, "[[", "Taxonomic lineage (PHYLUM)")
phyla_count_AM <- lapply(phylum_columns_AM, as.data.frame)
phyla_count_FE <- lapply(phylum_columns_FE, as.data.frame)
phyla_count_tables_AM <- lapply(phyla_count_AM, table)
phyla_count_tables_FE <- lapply(phyla_count_FE, table)
phyla_count_tables_AM <- lapply(phyla_count_tables_AM, as.data.frame)
phyla_count_tables_FE <- lapply(phyla_count_tables_FE, as.data.frame)

#Rename columns and data frames
phyla_count_tables_AM <- lapply(phyla_count_tables_AM, setnames, c("X..i..", "Freq"), c("PHYLUM", "Proteins"), skip_absent = TRUE)
phyla_count_tables_FE <- lapply(phyla_count_tables_FE, setnames, c("X..i..", "Freq"), c("PHYLUM", "Proteins"), skip_absent = TRUE)

#Reorder rows based on protein amounts
phyla_count_tables_AM <- lapply(1:length(phyla_count_tables_AM), function(i) {arrange(phyla_count_tables_AM[[i]], desc(Proteins))})
phyla_count_tables_FE <- lapply(1:length(phyla_count_tables_FE), function(i) {arrange(phyla_count_tables_FE[[i]], desc(Proteins))})

#Prepare initial data frame where to merge phyla count tables
initial_table_AM <- data.frame(PHYLUM = phyla_list_AM, test = rep.int(0, length(phyla_list_AM)), row.names= NULL)
initial_table_FE <- data.frame(PHYLUM = phyla_list_FE, test = rep.int(0, length(phyla_list_FE)), row.names= NULL)

#Adding initial help table to the list of merged data frames to match row numbers
initial_table_and_phylum_count_tables_list_AM <- append(phyla_count_tables_AM, list(initial_table_AM), 0) 
initial_table_and_phylum_count_tables_list_FE <- append(phyla_count_tables_FE, list(initial_table_FE), 0) 

#Merge sample phyla columns
combined_phyla_table_AM <- reduce(initial_table_and_phylum_count_tables_list_AM, left_join, by = "PHYLUM", all = TRUE, all.x = TRUE, all.y = TRUE)
combined_phyla_table_FE <- reduce(initial_table_and_phylum_count_tables_list_FE, left_join, by = "PHYLUM", all = TRUE, all.x = TRUE, all.y = TRUE)

#Getting rid of initial data frame column
combined_phyla_table_AM$test <- NULL
combined_phyla_table_FE$test <- NULL

#Switching NA values to zeros in all other columns except PHYLUM
combined_phyla_table_AM[, 2:27][is.na(combined_phyla_table_AM[, 2:27])] <- 0
combined_phyla_table_FE[, 2:26][is.na(combined_phyla_table_FE[, 2:26])] <- 0

#Remove PHYLUM <NA> row
combined_phyla_table_AM <- subset(combined_phyla_table_AM, !is.na(combined_phyla_table_AM$PHYLUM))
combined_phyla_table_FE <- subset(combined_phyla_table_FE, !is.na(combined_phyla_table_FE$PHYLUM))

#Setting sample names for column names
names(combined_phyla_table_AM) = c(
  'PHYLUM',
  'AM01',
  'AM02',
  'AM03A',
  'AM03B',
  'AM04',
  'AM05',
  'AM06',
  'AM07',
  'AM08',
  'AM09',
  'AM10',
  'AM11',
  'AM12',
  'AM13',
  'AM14',
  'AM15',
  'AM16',
  'AM17',
  'AM18',
  'AM19',
  'AM20',
  'AM21',
  'AM22',
  'AM23',
  'AM24',
  'AM25' 
)

names(combined_phyla_table_FE) = c(
  'PHYLUM',
  'FE01',
  'FE02',
  'FE03',
  'FE04',
  'FE05',
  'FE06',
  'FE07',
  'FE08',
  'FE09',
  'FE10',
  'FE11',
  'FE12',
  'FE13',
  'FE14',
  'FE15',
  'FE16',
  'FE17',
  'FE18',
  'FE19',
  'FE20',
  'FE21',
  'FE22',
  'FE23',
  'FE24',
  'FE25' 
)

#Export phyla tables
write.table(combined_phyla_table_AM, file = "your_output_folder\\phyla_map_table_AM", row.names = FALSE, col.names = TRUE, quote=FALSE, sep = "\t")
write.table(combined_phyla_table_FE, file = "your_output_folder\\phyla_map_table_FE", row.names = FALSE, col.names = TRUE, quote=FALSE, sep = "\t")

#DRAWING HEATMAP PRESENTATION OF BACTERIAL PROTEIN TAXONOMY
#Combine phyla tables generated for amniotic fluid and fecal bacterial proteins
combined_heatmap_AM_FE <- merge(combined_phyla_table_AM, combined_phyla_table_FE, by="PHYLUM", all=TRUE)
#Set NA values to 0
combined_heatmap_AM_FE[is.na(combined_heatmap_AM_FE)] <- 0
#Set out the filtering by indicating how many protein hits are needed in total across all samples
combined_heatmap_AM_FE_filtered <- filter(combined_heatmap_AM_FE,rowSums(combined_heatmap_AM_FE[,-1]) >= 10)

#Add pseudo count +0.5 and transform hit values to log10
rownames(combined_heatmap_AM_FE_filtered) <- combined_heatmap_AM_FE_filtered[,1]
combined_heatmap_AM_FE_filtered[,1] <- NULL
combined_heatmap_AM_FE_filtered <- combined_heatmap_AM_FE_filtered + 0.5
combined_heatmap_AM_FE_filtered <- log10((as.matrix(combined_heatmap_AM_FE_filtered)))
combined_heatmap_AM_FE_filtered <- as.data.frame(combined_heatmap_AM_FE_filtered)
combined_heatmap_AM_FE_filtered$PHYLUM <- rownames(combined_heatmap_AM_FE_filtered)

#Transform data table to appropriate form for ggplot2
combined_heatmap_AM_FE_modified <- pivot_longer(combined_heatmap_AM_FE_filtered, !PHYLUM, names_to = "sample", values_to = "proteins")

#FUNCTIONAL CHARACTERISTICS OF AMNIOTIC FLUID AND FECAL EV BACTERIAL PROTEINS

#Process GO, biological process columns and make data table with counts
#Amniotic fluid
AM_BP_columns <- lapply(1:length(AM_samples_bacteria), function(i) {subset(AM_samples_bacteria[[i]], select = "Gene ontology (biological process)")})
AM_BP_columns_renamed <- lapply(1:length(AM_BP_columns), function(i) {rename(AM_BP_columns[[i]], GO_BP = "Gene ontology (biological process)")})
AM_BP_columns_fixed <- lapply(1:length(AM_BP_columns_renamed), function(i) {separate_rows(AM_BP_columns_renamed[[i]], GO_BP, sep = "; ")}) 
AM_BP_table <- as.data.frame(table(unlist(AM_BP_columns_fixed)))
#Fecal samples
FE_BP_columns <- lapply(1:length(FE_samples_bacteria), function(i) {subset(FE_samples_bacteria[[i]], select = "Gene ontology (biological process)")})
FE_BP_columns_renamed <- lapply(1:length(FE_BP_columns), function(i) {rename(FE_BP_columns[[i]], GO_BP = "Gene ontology (biological process)")})
FE_BP_columns_fixed <- lapply(1:length(FE_BP_columns_renamed), function(i) {separate_rows(FE_BP_columns_renamed[[i]], GO_BP, sep = "; ")}) 
FE_BP_table <- as.data.frame(table(unlist(FE_BP_columns_fixed)))

#Calculate % for how many proteins GOs exist
#Amniotic fluid
AM_BP_NAs <- sum(is.na(unlist(AM_BP_columns_renamed)))
AM_BP_length <- length(unlist(AM_BP_columns_renamed))
AM_BP_percent <- (AM_MF_length-AM_BP_NAs)/AM_MF_length * 100
#Fecal samples
FE_BP_NAs <- sum(is.na(unlist(FE_BP_columns_renamed)))
FE_BP_length <- length(unlist(FE_BP_columns_renamed))
FE_BP_percent <- (FE_MF_length-FE_BP_NAs)/FE_BP_length * 100

#Export GO biological process lists for sample groups
write.table(AM_BP_table, file = "your_output_folder\\AM_BP", row.names = FALSE, col.names = TRUE, quote=FALSE, sep = "\t")
write.table(FE_BP_table, file = "your_output_folder\\FE_BP", row.names = FALSE, col.names = TRUE, quote=FALSE, sep = "\t")

#The GO annotations of the identified proteins were manually traced to the GO terms next to root "biological process" using
#https://www.ebi.ac.uk/QuickGO/annotations
#The manual annotation data was written to files "AM_BP_hierarchy_df.txt" and "FE_BP_hierarchy_df.txt

#Import manually annotated GO data tables
AM_BP_hierarchy_df <- read.csv2("your_output_folder\\AM_BP_hierarchy_df.txt",
                                sep ="\t", header = TRUE, check.names = FALSE, na.strings=c("","NA"))
FE_BP_hierarchy_df <- read.csv2("your_output_folder\\FE_BP_hierarchy_df.txt",
                                sep ="\t", header = TRUE, check.names = FALSE, na.strings=c("","NA"))

#Print unique GO terms present in the manually annotated GO data
#Amniotic fluid
AM_uniq_BP <- unique(as.vector(as.matrix(AM_BP_hierarchy_df[ , 3:5])))
AM_uniq_BP <- AM_uniq_BP[! AM_uniq_BP %in% NA]
#Fecal samples
FE_uniq_BP <- unique(as.vector(as.matrix(FE_BP_hierarchy_df[ , 3:5])))
FE_uniq_BP <- FE_uniq_BP[! FE_uniq_BP %in% NA]
AM_FE_uniq_BP <- unique(c(AM_uniq_BP, FE_uniq_BP))

#Calculate biological process GO counts for amniotic fluid samples
AM_GOs_BP <- c(
  sum(AM_BP_hierarchy_df[which(AM_BP_hierarchy_df$level_2A == "cellular process" | AM_BP_hierarchy_df$level_2B == "cellular process" | AM_BP_hierarchy_df$level_2C == "cellular process"), 2]),
  sum(AM_BP_hierarchy_df[which(AM_BP_hierarchy_df$level_2A == "response to stimulus" | AM_BP_hierarchy_df$level_2B == "response to stimulus" | AM_BP_hierarchy_df$level_2C == "response to stimulus"), 2]),
  sum(AM_BP_hierarchy_df[which(AM_BP_hierarchy_df$level_2A == "metabolic process" | AM_BP_hierarchy_df$level_2B == "metabolic process" | AM_BP_hierarchy_df$level_2C == "metabolic process"), 2]),
  sum(AM_BP_hierarchy_df[which(AM_BP_hierarchy_df$level_2A == "biological process involved in interspecies interaction between organisms" | AM_BP_hierarchy_df$level_2B == "biological process involved in interspecies interaction between organisms" | AM_BP_hierarchy_df$level_2C == "biological process involved in interspecies interaction between organisms"), 2]),
  sum(AM_BP_hierarchy_df[which(AM_BP_hierarchy_df$level_2A == "localization" | AM_BP_hierarchy_df$level_2B == "localization" | AM_BP_hierarchy_df$level_2C == "localization"), 2]),
  sum(AM_BP_hierarchy_df[which(AM_BP_hierarchy_df$level_2A == "biological regulation" | AM_BP_hierarchy_df$level_2B == "biological regulation" | AM_BP_hierarchy_df$level_2C == "biological regulation"), 2]),
  sum(AM_BP_hierarchy_df[which(AM_BP_hierarchy_df$level_2A == "developmental process" | AM_BP_hierarchy_df$level_2B == "developmental process" | AM_BP_hierarchy_df$level_2C == "developmental process"), 2]),
  sum(AM_BP_hierarchy_df[which(AM_BP_hierarchy_df$level_2A == "viral process" | AM_BP_hierarchy_df$level_2B == "viral process" | AM_BP_hierarchy_df$level_2C == "viral process"), 2]))

#Calculate biological process GO counts for fecal samples
FE_GOs_BP <- c(
  sum(FE_BP_hierarchy_df[which(FE_BP_hierarchy_df$level_2A == "cellular process" | FE_BP_hierarchy_df$level_2B == "cellular process" | FE_BP_hierarchy_df$level_2C == "cellular process"), 2]),
  sum(FE_BP_hierarchy_df[which(FE_BP_hierarchy_df$level_2A == "response to stimulus" | FE_BP_hierarchy_df$level_2B == "response to stimulus" | FE_BP_hierarchy_df$level_2C == "response to stimulus"), 2]),
  sum(FE_BP_hierarchy_df[which(FE_BP_hierarchy_df$level_2A == "metabolic process" | FE_BP_hierarchy_df$level_2B == "metabolic process" | FE_BP_hierarchy_df$level_2C == "metabolic process"), 2]),
  sum(FE_BP_hierarchy_df[which(FE_BP_hierarchy_df$level_2A == "biological process involved in interspecies interaction between organisms" | FE_BP_hierarchy_df$level_2B == "biological process involved in interspecies interaction between organisms" | FE_BP_hierarchy_df$level_2C == "biological process involved in interspecies interaction between organisms"), 2]),
  sum(FE_BP_hierarchy_df[which(FE_BP_hierarchy_df$level_2A == "localization" | FE_BP_hierarchy_df$level_2B == "localization" | FE_BP_hierarchy_df$level_2C == "localization"), 2]),
  sum(FE_BP_hierarchy_df[which(FE_BP_hierarchy_df$level_2A == "biological regulation" | FE_BP_hierarchy_df$level_2B == "biological regulation" | FE_BP_hierarchy_df$level_2C == "biological regulation"), 2]),
  sum(FE_BP_hierarchy_df[which(FE_BP_hierarchy_df$level_2A == "developmental process" | FE_BP_hierarchy_df$level_2B == "developmental process" | FE_BP_hierarchy_df$level_2C == "developmental process"), 2]),
  sum(FE_BP_hierarchy_df[which(FE_BP_hierarchy_df$level_2A == "viral process" | FE_BP_hierarchy_df$level_2B == "viral process" | FE_BP_hierarchy_df$level_2C == "viral process"), 2]))

#Create data table for GO counts
AM_FE_BP_hierarchy_counts_table <- data.frame(AM_FE_uniq_BP,AM_GOs_BP, FE_GOs_BP)

#Add pseudocount to GO counts and convert counts to log10
rownames(AM_FE_BP_hierarchy_counts_table) <- AM_FE_BP_hierarchy_counts_table[,1]
AM_FE_BP_hierarchy_counts_table[,1] <- NULL
AM_FE_BP_hierarchy_counts_table <- AM_FE_BP_hierarchy_counts_table + 0.5
AM_FE_BP_hierarchy_counts_table <- log10((as.matrix(AM_FE_BP_hierarchy_counts_table)))
AM_FE_BP_hierarchy_counts_table <- as.data.frame(AM_FE_BP_hierarchy_counts_table)
AM_FE_BP_hierarchy_counts_table$AM_FE_uniq_BP <- rownames(AM_FE_BP_hierarchy_counts_table)

#Split long GO name to 2 rows
AM_FE_BP_hierarchy_counts_table$AM_FE_uniq_BP[AM_FE_BP_hierarchy_counts_table$AM_FE_uniq_BP == 'biological process involved in interspecies interaction between organisms'] <- 'biological process involved in \n interspecies interaction between organisms'

#Transform data for ggplot
AM_FE_BP_hierarchy_counts_table_mofidied <- pivot_longer(AM_FE_BP_hierarchy_counts_table, !AM_FE_uniq_BP, names_to = "group", values_to = "counts")

#Process GO molecular function columns and make data table with counts
#Amniotic fluid
AM_MF_columns <- lapply(1:length(AM_samples_bacteria), function(i) {subset(AM_samples_bacteria[[i]], select = "Gene ontology (molecular function)")})
AM_MF_columns_renamed <- lapply(1:length(AM_MF_columns), function(i) {rename(AM_MF_columns[[i]], GO_MF = "Gene ontology (molecular function)")})
AM_MF_columns_fixed <- lapply(1:length(AM_MF_columns_renamed), function(i) {separate_rows(AM_MF_columns_renamed[[i]], GO_MF, sep = "; ")}) 
AM_MF_table <- as.data.frame(table(unlist(AM_MF_columns_fixed)))
#Fecal samples
FE_MF_columns <- lapply(1:length(FE_samples_bacteria), function(i) {subset(FE_samples_bacteria[[i]], select = "Gene ontology (molecular function)")})
FE_MF_columns_renamed <- lapply(1:length(FE_MF_columns), function(i) {rename(FE_MF_columns[[i]], GO_MF = "Gene ontology (molecular function)")})
FE_MF_columns_fixed <- lapply(1:length(FE_MF_columns_renamed), function(i) {separate_rows(FE_MF_columns_renamed[[i]], GO_MF, sep = "; ")}) 
FE_MF_table <- as.data.frame(table(unlist(FE_MF_columns_fixed)))

#Calculate % for how many proteins GOs exist
#Amniotic fluid
AM_MF_NAs <- sum(is.na(unlist(AM_MF_columns_renamed)))
AM_MF_length <- length(unlist(AM_MF_columns_renamed))
AM_MF_percent <- (AM_MF_length-AM_MF_NAs)/AM_MF_length * 100
#Fecal samples
FE_MF_NAs <- sum(is.na(unlist(FE_MF_columns_renamed)))
FE_MF_length <- length(unlist(FE_MF_columns_renamed))
FE_MF_percent <- (FE_MF_length-FE_MF_NAs)/FE_MF_length * 100

#Export GO molecular function lists for sample groups
write.table(AM_MF_table, file = "your_output_folder\\AM_MF", row.names = FALSE, col.names = TRUE, quote=FALSE, sep = "\t")
write.table(FE_MF_table, file = "your_output_folder\\FE_MF", row.names = FALSE, col.names = TRUE, quote=FALSE, sep = "\t")


#The GO annotations of the identified proteins were manually traced to the GO terms next to root "molecular function" using
#https://www.ebi.ac.uk/QuickGO/annotations
#The manual annotation data was written to files "AM_MF_hierarchy_df.txt" and "FE_MF_hierarchy_df.txt

#Import manually annotated GO data tables
AM_MF_hierarchy_df <- read.csv2("Z:\\bEVs of foetal environment project, Anna Kaisanlahti\\proteomics\\R_analysis_files\\results\\GO\\AM_MF_hierarchy_df.txt",
                                sep ="\t", header = TRUE, check.names = FALSE, na.strings=c("","NA"))
FE_MF_hierarchy_df <- read.csv2("Z:\\bEVs of foetal environment project, Anna Kaisanlahti\\proteomics\\R_analysis_files\\results\\GO\\FE_MF_hierarchy_df.txt",
                                sep ="\t", header = TRUE, check.names = FALSE, na.strings=c("","NA"))

#Print unique GO terms present in the manually annotated GO data
#Amniotic fluid
AM_uniq_MF <- unique(as.vector(as.matrix(AM_MF_hierarchy_df[ , 3:4])))
AM_uniq_MF <- AM_uniq_MF[! AM_uniq_MF %in% NA]
#Fecal samples
FE_uniq_MF <- unique(as.vector(as.matrix(FE_MF_hierarchy_df[ , 3:4])))
FE_uniq_MF <- FE_uniq_MF[! FE_uniq_MF %in% NA]
AM_FE_uniq_MF <- unique(c(AM_uniq_MF, FE_uniq_MF))

#Calculate molecular function GO counts for amniotic fluid samples
AM_GOs_MF <- c(
  sum(AM_MF_hierarchy_df[which(AM_MF_hierarchy_df$level_2A == "catalytic activity" | AM_MF_hierarchy_df$level_2B == "catalytic activity"), 2]),
  sum(AM_MF_hierarchy_df[which(AM_MF_hierarchy_df$level_2A == "binding" | AM_MF_hierarchy_df$level_2B == "binding"), 2]),
  sum(AM_MF_hierarchy_df[which(AM_MF_hierarchy_df$level_2A == "transporter activity" | AM_MF_hierarchy_df$level_2B == "transporter activity"), 2]),
  sum(AM_MF_hierarchy_df[which(AM_MF_hierarchy_df$level_2A == "structural molecule activity" |AM_MF_hierarchy_df$level_2B == "structural molecule activity"), 2]),
  sum(AM_MF_hierarchy_df[which(AM_MF_hierarchy_df$level_2A == "toxin activity" | AM_MF_hierarchy_df$level_2B == "toxin activity"), 2]),
  sum(AM_MF_hierarchy_df[which(AM_MF_hierarchy_df$level_2A == "cargo receptor activity" | AM_MF_hierarchy_df$level_2B == "cargo receptor activity"), 2]),
  sum(AM_MF_hierarchy_df[which(AM_MF_hierarchy_df$level_2A == "ATP-dependent activity" | AM_MF_hierarchy_df$level_2B == "ATP-dependent activity"), 2]),
  sum(AM_MF_hierarchy_df[which(AM_MF_hierarchy_df$level_2A == "translation regulator activity" | AM_MF_hierarchy_df$level_2B == "translation regulator activity"), 2]),
  sum(AM_MF_hierarchy_df[which(AM_MF_hierarchy_df$level_2A == "molecular adaptor activity" | AM_MF_hierarchy_df$level_2B == "molecular adaptor activity"), 2]))

#Calculate molecular function GO counts for fecal samples
FE_GOs_MF <- c(
  sum(FE_MF_hierarchy_df[which(FE_MF_hierarchy_df$level_2A == "catalytic activity" | FE_MF_hierarchy_df$level_2B == "catalytic activity"), 2]),
  sum(FE_MF_hierarchy_df[which(FE_MF_hierarchy_df$level_2A == "binding" | FE_MF_hierarchy_df$level_2B == "binding"), 2]),
  sum(FE_MF_hierarchy_df[which(FE_MF_hierarchy_df$level_2A == "transporter activity" | FE_MF_hierarchy_df$level_2B == "transporter activity"), 2]),
  sum(FE_MF_hierarchy_df[which(FE_MF_hierarchy_df$level_2A == "structural molecule activity" |FE_MF_hierarchy_df$level_2B == "structural molecule activity"), 2]),
  sum(FE_MF_hierarchy_df[which(FE_MF_hierarchy_df$level_2A == "toxin activity" | FE_MF_hierarchy_df$level_2B == "toxin activity"), 2]),
  sum(FE_MF_hierarchy_df[which(FE_MF_hierarchy_df$level_2A == "cargo receptor activity" | FE_MF_hierarchy_df$level_2B == "cargo receptor activity"), 2]),
  sum(FE_MF_hierarchy_df[which(FE_MF_hierarchy_df$level_2A == "ATP-dependent activity" | FE_MF_hierarchy_df$level_2B == "ATP-dependent activity"), 2]),
  sum(FE_MF_hierarchy_df[which(FE_MF_hierarchy_df$level_2A == "translation regulator activity" | FE_MF_hierarchy_df$level_2B == "translation regulator activity"), 2]),
  sum(FE_MF_hierarchy_df[which(FE_MF_hierarchy_df$level_2A == "molecular adaptor activity" | FE_MF_hierarchy_df$level_2B == "molecular adaptor activity"), 2]))

#Create data table for GO counts
AM_FE_MF_hierarchy_counts_table <- data.frame(AM_FE_uniq_MF,AM_GOs_MF, FE_GOs_MF)

##Add pseudocount to GO counts and convert counts to log10
rownames(AM_FE_MF_hierarchy_counts_table) <- AM_FE_MF_hierarchy_counts_table[,1]
AM_FE_MF_hierarchy_counts_table[,1] <- NULL
AM_FE_MF_hierarchy_counts_table <- AM_FE_MF_hierarchy_counts_table + 0.5
AM_FE_MF_hierarchy_counts_table <- log10((as.matrix(AM_FE_MF_hierarchy_counts_table)))
AM_FE_MF_hierarchy_counts_table <- as.data.frame(AM_FE_MF_hierarchy_counts_table)
AM_FE_MF_hierarchy_counts_table$AM_FE_uniq_MF <- rownames(AM_FE_MF_hierarchy_counts_table)

#Transform data for ggplot
AM_FE_MF_hierarchy_counts_table_mofidied <- pivot_longer(AM_FE_MF_hierarchy_counts_table, !AM_FE_uniq_MF, names_to = "group", values_to = "counts")

#Draw multipanel figure with cowplot
#Amniotic fluid and fecal EV bacterial protein phylum heatmap and GOs

#Draw heatmap for AM samples and store it in a variable
library(grid)
AM_lab <- textGrob("AM EVs", gp=gpar(fontsize=8))
FE_lab <- textGrob("FE EVs", gp=gpar(fontsize=8))
AM_FE_heatmap <- ggplot(combined_heatmap_AM_FE_modified, aes(x=sample,y=reorder(PHYLUM, proteins), fill= proteins)) + 
  geom_tile() +
  annotation_custom(AM_lab, xmin=15,xmax=15,ymin=-2,ymax=-2) +
  annotation_custom(FE_lab, xmin=37.5,xmax=37.5,ymin=-2,ymax=-2) +
  coord_cartesian(clip="off") +
  scale_fill_distiller(palette = "RdPu") +
  ylab("PHYLUM") +
  guides(fill = guide_colourbar(barwidth = 0.5, barheight = 3, title = "log10")) +
  theme(axis.title.x = element_blank(),
        panel.background = element_blank(),
        axis.text.x = element_text(size=6, angle=90, hjust = 1.5, face = "bold"),
        axis.ticks.x = element_blank(),
        axis.title.y = element_text(size=8),
        axis.text.y = element_text(size=8, face = "bold"),
        plot.margin = unit(c(0,0,0.2,0), "cm"))

#Draw heatmap for AM+FE GO biological process and store it in a variable
AM_FE_GOs_BP <- ggplot(AM_FE_BP_hierarchy_counts_table_mofidied, aes(x=group, y=reorder(AM_FE_uniq_BP, counts), fill= counts)) + 
  geom_tile() +
  scale_x_discrete(labels = c("AM EVs", "FE EVs")) +
  scale_fill_distiller(palette = "RdPu") +
  guides(fill = guide_colourbar(barwidth = 0.5, barheight = 3, title = "log10")) +
  theme(axis.title.x=element_blank(),
        panel.background=element_blank(),
        axis.ticks.x=element_blank(),
        axis.text.x=element_text(angle=90, size=8,face = "bold"),
        axis.title.y=element_blank(),
        axis.text.y = element_text(size=8, face = "bold"),
        plot.margin = unit(c(0.5,4,0,0), "cm"))

#Draw heatmap for AM+FE GO molecular function and store it in a variable
AM_FE_GOs_MF <- ggplot(AM_FE_MF_hierarchy_counts_table_mofidied, aes(x=group, y=reorder(AM_FE_uniq_MF, counts), fill= counts)) + 
  geom_tile() +
  scale_x_discrete(labels = c("AM EVs", "FE EVs")) +
  scale_fill_distiller(palette = "RdPu") +
  guides(fill = guide_colourbar(barwidth = 0.5, barheight = 3, title = "log10")) +
  theme(axis.title.x=element_blank(),
        panel.background=element_blank(),
        axis.ticks.x=element_blank(),
        axis.text.x=element_text(angle=90, size=8, face = "bold"),
        axis.title.y=element_blank(),
        axis.text.y = element_text(size=8, face = "bold"),
        plot.margin = unit(c(0.5,6,0,0), "cm"))

#Plot the figures in 2 rows
top_row <- plot_grid(AM_FE_heatmap, labels = c("A"), label_size = 14)
bottom_row <- plot_grid(AM_FE_GOs_BP, AM_FE_GOs_MF, labels = c("B", "C"), label_size = 14)
tiff(file="your_output_folder\\AM_FE_characterization_multipanel.tiff",
     res = 300, width = 10, height = 4, units = 'in')
plot_grid(top_row, bottom_row, ncol = 1)
dev.off()

#Section 4.0, characterize the bacterial proteins identified in amniotic fluid and fecal samples, overlap, GOs and taxonomy

#Export protein IDs that overlap in amniotic fluid and fecal sample groups
#Bacteria
bacterial_protein_IDs_overlapping_AM_FE <- intersect(as.character(AM_bacterial_proteins_all), as.character(FE_bacterial_proteins_all))
write.table(bacterial_protein_IDs_FEping_AM_FE, file = "your_output_folder\\overlap_AM_FE_bacteria", row.names = FALSE, col.names = TRUE, quote=FALSE, sep = "\t")

#Human
human_protein_IDs_overlapping_AM_FE <- intersect(as.character(AM_human_proteins_all), as.character(FE_human_proteins_all))
write.table(human_protein_IDs_FEping_AM_FE, file = "your_output_folder\\overlap_AM_FE_human", row.names = FALSE, col.names = TRUE, quote=FALSE, sep = "\t")

#Fetch overlapping bacterial protein data from UniProt using the exported overlap protein IDs with ID mapping tool from:
#https://www.uniprot.org/id-mapping

#Import overlapping data table exported from UniProt ID retrieval website
overlap_bacterial_proteins_AM_FE <- read.csv2("your_output_folder\\overlap_AM_FE_bacterial_data.txt",
                                              na.strings=c(""), sep = "\t", stringsAsFactors = FALSE)

#Process GO, biological process columns and make data table with counts
overlap_bacterial_proteins_AM_FE_BP_column <-subset(overlap_bacterial_proteins_AM_FE, select = Gene.Ontology..biological.process.)
overlap_bacterial_proteins_AM_FE_BP_column <- rename(overlap_bacterial_proteins_AM_FE_BP_column, GO_BP = "Gene.Ontology..biological.process.")
overlap_bacterial_proteins_AM_FE_BP_fixed <- separate_rows(overlap_bacterial_proteins_AM_FE_BP_column, GO_BP, sep = "; ") 
overlap_bacterial_proteins_GO_BP_table <- as.data.frame(table(overlap_bacterial_proteins_AM_FE_BP_fixed))

#Calculate % for how many proteins GOs exist
overlap_BP_NAs <- sum(is.na(overlap_bacterial_proteins_AM_FE_BP_column))
overlap_BP_length <- length(unlist(overlap_bacterial_proteins_AM_FE_BP_column))
overlap_BP_percent <- (overlap_MF_length-overlap_MF_NAs)/overlap_BP_length * 100

#The GO annotations of the identified proteins were manually traced to the GO terms next to root "biological process" using
#https://www.ebi.ac.uk/QuickGO/annotations
#The manual annotation data was written to file "overlap_BP_hierarchy_df.txt"

#Import manually annotated GO data table
overlap_BP_hierarchy_df <- read.csv2("your_output_file\\overlap_BP_hierarchy_df.txt",
                                     sep ="\t", header = TRUE, check.names = FALSE, na.strings=c("","NA"))

#Print unique GO terms present in the manually annotated GO data
overlap_uniq_BP <- unique(as.vector(as.matrix(overlap_BP_hierarchy_df[ , 3:6])))
overlap_uniq_BP <- overlap_uniq_BP[! overlap_uniq_BP %in% NA]

#Calculate biological process GO counts
overlap_GOs_BP <- c(
  sum(overlap_BP_hierarchy_df[which(overlap_BP_hierarchy_df$level_2A == "response to stimulus" | overlap_BP_hierarchy_df$level_2B == "response to stimulus" | overlap_BP_hierarchy_df$level_2C == "response to stimulus" | overlap_BP_hierarchy_df$level_2D == "response to stimulus"), 2]),
  sum(overlap_BP_hierarchy_df[which(overlap_BP_hierarchy_df$level_2A == "cellular process" | overlap_BP_hierarchy_df$level_2B == "cellular process" | overlap_BP_hierarchy_df$level_2C == "cellular process"| overlap_BP_hierarchy_df$level_2D == "cellular process"), 2]),
  sum(overlap_BP_hierarchy_df[which(overlap_BP_hierarchy_df$level_2A == "biological process involved in interspecies interaction between organisms" | overlap_BP_hierarchy_df$level_2B == "biological process involved in interspecies interaction between organisms" | overlap_BP_hierarchy_df$level_2C == "biological process involved in interspecies interaction between organisms"| overlap_BP_hierarchy_df$level_2D == "biological process involved in interspecies interaction between organisms"), 2]),
  sum(overlap_BP_hierarchy_df[which(overlap_BP_hierarchy_df$level_2A == "metabolic process" | overlap_BP_hierarchy_df$level_2B == "metabolic process" | overlap_BP_hierarchy_df$level_2C == "metabolic process"| overlap_BP_hierarchy_df$level_2D == "metabolic process"), 2]),
  sum(overlap_BP_hierarchy_df[which(overlap_BP_hierarchy_df$level_2A == "localization" | overlap_BP_hierarchy_df$level_2B == "localization" | overlap_BP_hierarchy_df$level_2C == "localization"| overlap_BP_hierarchy_df$level_2D == "localization"), 2]),
  sum(overlap_BP_hierarchy_df[which(overlap_BP_hierarchy_df$level_2A == "developmental process" | overlap_BP_hierarchy_df$level_2B == "developmental process" | overlap_BP_hierarchy_df$level_2C == "developmental process"| overlap_BP_hierarchy_df$level_2D == "developmental process"), 2]),
  sum(overlap_BP_hierarchy_df[which(overlap_BP_hierarchy_df$level_2A == "biological regulation" | overlap_BP_hierarchy_df$level_2B == "biological regulation" | overlap_BP_hierarchy_df$level_2C == "biological regulation"| overlap_BP_hierarchy_df$level_2D == "biological regulation"), 2]),
  sum(overlap_BP_hierarchy_df[which(overlap_BP_hierarchy_df$level_2A == "signaling" | overlap_BP_hierarchy_df$level_2B == "signaling" | overlap_BP_hierarchy_df$level_2C == "signaling"| overlap_BP_hierarchy_df$level_2D == "signaling"), 2]))

#Create data table for GO counts
overlap_BP_hierarchy_counts_table <- data.frame(overlap_uniq_BP, overlap_GOs_BP)

#Add GOs terms for 3 proteins missing from the current UniProt release
#Note the difference in database versions when analyzing proteomics data and fetching protein data from UniProt website
overlap_BP_hierarchy_counts_table$overlap_GOs_BP <- c(1,51,1,53,8,3,1,1)

#Modify GO term name 'biological process involved in interspecies interaction between organisms' to be in 2 rows
overlap_BP_hierarchy_counts_table$overlap_uniq_BP[overlap_BP_hierarchy_counts_table$overlap_uniq_BP == 'biological process involved in interspecies interaction between organisms'] <- 'biological process involved in \n interspecies interaction between organisms'

#Overlap GOs, molecular function
#Process GO, molecular function columns and make data table with counts
overlap_bacterial_proteins_AM_FE_MF_column <-subset(overlap_bacterial_proteins_AM_FE, select = Gene.Ontology..molecular.function.)
overlap_bacterial_proteins_AM_FE_MF_column <- rename(overlap_bacterial_proteins_AM_FE_MF_column, GO_MF = "Gene.Ontology..molecular.function.")
overlap_bacterial_proteins_AM_FE_MF_fixed <- separate_rows(overlap_bacterial_proteins_AM_FE_MF_column, GO_MF, sep = "; ") 
overlap_bacterial_proteins_GO_MF_table <- as.data.frame(table(overlap_bacterial_proteins_AM_FE_MF_fixed))

#Calculate % for how many proteins GOs exist
overlap_MF_NAs <- sum(is.na(overlap_bacterial_proteins_AM_FE_MF_column))
overlap_MF_length <- length(unlist(overlap_bacterial_proteins_AM_FE_MF_column))
overlap_MF_percent <- (overlap_MF_length-overlap_MF_NAs)/overlap_MF_length * 100

#The GO annotations of the identified proteins were manually traced to the GO terms next to root "molecular function" using
#https://www.ebi.ac.uk/QuickGO/annotations
#The manual annotation data was written to file "overlap_MF_hierarchy_df.txt"

#Import manually annotated GO data table
overlap_MF_hierarchy_df <- read.csv2("your_output_file\\overlap_MF_hierarchy_df.txt",
                                     sep ="\t", header = TRUE, check.names = FALSE, na.strings=c("","NA"))

#Print unique GO terms present in the manually annotated GO data
overlap_uniq_MF <- unique(as.vector(as.matrix(overlap_MF_hierarchy_df[ , 3:4])))
overlap_uniq_MF <- overlap_uniq_MF[! overlap_uniq_MF %in% NA]

#Calculate molecular function GO counts
overlap_GOs_MF <- c(
  sum(overlap_MF_hierarchy_df[which(overlap_MF_hierarchy_df$level_2A == "catalytic activity" | overlap_MF_hierarchy_df$level_2B == "catalytic activity"), 2]),
  sum(overlap_MF_hierarchy_df[which(overlap_MF_hierarchy_df$level_2A == "binding" | overlap_MF_hierarchy_df$level_2B == "binding"), 2]),
  sum(overlap_MF_hierarchy_df[which(overlap_MF_hierarchy_df$level_2A == "transporter activity" | overlap_MF_hierarchy_df$level_2B == "transporter activity"), 2]),
  sum(overlap_MF_hierarchy_df[which(overlap_MF_hierarchy_df$level_2A == "structural molecule activity" | overlap_MF_hierarchy_df$level_2B == "structural molecule activity"), 2]),
  sum(overlap_MF_hierarchy_df[which(overlap_MF_hierarchy_df$level_2A == "toxin activity" | overlap_MF_hierarchy_df$level_2B == "toxin activity"), 2]),
  sum(overlap_MF_hierarchy_df[which(overlap_MF_hierarchy_df$level_2A == "translation regulator activity" | overlap_MF_hierarchy_df$level_2B == "translation regulator activity"), 2]),
  sum(overlap_MF_hierarchy_df[which(overlap_MF_hierarchy_df$level_2A == "ATP-dependent activity" | overlap_MF_hierarchy_df$level_2B == "ATP-dependent activity"), 2]))

#Create data table for GO counts
overlap_MF_hierarchy_counts_table <- data.frame(overlap_uniq_MF, overlap_GOs_MF)

#Add GOs terms for 3 proteins missing from the curretn UniProt release
#Note the difference in database versions when analyzing proteomics data and fetching protein data from UniProt website
overlap_MF_hierarchy_counts_table$overlap_GOs_MF <- c(44,50,3,15,1,1,4)

#OVerlap protein bacteria taxonomy
#From the overlapping protein identifications, taxonomy of bacterial proteins is checked manually
#from the UniProt ID mapping data table fetched with unique protein IDs in the sample groups
#https://www.uniprot.org/id-mapping

#Phyla level taxonomy data of bacterial proteins
overlap_phylum <- c(
  "Firmicutes",
  "Actinobacteria",
  "Proteobacteria",
  "Spirochaetes",
  "Acidobacteria",
  "Cyanobacteria",
  "Bacteroidetes")

overlap_phylum_freq <- c(
  19,
  9,
  22,
  1,
  1,
  7,
  19)

#Construct data frame from the data above
overlap_taxa_df <- data.frame(overlap_phylum, overlap_phylum_freq)

#Venn diagrams with ggVennDiagram
#Draw Venn diagram for amniotic fluid and fecal samples bacterial proteins to a variable
x_bac <- list(as.character(AM_bacterial_proteins_all), as.character(FE_bacterial_proteins_all))
names(x_bac) <- c("AM EVs", "FE EVs")
venn_bac <- ggVennDiagram(x_bac, label_alpha = 0, set_size = 5, label_size = 5) +
  scale_fill_gradient(low = "pink1", high = "pink4") + 
  scale_color_manual(values = c("black", "black"))

#Draw overlap, taxonomy and GO figures
#Draw Venn diagram for amniotic fluid and fecal samples human proteins to a variable
x_human <- list(as.character(AM_human_proteins_all), as.character(FE_human_proteins_all))
names(x_human) <- c("AM EVs", "FE EVs")
venn_human <-ggVennDiagram(x_human, label_alpha = 0, set_size = 5, label_size = 5) +
  scale_fill_gradient(low = "pink1", high = "pink4") + 
  scale_color_manual(values = c("black","black"))

#Draw overlap taxonomy column blot to variable
overlap_taxa_colblot <- ggplot(data = overlap_taxa_df, aes(x = reorder(overlap_phylum, overlap_phylum_freq), y = overlap_phylum_freq)) +
  geom_col(fill="grey") +
  coord_flip() +
  ylab("number of hits") +
  theme(panel.background=element_blank(),
        axis.title.y = element_blank(),
        axis.title.x = element_text(size=11),
        axis.text.y=element_text(size=11,face = "bold"),
        axis.text.x=element_text(size=11,face = "bold"),
        plot.margin = unit(c(0.5,0,0,1.5), "cm"))

#Draw overlap GO BP column blot to variablee
overlap_GO_BP_colplot <- ggplot(data = overlap_BP_hierarchy_counts_table, aes(x = reorder(overlap_uniq_BP,overlap_GOs_BP), y = overlap_GOs_BP)) +
  geom_col(fill="grey") +
  coord_flip() +
  ylab("number of hits") +
  theme(panel.background=element_blank(),
        axis.title.y = element_blank(), 
        axis.title.x = element_text(size=11),
        axis.text.y=element_text(size=11,face = "bold"), 
        axis.text.x=element_text(size=11,face = "bold"),
        plot.margin = unit(c(0.5,0,0,0), "cm"))

#Draw overlap GO MF column blot to variable
overlap_GO_MF_colplot <- ggplot(data = overlap_MF_hierarchy_counts_table, aes(x = reorder(overlap_uniq_MF,overlap_GOs_MF), y = overlap_GOs_MF)) +
  geom_col(fill="grey") +
  coord_flip() +
  ylab("number of hits") +
  theme(panel.background = element_blank(),
        axis.title.y = element_blank(),
        axis.title.x = element_text(size=11),
        axis.text.y=element_text(size=11,face = "bold"), 
        axis.text.x=element_text(size=11,face = "bold"),
        plot.margin = unit(c(0.5,0,0,0), "cm"))

#Draw multipanel figure with cowplot with the figures above
top_row <- plot_grid(venn_bac, venn_human, labels = c("A", "B"), label_size = 20, ncol = 2)
bottom_row <- plot_grid(overlap_taxa_colblot, overlap_GO_BP_colplot, overlap_GO_MF_colplot,
                        labels = c("C", "D", "E"), label_size = 20, ncol = 3, rel_widths = c(0.8,1.3,1.1))
tiff(file="your_output_folder\\overlap_multipanel.tiff",
     res = 300, width = 13, height = 5, units = 'in')
plot_grid(top_row, bottom_row, ncol = 1)
dev.off()