#code for Fig. 6, stoichiometry calculation - commented ##calculating phosphorylation site stoichiometry information with a 3DMM using R ##this script is part of the publication "Benchmarking LFQ, SILAC and MS2/MS3-based TMT quantification strategies for large-scale phosphoproteomics" ##version of the script: 2017-08-09 ##this script was tested with R version 3.4.0 and data.table version 1.10.4 #this script describes how to calculate stoichiometry with the yest/HeLa example dataset from the publication, which can be downloaded from the PRIDE repository #the project accession number is PXD007145 #1) Data preparation ##### #this script works with the data table package, which (if not already done) needs to be installed first install.packages("data.table") library(data.table) #we next load the "Supplementary Data 2_Figure6_modificationSpecificPeptides.txt" from Supplementary File 2 or from the file "Figure6_occupancy_benchmark.rar" on PRIDE #this path needs to be adjusted to the correct location of the file TMT_Occ_mod <- fread("C:/Supplementary Data 2_Figure6_modificationSpecificPeptides.txt", sep="\t", na.strings = 'NaN', blank.lines.skip = FALSE, header=TRUE, verbose = TRUE, integer="double") #changing 0 for NA values in TMT reporter intensity columns for (col in grep("Reporter intensity", names(TMT_Occ_mod), value=TRUE)) TMT_Occ_mod[0==get(col), (col) := NA] rm(col) #changing names of the TMT reporter intensity columns #in this specific example, we measured the same TMT10-plex sample 3 times each with MS2 and MS3 #letters A, B, C, D and E refer to channels with 10%, 20%, 50%, 80% and 90% phosphorylation site stoichiometry, respectively #each stoichiometry exists in duplicates within the TMT10-plex setup setnames(TMT_Occ_mod,"Reporter intensity corrected 0 MS2_1","MS2_1_phos_A1") setnames(TMT_Occ_mod,"Reporter intensity corrected 1 MS2_1","MS2_1_phos_B1") setnames(TMT_Occ_mod,"Reporter intensity corrected 2 MS2_1","MS2_1_phos_C1") setnames(TMT_Occ_mod,"Reporter intensity corrected 3 MS2_1","MS2_1_phos_D1") setnames(TMT_Occ_mod,"Reporter intensity corrected 4 MS2_1","MS2_1_phos_E1") setnames(TMT_Occ_mod,"Reporter intensity corrected 5 MS2_1","MS2_1_phos_A2") setnames(TMT_Occ_mod,"Reporter intensity corrected 6 MS2_1","MS2_1_phos_B2") setnames(TMT_Occ_mod,"Reporter intensity corrected 7 MS2_1","MS2_1_phos_C2") setnames(TMT_Occ_mod,"Reporter intensity corrected 8 MS2_1","MS2_1_phos_D2") setnames(TMT_Occ_mod,"Reporter intensity corrected 9 MS2_1","MS2_1_phos_E2") setnames(TMT_Occ_mod,"Reporter intensity corrected 0 MS2_2","MS2_2_phos_A1") setnames(TMT_Occ_mod,"Reporter intensity corrected 1 MS2_2","MS2_2_phos_B1") setnames(TMT_Occ_mod,"Reporter intensity corrected 2 MS2_2","MS2_2_phos_C1") setnames(TMT_Occ_mod,"Reporter intensity corrected 3 MS2_2","MS2_2_phos_D1") setnames(TMT_Occ_mod,"Reporter intensity corrected 4 MS2_2","MS2_2_phos_E1") setnames(TMT_Occ_mod,"Reporter intensity corrected 5 MS2_2","MS2_2_phos_A2") setnames(TMT_Occ_mod,"Reporter intensity corrected 6 MS2_2","MS2_2_phos_B2") setnames(TMT_Occ_mod,"Reporter intensity corrected 7 MS2_2","MS2_2_phos_C2") setnames(TMT_Occ_mod,"Reporter intensity corrected 8 MS2_2","MS2_2_phos_D2") setnames(TMT_Occ_mod,"Reporter intensity corrected 9 MS2_2","MS2_2_phos_E2") setnames(TMT_Occ_mod,"Reporter intensity corrected 0 MS2_3","MS2_3_phos_A1") setnames(TMT_Occ_mod,"Reporter intensity corrected 1 MS2_3","MS2_3_phos_B1") setnames(TMT_Occ_mod,"Reporter intensity corrected 2 MS2_3","MS2_3_phos_C1") setnames(TMT_Occ_mod,"Reporter intensity corrected 3 MS2_3","MS2_3_phos_D1") setnames(TMT_Occ_mod,"Reporter intensity corrected 4 MS2_3","MS2_3_phos_E1") setnames(TMT_Occ_mod,"Reporter intensity corrected 5 MS2_3","MS2_3_phos_A2") setnames(TMT_Occ_mod,"Reporter intensity corrected 6 MS2_3","MS2_3_phos_B2") setnames(TMT_Occ_mod,"Reporter intensity corrected 7 MS2_3","MS2_3_phos_C2") setnames(TMT_Occ_mod,"Reporter intensity corrected 8 MS2_3","MS2_3_phos_D2") setnames(TMT_Occ_mod,"Reporter intensity corrected 9 MS2_3","MS2_3_phos_E2") setnames(TMT_Occ_mod,"Reporter intensity corrected 0 MS3_1","MS3_1_phos_A1") setnames(TMT_Occ_mod,"Reporter intensity corrected 1 MS3_1","MS3_1_phos_B1") setnames(TMT_Occ_mod,"Reporter intensity corrected 2 MS3_1","MS3_1_phos_C1") setnames(TMT_Occ_mod,"Reporter intensity corrected 3 MS3_1","MS3_1_phos_D1") setnames(TMT_Occ_mod,"Reporter intensity corrected 4 MS3_1","MS3_1_phos_E1") setnames(TMT_Occ_mod,"Reporter intensity corrected 5 MS3_1","MS3_1_phos_A2") setnames(TMT_Occ_mod,"Reporter intensity corrected 6 MS3_1","MS3_1_phos_B2") setnames(TMT_Occ_mod,"Reporter intensity corrected 7 MS3_1","MS3_1_phos_C2") setnames(TMT_Occ_mod,"Reporter intensity corrected 8 MS3_1","MS3_1_phos_D2") setnames(TMT_Occ_mod,"Reporter intensity corrected 9 MS3_1","MS3_1_phos_E2") setnames(TMT_Occ_mod,"Reporter intensity corrected 0 MS3_2","MS3_2_phos_A1") setnames(TMT_Occ_mod,"Reporter intensity corrected 1 MS3_2","MS3_2_phos_B1") setnames(TMT_Occ_mod,"Reporter intensity corrected 2 MS3_2","MS3_2_phos_C1") setnames(TMT_Occ_mod,"Reporter intensity corrected 3 MS3_2","MS3_2_phos_D1") setnames(TMT_Occ_mod,"Reporter intensity corrected 4 MS3_2","MS3_2_phos_E1") setnames(TMT_Occ_mod,"Reporter intensity corrected 5 MS3_2","MS3_2_phos_A2") setnames(TMT_Occ_mod,"Reporter intensity corrected 6 MS3_2","MS3_2_phos_B2") setnames(TMT_Occ_mod,"Reporter intensity corrected 7 MS3_2","MS3_2_phos_C2") setnames(TMT_Occ_mod,"Reporter intensity corrected 8 MS3_2","MS3_2_phos_D2") setnames(TMT_Occ_mod,"Reporter intensity corrected 9 MS3_2","MS3_2_phos_E2") setnames(TMT_Occ_mod,"Reporter intensity corrected 0 MS3_3","MS3_3_phos_A1") setnames(TMT_Occ_mod,"Reporter intensity corrected 1 MS3_3","MS3_3_phos_B1") setnames(TMT_Occ_mod,"Reporter intensity corrected 2 MS3_3","MS3_3_phos_C1") setnames(TMT_Occ_mod,"Reporter intensity corrected 3 MS3_3","MS3_3_phos_D1") setnames(TMT_Occ_mod,"Reporter intensity corrected 4 MS3_3","MS3_3_phos_E1") setnames(TMT_Occ_mod,"Reporter intensity corrected 5 MS3_3","MS3_3_phos_A2") setnames(TMT_Occ_mod,"Reporter intensity corrected 6 MS3_3","MS3_3_phos_B2") setnames(TMT_Occ_mod,"Reporter intensity corrected 7 MS3_3","MS3_3_phos_C2") setnames(TMT_Occ_mod,"Reporter intensity corrected 8 MS3_3","MS3_3_phos_D2") setnames(TMT_Occ_mod,"Reporter intensity corrected 9 MS3_3","MS3_3_phos_E2") #remove reverse hits of the target/decoy database search TMT_Occ_mod <- TMT_Occ_mod[!Reverse=="+"] #to differentiate between yeast and HeLa peptides, we need to map the species origin onto each peptide #for this purpose, we now need to load a file containing protein identifiers and their respective species #this file is included in the supplementary data 2 of the manuscript #this path needs to be adjusted to the correct location of the file YHident <- fread("C:/Supplementary Data 2_Figure6_HumanAndYeastProtIdentifiers.txt", sep="\t", na.strings = 'NaN', blank.lines.skip = FALSE, header=TRUE, verbose = TRUE, integer="double") #we match the yeast/HeLa data table onto our peptide identifications setkey(YHident, `Protein`) setkey(TMT_Occ_mod, Proteins) TMT_Occ_mod <- TMT_Occ_mod[YHident, nomatch = 0L] #for the purpose of our stoichiometry benchmark approach in the manuscript, we were not able to measure protein intensities #these are thus here set equal to 1, later yielding ratios of 1, which is as expected in most phosphoproteomics experiments #in an actual non-benchmark data set, protein intensities should be read from the "proteinGroups.txt" file instead TMT_Occ_mod[, gsub("phos(_[A-E][1,2]$)","prot\\1", grep("(phos_[A-E][1,2]$)", colnames(TMT_Occ_mod), value = TRUE, perl = TRUE), perl = TRUE) := as.data.table(matrix(1, ncol=length(grep("(phos_[A-E][1,2]$)", colnames(TMT_Occ_mod), value = TRUE, perl = TRUE)), nrow=.N))] #tags including all detected PTMs with/without phosphorylation are created #these will be used to match phosphopeptides with their respective non-phosphorylated peptide versions TMT_Occ_mod[, Prottag := paste(Sequence,"_A",`Acetyl (Protein N-term)`,"_O",`Oxidation (M)`, sep="")] TMT_Occ_mod[, Phostag := paste(Sequence,"_A",`Acetyl (Protein N-term)`,"_O",`Oxidation (M)`,"_P",`Phospho (STY)`, sep="")] #in this experiment, both phosphorylated and non-phosphorylated peptides have been measured in the same sample #non-phosphorylated peptides are now copied into new columns #in a data set with phospho- and non-phospho-peptides in different experimental groups, this step is not neccessary TMT_Occ_mod[`Phospho (STY)` == 0, gsub("phos(_[A-E][1,2]$)","non\\1", grep("(phos_[A-E][1,2]$)", colnames(TMT_Occ_mod), value = TRUE, perl = TRUE), perl = TRUE) := .SD, .SDcols = grep("(phos_[A-E][1,2]$)", colnames(TMT_Occ_mod), value = TRUE, perl = TRUE)] #the tags are used to aggregate non-phosphorylated peptides with their respective phosphorylated counterparts setkey(TMT_Occ_mod, Phostag) TMT_Occ_mod[, gsub("non(_[A-E][1,2]$)","mapped\\1", grep("(non_[A-E][1,2]$)", colnames(TMT_Occ_mod), value = TRUE, perl = TRUE), perl = TRUE) := .SD[1], .SDcols = grep("(non_[A-E][1,2]$)", names(TMT_Occ_mod), value = TRUE), by = Prottag] #all rows containing non-phosphorylated peptides are now deleted TMT_Occ_mod <- TMT_Occ_mod[`Phospho (STY)` > 0] TMT_Occ_mod <- TMT_Occ_mod[Species == "yeast"] ##### #2) 3DMM phosphorylation site stoichiometry calculation ##### #we demonstrate the calculation of phosphorylation site stoichiometry with the data prepared in step 1 #but it can of course be done with any data, as long as each (TMT-)sample contains three columns: #1. one for the phosphorylated peptide intensities (here ending in "...phos_[A-E][1,2]" with A1-E2 representing the 10 TMT channels) #2. one for the respective non-phosphorylated peptide intensities (here ending in "...mapped_[A-E][1,2]" with A1-E2 representing the 10 TMT channels) #3. one for the respective protein intensities (here ending in "...prot_[A-E][1,2]" with A1-E2 representing the 10 TMT channels) #the three columns belonging together should all have the exact same name before phos/mapped/prot #the endings in this example for the 10 different channels are "[A-E][1,2]", but this can be adapted in the code below #the code is written for 10 channels each; if more or less are used, the numbers in [] have to be adjusted accordingly #the code creates new columns "...Occ_[A-E][1,2]" for the stoichiometries per channel, and "...3D_neglog10p" for the negative log10 p-value per model for (colt in unique(gsub("(.*)phos(_[A-E][1,2])", "\\1", grep("phos_[A-E][1,2]", colnames(TMT_Occ_mod), value=TRUE, perl=TRUE), perl=TRUE))){ #new 3D approach (save 2017-07-06) TMT_Occ_mod[complete.cases(TMT_Occ_mod[, .SD, .SDcols = grep(paste(colt, "(phos|mapped|prot)_[A-E][1,2]", sep=""), colnames(TMT_Occ_mod), value=TRUE, perl=TRUE)]), paste(colt, "3D_m", sep="") := apply(.SD, 1, function(x){ model <- coefficients(summary(lm(x[1:10]~x[11:20]+x[21:30]))) return(model[2,1]) }), .SDcols = grep(paste(colt, "(phos|mapped|prot)_[A-E][1,2]", sep=""), colnames(TMT_Occ_mod), value=TRUE, perl=TRUE)] #new Occ calculation TMT_Occ_mod[, gsub("phos(_[A-E][1,2])", "Occ\\1", grep(paste(colt, "phos_[A-E][1,2]", sep=""), colnames(TMT_Occ_mod), value=TRUE, perl=TRUE), perl=TRUE) := as.data.table(mapply(function(phos,mapped,m){-phos/(mapped*m)/(1-phos/(mapped*m)) }, .SD[,1:10], .SD[,11:20], .SD[,21])), .SDcols = c(grep(paste(colt, "phos_[A-E][1,2]", sep=""), colnames(TMT_Occ_mod), value=TRUE, perl=TRUE), grep(paste(colt, "mapped_[A-E][1,2]", sep=""), colnames(TMT_Occ_mod), value=TRUE, perl=TRUE), paste(colt, "3D_m", sep=""))] #new p-value TMT_Occ_mod[complete.cases(TMT_Occ_mod[, .SD, .SDcols = grep(paste(colt, "(phos|mapped|prot)_[A-E][1,2]", sep=""), colnames(TMT_Occ_mod), value=TRUE, perl=TRUE)]), paste(colt, "3D_neglog10p", sep="") := apply(.SD, 1, function(x){ model <- coefficients(summary(lm(x[1:10]~x[11:20]+x[21:30]))) return(-log10(model[2,4])) }), .SDcols = grep(paste(colt, "(phos|mapped|prot)_[A-E][1,2]", sep=""), colnames(TMT_Occ_mod), value=TRUE, perl=TRUE)] } rm(colt) #for data visualization, we load all stoichiometry into a new data table, sorted by target stoichiometry #we then delete "illegal" stoichiometry outside the boundary 0<=x<=1 TMT_Occ_mod_col2 <- rbind(TMT_Occ_mod[, .(Occ_1 = MS2_1_Occ_A1, Occ_2 = MS2_1_Occ_A2, neglog10p = MS2_1_3D_neglog10p, Occupancy = "0.1")], TMT_Occ_mod[, .(Occ_1 = MS2_1_Occ_B1, Occ_2 = MS2_1_Occ_B2, neglog10p = MS2_1_3D_neglog10p, Occupancy = "0.2")], TMT_Occ_mod[, .(Occ_1 = MS2_1_Occ_C1, Occ_2 = MS2_1_Occ_C2, neglog10p = MS2_1_3D_neglog10p, Occupancy = "0.5")], TMT_Occ_mod[, .(Occ_1 = MS2_1_Occ_D1, Occ_2 = MS2_1_Occ_D2, neglog10p = MS2_1_3D_neglog10p, Occupancy = "0.8")], TMT_Occ_mod[, .(Occ_1 = MS2_1_Occ_E1, Occ_2 = MS2_1_Occ_E2, neglog10p = MS2_1_3D_neglog10p, Occupancy = "0.9")], TMT_Occ_mod[, .(Occ_1 = MS2_2_Occ_A1, Occ_2 = MS2_2_Occ_A2, neglog10p = MS2_2_3D_neglog10p, Occupancy = "0.1")], TMT_Occ_mod[, .(Occ_1 = MS2_2_Occ_B1, Occ_2 = MS2_2_Occ_B2, neglog10p = MS2_2_3D_neglog10p, Occupancy = "0.2")], TMT_Occ_mod[, .(Occ_1 = MS2_2_Occ_C1, Occ_2 = MS2_2_Occ_C2, neglog10p = MS2_2_3D_neglog10p, Occupancy = "0.5")], TMT_Occ_mod[, .(Occ_1 = MS2_2_Occ_D1, Occ_2 = MS2_2_Occ_D2, neglog10p = MS2_2_3D_neglog10p, Occupancy = "0.8")], TMT_Occ_mod[, .(Occ_1 = MS2_2_Occ_E1, Occ_2 = MS2_2_Occ_E2, neglog10p = MS2_2_3D_neglog10p, Occupancy = "0.9")], TMT_Occ_mod[, .(Occ_1 = MS2_3_Occ_A1, Occ_2 = MS2_3_Occ_A2, neglog10p = MS2_3_3D_neglog10p, Occupancy = "0.1")], TMT_Occ_mod[, .(Occ_1 = MS2_3_Occ_B1, Occ_2 = MS2_3_Occ_B2, neglog10p = MS2_3_3D_neglog10p, Occupancy = "0.2")], TMT_Occ_mod[, .(Occ_1 = MS2_3_Occ_C1, Occ_2 = MS2_3_Occ_C2, neglog10p = MS2_3_3D_neglog10p, Occupancy = "0.5")], TMT_Occ_mod[, .(Occ_1 = MS2_3_Occ_D1, Occ_2 = MS2_3_Occ_D2, neglog10p = MS2_3_3D_neglog10p, Occupancy = "0.8")], TMT_Occ_mod[, .(Occ_1 = MS2_3_Occ_E1, Occ_2 = MS2_3_Occ_E2, neglog10p = MS2_3_3D_neglog10p, Occupancy = "0.9")]) TMT_Occ_mod_col2 <- TMT_Occ_mod_col2[complete.cases(TMT_Occ_mod_col2) & Occ_1 >= 0 & Occ_1 <= 1 & Occ_2 >= 0 & Occ_2 <= 1] TMT_Occ_mod_col3 <- rbind(TMT_Occ_mod[, .(Occ_1 = MS3_1_Occ_A1, Occ_2 = MS3_1_Occ_A2, neglog10p = MS3_1_3D_neglog10p, Occupancy = "0.1")], TMT_Occ_mod[, .(Occ_1 = MS3_1_Occ_B1, Occ_2 = MS3_1_Occ_B2, neglog10p = MS3_1_3D_neglog10p, Occupancy = "0.2")], TMT_Occ_mod[, .(Occ_1 = MS3_1_Occ_C1, Occ_2 = MS3_1_Occ_C2, neglog10p = MS3_1_3D_neglog10p, Occupancy = "0.5")], TMT_Occ_mod[, .(Occ_1 = MS3_1_Occ_D1, Occ_2 = MS3_1_Occ_D2, neglog10p = MS3_1_3D_neglog10p, Occupancy = "0.8")], TMT_Occ_mod[, .(Occ_1 = MS3_1_Occ_E1, Occ_2 = MS3_1_Occ_E2, neglog10p = MS3_1_3D_neglog10p, Occupancy = "0.9")], TMT_Occ_mod[, .(Occ_1 = MS3_2_Occ_A1, Occ_2 = MS3_2_Occ_A2, neglog10p = MS3_2_3D_neglog10p, Occupancy = "0.1")], TMT_Occ_mod[, .(Occ_1 = MS3_2_Occ_B1, Occ_2 = MS3_2_Occ_B2, neglog10p = MS3_2_3D_neglog10p, Occupancy = "0.2")], TMT_Occ_mod[, .(Occ_1 = MS3_2_Occ_C1, Occ_2 = MS3_2_Occ_C2, neglog10p = MS3_2_3D_neglog10p, Occupancy = "0.5")], TMT_Occ_mod[, .(Occ_1 = MS3_2_Occ_D1, Occ_2 = MS3_2_Occ_D2, neglog10p = MS3_2_3D_neglog10p, Occupancy = "0.8")], TMT_Occ_mod[, .(Occ_1 = MS3_2_Occ_E1, Occ_2 = MS3_2_Occ_E2, neglog10p = MS3_2_3D_neglog10p, Occupancy = "0.9")], TMT_Occ_mod[, .(Occ_1 = MS3_3_Occ_A1, Occ_2 = MS3_3_Occ_A2, neglog10p = MS3_3_3D_neglog10p, Occupancy = "0.1")], TMT_Occ_mod[, .(Occ_1 = MS3_3_Occ_B1, Occ_2 = MS3_3_Occ_B2, neglog10p = MS3_3_3D_neglog10p, Occupancy = "0.2")], TMT_Occ_mod[, .(Occ_1 = MS3_3_Occ_C1, Occ_2 = MS3_3_Occ_C2, neglog10p = MS3_3_3D_neglog10p, Occupancy = "0.5")], TMT_Occ_mod[, .(Occ_1 = MS3_3_Occ_D1, Occ_2 = MS3_3_Occ_D2, neglog10p = MS3_3_3D_neglog10p, Occupancy = "0.8")], TMT_Occ_mod[, .(Occ_1 = MS3_3_Occ_E1, Occ_2 = MS3_3_Occ_E2, neglog10p = MS3_3_3D_neglog10p, Occupancy = "0.9")]) TMT_Occ_mod_col3 <- TMT_Occ_mod_col3[complete.cases(TMT_Occ_mod_col3) & Occ_1 >= 0 & Occ_1 <= 1 & Occ_2 >= 0 & Occ_2 <= 1] #we can now plot our stoichiometry according to Fig. 5d #for this, we need to have ggplot installed install.packages("ggplot2") library(ggplot2) #the argument "neglog10p" can be changed to any value, to adjust the 3DMM negative log10 p-value cutoff for plotting between the replicates ggplot(TMT_Occ_mod_col2[neglog10p > 7], aes(x=Occ_1, y=Occ_2, group=Occupancy)) + geom_point(aes(color=Occupancy)) + theme(legend.position="bottom") + scale_x_continuous(name="MS2 Intra-TMT replicate 1", breaks=c(0,0.2,0.4,0.6,0.8,1), limits = c(0,1)) + scale_y_continuous(name="MS2 Intra-TMT replicate 2", breaks=c(0,0.2,0.4,0.6,0.8,1), limits = c(0,1)) ggplot(TMT_Occ_mod_col3[neglog10p > 7], aes(x=Occ_1, y=Occ_2, group=Occupancy)) + geom_point(aes(color=Occupancy)) + theme(legend.position="bottom") + scale_x_continuous(name="MS3 Intra-TMT replicate 1", breaks=c(0,0.2,0.4,0.6,0.8,1), limits = c(0,1)) + scale_y_continuous(name="MS3 Intra-TMT replicate 2", breaks=c(0,0.2,0.4,0.6,0.8,1), limits = c(0,1)) #####