#code for Fig. 6, stoichiometry calculation - commented
##calculating phosphorylation site stoichiometry information with a 3DMM using R
##this script is part of the publication "Benchmarking LFQ, SILAC and MS2/MS3-based TMT quantification strategies for large-scale phosphoproteomics"
##version of the script: 2017-08-09
##this script was tested with R version 3.4.0 and data.table version 1.10.4

#this script describes how to calculate stoichiometry with the yest/HeLa example dataset from the publication, which can be downloaded from the PRIDE repository
#the project accession number is PXD007145



#1) Data preparation
#####
#this script works with the data table package, which (if not already done) needs to be installed first
install.packages("data.table")
library(data.table)

#we next load the "Supplementary Data 2_Figure6_modificationSpecificPeptides.txt" from Supplementary File 2 or from the file "Figure6_occupancy_benchmark.rar" on PRIDE
#this path needs to be adjusted to the correct location of the file
TMT_Occ_mod <- fread("C:/Supplementary Data 2_Figure6_modificationSpecificPeptides.txt",
                     sep="\t", na.strings = 'NaN', blank.lines.skip = FALSE, header=TRUE, verbose = TRUE, integer="double")

#changing 0 for NA values in TMT reporter intensity columns
for (col in grep("Reporter intensity", names(TMT_Occ_mod), value=TRUE)) TMT_Occ_mod[0==get(col), (col) := NA]
rm(col)

#changing names of the TMT reporter intensity columns
#in this specific example, we measured the same TMT10-plex sample 3 times each with MS2 and MS3
#letters A, B, C, D and E refer to channels with 10%, 20%, 50%, 80% and 90% phosphorylation site stoichiometry, respectively
#each stoichiometry exists in duplicates within the TMT10-plex setup
setnames(TMT_Occ_mod,"Reporter intensity corrected 0 MS2_1","MS2_1_phos_A1")
setnames(TMT_Occ_mod,"Reporter intensity corrected 1 MS2_1","MS2_1_phos_B1")
setnames(TMT_Occ_mod,"Reporter intensity corrected 2 MS2_1","MS2_1_phos_C1")
setnames(TMT_Occ_mod,"Reporter intensity corrected 3 MS2_1","MS2_1_phos_D1")
setnames(TMT_Occ_mod,"Reporter intensity corrected 4 MS2_1","MS2_1_phos_E1")
setnames(TMT_Occ_mod,"Reporter intensity corrected 5 MS2_1","MS2_1_phos_A2")
setnames(TMT_Occ_mod,"Reporter intensity corrected 6 MS2_1","MS2_1_phos_B2")
setnames(TMT_Occ_mod,"Reporter intensity corrected 7 MS2_1","MS2_1_phos_C2")
setnames(TMT_Occ_mod,"Reporter intensity corrected 8 MS2_1","MS2_1_phos_D2")
setnames(TMT_Occ_mod,"Reporter intensity corrected 9 MS2_1","MS2_1_phos_E2")

setnames(TMT_Occ_mod,"Reporter intensity corrected 0 MS2_2","MS2_2_phos_A1")
setnames(TMT_Occ_mod,"Reporter intensity corrected 1 MS2_2","MS2_2_phos_B1")
setnames(TMT_Occ_mod,"Reporter intensity corrected 2 MS2_2","MS2_2_phos_C1")
setnames(TMT_Occ_mod,"Reporter intensity corrected 3 MS2_2","MS2_2_phos_D1")
setnames(TMT_Occ_mod,"Reporter intensity corrected 4 MS2_2","MS2_2_phos_E1")
setnames(TMT_Occ_mod,"Reporter intensity corrected 5 MS2_2","MS2_2_phos_A2")
setnames(TMT_Occ_mod,"Reporter intensity corrected 6 MS2_2","MS2_2_phos_B2")
setnames(TMT_Occ_mod,"Reporter intensity corrected 7 MS2_2","MS2_2_phos_C2")
setnames(TMT_Occ_mod,"Reporter intensity corrected 8 MS2_2","MS2_2_phos_D2")
setnames(TMT_Occ_mod,"Reporter intensity corrected 9 MS2_2","MS2_2_phos_E2")

setnames(TMT_Occ_mod,"Reporter intensity corrected 0 MS2_3","MS2_3_phos_A1")
setnames(TMT_Occ_mod,"Reporter intensity corrected 1 MS2_3","MS2_3_phos_B1")
setnames(TMT_Occ_mod,"Reporter intensity corrected 2 MS2_3","MS2_3_phos_C1")
setnames(TMT_Occ_mod,"Reporter intensity corrected 3 MS2_3","MS2_3_phos_D1")
setnames(TMT_Occ_mod,"Reporter intensity corrected 4 MS2_3","MS2_3_phos_E1")
setnames(TMT_Occ_mod,"Reporter intensity corrected 5 MS2_3","MS2_3_phos_A2")
setnames(TMT_Occ_mod,"Reporter intensity corrected 6 MS2_3","MS2_3_phos_B2")
setnames(TMT_Occ_mod,"Reporter intensity corrected 7 MS2_3","MS2_3_phos_C2")
setnames(TMT_Occ_mod,"Reporter intensity corrected 8 MS2_3","MS2_3_phos_D2")
setnames(TMT_Occ_mod,"Reporter intensity corrected 9 MS2_3","MS2_3_phos_E2")

setnames(TMT_Occ_mod,"Reporter intensity corrected 0 MS3_1","MS3_1_phos_A1")
setnames(TMT_Occ_mod,"Reporter intensity corrected 1 MS3_1","MS3_1_phos_B1")
setnames(TMT_Occ_mod,"Reporter intensity corrected 2 MS3_1","MS3_1_phos_C1")
setnames(TMT_Occ_mod,"Reporter intensity corrected 3 MS3_1","MS3_1_phos_D1")
setnames(TMT_Occ_mod,"Reporter intensity corrected 4 MS3_1","MS3_1_phos_E1")
setnames(TMT_Occ_mod,"Reporter intensity corrected 5 MS3_1","MS3_1_phos_A2")
setnames(TMT_Occ_mod,"Reporter intensity corrected 6 MS3_1","MS3_1_phos_B2")
setnames(TMT_Occ_mod,"Reporter intensity corrected 7 MS3_1","MS3_1_phos_C2")
setnames(TMT_Occ_mod,"Reporter intensity corrected 8 MS3_1","MS3_1_phos_D2")
setnames(TMT_Occ_mod,"Reporter intensity corrected 9 MS3_1","MS3_1_phos_E2")

setnames(TMT_Occ_mod,"Reporter intensity corrected 0 MS3_2","MS3_2_phos_A1")
setnames(TMT_Occ_mod,"Reporter intensity corrected 1 MS3_2","MS3_2_phos_B1")
setnames(TMT_Occ_mod,"Reporter intensity corrected 2 MS3_2","MS3_2_phos_C1")
setnames(TMT_Occ_mod,"Reporter intensity corrected 3 MS3_2","MS3_2_phos_D1")
setnames(TMT_Occ_mod,"Reporter intensity corrected 4 MS3_2","MS3_2_phos_E1")
setnames(TMT_Occ_mod,"Reporter intensity corrected 5 MS3_2","MS3_2_phos_A2")
setnames(TMT_Occ_mod,"Reporter intensity corrected 6 MS3_2","MS3_2_phos_B2")
setnames(TMT_Occ_mod,"Reporter intensity corrected 7 MS3_2","MS3_2_phos_C2")
setnames(TMT_Occ_mod,"Reporter intensity corrected 8 MS3_2","MS3_2_phos_D2")
setnames(TMT_Occ_mod,"Reporter intensity corrected 9 MS3_2","MS3_2_phos_E2")

setnames(TMT_Occ_mod,"Reporter intensity corrected 0 MS3_3","MS3_3_phos_A1")
setnames(TMT_Occ_mod,"Reporter intensity corrected 1 MS3_3","MS3_3_phos_B1")
setnames(TMT_Occ_mod,"Reporter intensity corrected 2 MS3_3","MS3_3_phos_C1")
setnames(TMT_Occ_mod,"Reporter intensity corrected 3 MS3_3","MS3_3_phos_D1")
setnames(TMT_Occ_mod,"Reporter intensity corrected 4 MS3_3","MS3_3_phos_E1")
setnames(TMT_Occ_mod,"Reporter intensity corrected 5 MS3_3","MS3_3_phos_A2")
setnames(TMT_Occ_mod,"Reporter intensity corrected 6 MS3_3","MS3_3_phos_B2")
setnames(TMT_Occ_mod,"Reporter intensity corrected 7 MS3_3","MS3_3_phos_C2")
setnames(TMT_Occ_mod,"Reporter intensity corrected 8 MS3_3","MS3_3_phos_D2")
setnames(TMT_Occ_mod,"Reporter intensity corrected 9 MS3_3","MS3_3_phos_E2")

#remove reverse hits of the target/decoy database search
TMT_Occ_mod <- TMT_Occ_mod[!Reverse=="+"]

#to differentiate between yeast and HeLa peptides, we need to map the species origin onto each peptide
#for this purpose, we now need to load a file containing protein identifiers and their respective species
#this file is included in the supplementary data 2 of the manuscript
#this path needs to be adjusted to the correct location of the file
YHident <- fread("C:/Supplementary Data 2_Figure6_HumanAndYeastProtIdentifiers.txt",
                 sep="\t", na.strings = 'NaN', blank.lines.skip = FALSE, header=TRUE, verbose = TRUE, integer="double")

#we match the yeast/HeLa data table onto our peptide identifications
setkey(YHident, `Protein`)
setkey(TMT_Occ_mod, Proteins)
TMT_Occ_mod <- TMT_Occ_mod[YHident, nomatch = 0L]

#for the purpose of our stoichiometry benchmark approach in the manuscript, we were not able to measure protein intensities
#these are thus here set equal to 1, later yielding ratios of 1, which is as expected in most phosphoproteomics experiments
#in an actual non-benchmark data set, protein intensities should be read from the "proteinGroups.txt" file instead
TMT_Occ_mod[, gsub("phos(_[A-E][1,2]$)","prot\\1", grep("(phos_[A-E][1,2]$)", colnames(TMT_Occ_mod), value = TRUE, perl = TRUE), perl = TRUE)
            := as.data.table(matrix(1, ncol=length(grep("(phos_[A-E][1,2]$)", colnames(TMT_Occ_mod), value = TRUE, perl = TRUE)), nrow=.N))]

#tags including all detected PTMs with/without phosphorylation are created
#these will be used to match phosphopeptides with their respective non-phosphorylated peptide versions
TMT_Occ_mod[, Prottag := paste(Sequence,"_A",`Acetyl (Protein N-term)`,"_O",`Oxidation (M)`, sep="")]
TMT_Occ_mod[, Phostag := paste(Sequence,"_A",`Acetyl (Protein N-term)`,"_O",`Oxidation (M)`,"_P",`Phospho (STY)`, sep="")]

#in this experiment, both phosphorylated and non-phosphorylated peptides have been measured in the same sample
#non-phosphorylated peptides are now copied into new columns
#in a data set with phospho- and non-phospho-peptides in different experimental groups, this step is not neccessary
TMT_Occ_mod[`Phospho (STY)` == 0, gsub("phos(_[A-E][1,2]$)","non\\1", grep("(phos_[A-E][1,2]$)", colnames(TMT_Occ_mod), value = TRUE, perl = TRUE), perl = TRUE)
            := .SD, .SDcols = grep("(phos_[A-E][1,2]$)", colnames(TMT_Occ_mod), value = TRUE, perl = TRUE)]

#the tags are used to aggregate non-phosphorylated peptides with their respective phosphorylated counterparts
setkey(TMT_Occ_mod, Phostag)
TMT_Occ_mod[, gsub("non(_[A-E][1,2]$)","mapped\\1", grep("(non_[A-E][1,2]$)", colnames(TMT_Occ_mod), value = TRUE, perl = TRUE), perl = TRUE)
            := .SD[1], .SDcols = grep("(non_[A-E][1,2]$)", names(TMT_Occ_mod), value = TRUE), by = Prottag]

#all rows containing non-phosphorylated peptides are now deleted
TMT_Occ_mod <- TMT_Occ_mod[`Phospho (STY)` > 0]
TMT_Occ_mod <- TMT_Occ_mod[Species == "yeast"]
#####





#2) 3DMM phosphorylation site stoichiometry calculation
#####
#we demonstrate the calculation of phosphorylation site stoichiometry with the data prepared in step 1
#but it can of course be done with any data, as long as each (TMT-)sample contains three columns:
#1. one for the phosphorylated peptide intensities (here ending in "...phos_[A-E][1,2]" with A1-E2 representing the 10 TMT channels)
#2. one for the respective non-phosphorylated peptide intensities (here ending in "...mapped_[A-E][1,2]" with A1-E2 representing the 10 TMT channels)
#3. one for the respective protein intensities (here ending in "...prot_[A-E][1,2]" with A1-E2 representing the 10 TMT channels)
#the three columns belonging together should all have the exact same name before phos/mapped/prot
#the endings in this example for the 10 different channels are "[A-E][1,2]", but this can be adapted in the code below
#the code is written for 10 channels each; if more or less are used, the numbers in [] have to be adjusted accordingly
#the code creates new columns "...Occ_[A-E][1,2]" for the stoichiometries per channel, and "...3D_neglog10p" for the negative log10 p-value per model

for (colt in unique(gsub("(.*)phos(_[A-E][1,2])", "\\1", grep("phos_[A-E][1,2]", colnames(TMT_Occ_mod), value=TRUE, perl=TRUE), perl=TRUE))){
  #new 3D approach (save 2017-07-06)
  TMT_Occ_mod[complete.cases(TMT_Occ_mod[, .SD, .SDcols = grep(paste(colt, "(phos|mapped|prot)_[A-E][1,2]", sep=""), colnames(TMT_Occ_mod), value=TRUE, perl=TRUE)]),
              paste(colt, "3D_m", sep="") := apply(.SD, 1, function(x){
                model <- coefficients(summary(lm(x[1:10]~x[11:20]+x[21:30])))
                return(model[2,1])
              }), .SDcols = grep(paste(colt, "(phos|mapped|prot)_[A-E][1,2]", sep=""), colnames(TMT_Occ_mod), value=TRUE, perl=TRUE)]
  
  #new Occ calculation
  TMT_Occ_mod[, gsub("phos(_[A-E][1,2])", "Occ\\1", grep(paste(colt, "phos_[A-E][1,2]", sep=""), colnames(TMT_Occ_mod), value=TRUE, perl=TRUE), perl=TRUE) :=
                as.data.table(mapply(function(phos,mapped,m){-phos/(mapped*m)/(1-phos/(mapped*m))
                }, .SD[,1:10], .SD[,11:20], .SD[,21])), .SDcols = c(grep(paste(colt, "phos_[A-E][1,2]", sep=""), colnames(TMT_Occ_mod), value=TRUE, perl=TRUE),
                                                                    grep(paste(colt, "mapped_[A-E][1,2]", sep=""), colnames(TMT_Occ_mod), value=TRUE, perl=TRUE),
                                                                    paste(colt, "3D_m", sep=""))]
  #new p-value
  TMT_Occ_mod[complete.cases(TMT_Occ_mod[, .SD, .SDcols = grep(paste(colt, "(phos|mapped|prot)_[A-E][1,2]", sep=""), colnames(TMT_Occ_mod), value=TRUE, perl=TRUE)]),
              paste(colt, "3D_neglog10p", sep="") := apply(.SD, 1, function(x){
                model <- coefficients(summary(lm(x[1:10]~x[11:20]+x[21:30])))
                return(-log10(model[2,4]))
              }), .SDcols = grep(paste(colt, "(phos|mapped|prot)_[A-E][1,2]", sep=""), colnames(TMT_Occ_mod), value=TRUE, perl=TRUE)]
}
rm(colt)

#for data visualization, we load all stoichiometry into a new data table, sorted by target stoichiometry
#we then delete "illegal" stoichiometry outside the boundary 0<=x<=1
TMT_Occ_mod_col2 <- rbind(TMT_Occ_mod[, .(Occ_1 = MS2_1_Occ_A1, Occ_2 = MS2_1_Occ_A2, neglog10p = MS2_1_3D_neglog10p, Occupancy = "0.1")],
                          TMT_Occ_mod[, .(Occ_1 = MS2_1_Occ_B1, Occ_2 = MS2_1_Occ_B2, neglog10p = MS2_1_3D_neglog10p, Occupancy = "0.2")],
                          TMT_Occ_mod[, .(Occ_1 = MS2_1_Occ_C1, Occ_2 = MS2_1_Occ_C2, neglog10p = MS2_1_3D_neglog10p, Occupancy = "0.5")],
                          TMT_Occ_mod[, .(Occ_1 = MS2_1_Occ_D1, Occ_2 = MS2_1_Occ_D2, neglog10p = MS2_1_3D_neglog10p, Occupancy = "0.8")],
                          TMT_Occ_mod[, .(Occ_1 = MS2_1_Occ_E1, Occ_2 = MS2_1_Occ_E2, neglog10p = MS2_1_3D_neglog10p, Occupancy = "0.9")],
                          TMT_Occ_mod[, .(Occ_1 = MS2_2_Occ_A1, Occ_2 = MS2_2_Occ_A2, neglog10p = MS2_2_3D_neglog10p, Occupancy = "0.1")],
                          TMT_Occ_mod[, .(Occ_1 = MS2_2_Occ_B1, Occ_2 = MS2_2_Occ_B2, neglog10p = MS2_2_3D_neglog10p, Occupancy = "0.2")],
                          TMT_Occ_mod[, .(Occ_1 = MS2_2_Occ_C1, Occ_2 = MS2_2_Occ_C2, neglog10p = MS2_2_3D_neglog10p, Occupancy = "0.5")],
                          TMT_Occ_mod[, .(Occ_1 = MS2_2_Occ_D1, Occ_2 = MS2_2_Occ_D2, neglog10p = MS2_2_3D_neglog10p, Occupancy = "0.8")],
                          TMT_Occ_mod[, .(Occ_1 = MS2_2_Occ_E1, Occ_2 = MS2_2_Occ_E2, neglog10p = MS2_2_3D_neglog10p, Occupancy = "0.9")],
                          TMT_Occ_mod[, .(Occ_1 = MS2_3_Occ_A1, Occ_2 = MS2_3_Occ_A2, neglog10p = MS2_3_3D_neglog10p, Occupancy = "0.1")],
                          TMT_Occ_mod[, .(Occ_1 = MS2_3_Occ_B1, Occ_2 = MS2_3_Occ_B2, neglog10p = MS2_3_3D_neglog10p, Occupancy = "0.2")],
                          TMT_Occ_mod[, .(Occ_1 = MS2_3_Occ_C1, Occ_2 = MS2_3_Occ_C2, neglog10p = MS2_3_3D_neglog10p, Occupancy = "0.5")],
                          TMT_Occ_mod[, .(Occ_1 = MS2_3_Occ_D1, Occ_2 = MS2_3_Occ_D2, neglog10p = MS2_3_3D_neglog10p, Occupancy = "0.8")],
                          TMT_Occ_mod[, .(Occ_1 = MS2_3_Occ_E1, Occ_2 = MS2_3_Occ_E2, neglog10p = MS2_3_3D_neglog10p, Occupancy = "0.9")])
TMT_Occ_mod_col2 <- TMT_Occ_mod_col2[complete.cases(TMT_Occ_mod_col2) & Occ_1 >= 0 & Occ_1 <= 1 & Occ_2 >= 0 & Occ_2 <= 1]
TMT_Occ_mod_col3 <- rbind(TMT_Occ_mod[, .(Occ_1 = MS3_1_Occ_A1, Occ_2 = MS3_1_Occ_A2, neglog10p = MS3_1_3D_neglog10p, Occupancy = "0.1")],
                          TMT_Occ_mod[, .(Occ_1 = MS3_1_Occ_B1, Occ_2 = MS3_1_Occ_B2, neglog10p = MS3_1_3D_neglog10p, Occupancy = "0.2")],
                          TMT_Occ_mod[, .(Occ_1 = MS3_1_Occ_C1, Occ_2 = MS3_1_Occ_C2, neglog10p = MS3_1_3D_neglog10p, Occupancy = "0.5")],
                          TMT_Occ_mod[, .(Occ_1 = MS3_1_Occ_D1, Occ_2 = MS3_1_Occ_D2, neglog10p = MS3_1_3D_neglog10p, Occupancy = "0.8")],
                          TMT_Occ_mod[, .(Occ_1 = MS3_1_Occ_E1, Occ_2 = MS3_1_Occ_E2, neglog10p = MS3_1_3D_neglog10p, Occupancy = "0.9")],
                          TMT_Occ_mod[, .(Occ_1 = MS3_2_Occ_A1, Occ_2 = MS3_2_Occ_A2, neglog10p = MS3_2_3D_neglog10p, Occupancy = "0.1")],
                          TMT_Occ_mod[, .(Occ_1 = MS3_2_Occ_B1, Occ_2 = MS3_2_Occ_B2, neglog10p = MS3_2_3D_neglog10p, Occupancy = "0.2")],
                          TMT_Occ_mod[, .(Occ_1 = MS3_2_Occ_C1, Occ_2 = MS3_2_Occ_C2, neglog10p = MS3_2_3D_neglog10p, Occupancy = "0.5")],
                          TMT_Occ_mod[, .(Occ_1 = MS3_2_Occ_D1, Occ_2 = MS3_2_Occ_D2, neglog10p = MS3_2_3D_neglog10p, Occupancy = "0.8")],
                          TMT_Occ_mod[, .(Occ_1 = MS3_2_Occ_E1, Occ_2 = MS3_2_Occ_E2, neglog10p = MS3_2_3D_neglog10p, Occupancy = "0.9")],
                          TMT_Occ_mod[, .(Occ_1 = MS3_3_Occ_A1, Occ_2 = MS3_3_Occ_A2, neglog10p = MS3_3_3D_neglog10p, Occupancy = "0.1")],
                          TMT_Occ_mod[, .(Occ_1 = MS3_3_Occ_B1, Occ_2 = MS3_3_Occ_B2, neglog10p = MS3_3_3D_neglog10p, Occupancy = "0.2")],
                          TMT_Occ_mod[, .(Occ_1 = MS3_3_Occ_C1, Occ_2 = MS3_3_Occ_C2, neglog10p = MS3_3_3D_neglog10p, Occupancy = "0.5")],
                          TMT_Occ_mod[, .(Occ_1 = MS3_3_Occ_D1, Occ_2 = MS3_3_Occ_D2, neglog10p = MS3_3_3D_neglog10p, Occupancy = "0.8")],
                          TMT_Occ_mod[, .(Occ_1 = MS3_3_Occ_E1, Occ_2 = MS3_3_Occ_E2, neglog10p = MS3_3_3D_neglog10p, Occupancy = "0.9")])
TMT_Occ_mod_col3 <- TMT_Occ_mod_col3[complete.cases(TMT_Occ_mod_col3) & Occ_1 >= 0 & Occ_1 <= 1 & Occ_2 >= 0 & Occ_2 <= 1]

#we can now plot our stoichiometry according to Fig. 5d
#for this, we need to have ggplot installed
install.packages("ggplot2")
library(ggplot2)

#the argument "neglog10p" can be changed to any value, to adjust the 3DMM negative log10 p-value cutoff for plotting between the replicates
ggplot(TMT_Occ_mod_col2[neglog10p > 7], aes(x=Occ_1, y=Occ_2, group=Occupancy)) +
  geom_point(aes(color=Occupancy)) + theme(legend.position="bottom") +
  scale_x_continuous(name="MS2 Intra-TMT replicate 1", breaks=c(0,0.2,0.4,0.6,0.8,1), limits = c(0,1)) +
  scale_y_continuous(name="MS2 Intra-TMT replicate 2", breaks=c(0,0.2,0.4,0.6,0.8,1), limits = c(0,1))
ggplot(TMT_Occ_mod_col3[neglog10p > 7], aes(x=Occ_1, y=Occ_2, group=Occupancy)) +
  geom_point(aes(color=Occupancy)) + theme(legend.position="bottom") +
  scale_x_continuous(name="MS3 Intra-TMT replicate 1", breaks=c(0,0.2,0.4,0.6,0.8,1), limits = c(0,1)) +
  scale_y_continuous(name="MS3 Intra-TMT replicate 2", breaks=c(0,0.2,0.4,0.6,0.8,1), limits = c(0,1))
#####