#install.packages('doSNOW'); #install.packages('ggplot2'); #install.packages('directlabels'); #install.packages('gridExtra'); #install.packages('hexbin'); #install.packages('Hmisc'); #install.packages('gplots'); #install.packages('GMD'); #install.packages('FactoMineR'); #install.packages('reshape'); #install.packages('svDialogs'); #install.packages('stringr'); library(ggplot2); library(directlabels); library(gridExtra); library(hexbin); library(Hmisc); library(gplots); library(RColorBrewer); library(GMD); library(FactoMineR); library(parallel); library(foreach); library(doParallel); library(plyr); library(reshape); library(doSNOW); library(svDialogs); library(stringr); ##################################################################################### #### User-defined Functions #### ##################################################################################### ## Filters out psms with "Rejected" in the PSM Ambiguity column ambfilter <- function(x){ x[!(x$PSM.Ambiguity == "Rejected"),] } ## Averages the psm quantification from each TMT label addAvgSN <- function(inDF, tmts){ cbind(inDF, AvgSN = rowMeans(inDF[,tmts, drop = FALSE])) } ## Filters out psms with an average Signal:Noise < 10 Snfilter <- function(x){ x[!(x$AvgSN < 10),] } ## Filters out psms with an isolation interferenece > 25 specfilter <- function(x){ x[!(x$Isolation.Interference.... > 25),] } ## Extract the columns we need extract <- function(x, tmts){ x[,c("Protein", tmts)] } ## Sum the values for each unique protein ID sumvals <- function(x){ ddply(x,.(Protein),summarize, X126.1=sum(X126), X127N.1=sum(X127N),X127C.1=sum(X127C),X128N.1=sum(X128N),X128C.1=sum(X128C), X129N.1=sum(X129N),X129C.1=sum(X129C),X130N.1=sum(X130N),X130C.1=sum(X130C),X131.1=sum(X131)) } ## Change column names to append file they came from namexchange <- function(x,y){ colnames(x)[names(x) == "Protein"] <- c(paste("ProteinID")) colnames(x)[names(x) == "X126.1"] <- c(paste("X126", {y}, sep = "_")) colnames(x)[names(x) == "X127N.1"] <- c(paste("X127n", {y}, sep = "_")) colnames(x)[names(x) == "X127C.1"] <- c(paste("X127c", {y}, sep = "_")) colnames(x)[names(x) == "X128N.1"] <- c(paste("X128n", {y}, sep = "_")) colnames(x)[names(x) == "X128C.1"] <- c(paste("X128c", {y}, sep = "_")) colnames(x)[names(x) == "X129N.1"] <- c(paste("X129n", {y}, sep = "_")) colnames(x)[names(x) == "X129C.1"] <- c(paste("X129c", {y}, sep = "_")) colnames(x)[names(x) == "X130N.1"] <- c(paste("X130n", {y}, sep = "_")) colnames(x)[names(x) == "X130C.1"] <- c(paste("X130c", {y}, sep = "_")) colnames(x)[names(x) == "X131.1"] <- c(paste("X131", {y}, sep = "_")) return(x) } namexchange2 <- function(x,y){ colnames(x)[names(x) == "Protein"] <- c(paste("ProteinID")) if(exists("X126.1")) colnames(x)[names(x) == "X126.1"] <- c(paste("X126", {y}, sep = "_")) if(exists("X127N.1")) colnames(x)[names(x) == "X127N.1"] <- c(paste("X127n", {y}, sep = "_")) if(exists("X127C.1")) colnames(x)[names(x) == "X127C.1"] <- c(paste("X127c", {y}, sep = "_")) if(exists("X128N.1")) colnames(x)[names(x) == "X128N.1"] <- c(paste("X128n", {y}, sep = "_")) if(exists("X128C.1")) colnames(x)[names(x) == "X128C.1"] <- c(paste("X128c", {y}, sep = "_")) if(exists("X129N.1")) colnames(x)[names(x) == "X129N.1"] <- c(paste("X129n", {y}, sep = "_")) if(exists("X129C.1")) colnames(x)[names(x) == "X129C.1"] <- c(paste("X129c", {y}, sep = "_")) if(exists("X130N.1")) colnames(x)[names(x) == "X130N.1"] <- c(paste("X130n", {y}, sep = "_")) if(exists("X130C.1")) colnames(x)[names(x) == "X130C.1"] <- c(paste("X130c", {y}, sep = "_")) if(exists("X131.1")) colnames(x)[names(x) == "X131.1"] <- c(paste("X131", {y}, sep = "_")) return(x) } ##################################################################################### #### Data Processing Code #### ##################################################################################### #A) Set your working directory to the location of your input file setwd("/Users/johnlapek/documents") ldf <- list() # creates a list for the data files nonrejected <-list() #creates a list for the non-rejected data na.as.one <-list() #creates a list when replacing the NA values of the quantifications with 1's for proper averaging signal <-list() #creates a list for data summing sn for all channels Sn <-list() #creates a list for data removing below SN threshold spec <-list()#creates a list for data removing below specificity threshold spec2 <- list() NAgone <-list()#creates a list for data removing rows with NAs extractA <-list()#creates a list for dataframes with extracted columns extractB <-list()#creates a list for dataframes with extracted columns values <-list()#creates a list for final summed values final <-list()#creates a list for final values with named columns filePD <- list()#creates a list for the file from Proteome Discoverer listcsv <- list()#creates a list for separated files from the PD output tmtlablist <- list("X126", "X127N", "X127C", "X128N","X128C", "X129N","X129C", "X130N","X130C", "X131") #List of potential TMT labels labelsperfile <- list() # list("X126.1", "X127n.1", "X127c.1", "X128n.1", "X128c.1", "X129n.1", "X129c.1", "X130n.1", "X130c.1", "X131.1") temp <- list() #B) Read in .csv Proteome Discoverer file filePD[[1]] = read.csv("Peptides_Gencode.csv") #addinfo <- c(as.character(filePD[[1]]$Protein), filePD[[1]]$Coverage, as.character(filePD[[1]]$Description)) #addinfo <- matrix(addinfo ,ncol=3,byrow=FALSE) #colnames(addinfo) <- c("ProteinID", "%_Coverage", "Description") #addinfo <- unique(addinfo) #C) Get user input on number of files put into PD n <- dlgInput(message = "How many MS runs did you perform? ", default = "")$res tempPos <- gregexpr(':',n)[[1]][2] n <- as.numeric(substr(n, tempPos+1, nchar(n))) #Create empty vector for user defined file IDs fileIDs <- c() #Read in user defined file IDs for(i in 1:n){ tempID <- dlgInput(message = "Enter a unique string from each MS run: ", default = "")$res tempPos <- gregexpr(':',tempID)[[1]][2] tempID <- substr(tempID, tempPos+1, nchar(tempID)) fileIDs <- c(fileIDs, tempID) } #Allow user to select TMT labels used for each file for(i in 1:n){ tmtlab <- dlgList(tmtlablist, multiple = TRUE)$res labelsperfile[i] <- list(tmtlab) } dir.create("CSVs/") setwd("CSVs/") #Parse PD output into .csvs based on user-specified file number and IDs for(i in 1:n){ t1 <- filePD[[1]][,"Spectrum.File"][grepl(fileIDs[i], filePD[[1]][,"Spectrum.File"])] t2 <- filePD[[1]][filePD[[1]][,"Spectrum.File"] %in% t1,] write.csv(t2, file=paste(fileIDs[i], ".csv", sep="")) } #D) Read in parsed data listcsv <- dir(pattern = "*.csv") temp <- NULL temp2 <- NULL #peptides <- NULL for (k in 1:length(listcsv)){ temp <- labelsperfile[[k]] temp2 <- paste(temp, ".1", sep="") ldf[[k]] = read.csv(listcsv[k]) nonrejected[[k]] = data.frame(lapply(ldf[k], ambfilter)) nonrejected[[k]][,39:ncol(nonrejected[[k]])][is.na(nonrejected[[k]][,39:ncol(nonrejected[[k]])])] <- 1 signal[[k]] = data.frame(lapply(nonrejected[k], addAvgSN, temp)) Sn[[k]] = data.frame(lapply(signal[k], Snfilter)) spec[[k]] = data.frame(lapply(Sn[k], specfilter)) #peptides[[k]] = data.frame(table(unlist(spec[[k]][2]))) extractA[[k]] = data.frame(lapply(spec[k], extract, unlist( tmtlablist))) #colnames(peptides[[k]]) <- c(as.character("ProteinID"), paste("Peptides_",as.character(fileIDs[k]),sep="")) values[[k]] = data.frame(lapply(extractA[k], sumvals)) extractB[[k]] = data.frame(lapply(values[k], extract, temp2)) print(k) #Change column names to append file they came from final[[k]] = data.frame(lapply(extractB[k], namexchange, listcsv[k])) } #Merge all data frames in list by the column ProteinID finalvalues <- data.frame(Reduce(function(x,y) merge(x,y,all=TRUE), final)) #pepNos <- data.frame(Reduce(function(x,y) merge(x,y,all=TRUE), peptides)) #finalvalues <- finalvalues[2:nrow(finalvalues),] #rm(NAgone, Sn, extractA, extractB, final, ldf, signal, spec, values) write.table(finalvalues, 'nonnormalizeddataall.txt', sep = "\t", row.names = FALSE, col.names = TRUE) #Populate average matrix avgMat <- NULL avgVec <- NULL temp <- NULL for (i in 1:length(listcsv)){ if(i==1){ x1 <- 2 x2 <- length(labelsperfile[[i]]) + 1 }else{ x1 <- x1 + length(labelsperfile[[i-1]]) x2 <- x2 + length(labelsperfile[[i]]) } tempRow <- finalvalues[,x1:x2] tempMean <- rowMeans(tempRow) avgVec <- c(avgVec, tempMean) } avgMat <- matrix(avgVec,ncol=length(listcsv),byrow=FALSE) allMedian <- median(avgMat, na.rm = TRUE) avgnorm = c(); for (i in 1:length(listcsv)){ if(i==1){ x1 <- 2 x2 <- length(labelsperfile[[i]]) + 1 }else{ x1 <- x1 + length(labelsperfile[[i-1]]) x2 <- x2 + length(labelsperfile[[i]]) } subset <- finalvalues[,x1:x2] subset2 <- subset/(avgMat[,i]/allMedian) if(i==1){ avgnorm <- subset2 }else{ avgnorm <- cbind(avgnorm, subset2) } } datamedian <- median(unlist(avgnorm[,1:length(avgnorm)]), na.rm = TRUE) finalnorm = data.frame() colmeds <- list() for (j in 1:dim(avgnorm)[2]){ colmeds[[j]] = median(unlist(avgnorm[,j]), na.rm = TRUE) if(j == 1) { finalnorm = data.frame(avgnorm[,j]/(as.numeric(colmeds[j])/datamedian)); } else { finalnorm = data.frame(cbind(finalnorm, avgnorm[,j]/(as.numeric(colmeds[j])/datamedian))); } } names(finalnorm) <- names(avgnorm) finalnorm <- cbind(finalvalues["ProteinID"], finalnorm) #finalnorm <- merge(finalnorm, pepNos, by = "ProteinID") #finalnorm <- merge(finalnorm, addinfo, by = "ProteinID") write.table(finalnorm, 'NormalizedDataAll.txt', sep = "\t", col.names = TRUE, row.names = FALSE) commonreps <- na.omit(finalnorm) write.table(commonreps, 'NormalizedCommonReps.txt', sep = "\t", col.names = TRUE, row.names = FALSE)