#Load libraries ###### library(reshape2) library(ggplot2) library(dplyr) library(seqinr) library(corrplot) library(Rtsne) library(plotly) library(pheatmap) library(tidyr) library(qvalue) library(ptest) library(RColorBrewer) library(factoextra) library(pracma) source("/Users/tly/Documents/Current R Analyses/R library/TLfun.R") options(stringsAsFactors=FALSE) ############# new dataset, all libraries, with repeat injections ############# protg.full_cc <- read.csv("../../../Full dataset/full_reprocessed_fdr/data.filtered.prots.lrp.csv") protg.full_cc_mq <- read.delim("../../../Full dataset/full_all_libraries/proteinGroups.txt") protg.full_cc$Reverse <- protg.full_cc_mq$Reverse[match(protg.full_cc$Protein.group.IDs, protg.full_cc_mq$id)] protg.full_cc$Potential.contaminant <- protg.full_cc_mq$Potential.contaminant[match(protg.full_cc$Protein.group.IDs, protg.full_cc_mq$id)] protg.full_cc <- subset(protg.full_cc, Reverse != "+" & Potential.contaminant != "+") protg.full_cc <- protg.full_cc[!grepl("contam",protg.full_cc$Fasta.headers),] #identifier mapping and clean-up protg.full_cc$Base_Uniprot <- strsplit.extract(protg.full_cc$Majority.protein.IDs, ';', 1) protg.full_cc$Base_Uniprot <- gsub("..\\|(.+?)\\|.*", "\\1", protg.full_cc$Base_Uniprot) protg.full_cc$Base_Uniprot <- strsplit.extract(protg.full_cc$Base_Uniprot, '-', 1) gene.map.table <- read.delim("/Users/tly/Documents/Current R Analyses/R library/Map tables/uniprot-human-selectedcols.txt") gene.map.table$genes <- gsub(" ", ";", gene.map.table$Gene.names) protg.full_cc$Gene.names[protg.full_cc$Gene.names==""] <- gene.map.table$genes[match(protg.full_cc$Base_Uniprot[protg.full_cc$Gene.names==""], gene.map.table$Entry)] rm(gene.map.table) #mapping useful motifs library(seqinr) #create ref proteome complete_proteome <- read.fasta(file = "../../../Human Ref Proteome _ALL_2017-10-23.fasta",seqtype="AA") uniprot_ids <- character(length(complete_proteome)) for (i in 1:length(complete_proteome)) { uniprot_ids[i] <- strsplit(getName(complete_proteome[i]),"|", fixed=TRUE)[[1]][2] } rm(i) protg.fasta <- complete_proteome[match(protg.full_cc$Base_Uniprot, uniprot_ids)] protg.sequences <- character(length(protg.fasta)) for (i in 1:length(protg.fasta)) { protg.sequences[i] <- as.character(getSequence(protg.fasta[i], as.string = TRUE)[[1]]) } #Slims ken_slim_hits <- read.delim("../../../KEN_slim_search.txt") Dbox_slim_hits <- read.delim("../../../dbox_slim_search.txt") abba_slim_hits <- read.delim("../../../abba_slim_search.txt") #Motif analysis protg.full_cc$Dbox <- grepl("R..L", protg.sequences) protg.full_cc$KEN <- grepl("KEN",protg.sequences) protg.full_cc$KENslim <- protg.full_cc$Base_Uniprot%in%ken_slim_hits$ProteinAcc protg.full_cc$Dboxslim <- protg.full_cc$Base_Uniprot%in%Dbox_slim_hits$ProteinAcc protg.full_cc$abba_slim_hits <- protg.full_cc$Base_Uniprot%in%abba_slim_hits$ProteinAcc #useful columns protg.full_cc.id.cols <- c("Protein.group.IDs", "Majority.protein.IDs", "Fasta.headers", "Gene.names", "Base_Uniprot") protg.full_cc.int.cols <- grep("sum.pep.expt", colnames(protg.full_cc), value = T) protg.full_cc.int.facs.cols <- grep("P[1234567789]{1,2}", protg.full_cc.int.cols, value = T) #renaming columns and normalisation protg.full_cc.int.facs.mycols <- paste0('int.', gsub(".+?_(P[1234567890]{1,2}[abcd].{0,3}).sum.pep.expt", "\\1", protg.full_cc.int.facs.cols)) protg.full_cc.int.facs.mycols <- c(protg.full_cc.int.facs.mycols[1:69], paste0(protg.full_cc.int.facs.mycols[70:138], ".i2")) protg.full_cc[,protg.full_cc.int.facs.mycols] <- protg.full_cc[,protg.full_cc.int.facs.cols] protg.full_cc.ppm.facs.mycols <- paste0('ppm.', gsub(".+?_(P[1234567890]{1,2}[abcd][x]{0,1}).sum.pep.expt", "\\1", protg.full_cc.int.facs.cols)) protg.full_cc.ppm.facs.mycols <- c(protg.full_cc.ppm.facs.mycols[1:69], paste0(protg.full_cc.ppm.facs.mycols[70:138], ".i2")) protg.full_cc[,protg.full_cc.ppm.facs.mycols] <- apply(protg.full_cc[,protg.full_cc.int.facs.cols], 2, function(x) x/sum(x, na.rm=T)*1E6) com.ppm.facs.mycols <- paste0("com.", protg.full_cc.ppm.facs.mycols[1:69]) temp <- t(apply(protg.full_cc[,protg.full_cc.ppm.facs.mycols], 1, function(x) (x[1:69]+x[70:138])/2)) protg.full_cc[,com.ppm.facs.mycols] <- temp rm(temp) my.write.table(protg.full_cc[,c(protg.full_cc.id.cols, protg.full_cc.ppm.facs.mycols)], "Suppl_Table_1.txt") ###### DATA WRANGLING ############### protg.full_cc.long <- melt(protg.full_cc[,c(protg.full_cc.id.cols,protg.full_cc.ppm.facs.mycols)], id.vars = protg.full_cc.id.cols) protg.full_cc.long$pop <- as.numeric(gsub(".+?P([1234567890]{1,2}).*", "\\1", protg.full_cc.long$variable)) protg.full_cc.long$rep <- as.factor(gsub(".+?P([1234567890]{1,2})(.*)", "\\2", protg.full_cc.long$variable)) protg.full_cc.long.sum <- protg.full_cc.long %>% group_by(Protein.group.IDs, Base_Uniprot, Gene.names, pop) %>% summarise(totalint = sum(value)) protg.full_cc.long.sum %>% subset(grepl("CCNB1", protg.full_cc.long.sum$Gene.names)) %>% ggplot(aes(x = pop, y = totalint)) + geom_point() protg.full_cc.wide.sum <- spread(protg.full_cc.long.sum, pop, totalint) ######## USEFUL PLOTTING FUNCTIONS ################### plot_linegraph_sd <- function(gene.name) { protg.full_cc.long %>% filter(grepl(gene.name, Gene.names)) %>% filter(pop != 17) %>% group_by(Gene.names, pop) %>% summarise(mean_LFQ = mean(value, na.rm=T), se_LFQ = sd(value, na.rm=T)/sqrt(n())) %>% ggplot(aes(x = pop, y = mean_LFQ)) + geom_line(size = 1, aes(colour = Gene.names)) + geom_errorbar(aes(ymin=mean_LFQ-se_LFQ, ymax=mean_LFQ+se_LFQ), width=.2,position=position_dodge(.9), alpha = 0.8, size = 0.5) + scale_x_continuous(breaks = seq(0, 17, 1)) + expand_limits (y = 0) + theme_bw() + theme(text = element_text(size = 8, family = "Helvetica"), axis.title = element_text(face = "bold")) + xlab("Population") + ylab("Mean LFQ intensity") } plot_linegraph_scaled <- function(gene.name) { protg.full_cc.long %>% filter(grepl(gene.name, Gene.names)) %>% filter(pop != 17) %>% group_by(Gene.names, pop) %>% summarise(mean_LFQ = mean(value, na.rm=T), se_LFQ = sd(value, na.rm=T)/sqrt(n())) %>% group_by(Gene.names) %>% mutate(scaled_LFQ = mean_LFQ/mean(mean_LFQ, na.rm=T), scaled_se = se_LFQ/mean(mean_LFQ, na.rm=T)) %>% ggplot(aes(x = pop, y = scaled_LFQ)) + geom_line(size = 1, aes(colour = Gene.names)) + geom_errorbar(aes(ymin=scaled_LFQ-scaled_se, ymax=scaled_LFQ+scaled_se), width=.2,position=position_dodge(.9), alpha = 0.8, size = 0.5) + scale_x_continuous(breaks = seq(0, 17, 1)) + expand_limits (y = 0) + theme_bw() + theme(text = element_text(size = 8, family = "Helvetica"), axis.title = element_text(face = "bold")) + xlab("Population") + ylab("Mean LFQ intensity") } plot_linegraph_scaled_to_max <- function(gene.name) { protg.full_cc.long %>% filter(grepl(gene.name, Gene.names)) %>% filter(pop != 17) %>% group_by(Gene.names, pop) %>% summarise(mean_LFQ = mean(value, na.rm=T), se_LFQ = sd(value, na.rm=T)/sqrt(n())) %>% group_by(Gene.names) %>% mutate(scaled_LFQ = mean_LFQ/max(mean_LFQ, na.rm=T), scaled_se = se_LFQ/max(mean_LFQ, na.rm=T)) %>% ggplot(aes(x = pop, y = scaled_LFQ)) + geom_line(size = 1, aes(colour = Gene.names)) + geom_errorbar(aes(ymin=scaled_LFQ-scaled_se, ymax=scaled_LFQ+scaled_se), width=.2,position=position_dodge(.9), alpha = 0.8, size = 0.5) + scale_x_continuous(breaks = seq(0, 17, 1)) + expand_limits (y = 0) + theme_bw() + theme(text = element_text(size = 8, family = "Helvetica"), axis.title = element_text(face = "bold")) + xlab("Population") + ylab("Mean LFQ intensity") } plot_linegraph_scaled_to_max_2 <- function(gene.name) { protg.full_cc.long %>% filter(grepl(gene.name, Gene.names)) %>% filter(pop != 17) %>% group_by(Gene.names, pop) %>% summarise(mean_LFQ = mean(value, na.rm=T), se_LFQ = sd(value, na.rm=T)/sqrt(n())) %>% group_by(Gene.names) %>% mutate(scaled_LFQ = mean_LFQ/max(mean_LFQ, na.rm=T), scaled_se = se_LFQ/max(mean_LFQ, na.rm=T)) %>% ggplot(aes(x = pop, y = scaled_LFQ)) + geom_line(size = 1, aes(colour = Gene.names)) + geom_errorbar(aes(ymin=scaled_LFQ-scaled_se, ymax=scaled_LFQ+scaled_se), width=.2,position=position_dodge(.9), alpha = 0.1, size = 0.5) + scale_x_continuous(breaks = seq(0, 17, 1)) + expand_limits (y = 0) + theme_bw() + theme(text = element_text(size = 12, family = "Helvetica"), axis.title = element_text(face = "bold")) + xlab("Population") + ylab("Normalised intensity") } plot_linegraph_facet_rep <- function(gene.name) { protg.full_cc.long %>% filter(grepl(gene.name, Gene.names)) %>% filter(pop != 17) %>% group_by(Gene.names, pop, rep) %>% summarise(mean_LFQ = mean(value, na.rm=T), se_LFQ = sd(value, na.rm=T)/sqrt(n())) %>% ggplot(aes(x = pop, y = mean_LFQ)) + geom_line(size = 1, aes(colour = Gene.names)) + geom_errorbar(aes(ymin=mean_LFQ-se_LFQ, ymax=mean_LFQ+se_LFQ), width=.2,position=position_dodge(.9), alpha = 0.8, size = 0.25) + facet_grid(. ~ rep) + scale_x_continuous(breaks = seq(0, 17, 1)) + expand_limits (y = 0) + theme_bw() + theme(text = element_text(size = 8, family = "Helvetica"), axis.title = element_text(face = "bold")) + xlab("Population") + ylab("Mean LFQ intensity") } ######### PLOTTING LINEGRAPHS OF EXAMPLE PROTEINS -- INCL CYCLINS ############## plot_linegraph_sd("CCNB1") ggsave("CycB_LFQ_primmus_lineplot.pdf", width = 4, height = 1) plot_linegraph_sd("CCNA2") ggsave("CycA_LFQ_primmus_lineplot.pdf", width = 4, height = 1) plot_linegraph_sd("(CCNA2)|(CCNB1)|(^RRM2$)") ggsave("CycA_cycB_RRM2_LFQ_primmus_lineplot.pdf", width = 4, height = 1) plot_linegraph_scaled_to_max("(^DTL$)|(DLGAP5)|(^RRM2$)") ggsave("DTL_DLGAP5_RRM2_LFQ_primmus_lineplot.pdf", width = 8, height = 2) plot_linegraph_scaled_to_max_2("(CCNA2)|(CCNB1)|(SGOL2)|(AURKA$)") ggsave("CCNA2_CCNB1_SGOL2_AURKA_LFQ_primmus_lineplot.pdf", width = 6, height = 4) plot_linegraph_scaled_to_max("KIF20B|CCNB1") ######## PLOT TOTAL HISTONE INTENSITIES ############## histone_ids <- read.delim("~/Documents/Edinburgh WTCCB/Science/p21 AB/p21_R/ProteomeRuler/Homo_Sapiens_Histone_UniprotIDs_19Mar6") histones <- protg.full_cc[grep(paste(histone_ids$Entry, collapse="|"), protg.full_cc$Base_Uniprot), ] histones.total <- colSums(histones[,protg.full_cc.int.facs.mycols], na.rm=T) int.total <- colSums(protg.full_cc[,protg.full_cc.int.facs.mycols], na.rm=T) data.frame(labels = protg.full_cc.int.facs.mycols, values = histones.total) %>% mutate(pop = as.numeric(gsub("int.P([123456789][0123456789]{0,1})[abcd][x]{0,1}\\.{0,1}[i]{0,1}[2]{0,1}", "\\1", labels)), rep = gsub("int.P([123456789][0123456789]{0,1})([abcd])[x]{0,1}\\.{0,1}[i]{0,1}[2]{0,1}", "\\2", labels)) %>% ggplot(aes(x = pop, y = values)) + geom_point(aes(colour = pop)) + facet_grid(. ~ rep) data.frame(labels = protg.full_cc.int.facs.mycols, values = histones.total/int.total) %>% mutate(pop = as.numeric(gsub("int.P([123456789][0123456789]{0,1})[abcd][x]{0,1}\\.{0,1}[i]{0,1}[2]{0,1}", "\\1", labels)), rep = gsub("int.P([123456789][0123456789]{0,1})([abcd])[x]{0,1}\\.{0,1}[i]{0,1}[2]{0,1}", "\\2", labels)) %>% ggplot(aes(x = pop, y = values)) + geom_point(aes(colour = pop)) + facet_grid(. ~ rep) ####### PERIODICITY TESTS ############# protg.full_cc.long.sum <- protg.full_cc.long %>% group_by(Protein.group.IDs, Base_Uniprot, Gene.names, pop) %>% summarise(totalint = sum(value)) protg.full_cc.sum.wide <- spread(protg.full_cc.long.sum, pop, totalint, sep = ".") sumpop.cols <- grep("pop", colnames(protg.full_cc.sum.wide), value = T) ordered.ppm.facs.mycols <- c(paste0('P', 1:16, 'a'), paste0('P', 1:16, "b"), paste0('P', 1:16, 'c'), paste0('P', 1:16, 'd'), paste0('P', 1:16, 'a.i2'), paste0('P', 1:16, 'b.i2'), paste0('P', 1:16, 'c.i2'), paste0('P', 1:16, 'd.i2')) ordered.ppm.facs.mycols <- paste0('ppm.', ordered.ppm.facs.mycols) ordered.ppm.facs.mycols[!ordered.ppm.facs.mycols%in%colnames(protg.full_cc)] temp_inj1 <- protg.full_cc[,ordered.ppm.facs.mycols[1:64]] colnames(temp_inj1) <- 1:64 temp_inj2 <- protg.full_cc[,ordered.ppm.facs.mycols[65:128]] colnames(temp_inj2) <- 65:128 pp.bothinj.ordered.df <- protg.full_cc[,ordered.ppm.facs.mycols] colnames(pp.bothinj.ordered.df) <- 1:128 temp_inj1[is.na(temp_inj1)] <- 0 temp_inj2[is.na(temp_inj2)] <- 0 #this is a function that performs a periodicity test using the 'Fisher' method #it takes a data frame containing just numeric values as input #where each row is a different protein and each column is a different timepoint, ordered from low to high #the output is a data frame containing pvalues and periodic frequencies for each protein my.ptestg <- function(df) { ptestg.out.pval <- vector("numeric", nrow(protg.full_cc)) ptestg.out.freq <- vector("numeric", nrow(protg.full_cc)) for(i in 1:nrow(df) ){ if(any(df[i,] > 0)) { ptestg.out.pval[i] <- ptestg(unlist(df[i,]), method = 'Fisher', multiple = FALSE)$pvalue ptestg.out.freq[i] <- ptestg(unlist(df[i,]), method = 'Fisher', multiple = FALSE)$freq } else { ptestg.out.pval[i] <- NA ptestg.out.freq[i] <- NA } } return(data.frame(ptestg.out.pval = ptestg.out.pval, ptestg.out.freq = ptestg.out.freq)) } ptestg.out.inj1 <- my.ptestg(temp_inj1) ptestg.out.inj2 <- my.ptestg(temp_inj2) ptestg.out.inj1$ptestg.out.pval[ptestg.out.inj1$ptestg.out.pval > 1] <- 1 ptestg.out.inj2$ptestg.out.pval[ptestg.out.inj2$ptestg.out.pval > 1] <- 1 ptestg.out.inj1$ptestg.qval.inj1 <- qvalue(ptestg.out.inj1$ptestg.out.pval)$qvalues ptestg.out.inj2$ptestg.qval.inj2 <- qvalue(ptestg.out.inj2$ptestg.out.pval)$qvalues protg.full_cc$ptestg.qval.inj1 <- ptestg.out.inj1$ptestg.qval.inj1 protg.full_cc$ptestg.qval.inj2 <- ptestg.out.inj2$ptestg.qval.inj2 protg.full_cc$ptestg.freq.inj1 <- ptestg.out.inj1$ptestg.out.freq protg.full_cc$ptestg.freq.inj2 <- ptestg.out.inj2$ptestg.out.freq periodic.proteins.bothinj.indices <- which(ptestg.out.inj1$ptestg.qval.inj1 < 0.10 & ptestg.out.inj2$ptestg.qval.inj2 < 0.10 & (ptestg.out.inj1$ptestg.out.freq == 0.0625 | ptestg.out.inj1$ptestg.out.freq == 0.125) & (ptestg.out.inj2$ptestg.out.freq == 0.0625 | ptestg.out.inj2$ptestg.out.freq == 0.125)) my.write.table(protg.full_cc[,c(protg.full_cc.id.cols, "ptestg.qval.inj1", "ptestg.qval.inj2", "ptestg.freq.inj1", "ptestg.freq.inj2")], "ptestg.out.txt") my.write.table(protg.full_cc[periodic.proteins.bothinj.indices,c(protg.full_cc.id.cols, "ptestg.qval.inj1", "ptestg.qval.inj2", "ptestg.freq.inj1", "ptestg.freq.inj2")], "ptestg.out.periodic.proteins.bothinj.indices.txt") ##### CLUSTERING WITH AVERAGE PROFILE ######### protg.full_cc.long.avg <- protg.full_cc.long %>% group_by(Protein.group.IDs, Base_Uniprot, Gene.names, pop) %>% summarise(mean_LFQ = mean(value, na.rm=T), se_LFQ = sd(value, na.rm=T)/sqrt(n())) protg.full_cc.wide.avg <- spread(protg.full_cc.long.avg[,c("Protein.group.IDs", "Base_Uniprot", "Gene.names", "pop", "mean_LFQ")], key = pop, value = mean_LFQ) protg.full_cc.wide.avg.pp <- protg.full_cc.wide.avg[protg.full_cc.wide.avg$Protein.group.IDs%in%protg.full_cc[periodic.proteins.bothinj.indices,"Protein.group.IDs"],] protg.full_cc.wide.avg.pp.heatmap.mat <- as.matrix(protg.full_cc.wide.avg.pp[,as.character(c(1:16))]) protg.full_cc.wide.avg.pp.heatmap.mat[is.na(protg.full_cc.wide.avg.pp.heatmap.mat)] <- 0 wssplot(t(apply(protg.full_cc.wide.avg.pp.heatmap.mat, 1, scale)), nc = 20, iter.max = 50) protg.full_cc.wide.avg.pp.heatmap <- pheatmap(protg.full_cc.wide.avg.pp.heatmap.mat, clustering_method = 'ward.D2', scale = "row", cluster_cols = FALSE, cluster_rows = TRUE) n_clust <- 5 row_annots2 <- data.frame(cluster = as.factor(cutree(protg.full_cc.wide.avg.pp.heatmap$tree_row, k = n_clust))) # change cluster numbering... # 1 to 1 #### I think this is now 2 # 3 in original pheatmap output changed to 2 ### I think this is now 1 # 5 to 3 # 4 to 4 # 2 to 5 row_annots.final <- row_annots2 row_annots.final$cluster[row_annots2$cluster == 3] <- 1 row_annots.final$cluster[row_annots2$cluster == 5] <- 3 row_annots.final$cluster[row_annots2$cluster == 2] <- 5 row_annots.final$cluster[row_annots2$cluster == 1] <- 2 rownames(row_annots.final) <- protg.full_cc.wide.avg.pp$Gene.names ann_colors = list(cluster = brewer.pal(name = "Set1", n = n_clust)) names(ann_colors$cluster) <- levels(as.factor(cutree(protg.full_cc.wide.avg.pp.heatmap$tree_row, k = n_clust))) rownames(protg.full_cc.wide.avg.pp.heatmap.mat) <- protg.full_cc.wide.avg.pp$Gene.names protg.full_cc.wide.avg.pp.heatmap <- pheatmap(protg.full_cc.wide.avg.pp.heatmap.mat, clustering_method = 'ward.D2', scale = "row", cluster_cols = FALSE, cluster_rows = TRUE, cutree_rows = n_clust, annotation_row = row_annots.final, annotation_colors = ann_colors, legend = TRUE, border_color = NA) save_pheatmap_pdf(protg.full_cc.wide.avg.pp.heatmap, "periodic.proteins.bothinj.heatmap.pdf") protg.full_cc.wide.avg.pp$final.cluster <- row_annots.final$cluster protg.full_cc.wide.avg.pp$final.order[protg.full_cc.wide.avg.pp.heatmap$tree_row$order] <- 1:length(protg.full_cc.wide.avg.pp.heatmap$tree_row$order) pp_indices <- match(protg.full_cc.wide.avg.pp$Protein.group.IDs, protg.full_cc$Protein.group.IDs) protg.full_cc.wide.avg.pp$ptestg.qval.inj1 <- protg.full_cc$ptestg.qval.inj1[pp_indices] protg.full_cc.wide.avg.pp$ptestg.qval.inj2 <- protg.full_cc$ptestg.qval.inj2[pp_indices] protg.full_cc.wide.avg.pp[,c("Protein.group.IDs", as.character(1:16), "final.cluster")] %>% gather(key = pop, value = value, -Protein.group.IDs, -final.cluster) %>% group_by(Protein.group.IDs, final.cluster) %>% mutate(scaledValue = scale(value)) %>% group_by(final.cluster, pop) %>% summarise(meanScaledValue = mean(scaledValue, na.rm=T), sdScaledValue = sd(scaledValue, na.rm=T)) %>% ggplot(aes(x = as.numeric(pop), y = meanScaledValue, group = final.cluster)) + geom_line() + geom_ribbon(aes(x = as.numeric(pop), ymin = meanScaledValue - (1.96*sdScaledValue), ymax = meanScaledValue + (1.96*sdScaledValue)), alpha = 0.2) + facet_grid(. ~ final.cluster) + theme_bw() + theme(text = element_text(size = 8, family = "Helvetica"), axis.title = element_text(face = "bold"), panel.grid.major = element_blank(), panel.grid.minor = element_blank()) + xlab("Population") + ylab("Scaled intensity") ggsave("heatmap.linegraphs.pdf", height = 1.5, width = 3.5, useDingbats = FALSE) protg.full_cc.wide.avg.pp[,c("Protein.group.IDs", as.character(1:16), "final.cluster")] %>% gather(key = pop, value = value, -Protein.group.IDs, -final.cluster) %>% group_by(Protein.group.IDs, final.cluster) %>% mutate(scaledValue = scale(value)) %>% group_by(final.cluster, pop) %>% summarise(meanScaledValue = mean(scaledValue, na.rm=T), sdScaledValue = sd(scaledValue, na.rm=T)) %>% ggplot(aes(x = as.numeric(pop), y = meanScaledValue, group = final.cluster)) + geom_line(aes(colour = final.cluster), size = 2) + # geom_ribbon(aes(x = as.numeric(pop), ymin = meanScaledValue - (1.96*sdScaledValue), ymax = meanScaledValue + (1.96*sdScaledValue), colour = final.cluster, fill = final.cluster), alpha = 0.2) + theme_classic() + theme(text = element_text(size = 8, family = "Helvetica"), axis.title = element_text(face = "bold"), panel.grid.major = element_blank(), panel.grid.minor = element_blank()) + xlab("Population") + ylab("Scaled intensity") ggsave("heatmap.linegraphs overlay.pdf", height = 3, width = 3.5, useDingbats = FALSE) ##### which proteins in clusters 3, 4 and 5 are known apc/c substrates? ##### apcc_substrate_list <- read.delim("../../../APCC_substrates_Davey.txt", header= F) apcc_substrate_list <- unique(apcc_substrate_list$V1) apcc_substrate_list <- toupper(apcc_substrate_list) protg.full_cc$apcc_gene <- grepl(paste0(apcc_substrate_list, collapse = "|"), protg.full_cc$Majority.protein.IDs) table(protg.full_cc$apcc_gene) protg.full_cc.wide.avg.pp$apcc_gene <- protg.full_cc$apcc_gene[match(protg.full_cc.wide.avg.pp$Protein.group.IDs,protg.full_cc$Protein.group.IDs)] table(protg.full_cc.wide.avg.pp$apcc_gene) table(protg.full_cc.wide.avg.pp$apcc_gene, protg.full_cc.wide.avg.pp$final.cluster) ###### which proteins have Dbox and KEN box? ####### protg.full_cc.wide.avg.pp$KEN <- protg.full_cc$KEN[match(protg.full_cc.wide.avg.pp$Protein.group.IDs,protg.full_cc$Protein.group.IDs)] protg.full_cc.wide.avg.pp$Dbox <- protg.full_cc$Dbox[match(protg.full_cc.wide.avg.pp$Protein.group.IDs,protg.full_cc$Protein.group.IDs)] protg.full_cc.wide.avg.pp$KENslim <- protg.full_cc$KENslim[match(protg.full_cc.wide.avg.pp$Protein.group.IDs,protg.full_cc$Protein.group.IDs)] protg.full_cc.wide.avg.pp$Dboxslim <- protg.full_cc$Dboxslim[match(protg.full_cc.wide.avg.pp$Protein.group.IDs,protg.full_cc$Protein.group.IDs)] protg.full_cc.wide.avg.pp$abba_slim_hits <- protg.full_cc$abba_slim_hits[match(protg.full_cc.wide.avg.pp$Protein.group.IDs,protg.full_cc$Protein.group.IDs)] my.write.table(protg.full_cc.wide.avg.pp, "protg.full_cc.wide.avg.pp.txt") motifs_by_cluster_logical <- lapply(1:n_clust, function(x) { return(c(x, sum(protg.full_cc.wide.avg.pp[protg.full_cc.wide.avg.pp$final.cluster==x,"KEN"]==TRUE), sum(protg.full_cc.wide.avg.pp[protg.full_cc.wide.avg.pp$final.cluster==x,"KEN"]==FALSE), sum(protg.full_cc.wide.avg.pp[protg.full_cc.wide.avg.pp$final.cluster==x,"Dbox"]==TRUE), sum(protg.full_cc.wide.avg.pp[protg.full_cc.wide.avg.pp$final.cluster==x,"Dbox"]==FALSE), sum(protg.full_cc.wide.avg.pp[protg.full_cc.wide.avg.pp$final.cluster==x,"abba_slim_hits"]==TRUE), sum(protg.full_cc.wide.avg.pp[protg.full_cc.wide.avg.pp$final.cluster==x,"abba_slim_hits"]==FALSE), sum(protg.full_cc.wide.avg.pp[protg.full_cc.wide.avg.pp$final.cluster==x,"KENslim"]==TRUE), sum(protg.full_cc.wide.avg.pp[protg.full_cc.wide.avg.pp$final.cluster==x,"KENslim"]==FALSE), sum(protg.full_cc.wide.avg.pp[protg.full_cc.wide.avg.pp$final.cluster==x,"Dboxslim"]==TRUE), sum(protg.full_cc.wide.avg.pp[protg.full_cc.wide.avg.pp$final.cluster==x,"Dboxslim"]==FALSE))) }) motifs_by_cluster_logical.df <- do.call(rbind.data.frame, motifs_by_cluster_logical) colnames(motifs_by_cluster_logical.df) <- c("cluster", "KEN.pos", "KEN.neg", "Dbox.pos", "Dbox.neg", "abba.pos", "abba.neg", "KENslim.pos", "KENslim.neg", "Dboxslim.pos", "Dboxslim.neg") my.write.table(motifs_by_cluster_logical.df, "motifs_by_cluster_logical.df.txt") export.to.clipboard(c(sum(protg.full_cc[protg.full_cc$pp_gene==FALSE,"KEN"]==TRUE), sum(protg.full_cc[protg.full_cc$pp_gene==FALSE,"KEN"]==FALSE), sum(protg.full_cc[protg.full_cc$pp_gene==FALSE,"Dbox"]==TRUE), sum(protg.full_cc[protg.full_cc$pp_gene==FALSE,"Dbox"]==FALSE), sum(protg.full_cc[protg.full_cc$pp_gene==FALSE,"abba_slim_hits"]==TRUE), sum(protg.full_cc[protg.full_cc$pp_gene==FALSE,"abba_slim_hits"]==FALSE), sum(protg.full_cc[protg.full_cc$pp_gene==FALSE,"KENslim"]==TRUE), sum(protg.full_cc[protg.full_cc$pp_gene==FALSE,"KENslim"]==FALSE), sum(protg.full_cc[protg.full_cc$pp_gene==FALSE,"Dboxslim"]==TRUE), sum(protg.full_cc[protg.full_cc$pp_gene==FALSE,"Dboxslim"]==FALSE))) protg.full_cc.wide.avg.pp$Gene.names[which(protg.full_cc.wide.avg.pp$KENslim & protg.full_cc.wide.avg.pp$Dboxslim & protg.full_cc.wide.avg.pp$abba_slim_hits)] protg.full_cc.wide.avg.pp$Gene.names[which(protg.full_cc.wide.avg.pp$KENslim & protg.full_cc.wide.avg.pp$Dboxslim)] protg.full_cc.wide.avg.pp$Gene.names[which(protg.full_cc.wide.avg.pp$abba_slim_hits & protg.full_cc.wide.avg.pp$Dboxslim & protg.full_cc.wide.avg.pp$final.cluster == 4)] protg.full_cc.wide.avg.pp$Gene.names[which(protg.full_cc.wide.avg.pp$Dboxslim & protg.full_cc.wide.avg.pp$final.cluster == 3)] protg.full_cc.wide.avg.pp$Gene.names[which(protg.full_cc.wide.avg.pp$abba_slim_hits & protg.full_cc.wide.avg.pp$KENslim)] ####### linegraphs of example proteins across the pseudoperiodic timecourse ########### plot_linegraph_oneplot <- function(gene) { df <- pp.bothinj.ordered.df df$Gene.names <- protg.full_cc$Gene.names df[grep(gene, protg.full_cc$Gene.names),] %>% gather("pop", "value", -Gene.names) %>% ggplot(aes(x = as.numeric(pop), y = value, group = Gene.names)) + geom_line() + theme_bw() + theme(text = element_text(size = 18, family = "Helvetica"), axis.title = element_text(face = "bold")) + scale_x_continuous(breaks = seq(0, 128, 8)) + xlab("Sample") + ylab("Intensity") } plot_linegraph_oneplot_floor <- function(gene) { df <- pp.bothinj.ordered.df df$Gene.names <- protg.full_cc$Gene.names df[grep(gene, protg.full_cc$Gene.names),] %>% gather("pop", "value", -Gene.names) %>% ggplot(aes(x = as.numeric(pop), y = value, group = Gene.names)) + geom_line() + theme_bw() + theme(text = element_text(size = 18, family = "Helvetica"), axis.title = element_text(face = "bold")) + scale_x_continuous(breaks = seq(0, 128, 8)) + expand_limits(y = 0) + xlab("Sample") + ylab("Intensity") } plot_linegraph_oneplot_colour <- function(gene) { df <- pp.bothinj.ordered.df df$Gene.names <- protg.full_cc$Gene.names df[grep(gene, protg.full_cc$Gene.names),] %>% gather("pop", "value", -Gene.names) %>% ggplot(aes(x = as.numeric(pop), y = value, group = Gene.names, colour = Gene.names)) + geom_line() + theme_bw() + theme(text = element_text(size = 18, family = "Helvetica"), axis.title = element_text(face = "bold")) + scale_x_continuous(breaks = seq(0, 128, 8)) + xlab("Sample") + ylab("Intensity") } plot_linegraph_oneplot("^ATAD2$") ggsave("Plots/ATAD2_linegraph_oneplot.pdf", width = 8, height = 5) plot_linegraph_oneplot("^AURKA$") ggsave("Plots/AURKA_linegraph_oneplot.pdf", width = 8, height = 5) plot_linegraph_oneplot("^CCNA2$") ggsave("Plots/CCNA2_linegraph_oneplot.pdf", width = 8, height = 5) plot_linegraph_scaled_to_max("CCNA2|AURKA$|CCNB1") ggsave("Plots/CCNA2_AURKA_CCNB1_linegraph.pdf", width = 5, height = 3) plot_linegraph_oneplot_floor("HSP90AA1") ggsave("HSP90AA1_linegraph_oneplot.pdf", width = 8, height = 5) ###### Missing Values in cell cycle dataset ####### table(apply(protg.full_cc[,protg.full_cc.int.facs.cols],1,function(x) sum(x==0))) #doesn't make sense because some proteins will be true zeros in specific phases, such as cyclins #at least 3 replicates in one population colnames(protg.full_cc.long) protg.full_cc.long %>% subset(value > 0) %>% group_by(Protein.group.IDs, pop) %>% tally() %>% subset(n > 3) %>% distinct(Protein.group.IDs) %>% nrow() #6,650 with at least three replicates in one population n_pop_tally <- sapply(0:7, function(i) { protg.full_cc.long %>% subset(value > 0) %>% group_by(Protein.group.IDs, pop) %>% tally() %>% subset(n > i) %>% distinct(Protein.group.IDs) %>% nrow() }) export.to.clipboard(n_pop_tally) ###### PCA with average values ########## set.seed(1) #pp_ids <- periodic.proteins.bothinj.df$id #pp_ids_nomarkers <- periodic.proteins.bothinj.df$id[-grep("CCNB1|CCNA2", periodic.proteins.bothinj.df$Gene.names)] pca.mat <- t(protg.full_cc.wide.avg.pp[,as.character(c(1:16))]) pca.mat[is.na(pca.mat)] <- 0 pca.out <- prcomp(pca.mat, center = TRUE, scale. = TRUE) pca.out.df <- as.data.frame(pca.out$x) pca.out.df$name <- rownames(pca.out.df) pca.out.df$name <- factor(pca.out.df$name, c(1:16)) ggplot(pca.out.df, aes(x = PC1, y = PC2, label = name)) + geom_point(aes(colour = name), size = 1) + geom_text(hjust = 1.5, vjust = 0) + theme_bw() + theme(text = element_text(size = 12, family = "Helvetica", colour = 'black'), legend.position = "none") ggsave("pca.with_pop16.pdf", height = 2.5, width = 3, useDingbats = F) ggplotly() fviz_pca_ind(pca.out, col.ind = "cos2", # Color by the quality of representation gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"), repel = TRUE # Avoid text overlapping ) fviz_pca_var(pca.out, col.var = "contrib", # Color by contributions to the PC gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"), repel = TRUE # Avoid text overlapping ) pca.mat.17 <- t(protg.full_cc.wide.avg.pp[,as.character(c(1:17))]) pca.mat.17[is.na(pca.mat.17)] <- 0 pca.out.17 <- prcomp(pca.mat.17, center = TRUE, scale. = TRUE) pca.out.df.17 <- as.data.frame(pca.out.17$x) pca.out.df.17$name <- rownames(pca.out.df.17) pca.out.df.17$name <- factor(pca.out.df.17$name, c(1:17)) ggplot(pca.out.df.17, aes(x = PC1, y = PC2, label = name)) + geom_point(aes(colour = name), size = 1) + geom_text(hjust = 1.5, vjust = 0) + theme_bw() + theme(text = element_text(size = 12, family = "Helvetica", colour = 'black'), legend.position = "none") ggsave("pca.with_pop17.pdf", height = 2.5, width = 3, useDingbats = F) ggplotly() fviz_pca_ind(pca.out, col.ind = "cos2", # Color by the quality of representation gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"), repel = TRUE # Avoid text overlapping ) fviz_pca_var(pca.out, col.var = "contrib", # Color by contributions to the PC gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"), repel = TRUE # Avoid text overlapping ) # without cycA and cycB pca.mat <- t(protg.full_cc.wide.avg.pp[!grepl("CCNB1|CCNA2", protg.full_cc.wide.avg.pp$Gene.names),as.character(c(1:17))]) pca.mat[is.na(pca.mat)] <- 0 pca.out <- prcomp(pca.mat, center = TRUE, scale. = TRUE) pca.out.df <- as.data.frame(pca.out$x) pca.out.df$name <- rownames(pca.out.df) pca.out.df$name <- factor(pca.out.df$name, c(1:17)) ggplot(pca.out.df, aes(x = PC1, y = PC2, label = name)) + geom_point(aes(colour = name), size = 1) + geom_text(hjust = 1.5, vjust = 0) + theme_bw() + theme(text = element_text(size = 12, family = "Helvetica", colour = 'black'), legend.position = "none") ggsave("pca.with_pop17_without_CycA_CycB.pdf", height = 2.5, width = 3, useDingbats = F) ggplotly() ###### PCA with all samples ####### pca.protg_allsamples <- protg.full_cc.long %>% select(Protein.group.IDs, variable, value) %>% spread(variable, value) pca.protg_allsamples.pp <- pca.protg_allsamples[match(protg.full_cc.wide.avg.pp$Protein.group.IDs, pca.protg_allsamples$Protein.group.IDs),] pca.protg_allsamples.pp.mat <- t(as.matrix(pca.protg_allsamples.pp[,-1])) pca.protg_allsamples.pp.mat[is.na(pca.protg_allsamples.pp.mat)] <- 0 #pca.protg_allsamples.pp.mat <- pca.protg_allsamples.pp.mat[-c(grep("17", rownames(pca.protg_allsamples.pp.mat))),] my.write.table(pca.protg_allsamples.pp.mat, "pca.protg_allsamples.pp.mat.txt") pca.out.allsamples <- prcomp(pca.protg_allsamples.pp.mat, center = TRUE, scale. = TRUE) pca.out.df.allsamples <- as.data.frame(pca.out.allsamples$x) pca.out.df.allsamples$name <- rownames(pca.out.df.allsamples) pca.out.df.allsamples$phase <- "mitosis" pca.out.df.allsamples$phase[grep("P[12345678][abcd]", pca.out.df.allsamples$name)] <- "interphase" pca.out.df.allsamples$inj <- grepl(".i2", pca.out.df.allsamples$name) pca.out.df.allsamples$pop <- gsub("ppm.P([123456789][1234567890]{0,1})[abcd].{0,1}.{0,1}.{0,1}.{0,1}", "\\1", pca.out.df.allsamples$name) pca.out.df.allsamples$biorep <- gsub("ppm.P[123456789][1234567890]{0,1}([abcd]).{0,1}.{0,1}.{0,1}.{0,1}", "\\1", pca.out.df.allsamples$name) pca.out.df.allsamples$pop <- factor(pca.out.df.allsamples$pop, levels = c(1:17)) ggplot(pca.out.df.allsamples, aes(x = PC1, y = PC2, label = name)) + geom_point(aes(colour = pop, shape = phase), size = 1) + theme_bw() + theme(text = element_text(size = 12, family = "Helvetica", colour = 'black'), legend.position = "none") ggsave("pca.allsamples.pdf", height = 2.5, width = 3, useDingbats = F) ggplotly() ######## classification ############ #install.packages("class") library(class) train.reps <- "b|c|d" test.reps <- "a" class.pca.protg_allsamples.pp.mat <- pca.protg_allsamples.pp.mat[grepl(train.reps, row.names(pca.protg_allsamples.pp.mat)),] class.pca.protg_allsamples.pp.mat <- class.pca.protg_allsamples.pp.mat[!grepl("17", row.names(class.pca.protg_allsamples.pp.mat)),] class.pca.out.allsamples <- prcomp(class.pca.protg_allsamples.pp.mat, center = TRUE, scale. = TRUE) class.pca.out.df.allsamples <- as.data.frame(class.pca.out.allsamples$x) class.pca.out.df.allsamples$name <- rownames(class.pca.out.df.allsamples) class.pca.out.df.allsamples$phase <- "mitosis" class.pca.out.df.allsamples$phase[grep("P[12345678][abcd]", class.pca.out.df.allsamples$name)] <- "interphase" class.pca.out.df.allsamples$inj <- grepl(".i2", class.pca.out.df.allsamples$name) class.pca.out.df.allsamples$pop <- gsub("ppm.P([123456789][1234567890]{0,1})[abcd].{0,1}.{0,1}.{0,1}.{0,1}", "\\1", class.pca.out.df.allsamples$name) class.pca.out.df.allsamples$biorep <- gsub("ppm.P[123456789][1234567890]{0,1}([abcd]).{0,1}.{0,1}.{0,1}.{0,1}", "\\1", class.pca.out.df.allsamples$name) ggplot(class.pca.out.df.allsamples, aes(x = PC1, y = PC2, label = name)) + geom_point(aes(colour = pop, shape = phase), size = 1) + theme_bw() + theme(text = element_text(size = 12, family = "Helvetica", colour = 'black'), legend.position = "none") class.test.mat <- pca.protg_allsamples.pp.mat[grepl(test.reps, row.names(pca.protg_allsamples.pp.mat)),] class.test.mat <- class.test.mat[!grepl("17", row.names(class.test.mat)),] class.test.mat <- scale(class.test.mat) test.pca.transformed.mat <- class.test.mat%*%class.pca.out.allsamples$rotation knn.train <- class.pca.out.df.allsamples[,c("PC1", "PC2")] knn.test <- test.pca.transformed.mat[,c("PC1", "PC2")] knn.cl <- factor(as.character(class.pca.out.df.allsamples$pop), levels=1:16) knn.test.cl <- gsub("ppm.P([123456789][1234567890]{0,1})[abcd].{0,1}.{0,1}.{0,1}.{0,1}", "\\1", row.names(knn.test)) set.seed(1) knn.out <- knn(knn.train, knn.test, knn.cl, k = 6) knn.out plot(as.numeric(as.character(knn.out)), as.numeric(as.character(knn.test.cl))) data.frame(pred_pop = as.numeric(as.character(knn.out)), true_pop = as.numeric(as.character(knn.test.cl))) %>% lm(formula = true_pop ~ pred_pop) %>% summary() circ.df <- data.frame(pred_pop = as.numeric(as.character(knn.out)), true_pop = as.numeric(as.character(knn.test.cl))) circ.df <- circ.df/16*2*pi mean(apply(circ.df, 1, function(x) {atan2(sin(x[1]-x[2]), cos(x[1]-x[2])) }) / 2 / pi * 16) sd(apply(circ.df, 1, function(x) {atan2(sin(x[1]-x[2]), cos(x[1]-x[2])) }) / 2 / pi * 16) nrow(circ.df) data.frame(pred_pop = as.numeric(as.character(knn.out)), true_pop = as.numeric(as.character(knn.test.cl))) %>% ggplot(aes(x = true_pop, y = pred_pop)) + geom_point() + theme_bw() + xlab("Population") + ylab("kNN-Predicted Population") + scale_x_continuous(breaks = 1:16) + scale_y_continuous(breaks = 1:16) + theme(text = element_text(size = 12, family = "Helvetica", colour = 'black'), legend.position = "none", panel.grid.minor = element_blank()) ggsave("kNN_output_a_test.pdf", height = 2.5, width = 3, useDingbats = F) #classification with other permutations #b test set.seed(1) train.reps <- "a|c|d" test.reps <- "b" class.pca.protg_allsamples.pp.mat <- pca.protg_allsamples.pp.mat[grepl(train.reps, row.names(pca.protg_allsamples.pp.mat)),] class.pca.protg_allsamples.pp.mat <- class.pca.protg_allsamples.pp.mat[!grepl("17", row.names(class.pca.protg_allsamples.pp.mat)),] class.pca.out.allsamples <- prcomp(class.pca.protg_allsamples.pp.mat, center = TRUE, scale. = TRUE) class.pca.out.df.allsamples <- as.data.frame(class.pca.out.allsamples$x) class.pca.out.df.allsamples$name <- rownames(class.pca.out.df.allsamples) class.pca.out.df.allsamples$phase <- "mitosis" class.pca.out.df.allsamples$phase[grep("P[12345678][abcd]", class.pca.out.df.allsamples$name)] <- "interphase" class.pca.out.df.allsamples$inj <- grepl(".i2", class.pca.out.df.allsamples$name) class.pca.out.df.allsamples$pop <- gsub("ppm.P([123456789][1234567890]{0,1})[abcd].{0,1}.{0,1}.{0,1}.{0,1}", "\\1", class.pca.out.df.allsamples$name) class.pca.out.df.allsamples$biorep <- gsub("ppm.P[123456789][1234567890]{0,1}([abcd]).{0,1}.{0,1}.{0,1}.{0,1}", "\\1", class.pca.out.df.allsamples$name) ggplot(class.pca.out.df.allsamples, aes(x = PC1, y = PC2, label = name)) + geom_point(aes(colour = pop, shape = phase), size = 1) + theme_bw() + theme(text = element_text(size = 12, family = "Helvetica", colour = 'black'), legend.position = "none") class.test.mat <- pca.protg_allsamples.pp.mat[grepl(test.reps, row.names(pca.protg_allsamples.pp.mat)),] class.test.mat <- class.test.mat[!grepl("17", row.names(class.test.mat)),] class.test.mat <- scale(class.test.mat) test.pca.transformed.mat <- class.test.mat%*%class.pca.out.allsamples$rotation knn.train <- class.pca.out.df.allsamples[,c("PC1", "PC2")] knn.test <- test.pca.transformed.mat[,c("PC1", "PC2")] knn.cl <- factor(as.character(class.pca.out.df.allsamples$pop), levels=1:16) knn.test.cl <- gsub("ppm.P([123456789][1234567890]{0,1})[abcd].{0,1}.{0,1}.{0,1}.{0,1}", "\\1", row.names(knn.test)) set.seed(1) knn.out <- knn(knn.train, knn.test, knn.cl, k = 6) knn.out plot(as.numeric(as.character(knn.out)), as.numeric(as.character(knn.test.cl))) data.frame(pred_pop = as.numeric(as.character(knn.out)), true_pop = as.numeric(as.character(knn.test.cl))) %>% lm(formula = true_pop ~ pred_pop) %>% summary() circ.df <- data.frame(pred_pop = as.numeric(as.character(knn.out)), true_pop = as.numeric(as.character(knn.test.cl))) circ.df <- circ.df/16*2*pi mean(apply(circ.df, 1, function(x) {atan2(sin(x[1]-x[2]), cos(x[1]-x[2])) }) / 2 / pi * 16) sd(apply(circ.df, 1, function(x) {atan2(sin(x[1]-x[2]), cos(x[1]-x[2])) }) / 2 / pi * 16) nrow(circ.df) data.frame(pred_pop = as.numeric(as.character(knn.out)), true_pop = as.numeric(as.character(knn.test.cl))) %>% ggplot(aes(x = true_pop, y = pred_pop)) + geom_point() + theme_bw() + xlab("Population") + ylab("kNN-Predicted Population") + scale_x_continuous(breaks = 1:16) + scale_y_continuous(breaks = 1:16) + theme(text = element_text(size = 12, family = "Helvetica", colour = 'black'), legend.position = "none", panel.grid.minor = element_blank()) ggsave("kNN_output_b_test.pdf", height = 2.5, width = 3, useDingbats = F) #c test set.seed(1) train.reps <- "a|b|d" test.reps <- "c" class.pca.protg_allsamples.pp.mat <- pca.protg_allsamples.pp.mat[grepl(train.reps, row.names(pca.protg_allsamples.pp.mat)),] class.pca.protg_allsamples.pp.mat <- class.pca.protg_allsamples.pp.mat[!grepl("17", row.names(class.pca.protg_allsamples.pp.mat)),] class.pca.out.allsamples <- prcomp(class.pca.protg_allsamples.pp.mat, center = TRUE, scale. = TRUE) class.pca.out.df.allsamples <- as.data.frame(class.pca.out.allsamples$x) class.pca.out.df.allsamples$name <- rownames(class.pca.out.df.allsamples) class.pca.out.df.allsamples$phase <- "mitosis" class.pca.out.df.allsamples$phase[grep("P[12345678][abcd]", class.pca.out.df.allsamples$name)] <- "interphase" class.pca.out.df.allsamples$inj <- grepl(".i2", class.pca.out.df.allsamples$name) class.pca.out.df.allsamples$pop <- gsub("ppm.P([123456789][1234567890]{0,1})[abcd].{0,1}.{0,1}.{0,1}.{0,1}", "\\1", class.pca.out.df.allsamples$name) class.pca.out.df.allsamples$biorep <- gsub("ppm.P[123456789][1234567890]{0,1}([abcd]).{0,1}.{0,1}.{0,1}.{0,1}", "\\1", class.pca.out.df.allsamples$name) ggplot(class.pca.out.df.allsamples, aes(x = PC1, y = PC2, label = name)) + geom_point(aes(colour = pop, shape = phase), size = 1) + theme_bw() + theme(text = element_text(size = 12, family = "Helvetica", colour = 'black'), legend.position = "none") class.test.mat <- pca.protg_allsamples.pp.mat[grepl(test.reps, row.names(pca.protg_allsamples.pp.mat)),] class.test.mat <- class.test.mat[!grepl("17", row.names(class.test.mat)),] class.test.mat <- scale(class.test.mat) test.pca.transformed.mat <- class.test.mat%*%class.pca.out.allsamples$rotation knn.train <- class.pca.out.df.allsamples[,c("PC1", "PC2")] knn.test <- test.pca.transformed.mat[,c("PC1", "PC2")] knn.cl <- factor(as.character(class.pca.out.df.allsamples$pop), levels=1:16) knn.test.cl <- gsub("ppm.P([123456789][1234567890]{0,1})[abcd].{0,1}.{0,1}.{0,1}.{0,1}", "\\1", row.names(knn.test)) set.seed(1) knn.out <- knn(knn.train, knn.test, knn.cl, k = 6) knn.out plot(as.numeric(as.character(knn.out)), as.numeric(as.character(knn.test.cl))) data.frame(pred_pop = as.numeric(as.character(knn.out)), true_pop = as.numeric(as.character(knn.test.cl))) %>% lm(formula = true_pop ~ pred_pop) %>% summary() circ.df <- data.frame(pred_pop = as.numeric(as.character(knn.out)), true_pop = as.numeric(as.character(knn.test.cl))) circ.df <- circ.df/16*2*pi mean(apply(circ.df, 1, function(x) {atan2(sin(x[1]-x[2]), cos(x[1]-x[2])) }) / 2 / pi * 16) sd(apply(circ.df, 1, function(x) {atan2(sin(x[1]-x[2]), cos(x[1]-x[2])) }) / 2 / pi * 16) nrow(circ.df) data.frame(pred_pop = as.numeric(as.character(knn.out)), true_pop = as.numeric(as.character(knn.test.cl))) %>% ggplot(aes(x = true_pop, y = pred_pop)) + geom_point() + theme_bw() + xlab("Population") + ylab("kNN-Predicted Population") + scale_x_continuous(breaks = 1:16) + scale_y_continuous(breaks = 1:16) + theme(text = element_text(size = 12, family = "Helvetica", colour = 'black'), legend.position = "none", panel.grid.minor = element_blank()) ggsave("kNN_output_c_test.pdf", height = 2.5, width = 3, useDingbats = F) #d test train.reps <- "a|b|c" test.reps <- "d" class.pca.protg_allsamples.pp.mat <- pca.protg_allsamples.pp.mat[grepl(train.reps, row.names(pca.protg_allsamples.pp.mat)),] class.pca.protg_allsamples.pp.mat <- class.pca.protg_allsamples.pp.mat[!grepl("17", row.names(class.pca.protg_allsamples.pp.mat)),] class.pca.out.allsamples <- prcomp(class.pca.protg_allsamples.pp.mat, center = TRUE, scale. = TRUE) class.pca.out.df.allsamples <- as.data.frame(class.pca.out.allsamples$x) class.pca.out.df.allsamples$name <- rownames(class.pca.out.df.allsamples) class.pca.out.df.allsamples$phase <- "mitosis" class.pca.out.df.allsamples$phase[grep("P[12345678][abcd]", class.pca.out.df.allsamples$name)] <- "interphase" class.pca.out.df.allsamples$inj <- grepl(".i2", class.pca.out.df.allsamples$name) class.pca.out.df.allsamples$pop <- gsub("ppm.P([123456789][1234567890]{0,1})[abcd].{0,1}.{0,1}.{0,1}.{0,1}", "\\1", class.pca.out.df.allsamples$name) class.pca.out.df.allsamples$biorep <- gsub("ppm.P[123456789][1234567890]{0,1}([abcd]).{0,1}.{0,1}.{0,1}.{0,1}", "\\1", class.pca.out.df.allsamples$name) ggplot(class.pca.out.df.allsamples, aes(x = PC1, y = PC2, label = name)) + geom_point(aes(colour = pop, shape = phase), size = 1) + theme_bw() + theme(text = element_text(size = 12, family = "Helvetica", colour = 'black'), legend.position = "none") class.test.mat <- pca.protg_allsamples.pp.mat[grepl(test.reps, row.names(pca.protg_allsamples.pp.mat)),] class.test.mat <- class.test.mat[!grepl("17", row.names(class.test.mat)),] class.test.mat <- scale(class.test.mat) test.pca.transformed.mat <- class.test.mat%*%class.pca.out.allsamples$rotation knn.train <- class.pca.out.df.allsamples[,c("PC1", "PC2")] knn.test <- test.pca.transformed.mat[,c("PC1", "PC2")] knn.cl <- factor(as.character(class.pca.out.df.allsamples$pop), levels=1:16) knn.test.cl <- gsub("ppm.P([123456789][1234567890]{0,1})[abcd].{0,1}.{0,1}.{0,1}.{0,1}", "\\1", row.names(knn.test)) set.seed(1) knn.out <- knn(knn.train, knn.test, knn.cl, k = 6) knn.out plot(as.numeric(as.character(knn.out)), as.numeric(as.character(knn.test.cl))) data.frame(pred_pop = as.numeric(as.character(knn.out)), true_pop = as.numeric(as.character(knn.test.cl))) %>% lm(formula = true_pop ~ pred_pop) %>% summary() circ.df <- data.frame(pred_pop = as.numeric(as.character(knn.out)), true_pop = as.numeric(as.character(knn.test.cl))) circ.df <- circ.df/16*2*pi mean(apply(circ.df, 1, function(x) {atan2(sin(x[1]-x[2]), cos(x[1]-x[2])) }) / 2 / pi * 16) sd(apply(circ.df, 1, function(x) {atan2(sin(x[1]-x[2]), cos(x[1]-x[2])) }) / 2 / pi * 16) nrow(circ.df) data.frame(pred_pop = as.numeric(as.character(knn.out)), true_pop = as.numeric(as.character(knn.test.cl))) %>% ggplot(aes(x = true_pop, y = pred_pop)) + geom_point() + theme_bw() + xlab("Population") + ylab("kNN-Predicted Population") + scale_x_continuous(breaks = 1:16) + scale_y_continuous(breaks = 1:16) + theme(text = element_text(size = 12, family = "Helvetica", colour = 'black'), legend.position = "none", panel.grid.minor = element_blank()) ggsave("kNN_output_d_test.pdf", height = 2.5, width = 3, useDingbats = F) # classification of pop 17 ####### train.reps <- "a|b|c|d" test.reps <- "17" class.pca.protg_allsamples.pp.mat <- pca.protg_allsamples.pp.mat[grepl(train.reps, row.names(pca.protg_allsamples.pp.mat)),] class.pca.protg_allsamples.pp.mat <- class.pca.protg_allsamples.pp.mat[!grepl("17", row.names(class.pca.protg_allsamples.pp.mat)),] class.pca.out.allsamples <- prcomp(class.pca.protg_allsamples.pp.mat, center = TRUE, scale. = TRUE) class.pca.out.df.allsamples <- as.data.frame(class.pca.out.allsamples$x) class.pca.out.df.allsamples$name <- rownames(class.pca.out.df.allsamples) class.pca.out.df.allsamples$phase <- "mitosis" class.pca.out.df.allsamples$phase[grep("P[12345678][abcd]", class.pca.out.df.allsamples$name)] <- "interphase" class.pca.out.df.allsamples$inj <- grepl(".i2", class.pca.out.df.allsamples$name) class.pca.out.df.allsamples$pop <- gsub("ppm.P([123456789][1234567890]{0,1})[abcd].{0,1}.{0,1}.{0,1}.{0,1}", "\\1", class.pca.out.df.allsamples$name) class.pca.out.df.allsamples$biorep <- gsub("ppm.P[123456789][1234567890]{0,1}([abcd]).{0,1}.{0,1}.{0,1}.{0,1}", "\\1", class.pca.out.df.allsamples$name) ggplot(class.pca.out.df.allsamples, aes(x = PC1, y = PC2, label = name)) + geom_point(aes(colour = pop, shape = phase), size = 1) + theme_bw() + theme(text = element_text(size = 12, family = "Helvetica", colour = 'black'), legend.position = "none") class.test.mat <- pca.protg_allsamples.pp.mat[grepl(test.reps, row.names(pca.protg_allsamples.pp.mat)),] class.test.mat <- scale(class.test.mat) class.test.mat[is.na(class.test.mat)] <- 0 test.pca.transformed.mat <- class.test.mat%*%class.pca.out.allsamples$rotation knn.train <- class.pca.out.df.allsamples[,c("PC1", "PC2")] knn.test <- test.pca.transformed.mat[,c("PC1", "PC2")] knn.cl <- factor(as.character(class.pca.out.df.allsamples$pop), levels=1:16) knn.test.cl <- gsub("ppm.P([123456789][1234567890]{0,1})[abcd].{0,1}.{0,1}.{0,1}.{0,1}", "\\1", row.names(knn.test)) set.seed(1) knn.out <- knn(knn.train, knn.test, knn.cl, k = 6) knn.out plot(as.numeric(as.character(knn.out)), as.numeric(as.character(knn.test.cl))) ####### using all 119 features for kNN ############# train.reps <- "b|c|d" test.reps <- "a" class.pca.protg_allsamples.pp.mat <- pca.protg_allsamples.pp.mat[grepl(train.reps, row.names(pca.protg_allsamples.pp.mat)),] class.pca.protg_allsamples.pp.mat <- class.pca.protg_allsamples.pp.mat[!grepl("17", row.names(class.pca.protg_allsamples.pp.mat)),] class.pca.protg_allsamples.pp.mat <- scale(class.pca.protg_allsamples.pp.mat) class.test.mat <- pca.protg_allsamples.pp.mat[grepl(test.reps, row.names(pca.protg_allsamples.pp.mat)),] class.test.mat <- class.test.mat[!grepl("17", row.names(class.test.mat)),] class.test.mat <- scale(class.test.mat) knn.train <- class.pca.protg_allsamples.pp.mat knn.test <- class.test.mat knn.cl <- gsub("ppm.P([123456789][1234567890]{0,1})[abcd].{0,1}.{0,1}.{0,1}.{0,1}", "\\1", row.names(knn.train)) knn.test.cl <- gsub("ppm.P([123456789][1234567890]{0,1})[abcd].{0,1}.{0,1}.{0,1}.{0,1}", "\\1", row.names(knn.test)) set.seed(1) knn.out <- knn(knn.train, knn.test, knn.cl, k = 6) knn.out plot(as.numeric(as.character(knn.out)), as.numeric(as.character(knn.test.cl))) data.frame(pred_pop = as.numeric(as.character(knn.out)), true_pop = as.numeric(as.character(knn.test.cl))) %>% lm(formula = true_pop ~ pred_pop) %>% summary() circ.df <- data.frame(pred_pop = as.numeric(as.character(knn.out)), true_pop = as.numeric(as.character(knn.test.cl))) circ.df <- circ.df/16*2*pi mean(apply(circ.df, 1, function(x) {atan2(sin(x[1]-x[2]), cos(x[1]-x[2])) }) / 2 / pi * 16) sd(apply(circ.df, 1, function(x) {atan2(sin(x[1]-x[2]), cos(x[1]-x[2])) }) / 2 / pi * 16) nrow(circ.df) data.frame(pred_pop = as.numeric(as.character(knn.out)), true_pop = as.numeric(as.character(knn.test.cl))) %>% ggplot(aes(x = true_pop, y = pred_pop)) + geom_point() + theme_bw() + xlab("Population") + ylab("kNN-Predicted Population") + scale_x_continuous(breaks = 1:16) + scale_y_continuous(breaks = 1:16) + theme(text = element_text(size = 12, family = "Helvetica", colour = 'black'), legend.position = "none", panel.grid.minor = element_blank()) ggsave("kNN_output_a_test_119features.pdf", height = 2.5, width = 3, useDingbats = F) # b test train.reps <- "a|c|d" test.reps <- "b" class.pca.protg_allsamples.pp.mat <- pca.protg_allsamples.pp.mat[grepl(train.reps, row.names(pca.protg_allsamples.pp.mat)),] class.pca.protg_allsamples.pp.mat <- class.pca.protg_allsamples.pp.mat[!grepl("17", row.names(class.pca.protg_allsamples.pp.mat)),] class.pca.protg_allsamples.pp.mat <- scale(class.pca.protg_allsamples.pp.mat) class.test.mat <- pca.protg_allsamples.pp.mat[grepl(test.reps, row.names(pca.protg_allsamples.pp.mat)),] class.test.mat <- class.test.mat[!grepl("17", row.names(class.test.mat)),] class.test.mat <- scale(class.test.mat) knn.train <- class.pca.protg_allsamples.pp.mat knn.test <- class.test.mat knn.cl <- gsub("ppm.P([123456789][1234567890]{0,1})[abcd].{0,1}.{0,1}.{0,1}.{0,1}", "\\1", row.names(knn.train)) knn.test.cl <- gsub("ppm.P([123456789][1234567890]{0,1})[abcd].{0,1}.{0,1}.{0,1}.{0,1}", "\\1", row.names(knn.test)) set.seed(1) knn.out <- knn(knn.train, knn.test, knn.cl, k = 6) knn.out plot(as.numeric(as.character(knn.out)), as.numeric(as.character(knn.test.cl))) data.frame(pred_pop = as.numeric(as.character(knn.out)), true_pop = as.numeric(as.character(knn.test.cl))) %>% lm(formula = true_pop ~ pred_pop) %>% summary() circ.df <- data.frame(pred_pop = as.numeric(as.character(knn.out)), true_pop = as.numeric(as.character(knn.test.cl))) circ.df <- circ.df/16*2*pi mean(apply(circ.df, 1, function(x) {atan2(sin(x[1]-x[2]), cos(x[1]-x[2])) }) / 2 / pi * 16) sd(apply(circ.df, 1, function(x) {atan2(sin(x[1]-x[2]), cos(x[1]-x[2])) }) / 2 / pi * 16) nrow(circ.df) data.frame(pred_pop = as.numeric(as.character(knn.out)), true_pop = as.numeric(as.character(knn.test.cl))) %>% ggplot(aes(x = true_pop, y = pred_pop)) + geom_point() + theme_bw() + xlab("Population") + ylab("kNN-Predicted Population") + scale_x_continuous(breaks = 1:16) + scale_y_continuous(breaks = 1:16) + theme(text = element_text(size = 12, family = "Helvetica", colour = 'black'), legend.position = "none", panel.grid.minor = element_blank()) ggsave("kNN_output_b_test_119features.pdf", height = 2.5, width = 3, useDingbats = F) #c test train.reps <- "a|b|d" test.reps <- "c" class.pca.protg_allsamples.pp.mat <- pca.protg_allsamples.pp.mat[grepl(train.reps, row.names(pca.protg_allsamples.pp.mat)),] class.pca.protg_allsamples.pp.mat <- class.pca.protg_allsamples.pp.mat[!grepl("17", row.names(class.pca.protg_allsamples.pp.mat)),] class.pca.protg_allsamples.pp.mat <- scale(class.pca.protg_allsamples.pp.mat) class.test.mat <- pca.protg_allsamples.pp.mat[grepl(test.reps, row.names(pca.protg_allsamples.pp.mat)),] class.test.mat <- class.test.mat[!grepl("17", row.names(class.test.mat)),] class.test.mat <- scale(class.test.mat) knn.train <- class.pca.protg_allsamples.pp.mat knn.test <- class.test.mat knn.cl <- gsub("ppm.P([123456789][1234567890]{0,1})[abcd].{0,1}.{0,1}.{0,1}.{0,1}", "\\1", row.names(knn.train)) knn.test.cl <- gsub("ppm.P([123456789][1234567890]{0,1})[abcd].{0,1}.{0,1}.{0,1}.{0,1}", "\\1", row.names(knn.test)) set.seed(1) knn.out <- knn(knn.train, knn.test, knn.cl, k = 6) knn.out plot(as.numeric(as.character(knn.out)), as.numeric(as.character(knn.test.cl))) data.frame(pred_pop = as.numeric(as.character(knn.out)), true_pop = as.numeric(as.character(knn.test.cl))) %>% lm(formula = true_pop ~ pred_pop) %>% summary() circ.df <- data.frame(pred_pop = as.numeric(as.character(knn.out)), true_pop = as.numeric(as.character(knn.test.cl))) circ.df <- circ.df/16*2*pi mean(apply(circ.df, 1, function(x) {atan2(sin(x[1]-x[2]), cos(x[1]-x[2])) }) / 2 / pi * 16) sd(apply(circ.df, 1, function(x) {atan2(sin(x[1]-x[2]), cos(x[1]-x[2])) }) / 2 / pi * 16) nrow(circ.df) data.frame(pred_pop = as.numeric(as.character(knn.out)), true_pop = as.numeric(as.character(knn.test.cl))) %>% ggplot(aes(x = true_pop, y = pred_pop)) + geom_point() + theme_bw() + xlab("Population") + ylab("kNN-Predicted Population") + scale_x_continuous(breaks = 1:16) + scale_y_continuous(breaks = 1:16) + theme(text = element_text(size = 12, family = "Helvetica", colour = 'black'), legend.position = "none", panel.grid.minor = element_blank()) ggsave("kNN_output_c_test_119features.pdf", height = 2.5, width = 3, useDingbats = F) #d test train.reps <- "a|b|c" test.reps <- "d" class.pca.protg_allsamples.pp.mat <- pca.protg_allsamples.pp.mat[grepl(train.reps, row.names(pca.protg_allsamples.pp.mat)),] class.pca.protg_allsamples.pp.mat <- class.pca.protg_allsamples.pp.mat[!grepl("17", row.names(class.pca.protg_allsamples.pp.mat)),] class.pca.protg_allsamples.pp.mat <- scale(class.pca.protg_allsamples.pp.mat) class.test.mat <- pca.protg_allsamples.pp.mat[grepl(test.reps, row.names(pca.protg_allsamples.pp.mat)),] class.test.mat <- class.test.mat[!grepl("17", row.names(class.test.mat)),] class.test.mat <- scale(class.test.mat) knn.train <- class.pca.protg_allsamples.pp.mat knn.test <- class.test.mat knn.cl <- gsub("ppm.P([123456789][1234567890]{0,1})[abcd].{0,1}.{0,1}.{0,1}.{0,1}", "\\1", row.names(knn.train)) knn.test.cl <- gsub("ppm.P([123456789][1234567890]{0,1})[abcd].{0,1}.{0,1}.{0,1}.{0,1}", "\\1", row.names(knn.test)) set.seed(1) knn.out <- knn(knn.train, knn.test, knn.cl, k = 6) knn.out plot(as.numeric(as.character(knn.out)), as.numeric(as.character(knn.test.cl))) data.frame(pred_pop = as.numeric(as.character(knn.out)), true_pop = as.numeric(as.character(knn.test.cl))) %>% lm(formula = true_pop ~ pred_pop) %>% summary() circ.df <- data.frame(pred_pop = as.numeric(as.character(knn.out)), true_pop = as.numeric(as.character(knn.test.cl))) circ.df <- circ.df/16*2*pi mean(apply(circ.df, 1, function(x) {atan2(sin(x[1]-x[2]), cos(x[1]-x[2])) }) / 2 / pi * 16) sd(apply(circ.df, 1, function(x) {atan2(sin(x[1]-x[2]), cos(x[1]-x[2])) }) / 2 / pi * 16) nrow(circ.df) data.frame(pred_pop = as.numeric(as.character(knn.out)), true_pop = as.numeric(as.character(knn.test.cl))) %>% ggplot(aes(x = true_pop, y = pred_pop)) + geom_point() + theme_bw() + xlab("Population") + ylab("kNN-Predicted Population") + scale_x_continuous(breaks = 1:16) + scale_y_continuous(breaks = 1:16) + theme(text = element_text(size = 12, family = "Helvetica", colour = 'black'), legend.position = "none", panel.grid.minor = element_blank()) ggsave("kNN_output_d_test_119features.pdf", height = 2.5, width = 3, useDingbats = F) ####### protein linegraphs for heatmap figure ############ plot_fig_linegraph_scaled_to_max <- function(gene.name) { protg.full_cc.long %>% filter(grepl(gene.name, Gene.names)) %>% filter(pop != 17) %>% group_by(Gene.names, pop) %>% summarise(mean_LFQ = mean(value, na.rm=T), se_LFQ = sd(value, na.rm=T)/sqrt(n())) %>% group_by(Gene.names) %>% mutate(scaled_LFQ = mean_LFQ/max(mean_LFQ, na.rm=T), scaled_se = se_LFQ/max(mean_LFQ, na.rm=T)) %>% ggplot(aes(x = pop, y = scaled_LFQ)) + geom_line(size = 0.5, colour = "darkred") + geom_errorbar(aes(ymin=scaled_LFQ-scaled_se, ymax=scaled_LFQ+scaled_se), width=.2,position=position_dodge(.9), alpha = 0.8, size = 0.2) + scale_x_continuous(breaks = seq(0, 17, 1)) + scale_y_continuous(breaks = seq(0, 1.2, 0.2)) + expand_limits (y = 0) + theme_bw() + theme(text = element_text(size = 8, family = "Helvetica", colour = 'black'), axis.title = element_text(face = "bold"), axis.text.x = element_blank(), axis.text.y = element_text(size = 6, colour = 'black'), panel.grid.minor=element_blank(), panel.grid.major=element_blank(), legend.position = "none") + xlab("Population") + ylab("Normalised Intensity") } plot_fig_linegraph_scaled_to_max("SRSF2") ggsave("SRSF2_linegraph_scaled_to_max.pdf", height = 1.5, width = 1.25, useDingbats = FALSE) plot_fig_linegraph_scaled_to_max("SRSF3") ggsave("SRSF3_linegraph_scaled_to_max.pdf", height = 1.5, width = 1.25, useDingbats = FALSE) plot_fig_linegraph_scaled_to_max("SRSF6") ggsave("SRSF6_linegraph_scaled_to_max.pdf", height = 1.5, width = 1.25, useDingbats = FALSE) plot_fig_linegraph_scaled_to_max("CCNA2") ggsave("CCNA2_linegraph_scaled_to_max.pdf", height = 1.5, width = 1.25, useDingbats = FALSE) plot_fig_linegraph_scaled_to_max("CCNB1") ggsave("CCNB1_linegraph_scaled_to_max.pdf", height = 1.5, width = 1.25, useDingbats = FALSE) plot_fig_linegraph_scaled_to_max("CCNB2") ggsave("CCNB2_linegraph_scaled_to_max.pdf", height = 1.5, width = 1.25, useDingbats = FALSE) plot_fig_linegraph_scaled_to_max("AURKB") ggsave("AURKB_linegraph_scaled_to_max.pdf", height = 1.5, width = 1.25, useDingbats = FALSE) plot_fig_linegraph_scaled_to_max("ATAD2$") ggsave("ATAD2_linegraph_scaled_to_max.pdf", height = 1.5, width = 1.25, useDingbats = FALSE) plot_fig_linegraph_scaled_to_max("FAM111B") ggsave("FAM111B_linegraph_scaled_to_max.pdf", height = 1.5, width = 1.25, useDingbats = FALSE) plot_fig_linegraph_scaled_to_max("SLBP$") ggsave("SLBP_linegraph_scaled_to_max.pdf", height = 1.5, width = 1.25, useDingbats = FALSE) plot_fig_linegraph_scaled_to_max("CHAF1B$") ggsave("CHAF1B_linegraph_scaled_to_max.pdf", height = 1.5, width = 1.25, useDingbats = FALSE) plot_fig_linegraph_scaled_to_max("AURKA$") ggsave("AURKA_linegraph_scaled_to_max.pdf", height = 1.5, width = 1.25, useDingbats = FALSE) plot_fig_linegraph_scaled_to_max("AURKB$") ggsave("AURKB_linegraph_scaled_to_max.pdf", height = 1.5, width = 1.25, useDingbats = FALSE) plot_fig_linegraph_scaled_to_max("INCENP$") ggsave("INCENP_linegraph_scaled_to_max.pdf", height = 1.5, width = 1.25, useDingbats = FALSE) plot_fig_linegraph_scaled_to_max("FAM83D$") ggsave("FAM83D_linegraph_scaled_to_max.pdf", height = 1.5, width = 1.25, useDingbats = FALSE) plot_fig_linegraph_scaled_to_max("CDCA8$") ggsave("CDCA8_linegraph_scaled_to_max.pdf", height = 1.5, width = 1.25, useDingbats = FALSE) plot_fig_linegraph_scaled_to_max("TPX2$") ggsave("TPX2_linegraph_scaled_to_max.pdf", height = 1.5, width = 1.25, useDingbats = FALSE) plot_fig_linegraph_scaled_to_max("BUB1$") ggsave("BUB1_linegraph_scaled_to_max.pdf", height = 1.5, width = 1.25, useDingbats = FALSE) plot_fig_linegraph_scaled_to_max("HMMR$") ggsave("HMMR_linegraph_scaled_to_max.pdf", height = 1.5, width = 1.25, useDingbats = FALSE) plot_fig_linegraph_scaled_to_max("NUSAP1$") ggsave("NUSAP1_linegraph_scaled_to_max.pdf", height = 1.5, width = 1.25, useDingbats = FALSE) plot_fig_linegraph_scaled_to_max("GTSE1$") ggsave("GTSE1_linegraph_scaled_to_max.pdf", height = 1.5, width = 1.25, useDingbats = FALSE) plot_fig_linegraph_scaled_to_max("SRSF6$") ggsave("SRSF6_linegraph_scaled_to_max.pdf", height = 1.5, width = 1.25, useDingbats = FALSE) plot_fig_linegraph_scaled_to_max("HDGF$") ggsave("HDGF_linegraph_scaled_to_max.pdf", height = 1.5, width = 1.25, useDingbats = FALSE) plot_fig_linegraph_scaled_to_max("^RRM2$") ggsave("RRM2_linegraph_scaled_to_max.pdf", height = 1.5, width = 1.25, useDingbats = FALSE) plot_fig_linegraph_scaled_to_max("^KIF18B$") ggsave("KIF18B_linegraph_scaled_to_max.pdf", height = 1.5, width = 1.25, useDingbats = FALSE) plot_fig_linegraph_scaled_to_max("^MIS18BP1$") plot_fig_linegraph_scaled_to_max("^PIF1$") plot_fig_linegraph_scaled_to_max("^MINPP1$") plot_fig_linegraph_scaled_to_max("^NET1$") plot_fig_linegraph_scaled_to_max("^CTNNB1$") plot_fig_linegraph_scaled_to_max("^ZNF362$") plot_fig_linegraph_scaled_to_max("^ERF$") plot_fig_linegraph_scaled_to_max("^DDB2$") plot_fig_linegraph_scaled_to_max("^CENPE$") plot_fig_linegraph_scaled_to_max("^CEP78$") plot_fig_linegraph_scaled_to_max("^EXO1$") plot_fig_linegraph_scaled_to_max("^ATAD2$") #peptide analysis # pep <- read.csv("../../../Full dataset/full_reprocessed_fdr/data.filtered.peptide.long.csv") pep_withlibrary <- pep pep$pop <- as.numeric(gsub(".+?P([1234567890]{1,2}).*", "\\1", pep$Experiment)) pep <- subset(pep, !is.na(pep$pop)) pep$rep <- as.factor(gsub(".+?P([1234567890]{1,2})(.*)", "\\2", pep$Experiment)) SRSF2_indices <- which(pep$Gene.names == "SRSF2") STY_indices <- grep("[STY]", pep$Sequence) ProD_indices <- grep("[ST]P", pep$Sequence) CDKmotif_indices <- grep("[ST]P..[KR]", pep$Sequence) CDKmotif_logical <- grepl("[ST]P..[KR]", pep$Sequence) pep$CDKmotif_logical <- CDKmotif_logical pep %>% slice(SRSF2_indices) %>% group_by(Sequence, pop) %>% summarise(sumInt = sum(Intensity, na.rm=TRUE)) %>% group_by(Sequence) %>% mutate(normInt = sumInt / max(sumInt)) %>% ggplot(aes(x = pop, y = normInt)) + geom_smooth() + facet_grid(. ~ Sequence) SRSF3_indices <- which(pep$Gene.names == "SRSF3") pep %>% slice(SRSF3_indices) %>% group_by(Sequence, pop) %>% summarise(sumInt = sum(Intensity, na.rm=TRUE)) %>% group_by(Sequence) %>% mutate(normInt = sumInt / max(sumInt)) %>% ggplot(aes(x = pop, y = normInt)) + geom_smooth() + facet_grid(. ~ Sequence) SRSF5_indices <- which(pep$Gene.names == "SRSF5") pep %>% slice(SRSF5_indices) %>% group_by(Sequence, pop) %>% summarise(sumInt = sum(Intensity, na.rm=TRUE)) %>% group_by(Sequence) %>% mutate(normInt = sumInt / max(sumInt)) %>% ggplot(aes(x = pop, y = normInt)) + geom_smooth() + facet_grid(. ~ Sequence) SRSF6_indices <- which(pep$Gene.names == "SRSF6") pep %>% slice(SRSF6_indices) %>% group_by(Sequence, pop) %>% summarise(sumInt = sum(Intensity, na.rm=TRUE)) %>% group_by(Sequence) %>% mutate(normInt = sumInt / max(sumInt)) %>% ggplot(aes(x = pop, y = normInt)) + geom_smooth() + facet_grid(. ~ Sequence) SR_indices <- which(grepl("SRSF2|SRSF3|SRSF5|SRSF6|SRRM2", pep$Gene.names)) this.df <- pep %>% subset(pop != "17") %>% slice(SR_indices) %>% group_by(Sequence, pop) %>% summarise(sumInt = sum(Intensity, na.rm=TRUE)) %>% # group_by(Sequence) %>% mutate(normInt = sumInt/max(sumInt)) %>% mutate(STY_logical = grepl("[STY]", Sequence)) %>% # group_by(STY_logical, pop) %>% summarise(summInt = sum(sumInt)) %>% # group_by(STY_logical) %>% mutate(normInt = summInt/max(summInt)) %>% ggplot(aes(x = pop, y = sumInt)) + geom_smooth() + facet_grid(. ~ STY_logical) select(STY_logical, pop, normInt) %>% spread(key = pop, value = normInt) cor(this.df[1:16,3], this.df[17:32,3]) this.df scaled_to_max_fc <- function(gene.name) { protg.full_cc.long %>% filter(grepl(gene.name, Gene.names)) %>% filter(pop != 17) %>% group_by(Gene.names, pop) %>% summarise(mean_LFQ = mean(value, na.rm=T), se_LFQ = sd(value, na.rm=T)/sqrt(n())) %>% group_by(Gene.names) %>% mutate(scaled_LFQ = mean_LFQ/max(mean_LFQ, na.rm=T), scaled_se = se_LFQ/max(mean_LFQ, na.rm=T)) %>% summarise(fc = max(scaled_LFQ)/min(scaled_LFQ)) } sapply(protg.full_cc.wide.avg.pp$Gene.names, scaled_to_max_fc) ##### proteins unique to population 17 ######## obs.in.p17.logi <- !is.na(protg.full_cc.wide.avg$'17') not.obs.in.p1top16.logi <- apply(protg.full_cc.wide.avg[,as.character(1:16)], 1, function(x) all(is.na(x))) obs.in.p17.only.df <- protg.full_cc.wide.avg[obs.in.p17.logi & not.obs.in.p1top16.logi,] p17.p1.fc <- protg.full_cc.wide.avg$'17'/protg.full_cc.wide.avg$'1' higher_in_p17.vs.p1.df <- protg.full_cc.wide.avg[which(p17.p1.fc > 2 & !is.na(p17.p1.fc)),] export.to.clipboard(strsplit.extract(higher_in_p17.vs.p1.df$Gene.names, ';', 1)) export.to.clipboard(higher_in_p17.vs.p1.df$Base_Uniprot) p17.comparison.df <- protg.full_cc.wide.avg p17.comparison.df$p17.p1.fc <- protg.full_cc.wide.avg$'17'/protg.full_cc.wide.avg$'1' p17.comparison.df$p17.p16.fc <- protg.full_cc.wide.avg$'17'/protg.full_cc.wide.avg$'16' p17.comparison.df$p17.p8.fc<- protg.full_cc.wide.avg$'17'/protg.full_cc.wide.avg$'8' export.to.clipboard(strsplit.extract(p17.comparison.df$Base_Uniprot, '-', 1)) my.write.table(p17.comparison.df, "p17.comparison.df.txt") head(protg.full_cc.long) #p8 vs p17 ######## protg.full_cc$pp_gene <- protg.full_cc$Protein.group.IDs%in%periodic.proteins.bothinj.df$Protein.group.IDs p8.mycols <- grep("8",protg.full_cc.ppm.facs.mycols, value = T) p17.mycols <- grep("17",protg.full_cc.ppm.facs.mycols, value = T) p8.vs.p17.ttest.result <- apply(protg.full_cc[,c(p8.mycols,p17.mycols)], 1, function(x) { if(any(is.na(x))) { result <- NA } else { result <- t.test(x[1:8], x[9:12])$p.value } return(result) }) protg.full_cc$p8.vs.p17.ttest.pval <- p8.vs.p17.ttest.result p8.vs.p17.fc <- apply(protg.full_cc[,c(p8.mycols,p17.mycols)], 1, function(x) { mean(x[9:12], na.rm=T)/mean(x[1:8], na.rm=T) }) protg.full_cc$p8.vs.p17.fc <- p8.vs.p17.fc protg.full_cc %>% subset(!is.na(p8.vs.p17.ttest.pval)) %>% ggplot(aes(x = log2(p8.vs.p17.fc), y = log(p8.vs.p17.ttest.pval, base = 0.05))) + geom_point(aes(label = Gene.names, colour = apcc_gene)) + xlab("Log2 Fold change (P17 / P8)") + ylab("Log0.05 p-value") + theme_bw() + theme(legend.position="none") + theme(text = element_text(size = 20, family = "Helvetica"), axis.title = element_text(face = "bold")) ggsave("p17 vs p8 volcano plot.pdf", useDingbats = FALSE, width = 8, height = 6) ggplotly() protg.full_cc %>% subset(!is.na(p8.vs.p17.ttest.pval)) %>% ggplot(aes(x = log2(p8.vs.p17.fc), y = log(p8.vs.p17.ttest.pval, base = 0.05))) + geom_point(aes(label = Gene.names, colour = pp_gene)) + xlab("Log2 Fold change (P17 / P8)") + ylab("Log0.05 p-value") + theme_bw() + theme(legend.position="none") + theme(text = element_text(size = 20, family = "Helvetica"), axis.title = element_text(face = "bold")) #p1 vs p17 ######## p1.mycols <- grep("ppm.P1[abcd]",protg.full_cc.ppm.facs.mycols, value = T) p1.vs.p17.ttest.result <- apply(protg.full_cc[,c(p1.mycols,p17.mycols)], 1, function(x) { if(any(is.na(x))) { result <- NA } else { result <- t.test(x[1:8], x[9:12])$p.value } return(result) }) protg.full_cc$p1.vs.p17.ttest.pval <- p1.vs.p17.ttest.result p1.vs.p17.fc <- apply(protg.full_cc[,c(p1.mycols,p17.mycols)], 1, function(x) { mean(x[9:12], na.rm=T)/mean(x[1:8], na.rm=T) }) protg.full_cc$p1.vs.p17.fc <- p1.vs.p17.fc protg.full_cc %>% subset(!is.na(p1.vs.p17.ttest.pval)) %>% ggplot(aes(x = log2(p1.vs.p17.fc), y = log(p1.vs.p17.ttest.pval, base = 0.05))) + geom_point(aes(label = Gene.names, colour = pp_gene)) + xlab("Log2 Fold change (P17 / P1)") + ylab("Log0.05 p-value") + theme_bw() + theme(legend.position="none") + theme(text = element_text(size = 20, family = "Helvetica"), axis.title = element_text(face = "bold")) ggsave("p17 vs p1 volcano plot.pdf", useDingbats = FALSE, width = 8, height = 6) ggplotly() #p16 vs p17 ######## p16.mycols <- grep("ppm.P16[abcd]",protg.full_cc.ppm.facs.mycols, value = T) p16.vs.p17.ttest.result <- apply(protg.full_cc[,c(p16.mycols,p17.mycols)], 1, function(x) { if(any(is.na(x))) { result <- NA } else { result <- t.test(x[1:10], x[11:14])$p.value } return(result) }) protg.full_cc$p16.vs.p17.ttest.pval <- p16.vs.p17.ttest.result p16.vs.p17.fc <- apply(protg.full_cc[,c(p16.mycols,p17.mycols)], 1, function(x) { mean(x[11:14], na.rm=T)/mean(x[1:10], na.rm=T) }) protg.full_cc$p16.vs.p17.fc <- p1.vs.p17.fc protg.full_cc %>% subset(!is.na(p16.vs.p17.ttest.pval)) %>% ggplot(aes(x = log2(p16.vs.p17.fc), y = log(p16.vs.p17.ttest.pval, base = 0.05))) + geom_point(aes(label = Gene.names), alpha = 0.3) + xlab("Log2 Fold change (P17 / P16)") + ylab("Log0.05 p-value") + theme_bw() + theme(legend.position="none") + theme(text = element_text(size = 20, family = "Helvetica"), axis.title = element_text(face = "bold")) ggsave("p17 vs p16 volcano plot.pdf", useDingbats = FALSE, width = 3.5, height = 3) ggplotly() #HINT1, PPP6R2 and BMI1 -- export data to PRISM export.to.clipboard(protg.full_cc[grep("HINT1", protg.full_cc$Gene.names),c(p1.mycols)]) export.to.clipboard(protg.full_cc[grep("HINT1", protg.full_cc$Gene.names),c(p16.mycols)]) export.to.clipboard(protg.full_cc[grep("HINT1", protg.full_cc$Gene.names),c(p17.mycols)]) export.to.clipboard(protg.full_cc[grep("COMMD3-BMI1;BMI1", protg.full_cc$Gene.names),c(p1.mycols)]) export.to.clipboard(protg.full_cc[grep("COMMD3-BMI1;BMI1", protg.full_cc$Gene.names),c(p16.mycols)]) export.to.clipboard(protg.full_cc[grep("COMMD3-BMI1;BMI1", protg.full_cc$Gene.names),c(p17.mycols)]) export.to.clipboard(protg.full_cc[grep("PPP6R2", protg.full_cc$Gene.names),c(p1.mycols)]) export.to.clipboard(protg.full_cc[grep("PPP6R2", protg.full_cc$Gene.names),c(p16.mycols)]) export.to.clipboard(protg.full_cc[grep("PPP6R2", protg.full_cc$Gene.names),c(p17.mycols)]) #rest vs p17 ######## rest.mycols <- grep("ppm.P[1234567890][0123456]{0,1}[abcd]",protg.full_cc.ppm.facs.mycols, value = T) rest.vs.p17.ttest.result <- apply(protg.full_cc[,c(rest.mycols,p17.mycols)], 1, function(x) { if(any(is.na(x))) { result <- NA } else { result <- t.test(x[1:134], x[135:138])$p.value } return(result) }) protg.full_cc$rest.vs.p17.ttest.pval <- rest.vs.p17.ttest.result rest.vs.p17.fc <- apply(protg.full_cc[,c(rest.mycols,p17.mycols)], 1, function(x) { mean(x[135:138], na.rm=T)/mean(x[1:134], na.rm=T) }) protg.full_cc$rest.vs.p17.fc <- rest.vs.p17.fc protg.full_cc %>% subset(!is.na(rest.vs.p17.ttest.pval)) %>% ggplot(aes(x = log2(rest.vs.p17.fc), y = log(rest.vs.p17.ttest.pval, base = 0.05))) + geom_point(aes(label = Gene.names, colour = pp_gene)) + xlab("Log2 Fold change (P17 / rest)") + ylab("Log0.05 p-value") + theme_bw() + theme(legend.position="none") + theme(text = element_text(size = 20, family = "Helvetica"), axis.title = element_text(face = "bold")) ggsave("p17 vs rest volcano plot.pdf", useDingbats = FALSE, width = 8, height = 6) ggplotly() ############## EXRA ANALYSES ################# ###### trying to map eLife 2017 data onto PCA (did not work at all - coverage for 2017 paper was wayyy too low) ######## elife2017 <- read.delim("~/Documents/GRE Documents/Finished Projects/FACS Mitotic Substages/pNB4/pNB4 TMT ALL pElu/Total/evidence.txt") elife2017 <- subset(elife2017, !grepl("Phospho", elife2017$Modifications)) elife.tmt.cols <- grep("Reporter.intensity.[0123456789]$", colnames(elife2017), value = T) norm.tmt.elife.df <- apply(elife2017[,elife.tmt.cols], 2, function(x) x / sum(x)) * 1E6 elife2017[,elife.lfq.cols] <- norm.tmt.elife.df elife2017_pp_indices <- sapply(strsplit.extract(protg.full_cc.wide.avg.pp$Base_Uniprot, '-', 1), function(base_uniprot) { return(grep(base_uniprot, elife2017$Leading.Razor.Protein)) }) elife2017_pp <- elife2017[unlist(elife2017_pp_indices),c("Leading.Razor.Protein", elife.lfq.cols)] elife2017_pp.long <- gather(elife2017_pp, Reporter_channel, Reporter_intensity, Reporter.intensity.0:Reporter.intensity.9) elife2017_pp.long$base_uniprot <- rep(names(elife2017_pp_indices), times = sapply(elife2017_pp_indices, length)) elife2017_pp.long.sum <- elife2017_pp.long %>% group_by(base_uniprot, Reporter_channel) %>% summarise(sumIntensity = sum(Reporter_intensity)) elife2017_pp.long.sum.wide <- spread(elife2017_pp.long.sum, Reporter_channel, sumIntensity) pca.elife.df <- protg.full_cc.wide.avg.pp pca.elife.df[, elife.lfq.cols] <- 0 pca.elife.df[match(elife2014_pp.long.sum.wide$base_uniprot, strsplit.extract(pca.elife.df$Base_Uniprot, '-', 1)), elife.lfq.cols] <- elife2014_pp.long.sum.wide[, elife.lfq.cols] pca.elife.df[, elife.tmt.cols] <- 0 pca.elife.df[match(elife2017_pp.long.sum.wide$base_uniprot, strsplit.extract(pca.elife.df$Base_Uniprot, '-', 1)), elife.tmt.cols] <- elife2017_pp.long.sum.wide[, elife.tmt.cols] pca.elife.df <- pca.elife.df[apply(pca.elife.df[,elife.tmt.cols], 1, function(x) any(x > 0)), ] pca.mat.elife <- t(pca.elife.df[,as.character(c(1:17, elife.lfq.cols, elife.tmt.cols))]) pca.mat.elife <- t(apply(pca.mat.elife, 1, scale)) pca.mat.elife[is.na(pca.mat.elife)] <- 0 pca.out.elife <- prcomp(pca.mat.elife, center = TRUE, scale. = TRUE) pca.out.df.elife <- as.data.frame(pca.out.elife$x) pca.out.df.elife$name <- rownames(pca.out.df.elife) ggplot(pca.out.df.elife, aes(x = PC1, y = PC2, label = name)) + geom_point(size = 1) + geom_text(hjust = 2, vjust = 0) + theme_bw() + theme(text = element_text(size = 12, family = "Helvetica", colour = 'black')) ggplotly() fviz_pca_var(pca.out.elife, col.var = "contrib", # Color by contributions to the PC gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"), repel = TRUE # Avoid text overlapping ) ###### trying to map eLife 2014 data onto PCA (looking good!) ######## elife2014 <- read.delim("~/Documents/zzz.GRE Documents/Finished Projects/Label Free Cell Cycle Manuscript/Supp_Table_1 (Proteomics Dataset).txt") elife.lfq.cols <- grep("LFQ", colnames(elife2014), value = T) #signatures filtered.sig <- read.delim("protein signatures/filtered_signature.txt") min.sig <- read.delim("protein signatures/minimum_signature.txt") sort.sig <- read.delim("protein signatures/sort_signature.txt") elife2014_pp_indices <- sapply(strsplit.extract(protg.full_cc.wide.avg.pp$Base_Uniprot, '-', 1), function(base_uniprot) { return(grep(base_uniprot, elife2014$Majority_protein_IDs)) }) elife2014_filtered.pp_indices <- sapply(strsplit.extract(filtered.sig$Base_Uniprot, '-', 1), function(base_uniprot) { return(grep(base_uniprot, elife2014$Majority_protein_IDs)) }) elife2014_min.pp_indices <- sapply(strsplit.extract(min.sig$Base_Uniprot, '-', 1), function(base_uniprot) { return(grep(base_uniprot, elife2014$Majority_protein_IDs)) }) elife2014_sort.pp_indices <- sapply(strsplit.extract(sort.sig$Base_Uniprot, '-', 1), function(base_uniprot) { return(grep(base_uniprot, elife2014$Majority_protein_IDs)) }) #full sig ####### elife2014_pp <- elife2014[unlist(elife2014_pp_indices),c("gene_names", "Majority_protein_IDs", grep("LFQ", colnames(elife2014), value = T))] elife2014_pp.long <- gather(elife2014_pp, Fraction, LFQ_Intensity, LFQ_intensity_F0:LFQ_intensity_F6) elife2014_pp.long$base_uniprot <- rep(names(elife2014_pp_indices), times = sapply(elife2014_pp_indices, length)) elife2014_pp.long.sum <- elife2014_pp.long %>% group_by(base_uniprot, Fraction) %>% summarise(sumIntensity = sum(LFQ_Intensity)) elife2014_pp.long.sum.wide <- spread(elife2014_pp.long.sum, Fraction, sumIntensity) protg.full_cc.wide.avg.pp.norm <- protg.full_cc.wide.avg.pp temp <- t(apply(protg.full_cc.wide.avg.pp[,as.character(1:16)], 1, function(x) x/mean(x, na.rm=TRUE))) protg.full_cc.wide.avg.pp.norm[,as.character(1:16)] <- temp #temp <- t(apply(protg.full_cc.wide.avg.pp[,as.character(1:16)], 1, function(x) x / x[1])) #protg.full_cc.wide.avg.pp.norm[,as.character(1:16)] <- temp elife2014_pp.long.sum.wide.norm <- elife2014_pp.long.sum.wide elife2014_pp.long.sum.wide.norm[,elife.lfq.cols] <- t(apply(elife2014_pp.long.sum.wide[,elife.lfq.cols], 1, function(x) x/((x[1])))) #elife2014_pp.long.sum.wide.norm[,elife.lfq.cols] <- t(apply(elife2014_pp.long.sum.wide[,elife.lfq.cols], 1, function(x) x / x[1])) pca.elife.df <- protg.full_cc.wide.avg.pp.norm pca.elife.df[, elife.lfq.cols] <- 0 pca.elife.df[match(elife2014_pp.long.sum.wide.norm$base_uniprot, strsplit.extract(pca.elife.df$Base_Uniprot, '-', 1)), elife.lfq.cols] <- elife2014_pp.long.sum.wide.norm[, elife.lfq.cols] pca.indices.without.zeroes <- apply(pca.elife.df[,as.character(c(1:16, elife.lfq.cols))], 1, function(x) all(x > 0 & !is.na(x))) pca.elife.df <- pca.elife.df[pca.indices.without.zeroes,] pca.mat.elife <- t(pca.elife.df[,as.character(c(1:16, elife.lfq.cols))]) pca.mat.elife <- t(apply(pca.mat.elife, 1, scale)) pca.mat.elife[is.na(pca.mat.elife)] <- 0 pca.out.elife <- prcomp(pca.mat.elife) pca.out.df.elife <- as.data.frame(pca.out.elife$x) pca.out.df.elife$name <- rownames(pca.out.df.elife) ggplot(pca.out.df.elife, aes(x = PC1, y = PC2, label = name)) + geom_point(size = 1) + geom_text(nudge_x = 0.2, hjust = 0) + theme_bw() + theme(text = element_text(size = 12, family = "Helvetica", colour = 'black')) ggsave("Plots/elife2014 pca.pdf", height = 4, width = 5, useDingbats = FALSE) ggplotly() fviz_pca_var(pca.out.elife, col.var = "contrib", # Color by contributions to the PC gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"), repel = TRUE # Avoid text overlapping ) ###### trying to map eLife 2015 data onto PCA ######## elife2015.orig <- read.delim("~/Documents/zzz.GRE Documents/Finished Projects/Elutriation vs Arrests/Final Versions/Revision/Supp_Table_1.txt") elife2015.lfq.cols.orig <- grep("LFQ", colnames(elife2015.orig), value = T) elife2015.lfq.cols <- gsub("LFQ.intensity.arrest.(.+)", "\\1", elife2015.lfq.cols.orig) elife2015.orig[,elife2015.lfq.cols] <- elife2015.orig[,elife2015.lfq.cols.orig] elife2015 <- elife2015.orig[,c("Gene.names", "Majority.protein.IDs", elife2015.lfq.cols)] #elife2015$Majority.protein.IDs elife2015_pp_indices <- sapply(strsplit.extract(protg.full_cc.wide.avg.pp$Gene.names, '-', 1), function(base_uniprot) { return(grep(base_uniprot, elife2015$Gene.names)) }) elife2015_pp <- elife2015[unlist(elife2015_pp_indices),c("Gene.names", "Majority.protein.IDs", elife2015.lfq.cols)] elife2015_pp.long <- gather(elife2015_pp, key = Sample, value = LFQ_Intensity, -Gene.names, -Majority.protein.IDs) elife2015_pp.long$Gene.names <- rep(names(elife2015_pp_indices), times = sapply(elife2015_pp_indices, length)) elife2015_pp.long.sum <- elife2015_pp.long %>% group_by(Gene.names, Sample) %>% summarise(sumIntensity = sum(LFQ_Intensity)) elife2015_pp.long.sum.wide <- spread(elife2015_pp.long.sum, Sample, sumIntensity) protg.full_cc.wide.avg.pp.norm <- protg.full_cc.wide.avg.pp temp <- t(apply(protg.full_cc.wide.avg.pp[,as.character(1:16)], 1, function(x) x/mean(x, na.rm=TRUE))) protg.full_cc.wide.avg.pp.norm[,as.character(1:16)] <- temp #temp <- t(apply(protg.full_cc.wide.avg.pp[,as.character(1:16)], 1, function(x) x / x[1])) #protg.full_cc.wide.avg.pp.norm[,as.character(1:16)] <- temp elife2015_pp.long.sum.wide.norm <- elife2015_pp.long.sum.wide elife2015_pp.long.sum.wide.norm[,elife2015.lfq.cols] <- t(apply(elife2015_pp.long.sum.wide[,elife2015.lfq.cols], 1, function(x) x/mean(x[1:3]))) #elife2014_pp.long.sum.wide.norm[,elife.lfq.cols] <- t(apply(elife2014_pp.long.sum.wide[,elife.lfq.cols], 1, function(x) x / x[1])) pca.elife.df <- protg.full_cc.wide.avg.pp.norm pca.elife.df[, elife2015.lfq.cols] <- 0 pca.elife.df[match(elife2015_pp.long.sum.wide.norm$Gene.names, strsplit.extract(pca.elife.df$Gene.names, '-', 1)), elife2015.lfq.cols] <- elife2015_pp.long.sum.wide.norm[, elife2015.lfq.cols] pca.indices.without.zeroes <- apply(pca.elife.df[,as.character(c(1:16, elife2015.lfq.cols))], 1, function(x) all(x > 0 & !is.na(x))) pca.elife.df <- pca.elife.df[pca.indices.without.zeroes,] pca.mat.elife <- t(pca.elife.df[,as.character(c(1:16, elife2015.lfq.cols))]) pca.mat.elife <- t(apply(pca.mat.elife, 1, scale)) pca.mat.elife[is.na(pca.mat.elife)] <- 0 pca.out.elife <- prcomp(pca.mat.elife) pca.out.df.elife <- as.data.frame(pca.out.elife$x) pca.out.df.elife$name <- rownames(pca.out.df.elife) ggplot(pca.out.df.elife, aes(x = PC1, y = PC2, label = name)) + geom_point(size = 1) + geom_text(nudge_x = 0.2, hjust = 0) + theme_bw() + theme(text = element_text(size = 12, family = "Helvetica", colour = 'black')) ggsave("Plots/elife2015 pca.pdf", height = 4, width = 5, useDingbats = FALSE) fviz_pca_var(pca.out.elife, col.var = "contrib", # Color by contributions to the PC gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"), repel = TRUE # Avoid text overlapping ) ######## classifying eLife 2014 with full dataset ######### elife2015_pp.long.sum.wide.norm <- elife2015_pp.long.sum.wide elife2015_pp.long.sum.wide.norm[,elife2015.lfq.cols] <- t(apply(elife2015_pp.long.sum.wide[,elife2015.lfq.cols], 1, function(x) x/mean(x[1:3]))) pca.protg_allsamples <- protg.full_cc.long %>% subset(pop < 17 & !grepl("i2", protg.full_cc.long$variable)) %>% select(Protein.group.IDs, Base_Uniprot, variable, value) %>% spread(variable, value) pca.protg_allsamples.pp <- pca.protg_allsamples[match(protg.full_cc.wide.avg.pp$Protein.group.IDs, pca.protg_allsamples$Protein.group.IDs),] temp <- t(apply(pca.protg_allsamples.pp[,-c(1,2)], 1, function(x) x/mean(x, na.rm=TRUE))) temp[is.na(temp)] <- 0 pca.protg_allsamples.pp[,-c(1,2)] <- temp pca.protg_allsamples.pp.elife2014 <- pca.protg_allsamples.pp pca.protg_allsamples.pp.elife2014[, elife.lfq.cols] <- 0 pca.protg_allsamples.pp.elife2014[match(elife2014_pp.long.sum.wide.norm$base_uniprot, strsplit.extract(pca.protg_allsamples.pp.elife2014$Base_Uniprot, '-', 1)), elife.lfq.cols] <- elife2014_pp.long.sum.wide.norm[, elife.lfq.cols] pca.protg_allsamples.pp.elife2014.mat <- t(as.matrix(pca.protg_allsamples.pp.elife2014[,-c(1,2)])) pca.indices.without.zeroes <- apply(pca.protg_allsamples.pp.elife2014.mat, 2, function(x) all(x > 0 & !is.na(x))) pca.mat.elife <- pca.protg_allsamples.pp.elife2014.mat[,pca.indices.without.zeroes] pca.mat.elife <- t(apply(pca.mat.elife, 1, scale)) pca.mat.elife[is.na(pca.mat.elife)] <- 0 pca.out.elife <- prcomp(pca.mat.elife) pca.out.df.elife <- as.data.frame(pca.out.elife$x) pca.out.df.elife$name <- rownames(pca.out.df.elife) pca.out.df.elife$exp <- "this" pca.out.df.elife$exp[!grepl("ppm", pca.out.df.elife$name)] <- "elife2014" pca.out.df.elife$inj <- grepl(".i2", pca.out.df.elife$name) pca.out.df.elife$pop <- gsub("ppm.P([123456789][1234567890]{0,1})[abcd].{0,1}.{0,1}.{0,1}.{0,1}", "\\1", pca.out.df.elife$name) pca.out.df.elife$pop[grepl("r", pca.out.df.elife$pop)] <- "unk" pca.out.df.elife$biorep <- gsub("ppm.P[123456789][1234567890]{0,1}([abcd]).{0,1}.{0,1}.{0,1}.{0,1}", "\\1", pca.out.df.elife$name) ggplot(pca.out.df.elife, aes(x = PC1, y = PC2, label = name, colour = exp)) + geom_point(size = 1) + geom_text(nudge_x = 0.2, hjust = 0) + theme_bw() + theme(text = element_text(size = 12, family = "Helvetica", colour = 'black')) knn.train <- pca.out.df.elife[grepl("ppm", rownames(pca.out.df.elife)),c("PC1", "PC2")] knn.test <- pca.out.df.elife[!grepl("ppm", rownames(pca.out.df.elife)),c("PC1", "PC2")] knn.cl <- factor(as.character( pca.out.df.elife[grepl("ppm", rownames(pca.out.df.elife)),c("pop")]), levels=1:16) set.seed(1) knn.out <- knn(knn.train, knn.test, knn.cl, k = 6) knn.out ######## classifying eLife 2014 and 2017 with full dataset ######### elife2015_pp.long.sum.wide.norm <- elife2015_pp.long.sum.wide elife2015_pp.long.sum.wide.norm[,elife2015.lfq.cols] <- t(apply(elife2015_pp.long.sum.wide[,elife2015.lfq.cols], 1, function(x) x/mean(x[1:3]))) pca.protg_allsamples <- protg.full_cc.long %>% subset(pop < 17 & !grepl("i2", protg.full_cc.long$variable)) %>% select(Protein.group.IDs, Gene.names, variable, value) %>% spread(variable, value) pca.protg_allsamples.pp <- pca.protg_allsamples[match(protg.full_cc.wide.avg.pp$Protein.group.IDs, pca.protg_allsamples$Protein.group.IDs),] my.write.table(pca.protg_allsamples.pp, "pca.protg_allsamples.pp.txt") temp <- t(apply(pca.protg_allsamples.pp[,-c(1,2)], 1, function(x) x/mean(x, na.rm=TRUE))) temp[is.na(temp)] <- 0 pca.protg_allsamples.pp[,-c(1,2)] <- temp pca.protg_allsamples.pp.elife2015 <- pca.protg_allsamples.pp pca.protg_allsamples.pp.elife2015[, elife2015.lfq.cols] <- 0 pca.protg_allsamples.pp.elife2015[match(elife2015_pp.long.sum.wide.norm$Gene.names, strsplit.extract(pca.protg_allsamples.pp.elife2015$Gene.names, '-', 1)), elife2015.lfq.cols] <- elife2015_pp.long.sum.wide.norm[, elife2015.lfq.cols] pca.protg_allsamples.pp.elife2015.mat <- t(as.matrix(pca.protg_allsamples.pp.elife2015[,-c(1,2)])) pca.indices.without.zeroes <- apply(pca.protg_allsamples.pp.elife2015.mat, 2, function(x) all(x > 0 & !is.na(x))) pca.mat.elife <- pca.protg_allsamples.pp.elife2015.mat[,pca.indices.without.zeroes] pca.mat.elife <- t(apply(pca.mat.elife, 1, scale)) pca.mat.elife[is.na(pca.mat.elife)] <- 0 pca.out.elife <- prcomp(pca.mat.elife) pca.out.df.elife <- as.data.frame(pca.out.elife$x) pca.out.df.elife$name <- rownames(pca.out.df.elife) pca.out.df.elife$exp <- "this" pca.out.df.elife$exp[!grepl("ppm", pca.out.df.elife$name)] <- "elife2015" pca.out.df.elife$inj <- grepl(".i2", pca.out.df.elife$name) pca.out.df.elife$pop <- gsub("ppm.P([123456789][1234567890]{0,1})[abcd].{0,1}.{0,1}.{0,1}.{0,1}", "\\1", pca.out.df.elife$name) pca.out.df.elife$pop[grepl("r", pca.out.df.elife$pop)] <- "unk" pca.out.df.elife$biorep <- gsub("ppm.P[123456789][1234567890]{0,1}([abcd]).{0,1}.{0,1}.{0,1}.{0,1}", "\\1", pca.out.df.elife$name) ggplot(pca.out.df.elife, aes(x = PC1, y = PC2, label = name, colour = exp)) + geom_point(size = 1) + geom_text(nudge_x = 0.2, hjust = 0) + theme_bw() + theme(text = element_text(size = 12, family = "Helvetica", colour = 'black')) knn.train <- pca.out.df.elife[grepl("ppm", rownames(pca.out.df.elife)),c("PC1", "PC2")] knn.test <- pca.out.df.elife[!grepl("ppm", rownames(pca.out.df.elife)),c("PC1", "PC2")] knn.cl <- factor(as.character( pca.out.df.elife[grepl("ppm", rownames(pca.out.df.elife)),c("pop")]), levels=1:16) set.seed(1) knn.out <- knn(knn.train, knn.test, knn.cl, k = 6) knn.out ##### UNUSED SCRIPTS ######## ##### OTHER SIGNATURES -- UNUSED ####### #filtered sig (about the same as full sig) ############ elife2014_pp <- elife2014[unlist(elife2014_filtered.pp_indices),c("gene_names", "Majority_protein_IDs", grep("LFQ", colnames(elife2014), value = T))] elife2014_pp.long <- gather(elife2014_pp, Fraction, LFQ_Intensity, LFQ_intensity_F0:LFQ_intensity_F6) elife2014_pp.long$base_uniprot <- rep(names(elife2014_filtered.pp_indices), times = sapply(elife2014_filtered.pp_indices, length)) elife2014_pp.long.sum <- elife2014_pp.long %>% group_by(base_uniprot, Fraction) %>% summarise(sumIntensity = sum(LFQ_Intensity)) elife2014_pp.long.sum.wide <- spread(elife2014_pp.long.sum, Fraction, sumIntensity) protg.full_cc.wide.avg.pp.norm <- protg.full_cc.wide.avg.pp[match(filtered.sig$Protein.group.IDs,protg.full_cc.wide.avg.pp$Protein.group.IDs),] temp <- t(apply(protg.full_cc.wide.avg.pp[match(filtered.sig$Protein.group.IDs,protg.full_cc.wide.avg.pp$Protein.group.IDs),as.character(1:16)], 1, function(x) x / mean(x, na.rm=TRUE))) protg.full_cc.wide.avg.pp.norm[,as.character(1:16)] <- temp elife2014_pp.long.sum.wide.norm <- elife2014_pp.long.sum.wide elife2014_pp.long.sum.wide.norm[,elife.lfq.cols] <- t(apply(elife2014_pp.long.sum.wide[,elife.lfq.cols], 1, function(x) x / mean(x, na.rm=TRUE))) pca.elife.df <- protg.full_cc.wide.avg.pp.norm pca.elife.df[, elife.lfq.cols] <- 0 pca.elife.df[match(elife2014_pp.long.sum.wide.norm$base_uniprot, strsplit.extract(pca.elife.df$Base_Uniprot, '-', 1)), elife.lfq.cols] <- elife2014_pp.long.sum.wide.norm[, elife.lfq.cols] pca.indices.without.zeroes <- apply(pca.elife.df[,as.character(c(1:16, elife.lfq.cols))], 1, function(x) all(x > 0 & !is.na(x))) pca.elife.df <- pca.elife.df[pca.indices.without.zeroes,] pca.mat.elife <- t(pca.elife.df[,as.character(c(1:16, elife.lfq.cols))]) #pca.mat.elife <- t(apply(pca.mat.elife, 1, scale)) pca.mat.elife[is.na(pca.mat.elife)] <- 0 pca.out.elife <- prcomp(pca.mat.elife) pca.out.df.elife <- as.data.frame(pca.out.elife$x) pca.out.df.elife$name <- rownames(pca.out.df.elife) ggplot(pca.out.df.elife, aes(x = PC1, y = PC2, label = name)) + geom_point(size = 1) + geom_text(hjust = 1.2) + theme_bw() + theme(text = element_text(size = 12, family = "Helvetica", colour = 'black')) fviz_pca_var(pca.out.elife, col.var = "contrib", # Color by contributions to the PC gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"), repel = TRUE # Avoid text overlapping ) #min sig (not as good as full sig) ############ elife2014_pp <- elife2014[unlist(elife2014_min.pp_indices),c("gene_names", "Majority_protein_IDs", grep("LFQ", colnames(elife2014), value = T))] elife2014_pp.long <- gather(elife2014_pp, Fraction, LFQ_Intensity, LFQ_intensity_F0:LFQ_intensity_F6) elife2014_pp.long$base_uniprot <- rep(names(elife2014_min.pp_indices), times = sapply(elife2014_min.pp_indices, length)) elife2014_pp.long.sum <- elife2014_pp.long %>% group_by(base_uniprot, Fraction) %>% summarise(sumIntensity = sum(LFQ_Intensity)) elife2014_pp.long.sum.wide <- spread(elife2014_pp.long.sum, Fraction, sumIntensity) protg.full_cc.wide.avg.pp.norm <- protg.full_cc.wide.avg.pp[match(min.sig$Protein.group.IDs,protg.full_cc.wide.avg.pp$Protein.group.IDs),] temp <- t(apply(protg.full_cc.wide.avg.pp[match(min.sig$Protein.group.IDs,protg.full_cc.wide.avg.pp$Protein.group.IDs),as.character(1:16)], 1, function(x) x / mean(x, na.rm=TRUE))) protg.full_cc.wide.avg.pp.norm[,as.character(1:16)] <- temp elife2014_pp.long.sum.wide.norm <- elife2014_pp.long.sum.wide elife2014_pp.long.sum.wide.norm[,elife.lfq.cols] <- t(apply(elife2014_pp.long.sum.wide[,elife.lfq.cols], 1, function(x) x / mean(x, na.rm=TRUE))) pca.elife.df <- protg.full_cc.wide.avg.pp.norm pca.elife.df[, elife.lfq.cols] <- 0 pca.elife.df[match(elife2014_pp.long.sum.wide.norm$base_uniprot, strsplit.extract(pca.elife.df$Base_Uniprot, '-', 1)), elife.lfq.cols] <- elife2014_pp.long.sum.wide.norm[, elife.lfq.cols] pca.mat.elife <- t(pca.elife.df[,as.character(c(1:16, elife.lfq.cols))]) #pca.mat.elife <- t(apply(pca.mat.elife, 1, scale)) pca.mat.elife[is.na(pca.mat.elife)] <- 0 pca.out.elife <- prcomp(pca.mat.elife) pca.out.df.elife <- as.data.frame(pca.out.elife$x) pca.out.df.elife$name <- rownames(pca.out.df.elife) ggplot(pca.out.df.elife, aes(x = PC1, y = PC2, label = name)) + geom_point(size = 1) + geom_text(hjust = 1.2) + theme_bw() + theme(text = element_text(size = 12, family = "Helvetica", colour = 'black')) ggplotly() fviz_pca_var(pca.out.elife, col.var = "contrib", # Color by contributions to the PC gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"), repel = TRUE # Avoid text overlapping ) #sort sig (poor separation) ############ elife2014_pp <- elife2014[unlist(elife2014_sort.pp_indices),c("gene_names", "Majority_protein_IDs", grep("LFQ", colnames(elife2014), value = T))] elife2014_pp.long <- gather(elife2014_pp, Fraction, LFQ_Intensity, LFQ_intensity_F0:LFQ_intensity_F6) elife2014_pp.long$base_uniprot <- rep(names(elife2014_sort.pp_indices), times = sapply(elife2014_sort.pp_indices, length)) elife2014_pp.long.sum <- elife2014_pp.long %>% group_by(base_uniprot, Fraction) %>% summarise(sumIntensity = sum(LFQ_Intensity)) elife2014_pp.long.sum.wide <- spread(elife2014_pp.long.sum, Fraction, sumIntensity) protg.full_cc.wide.avg.pp.norm <- protg.full_cc.wide.avg.pp[match(sort.sig$Protein.group.IDs,protg.full_cc.wide.avg.pp$Protein.group.IDs),] temp <- t(apply(protg.full_cc.wide.avg.pp[match(sort.sig$Protein.group.IDs,protg.full_cc.wide.avg.pp$Protein.group.IDs),as.character(1:16)], 1, function(x) x / mean(x, na.rm=TRUE))) protg.full_cc.wide.avg.pp.norm[,as.character(1:16)] <- temp elife2014_pp.long.sum.wide.norm <- elife2014_pp.long.sum.wide elife2014_pp.long.sum.wide.norm[,elife.lfq.cols] <- t(apply(elife2014_pp.long.sum.wide[,elife.lfq.cols], 1, function(x) x / mean(x, na.rm=TRUE))) pca.elife.df <- protg.full_cc.wide.avg.pp.norm pca.elife.df[, elife.lfq.cols] <- 0 pca.elife.df[match(elife2014_pp.long.sum.wide.norm$base_uniprot, strsplit.extract(pca.elife.df$Base_Uniprot, '-', 1)), elife.lfq.cols] <- elife2014_pp.long.sum.wide.norm[, elife.lfq.cols] pca.mat.elife <- t(pca.elife.df[,as.character(c(1:16, elife.lfq.cols))]) #pca.mat.elife <- t(apply(pca.mat.elife, 1, scale)) pca.mat.elife[is.na(pca.mat.elife)] <- 0 pca.out.elife <- prcomp(pca.mat.elife) pca.out.df.elife <- as.data.frame(pca.out.elife$x) pca.out.df.elife$name <- rownames(pca.out.df.elife) ggplot(pca.out.df.elife, aes(x = PC1, y = PC2, label = name)) + geom_point(size = 1) + geom_text(hjust = 1.2) + theme_bw() + theme(text = element_text(size = 12, family = "Helvetica", colour = 'black')) ggplotly() fviz_pca_var(pca.out.elife, col.var = "contrib", # Color by contributions to the PC gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"), repel = TRUE # Avoid text overlapping )