########################################################################################################### ##calculate raw Ct technical variance and biological variance per gene for each timepoint ##calculate normalized Ct (to ref gene, aka dCt) technical variance and biological variance for each timepoint ##calculate ddCt and propagate error ##plot lfc for samples #hld 11/28/18 ########################################################################################################### library(dplyr) library(tidyr) library(reshape2) library(broom) require(tidyverse) ########################################################################################################### ## Functions ################# # filter out bad technical replicates ## only keep a technical replicate if: 1) CV < 1 for all tech reps performed OR 2) if CV > 1 for all tech reps, filter out ## the tech rep that deviates the most from the mean ## only keep bio reps if they have >= 2 tech reps (filter call will remove any rows where CV & SD == NA) techrep_filt_stats <- function(df.raw){ #id technical outliers and rm bad samples known a priori #calculations performed: mean, standard deviation, coefficient of variation, and abs distance from mean (per tech rep) techVar <- df.raw %>% dplyr::group_by(GeneName, BioRep, Timepoint) %>% dplyr::filter(Cp != "NA") %>% #get rid of NA Ct values dplyr::mutate(techAve = mean(Cp), techSD = sd(Cp), CV = (techSD/techAve)*100, tr_diff = abs(techAve - Cp)) %>% dplyr::ungroup() techAvg <- techVar %>% dplyr::group_by(GeneName, BioRep, Timepoint) %>% dplyr::filter(CV < 1 | (CV > 1 & TechRep != which.max(tr_diff))) %>% dplyr::mutate(filtTechAve = mean(Cp), filtTechSD = sd(Cp), filtTechSEM = se(Cp), filtCV = (filtTechSD/filtTechAve)*100) %>% dplyr::select(Pos:Cp, filtTechAve:filtCV) %>% #recalculate summary statistics on filtered df dplyr::summarize(Count = n(), Cp_techAv = mean(Cp), Cp_techSD = sd(Cp), Cp_techSE = se(Cp)) %>% dplyr::mutate(CV = (Cp_techSD/Cp_techAv)*100) %>% dplyr::ungroup() #relevel factors in timepoint techAvg$Timepoint <- factor(techAvg$Timepoint, levels = c("P0", "P7", "P14", "P21", "P28", "P35")) #return filtered dataframe with summary stats return(techAvg) } #summary stats on biological level biorep_stats <- function(techAv){ biorepVar <- techAv %>% dplyr::group_by(GeneName, Timepoint) %>% dplyr::mutate(Cp_bioSD = sd(Cp_techAv), Cp_bioSE = se(Cp_techAv)) return(biorepVar) } #join GOI and reference gene dataframes and calculate delta Ct calc_dCt <- function(biorepVar, ref.df, hkg){ #re-join ref gene values with tech rep Ct values and remove ref gene from GOI #propagate Ct level uncertainty using standard error propagation techniques dCt <- biorepVar %>% dplyr::group_by(GeneName, BioRep, Timepoint) %>% dplyr::inner_join(ref.df, by = "BioRep") %>% dplyr::rename(GOI = GeneName.x, Timepoint = Timepoint.x, n.tech.goi = Count.x, Cp_techAv.goi = Cp_techAv.x, Cp_techSD.goi = Cp_techSD.x, Cp_techSE.goi = Cp_techSE.x, Cp_bioSD.goi = Cp_bioSD.x, Cp_bioSE.goi = Cp_bioSE.x, RefGene = GeneName.y, n.tech.ref = Count.y, Cp_techAv.ref = Cp_techAv.y, Cp_techSD.ref = Cp_techSD.y, Cp_techSE.ref = Cp_techSE.y, Cp_bioSD.ref = Cp_bioSD.y, Cp_bioSE.ref = Cp_bioSE.y) %>% #calculations: delta Ct, propagated technical stdev & sem from Ct values for both goi and ref gene dplyr::mutate(dCp = (Cp_techAv.goi - Cp_techAv.ref), SD.tech.propGenes = sqrt(Cp_techSD.ref^2 + Cp_techSD.goi^2), SE.tech.propGenes = sqrt(Cp_techSE.ref^2 + Cp_techSE.goi^2), SD.Ct.prop = sqrt(Cp_bioSD.ref^2 + Cp_bioSD.goi^2), SE.Ct.prop = sqrt(Cp_bioSE.ref^2 + Cp_bioSE.goi^2)) %>% dplyr::select(GOI:Timepoint, RefGene, dCp, SD.tech.propGenes, SD.Ct.prop, SE.tech.propGenes, SE.Ct.prop) %>% dplyr::filter(GOI != ref.gene) %>% dplyr::ungroup() return(dCt) } #calculate summary statistics for delta Ct dCt_stats <- function(dCt) { dCt.sumstat <- dCt %>% dplyr::group_by(GOI, Timepoint) %>% #calculations: mean dCt, stdev and sem of dCt values, CV of dCt, propagated stdev and sem to combine tech ##and bio level uncertainty dplyr::summarize(av_dCp = mean(dCp), sd_dCp = sd(dCp), se_dCp = se(dCp), CV = abs(sd_dCp/av_dCp)*100, SD.Ct.prop = mean(SD.Ct.prop), SE.Ct.prop = mean(SE.Ct.prop), #using mean() bc same value for each sample SD.comb.tech = sqrt(sum(SD.tech.propGenes^2)), SE.comb.tech = sqrt(sum(SE.tech.propGenes^2))) %>% dplyr::ungroup() return(dCt.sumstat) } #calculate delta delta Ct #dCt.sumstat.ctrl is the data frame for the control sample only calc_ddCt <- function(dCt.sumstat, dCt.sumstat.ctrl){ ddCt <- dCt.sumstat %>% dplyr::group_by(GOI) %>% dplyr::inner_join(dCt.sumstat.ctrl, by = "GOI") %>% dplyr::rename(Timepoint = Timepoint.x, control.tp = Timepoint.y, av_dCp.test = av_dCp.x, av_dCp.ctrl = av_dCp.y, sd_dCp.test = sd_dCp.x, sd_dCp.ctrl = sd_dCp.y, se_dCp.test = se_dCp.x, se_dCp.ctrl = se_dCp.y, SD.Ct.prop.test = SD.Ct.prop.x, SD.Ct.prop.ctrl = SD.Ct.prop.y, SE.Ct.prop.test = SE.Ct.prop.x, SE.Ct.prop.ctrl = SE.Ct.prop.y, SD.comb.tech.test = SD.comb.tech.x, SD.comb.tech.ctrl = SD.comb.tech.y, SE.comb.tech.test = SE.comb.tech.x, SE.comb.tech.ctrl = SE.comb.tech.y) %>% dplyr::mutate(ddCp = (av_dCp.test - av_dCp.ctrl)) %>% dplyr::mutate(ddCp.calc.sd = sd_dCp.test, ddCp.calc.se = se_dCp.test, ddCp.prop.sd = SD.Ct.prop.test, ddCp.prop.se = SE.Ct.prop.test, ddCp.comb.tech.sd = SD.comb.tech.test, ddCp.comb.tech.se = SE.comb.tech.test) %>% dplyr::select(GOI, Timepoint, ddCp, ddCp.calc.sd, ddCp.calc.se, ddCp.prop.sd, ddCp.prop.se, ddCp.comb.tech.sd, ddCp.comb.tech.se) %>% dplyr::ungroup() return(ddCt) } #calculate ddCt fold change for visualization #uses stderr, error calculated on delta Ct values foldChange_ddCt_calcSE <- function(ddCt){ ddCt.FC.calc.se <- ddCt %>% mutate(upper.lim = (ddCp - ddCp.calc.se), lower.lim = (ddCp + ddCp.calc.se), relExp.FC = 2^(-ddCp), relExp.FC.low = 2^(-lower.lim), relExp.FC.high = 2^(-upper.lim)) ddCt.FC.calc.se$GOI <- factor(ddCt.FC.calc.se$GOI) return(ddCt.FC.calc.se) } #calculate ddCt fold change for visualization #uses stderr, error estimate is propagated from Ct values foldChange_ddCt_propSE <- function(ddCt) { ddCt.FC.prop.se <- ddCt %>% mutate(upper.lim = (ddCp - ddCp.prop.se), lower.lim = (ddCp + ddCp.prop.se), relExp.FC = 2^(-ddCp), relExp.FC.low = 2^(-lower.lim), relExp.FC.high = 2^(-upper.lim)) ddCt.FC.prop.se$GOI <- factor(ddCt.FC.prop.se$GOI) return(ddCt.FC.prop.se) } ########################################################################################################### #set paths, wd, and load/clean data path <- "/Users/hdingwall/Documents/Research/gradSchool/GallowayLab/qPCR_data/LC480/proliferationPaper" plots_dir <- "/Users/hdingwall/Documents/Research/gradSchool/GallowayLab/qPCR_data/LC480/proliferationPaper/plots/" stats_dir <- "/Users/hdingwall/Documents/Research/gradSchool/GallowayLab/qPCR_data/LC480/proliferationPaper/stats/" setwd(path) df1 <- read.table("2017-08-14_timeSeries-plate1_cp_exported04-04-18.txt", header = TRUE, na.strings = "NA") df2 <- read.table("2017-08-15_timeSeries-plate2_cp_exported04-04-18.txt", header = TRUE, na.strings = "NA") df3 <- read.table("Ken_P7_Scx-redo_7-19-18_full_cp_data.txt", header = TRUE, na.strings = "NA") #filter out bad samples known a priori #had to re-do P7 Scx due to high variance; removed all Tnmd due to spurious results (likely mispriming) df1 <- df1 %>% dplyr::group_by(GeneName, BioRep, Timepoint) %>% dplyr::filter(GeneName != "Tnmd", Cp != "NA") %>% #filter out bad P7 Scx values dplyr::filter(GeneName != "Scx" | Timepoint != "P7") %>% dplyr::ungroup() #Acan and Cx43 are not relevant to this project, remove df2 <- df2 %>% dplyr::group_by(GeneName, BioRep, Timepoint) %>% dplyr::filter(GeneName != "Acan", GeneName != "Cx43", Cp != "NA") %>% dplyr::ungroup() df2$GeneName <- dplyr::recode_factor(df2$GeneName, "Col3" = "Col3a1") #make list of plate dataframes for easy iteration plates <- list(plate1 = df1, plate2 = df2, plate3 = df3) #compute stats on technical and biological replicates for all plates techAvs <- lapply(plates, techrep_filt_stats) biorepVars <- lapply(techAvs, biorep_stats) #select reference gene values from all plates ref.gene <- "Gapdh" bioVars.ref <- lapply(biorepVars, function(x) dplyr::filter(x, GeneName == ref.gene)) #calculate dCt for all plates individually #this ensures normalizing within a plate only dCt <- Map(calc_dCt, biorepVars, bioVars.ref, ref.gene) #merge all dCt data into one df for easy stats dCt.merge <- dplyr::bind_rows(dCt, .id = "plate") #save as table for importing into stats rmd file write.csv(dCt.merge, file = "prolif_paper_rt-qpcr_dCtMerge.csv") #calculate summary statistics for dCt dCt.sumstat <- lapply(dCt, dCt_stats) #summary stats merged df dCt.sumstat.merge <- dplyr::bind_rows(dCt.sumstat, .id = "plate") ######################################################################### ### compute ddCt for each bio rep using merged dataframe, normalize to P0 ### #select reference sample ref.sample <- "P0" dCt.sumstat.ctrl <- dCt.sumstat.merge %>% dplyr::group_by(GOI) %>% dplyr::filter(Timepoint == ref.sample) %>% dplyr::group_by(GOI) #calculate ddCt ddCt <- calc_ddCt(dCt.sumstat.merge, dCt.sumstat.ctrl) #calculate ddCt fold change for visualization ##with error calculated from dCt ddCt.FC.calc.se <- foldChange_ddCt_calcSE(ddCt) # ##with propagated error from Ct values # ddCt.FC.prop.se <- foldChange_ddCt_propSE(ddCt) ################################################################ ##### check distribution/variance ##### compute statistics -> export to ugly tables ##nice tables are made in "2018-12-06_prolifPaper-stats-tables.Rmd" library(FSA) library(pwr) library(lawstat) library(fitdistrplus) ##test equality of variances dCt.merge.levene <- levene.test(dCt.merge$dCp, dCt.merge$Timepoint) #loop over genes and check distribution of data for (gene in dCt.merge$GOI){ dCt.gene <- dCt.merge %>% dplyr::group_by(GOI) %>% dplyr::filter(GOI == gene) dCt.gene.lev <- levene.test(dCt.gene$dCp, dCt.gene$Timepoint) dCt.gene.desc <- descdist(dCt.gene$dCp, discrete = FALSE, boot = 100, method = "sample") gene.fit.norm1 <- fitdist(dCt.gene$dCp, "norm") gene.plt.dist <- plotdist(dCt.gene$dCp) #output to file current_date <- date() check_dist_file <- paste(stats_dir, "_", gene, "_dCt_check_dist.txt", sep = "") cat(paste(gene, "qPCR Check Variance and Distribution of Data\n\n##############################\n###\n", current_date, sep = ""), file = check_dist_file) cat("\n\n\n\n##############################\n###\n## Levene Test for Normality of Variance \n###\n##############################\n\n\n\n", file = check_dist_file, append = TRUE) capture.output(dCt.gene.lev, file = check_dist_file, append = TRUE) # cat("\n\n\n\n##############################\n###\n## Plot Distribution of Data\n###\n##############################\n\n\n\n", # file = check_dist_file, append = TRUE) # capture.output(gene.plt.dist, file = check_dist_file, append = TRUE) cat("\n\n\n\n##############################\n###\n## Fit Normal Distribution\n###\n##############################\n\n\n\n", file = check_dist_file, append = TRUE) capture.output(gene.fit.norm1, file = check_dist_file, append = TRUE) } # loop over each GOI, calculate stats & export to file for (gene in dCt.merge$GOI) { print(gene) dCt.gene <-dplyr::filter(dCt.merge, GOI == gene) dCt.aov.gene <- aov(dCp ~ Timepoint, data = dCt.gene) dCt.aov.gene.res <- summary(dCt.aov.gene) dCt.gene.tukey <- TukeyHSD(dCt.aov.gene, conf.level = 0.95) #output statistics to file out_file <- paste(stats_dir, "2018-12-5_", gene, "_dCt_stats.txt", sep = "") current_date <- date() cat(paste(gene, "qPCR Statistical Analysis Output\n\n##############################\n###\n", current_date, sep = ""), file = out_file) #export ANOVA output cat("## One-way ANOVA results\n###\n##############################\n\n\n", file = out_file, append = TRUE) capture.output(dCt.aov.gene.res, file = out_file, append = TRUE) #export post hoc results cat("\n\n\n\n##############################\n###\n## Post hoc tests\n###\n##############################\n\n\n\n", file = out_file, append = TRUE) cat("\n\n\t2. Tukey HSD test\n\n", file = out_file, append = TRUE) capture.output(dCt.gene.tukey, file = out_file, append = TRUE) } ############################################## ##### make plots ############################################## library(ggpubr) library(logspline) library(ggrepel) library(scales) library(cowplot) #plot with calculated SE ddCt.FC.calc.plt <- ddCt.FC.calc.se %>% dplyr::group_by(GOI) %>% dplyr::select(GOI, Timepoint, relExp.FC, relExp.FC.low, relExp.FC.high) matrix.genes <- c("Col1a2", "Col3a1", "Fmod") ddCt.FC.calc.plt.matrix <- ddCt.FC.calc.plt %>% dplyr::filter(GOI %in% matrix.genes) %>% droplevels() %>% dplyr::mutate(gene.class = "matrix") ddCt.FC.calc.plt.matrix$GOI <- factor(ddCt.FC.calc.plt.matrix$GOI, levels = c("Col1a2", "Col3a1", "Fmod")) ddCt.FC.calc.plt.cell <- ddCt.FC.calc.plt %>% dplyr::group_by(GOI) %>% dplyr::filter(!GOI %in% matrix.genes) %>% droplevels() %>% dplyr::mutate(gene.class = if_else(GOI == "Ki67", "proliferation", "tendon")) ddCt.FC.calc.plt.cell$GOI <- factor(ddCt.FC.calc.plt.cell$GOI, levels = c("Ki67", "Scx", "Mkx")) plt.colors <- list(prolif.blue = "#20409A", tend.green = "#39B54A", mat.gray = "#606060") plt.yticks <- c(2^-6, 2^-4, 2^-2, 2^0, 2^2, 2^4, 2^6) plt.ytick.labs <- c(expression(2^-6), expression(2^-4), expression(2^-2), 1, expression(2^2), expression(2^4), expression(2^6)) #plot using facet wrap ddCt.all.plt.facet <- rbind(ddCt.FC.calc.plt.cell, ddCt.FC.calc.plt.matrix) ddCt.all.plt.facet$gene.class <- factor(ddCt.all.plt.facet$gene.class, levels = c("proliferation", "tendon", "matrix")) ddCt.all.plt.facet$GOI <- factor(ddCt.all.plt.facet$GOI, levels = c("Ki67", "Scx", "Mkx", "Col1a2", "Col3a1", "Fmod")) plt.all.facet <- ggplot(ddCt.all.plt.facet, aes(x = Timepoint, y = relExp.FC, fill = gene.class)) + ggplot2::geom_bar(stat = "identity", position = "dodge", alpha = 0.5) + scale_fill_manual(values = c(plt.colors$prolif.blue, plt.colors$tend.green, plt.colors$mat.gray)) + facet_wrap(. ~ GOI, ncol = 3) + ggplot2::geom_errorbar(aes(ymin = relExp.FC.low, ymax = relExp.FC.high), width = 0.1, position = position_dodge(0.9), color = "grey5", size = 0.7) + ggplot2::scale_y_log10(limits = c(0.003, 50), breaks = plt.yticks, labels = plt.ytick.labs) + ggplot2::xlab(label = "") + ggplot2::ylab(expression("Relative Expression " (2^{~-Delta~Delta ~"C"[T]}))) + theme(panel.background = element_rect(fill = "transparent"), # bg of the panel plot.background = element_rect(fill = "transparent")) + # bg of the plot theme(strip.background =element_rect(fill=NA))+ theme(strip.text = element_text(colour = 'black', face = 'italic', size = 16)) + theme(axis.title.y = element_text(size = 18, margin = margin(t=0, r=10))) + theme(axis.text.x = element_text(size = 16, angle = 45, margin = margin(t=10, r=0, b=0, l=0))) + theme(axis.text.y = element_text(size = 14, margin = margin(t=1, r=4, b=1, l=0))) + theme(legend.text = element_text(size = 16)) + theme(axis.line = element_line(size = 4, linetype = "solid")) + theme(axis.ticks = element_line(size = 1, linetype = "solid")) + labs(fill="") # save plots as .pdf ggsave(filename = "2018-12-3_facetAll_ddCt-FCwStdErr_v2.pdf", plt.all.facet, path = plots_dir, scale = 1, bg = "transparent", width = 11, height = 8.5)