###########################################################################################################
##calculate raw Ct technical variance and biological variance per gene for each timepoint
##calculate normalized Ct (to ref gene, aka dCt) technical variance and biological variance for each timepoint
##calculate ddCt and propagate error
##plot lfc for samples

#hld 11/28/18
###########################################################################################################

library(dplyr)
library(tidyr)
library(reshape2)
library(broom)
require(tidyverse)

###########################################################################################################
## Functions
#################

# filter out bad technical replicates
## only keep a technical replicate if: 1) CV < 1 for all tech reps performed OR 2) if CV > 1 for all tech reps, filter out
## the tech rep that deviates the most from the mean
## only keep bio reps if they have >= 2 tech reps (filter call will remove any rows where CV & SD == NA)
techrep_filt_stats <- function(df.raw){
  #id technical outliers and rm bad samples known a priori
  #calculations performed: mean, standard deviation, coefficient of variation, and abs distance from mean (per tech rep)
  techVar <-  df.raw %>%
    dplyr::group_by(GeneName, BioRep, Timepoint) %>%
        dplyr::filter(Cp != "NA") %>% #get rid of NA Ct values
    dplyr::mutate(techAve = mean(Cp), techSD = sd(Cp), CV = (techSD/techAve)*100,
                  tr_diff = abs(techAve - Cp)) %>%
    dplyr::ungroup()
  
  techAvg <- techVar %>%
    dplyr::group_by(GeneName, BioRep, Timepoint) %>%
    dplyr::filter(CV < 1 | (CV > 1 & TechRep != which.max(tr_diff))) %>%
    dplyr::mutate(filtTechAve = mean(Cp), filtTechSD = sd(Cp), filtTechSEM = se(Cp),
                  filtCV = (filtTechSD/filtTechAve)*100) %>%
    dplyr::select(Pos:Cp, filtTechAve:filtCV) %>%
    #recalculate summary statistics on filtered df
    dplyr::summarize(Count = n(), Cp_techAv = mean(Cp), Cp_techSD = sd(Cp), Cp_techSE = se(Cp)) %>%
    dplyr::mutate(CV = (Cp_techSD/Cp_techAv)*100) %>%
    dplyr::ungroup()
  
  #relevel factors in timepoint
  techAvg$Timepoint <- factor(techAvg$Timepoint, levels = c("P0", "P7", "P14", "P21", "P28", "P35"))
  
  #return filtered dataframe with summary stats
  return(techAvg)
}

#summary stats on biological level
biorep_stats <- function(techAv){
  biorepVar <- techAv %>%
    dplyr::group_by(GeneName, Timepoint) %>%
    dplyr::mutate(Cp_bioSD = sd(Cp_techAv), Cp_bioSE = se(Cp_techAv))
  
  return(biorepVar)
}

#join GOI and reference gene dataframes and calculate delta Ct
calc_dCt <- function(biorepVar, ref.df, hkg){
  #re-join ref gene values with tech rep Ct values and remove ref gene from GOI
  #propagate Ct level uncertainty using standard error propagation techniques
  dCt <- biorepVar %>%
    dplyr::group_by(GeneName, BioRep, Timepoint) %>%
    dplyr::inner_join(ref.df, by = "BioRep") %>%
    dplyr::rename(GOI = GeneName.x, Timepoint = Timepoint.x, n.tech.goi = Count.x, Cp_techAv.goi = Cp_techAv.x,
                  Cp_techSD.goi = Cp_techSD.x, Cp_techSE.goi = Cp_techSE.x,
                  Cp_bioSD.goi = Cp_bioSD.x, Cp_bioSE.goi = Cp_bioSE.x,
                  RefGene = GeneName.y, n.tech.ref = Count.y, Cp_techAv.ref = Cp_techAv.y,
                  Cp_techSD.ref = Cp_techSD.y, Cp_techSE.ref = Cp_techSE.y,
                  Cp_bioSD.ref = Cp_bioSD.y, Cp_bioSE.ref = Cp_bioSE.y) %>%
    #calculations: delta Ct, propagated technical stdev & sem from Ct values for both goi and ref gene
    dplyr::mutate(dCp = (Cp_techAv.goi - Cp_techAv.ref),
                  SD.tech.propGenes = sqrt(Cp_techSD.ref^2 + Cp_techSD.goi^2),
                  SE.tech.propGenes = sqrt(Cp_techSE.ref^2 + Cp_techSE.goi^2),
                  SD.Ct.prop = sqrt(Cp_bioSD.ref^2 + Cp_bioSD.goi^2),
                  SE.Ct.prop = sqrt(Cp_bioSE.ref^2 + Cp_bioSE.goi^2)) %>%
    dplyr::select(GOI:Timepoint, RefGene, dCp, SD.tech.propGenes, SD.Ct.prop, SE.tech.propGenes, SE.Ct.prop) %>%
    dplyr::filter(GOI != ref.gene) %>% 
    dplyr::ungroup()
  
  return(dCt)
}

#calculate summary statistics for delta Ct
dCt_stats <- function(dCt) {
  dCt.sumstat <- dCt %>%
    dplyr::group_by(GOI, Timepoint) %>%
    #calculations: mean dCt, stdev and sem of dCt values, CV of dCt, propagated stdev and sem to combine tech 
    ##and bio level uncertainty
    dplyr::summarize(av_dCp = mean(dCp), sd_dCp = sd(dCp), se_dCp = se(dCp), CV = abs(sd_dCp/av_dCp)*100,
                     SD.Ct.prop = mean(SD.Ct.prop), SE.Ct.prop = mean(SE.Ct.prop), #using mean() bc same value for each sample
                     SD.comb.tech = sqrt(sum(SD.tech.propGenes^2)),
                     SE.comb.tech = sqrt(sum(SE.tech.propGenes^2))) %>%
    dplyr::ungroup()
  
  return(dCt.sumstat)
}

#calculate delta delta Ct
#dCt.sumstat.ctrl is the data frame for the control sample only
calc_ddCt <- function(dCt.sumstat, dCt.sumstat.ctrl){
  ddCt <- dCt.sumstat %>%
    dplyr::group_by(GOI) %>%
    dplyr::inner_join(dCt.sumstat.ctrl, by = "GOI") %>%
    dplyr::rename(Timepoint = Timepoint.x, control.tp = Timepoint.y,
                  av_dCp.test = av_dCp.x, av_dCp.ctrl = av_dCp.y,
                  sd_dCp.test = sd_dCp.x, sd_dCp.ctrl = sd_dCp.y,
                  se_dCp.test = se_dCp.x, se_dCp.ctrl = se_dCp.y,
                  SD.Ct.prop.test = SD.Ct.prop.x, SD.Ct.prop.ctrl = SD.Ct.prop.y,
                  SE.Ct.prop.test = SE.Ct.prop.x, SE.Ct.prop.ctrl = SE.Ct.prop.y,
                  SD.comb.tech.test = SD.comb.tech.x, SD.comb.tech.ctrl = SD.comb.tech.y,
                  SE.comb.tech.test = SE.comb.tech.x, SE.comb.tech.ctrl = SE.comb.tech.y) %>%
    dplyr::mutate(ddCp = (av_dCp.test - av_dCp.ctrl)) %>%
    dplyr::mutate(ddCp.calc.sd = sd_dCp.test,
                  ddCp.calc.se = se_dCp.test,
                  ddCp.prop.sd = SD.Ct.prop.test,
                  ddCp.prop.se = SE.Ct.prop.test,
                  ddCp.comb.tech.sd = SD.comb.tech.test,
                  ddCp.comb.tech.se = SE.comb.tech.test) %>%
    dplyr::select(GOI, Timepoint, ddCp, ddCp.calc.sd, ddCp.calc.se, ddCp.prop.sd, ddCp.prop.se,
                  ddCp.comb.tech.sd, ddCp.comb.tech.se) %>%
    dplyr::ungroup()
  
  return(ddCt)
}

#calculate ddCt fold change for visualization
#uses stderr, error calculated on delta Ct values
foldChange_ddCt_calcSE <- function(ddCt){
  ddCt.FC.calc.se <- ddCt %>%
    mutate(upper.lim = (ddCp - ddCp.calc.se),
           lower.lim = (ddCp + ddCp.calc.se),
           relExp.FC = 2^(-ddCp),
           relExp.FC.low = 2^(-lower.lim), relExp.FC.high = 2^(-upper.lim))
  ddCt.FC.calc.se$GOI <- factor(ddCt.FC.calc.se$GOI)
  return(ddCt.FC.calc.se)
} 

#calculate ddCt fold change for visualization
#uses stderr, error estimate is propagated from Ct values
foldChange_ddCt_propSE <- function(ddCt) {
  ddCt.FC.prop.se <- ddCt %>%
    mutate(upper.lim = (ddCp - ddCp.prop.se),
           lower.lim = (ddCp + ddCp.prop.se),
           relExp.FC = 2^(-ddCp),
           relExp.FC.low = 2^(-lower.lim), relExp.FC.high = 2^(-upper.lim))
  ddCt.FC.prop.se$GOI <- factor(ddCt.FC.prop.se$GOI)
  return(ddCt.FC.prop.se)
}

###########################################################################################################

#set paths, wd, and load/clean data
path <- "/Users/hdingwall/Documents/Research/gradSchool/GallowayLab/qPCR_data/LC480/proliferationPaper"
plots_dir <- "/Users/hdingwall/Documents/Research/gradSchool/GallowayLab/qPCR_data/LC480/proliferationPaper/plots/"
stats_dir <- "/Users/hdingwall/Documents/Research/gradSchool/GallowayLab/qPCR_data/LC480/proliferationPaper/stats/"
setwd(path)

df1 <- read.table("2017-08-14_timeSeries-plate1_cp_exported04-04-18.txt", header = TRUE, na.strings = "NA")
df2 <- read.table("2017-08-15_timeSeries-plate2_cp_exported04-04-18.txt", header = TRUE, na.strings = "NA")
df3 <- read.table("Ken_P7_Scx-redo_7-19-18_full_cp_data.txt", header = TRUE, na.strings = "NA")

#filter out bad samples known a priori
#had to re-do P7 Scx due to high variance; removed all Tnmd due to spurious results (likely mispriming)
df1 <-  df1 %>%
  dplyr::group_by(GeneName, BioRep, Timepoint) %>%
  dplyr::filter(GeneName != "Tnmd", Cp != "NA") %>%
  #filter out bad P7 Scx values
  dplyr::filter(GeneName != "Scx" | Timepoint != "P7") %>%
  dplyr::ungroup()

#Acan and Cx43 are not relevant to this project, remove
df2 <-  df2 %>%
  dplyr::group_by(GeneName, BioRep, Timepoint) %>%
  dplyr::filter(GeneName != "Acan", GeneName != "Cx43", Cp != "NA") %>%
  dplyr::ungroup()
df2$GeneName <- dplyr::recode_factor(df2$GeneName, "Col3" = "Col3a1")

#make list of plate dataframes for easy iteration
plates <- list(plate1 = df1, plate2 = df2, plate3 = df3)

#compute stats on technical and biological replicates for all plates
techAvs <- lapply(plates, techrep_filt_stats)
biorepVars <- lapply(techAvs, biorep_stats)

#select reference gene values from all plates
ref.gene <- "Gapdh"
bioVars.ref <- lapply(biorepVars, function(x) dplyr::filter(x, GeneName == ref.gene))

#calculate dCt for all plates individually
#this ensures normalizing within a plate only
dCt <- Map(calc_dCt, biorepVars, bioVars.ref, ref.gene)

#merge all dCt data into one df for easy stats
dCt.merge <- dplyr::bind_rows(dCt, .id = "plate")

#save as table for importing into stats rmd file
write.csv(dCt.merge, file = "prolif_paper_rt-qpcr_dCtMerge.csv")

#calculate summary statistics for dCt
dCt.sumstat <- lapply(dCt, dCt_stats)

#summary stats merged df
dCt.sumstat.merge <- dplyr::bind_rows(dCt.sumstat, .id = "plate")

#########################################################################
### compute ddCt for each bio rep using merged dataframe, normalize to P0
###

#select reference sample
ref.sample <- "P0"
dCt.sumstat.ctrl <- dCt.sumstat.merge %>%
  dplyr::group_by(GOI) %>%
  dplyr::filter(Timepoint == ref.sample) %>%
  dplyr::group_by(GOI)

#calculate ddCt
ddCt <- calc_ddCt(dCt.sumstat.merge, dCt.sumstat.ctrl)

#calculate ddCt fold change for visualization
##with error calculated from dCt
ddCt.FC.calc.se <- foldChange_ddCt_calcSE(ddCt)

# ##with propagated error from Ct values
# ddCt.FC.prop.se <- foldChange_ddCt_propSE(ddCt)


################################################################
##### check distribution/variance
##### compute statistics -> export to ugly tables
##nice tables are made in "2018-12-06_prolifPaper-stats-tables.Rmd"

library(FSA)
library(pwr)
library(lawstat)
library(fitdistrplus)

##test equality of variances
dCt.merge.levene <- levene.test(dCt.merge$dCp, dCt.merge$Timepoint)

#loop over genes and check distribution of data
for (gene in dCt.merge$GOI){
  dCt.gene <- dCt.merge %>%
    dplyr::group_by(GOI) %>%
    dplyr::filter(GOI == gene)
  dCt.gene.lev <- levene.test(dCt.gene$dCp, dCt.gene$Timepoint)
  dCt.gene.desc <- descdist(dCt.gene$dCp, discrete = FALSE, boot = 100, method = "sample")
  gene.fit.norm1 <- fitdist(dCt.gene$dCp, "norm")
  gene.plt.dist <- plotdist(dCt.gene$dCp)
  
  #output to file
  current_date <- date()
  check_dist_file <- paste(stats_dir, "_", gene, "_dCt_check_dist.txt", sep = "")
  cat(paste(gene, "qPCR Check Variance and Distribution of Data\n\n##############################\n###\n",
            current_date, sep = ""),
      file = check_dist_file)
  cat("\n\n\n\n##############################\n###\n## Levene Test for Normality of Variance \n###\n##############################\n\n\n\n",
      file = check_dist_file, append = TRUE)
  capture.output(dCt.gene.lev, file = check_dist_file, append = TRUE)
  # cat("\n\n\n\n##############################\n###\n## Plot Distribution of Data\n###\n##############################\n\n\n\n",
  #     file = check_dist_file, append = TRUE)
  # capture.output(gene.plt.dist, file = check_dist_file, append = TRUE)
  cat("\n\n\n\n##############################\n###\n## Fit Normal Distribution\n###\n##############################\n\n\n\n",
      file = check_dist_file, append = TRUE)
  capture.output(gene.fit.norm1, file = check_dist_file, append = TRUE)
}

# loop over each GOI, calculate stats & export to file
for (gene in dCt.merge$GOI) {
  print(gene)
  dCt.gene <-dplyr::filter(dCt.merge, GOI == gene)
  dCt.aov.gene <- aov(dCp ~ Timepoint, data = dCt.gene)
  dCt.aov.gene.res <- summary(dCt.aov.gene)
  dCt.gene.tukey <- TukeyHSD(dCt.aov.gene, conf.level = 0.95)
  
  #output statistics to file
  out_file <- paste(stats_dir, "2018-12-5_", gene, "_dCt_stats.txt", sep = "")
  current_date <- date()
  cat(paste(gene, "qPCR Statistical Analysis Output\n\n##############################\n###\n", current_date, sep = ""),
      file = out_file)
  
  #export ANOVA output
  cat("## One-way ANOVA results\n###\n##############################\n\n\n", file = out_file, append = TRUE)
  capture.output(dCt.aov.gene.res, file = out_file, append = TRUE)
  
  #export post hoc results
  cat("\n\n\n\n##############################\n###\n## Post hoc tests\n###\n##############################\n\n\n\n",
      file = out_file, append = TRUE)
  cat("\n\n\t2. Tukey HSD test\n\n", file = out_file, append = TRUE)
  capture.output(dCt.gene.tukey, file = out_file, append = TRUE)
}


##############################################
##### make plots
##############################################
library(ggpubr)
library(logspline)
library(ggrepel)
library(scales)
library(cowplot)

#plot with calculated SE
ddCt.FC.calc.plt <- ddCt.FC.calc.se %>%
  dplyr::group_by(GOI) %>%
  dplyr::select(GOI, Timepoint, relExp.FC, relExp.FC.low, relExp.FC.high)

matrix.genes <- c("Col1a2", "Col3a1", "Fmod")

ddCt.FC.calc.plt.matrix <- ddCt.FC.calc.plt %>%
  dplyr::filter(GOI %in% matrix.genes) %>%
  droplevels() %>%
  dplyr::mutate(gene.class = "matrix")
ddCt.FC.calc.plt.matrix$GOI <- factor(ddCt.FC.calc.plt.matrix$GOI, levels = c("Col1a2", "Col3a1", "Fmod"))

ddCt.FC.calc.plt.cell <- ddCt.FC.calc.plt %>%
  dplyr::group_by(GOI) %>%
  dplyr::filter(!GOI %in% matrix.genes) %>%
  droplevels() %>%
  dplyr::mutate(gene.class = if_else(GOI == "Ki67", "proliferation", "tendon"))
ddCt.FC.calc.plt.cell$GOI <- factor(ddCt.FC.calc.plt.cell$GOI, levels = c("Ki67", "Scx", "Mkx"))

plt.colors <- list(prolif.blue = "#20409A", tend.green = "#39B54A", mat.gray = "#606060")
plt.yticks <- c(2^-6, 2^-4, 2^-2, 2^0, 2^2, 2^4, 2^6)
plt.ytick.labs <- c(expression(2^-6), expression(2^-4),
                    expression(2^-2), 1, expression(2^2),
                    expression(2^4), expression(2^6))
#plot using facet wrap
ddCt.all.plt.facet <- rbind(ddCt.FC.calc.plt.cell, ddCt.FC.calc.plt.matrix)
ddCt.all.plt.facet$gene.class <- factor(ddCt.all.plt.facet$gene.class,
                                        levels = c("proliferation", "tendon", "matrix"))
ddCt.all.plt.facet$GOI <- factor(ddCt.all.plt.facet$GOI,
                                 levels = c("Ki67", "Scx", "Mkx", "Col1a2", "Col3a1", "Fmod"))

plt.all.facet <- ggplot(ddCt.all.plt.facet,
                        aes(x = Timepoint, y = relExp.FC, fill = gene.class)) +
  ggplot2::geom_bar(stat = "identity", position = "dodge", alpha = 0.5) +
  scale_fill_manual(values = c(plt.colors$prolif.blue, plt.colors$tend.green, plt.colors$mat.gray)) +
  facet_wrap(. ~ GOI, ncol = 3) +
  ggplot2::geom_errorbar(aes(ymin = relExp.FC.low, ymax = relExp.FC.high), width = 0.1,
                         position = position_dodge(0.9), color = "grey5", size = 0.7) +
  ggplot2::scale_y_log10(limits = c(0.003, 50),
                         breaks = plt.yticks,
                         labels = plt.ytick.labs) +
  ggplot2::xlab(label = "") +
  ggplot2::ylab(expression("Relative Expression " (2^{~-Delta~Delta ~"C"[T]}))) +
  theme(panel.background = element_rect(fill = "transparent"), # bg of the panel
        plot.background = element_rect(fill = "transparent")) + # bg of the plot
  theme(strip.background =element_rect(fill=NA))+
  theme(strip.text = element_text(colour = 'black', face = 'italic', size = 16)) +
  theme(axis.title.y = element_text(size = 18, margin = margin(t=0, r=10))) +
  theme(axis.text.x = element_text(size = 16, angle = 45, margin = margin(t=10, r=0, b=0, l=0))) +
  theme(axis.text.y = element_text(size = 14, margin = margin(t=1, r=4, b=1, l=0))) +
  theme(legend.text = element_text(size = 16)) +
  theme(axis.line = element_line(size = 4, linetype = "solid")) +
  theme(axis.ticks = element_line(size = 1, linetype = "solid")) +
  labs(fill="")

# save plots as .pdf
ggsave(filename = "2018-12-3_facetAll_ddCt-FCwStdErr_v2.pdf",
       plt.all.facet,
       path = plots_dir,
       scale = 1,
       bg = "transparent",
       width = 11, height = 8.5)