remove(list = ls())


# R packages utilized
library(tidyverse)
library(MASS)
library(epiR)
library(lubridate)
library(readxl, quietly = TRUE)
library(dplyr)
library(ggplot2)
library(reshape2)
library(hrbrthemes)
library(data.table, quietly = TRUE)
library(foreach, quietly = TRUE)
library(doBy)
library(ggpmisc)
library(gridExtra)
library(ggpubr)
library(MatchIt)
library(gtsummary)
library(lme4)
library(sjPlot)
library(stargazer)
library(cAIC4)
library(MuMIn)
library(fmsb)
library(jtools)
library(broom)
library(ggstance)
library(pROC)
library(InformationValue)
library(caret)

# load functions
cor_est <- function(x,y,...){
  obj <- cor.test(x,y,...)
  val <- obj$estimate
  return(round(as.numeric(val),digits = 3))
}
cor_pval <- function(x,y,...){
  obj <- cor.test(x,y,...)
  pval <- obj$p.value
  return(as.numeric(pval))
}
corr_fun <- function(dat,outcomes,new_markers,cohort,...){
    dat_est <- sapply(outcomes,function(ychar){
    sapply(new_markers,function(xchar){
      cor_est(dat[[xchar]],dat[[ychar]],method="spearman")
    })
  })
  
  
  dat_pval <- sapply(outcomes,function(ychar){sapply(new_markers,function(xchar){
    cor_pval(dat[[xchar]],dat[[ychar]],method="spearman")})})
  
  
  dat_pval <- as_tibble(dat_pval) %>% 
    mutate_all(p.adjust,method="fdr") %>%
    mutate(marker = new_markers,type=paste("p-value")) %>%  
    select(marker,type,everything()) 
  
  dat_est <- as_tibble(dat_est) %>% 
    mutate(marker = new_markers,type=paste("spearman")) %>% 
    select(marker,type,everything())
  
  dat_results <- bind_rows(dat_est,dat_pval)
  dat_results <- dat_results %>% 
    gather(key,value,outcomes) 
  
  
  dat_results <- dat_results %>% 
    mutate(key = factor(key,levels=unique(dat_results$key))) %>% 
    mutate(type=factor(type,levels=c(paste("spearman"),paste("p-value")))) %>%  
    arrange(marker,key,type) %>%    
    mutate(key = paste(as.character(key),type)) %>%
    select(-type) 
  
  dat_results <- dat_results %>% 
    mutate(key = factor(key,levels=unique(dat_results$key))) %>% 
    spread(key,value) 
  
  
  return(dat_results)
}


radarchart2 <- function (df, axistype = 0, seg = 4, pty = 16, pcol = 1:8, plty = 1:6, 
                         plwd = 1, pdensity = NULL, pangle = 45, pfcol = NA, cglty = 3, 
                         cglwd = 1, cglcol = "navy", axislabcol = "blue", vlabcol = "black", title = "", 
                         maxmin = TRUE, na.itp = TRUE, centerzero = FALSE, vlabels = NULL, 
                         vlcex = NULL, caxislabels = NULL, calcex = NULL, paxislabels = NULL, 
                         palcex = NULL, ...) 
{
  if (!is.data.frame(df)) {
    cat("The data must be given as dataframe.\n")
    return()
  }
  if ((n <- length(df)) < 3) {
    cat("The number of variables must be 3 or more.\n")
    return()
  }
  if (maxmin == FALSE) {
    dfmax <- apply(df, 2, max)
    dfmin <- apply(df, 2, min)
    df <- rbind(dfmax, dfmin, df)
  }
  plot(c(-1.2, 1.2), c(-1.2, 1.2), type = "n", frame.plot = FALSE, 
       axes = FALSE, xlab = "", ylab = "", main = title, asp = 1, 
       ...)
  theta <- seq(90, 450, length = n + 1) * pi/180
  theta <- theta[1:n]
  xx <- cos(theta)
  yy <- sin(theta)
  CGap <- ifelse(centerzero, 0, 1)
  for (i in 0:seg) {
    polygon(xx * (i + CGap)/(seg + CGap), yy * (i + CGap)/(seg + 
                                                             CGap), lty = cglty, lwd = cglwd, border = cglcol)
    if (axistype == 1 | axistype == 3) 
      CAXISLABELS <- paste(i/seg * 100, "(%)")
    if (axistype == 4 | axistype == 5) 
      CAXISLABELS <- sprintf("%3.2f", i/seg)
    if (!is.null(caxislabels) & (i < length(caxislabels))) 
      CAXISLABELS <- caxislabels[i + 1]
    if (axistype == 1 | axistype == 3 | axistype == 4 | 
        axistype == 5) {
      if (is.null(calcex)) 
        text(-0.05, (i + CGap)/(seg + CGap), CAXISLABELS, 
             col = axislabcol)
      else text(-0.05, (i + CGap)/(seg + CGap), CAXISLABELS, 
                col = axislabcol, cex = calcex)
    }
  }
  if (centerzero) {
    arrows(0, 0, xx * 1, yy * 1, lwd = cglwd, lty = cglty, 
           length = 0, col = cglcol)
  }
  else {
    arrows(xx/(seg + CGap), yy/(seg + CGap), xx * 1, yy * 
             1, lwd = cglwd, lty = cglty, length = 0, col = cglcol)
  }
  PAXISLABELS <- df[1, 1:n]
  if (!is.null(paxislabels)) 
    PAXISLABELS <- paxislabels
  if (axistype == 2 | axistype == 3 | axistype == 5) {
    if (is.null(palcex)) 
      text(xx[1:n], yy[1:n], PAXISLABELS, col = axislabcol)
    else text(xx[1:n], yy[1:n], PAXISLABELS, col = axislabcol, 
              cex = palcex)
  }
  VLABELS <- colnames(df)
  if (!is.null(vlabels)) 
    VLABELS <- vlabels
  if (is.null(vlcex)) 
    text(xx * 1.2, yy * 1.2, VLABELS, col = vlabcol)
  else text(xx * 1.2, yy * 1.2, VLABELS, cex = vlcex, col = vlabcol)
  series <- length(df[[1]])
  SX <- series - 2
  if (length(pty) < SX) {
    ptys <- rep(pty, SX)
  }
  else {
    ptys <- pty
  }
  if (length(pcol) < SX) {
    pcols <- rep(pcol, SX)
  }
  else {
    pcols <- pcol
  }
  if (length(plty) < SX) {
    pltys <- rep(plty, SX)
  }
  else {
    pltys <- plty
  }
  if (length(plwd) < SX) {
    plwds <- rep(plwd, SX)
  }
  else {
    plwds <- plwd
  }
  if (length(pdensity) < SX) {
    pdensities <- rep(pdensity, SX)
  }
  else {
    pdensities <- pdensity
  }
  if (length(pangle) < SX) {
    pangles <- rep(pangle, SX)
  }
  else {
    pangles <- pangle
  }
  if (length(pfcol) < SX) {
    pfcols <- rep(pfcol, SX)
  }
  else {
    pfcols <- pfcol
  }
  for (i in 3:series) {
    xxs <- xx
    yys <- yy
    scale <- CGap/(seg + CGap) + (df[i, ] - df[2, ])/(df[1, 
                                                         ] - df[2, ]) * seg/(seg + CGap)
    if (sum(!is.na(df[i, ])) < 3) {
      cat(sprintf("[DATA NOT ENOUGH] at %d\n%g\n", i, 
                  df[i, ]))
    }
    else {
      for (j in 1:n) {
        if (is.na(df[i, j])) {
          if (na.itp) {
            left <- ifelse(j > 1, j - 1, n)
            while (is.na(df[i, left])) {
              left <- ifelse(left > 1, left - 1, n)
            }
            right <- ifelse(j < n, j + 1, 1)
            while (is.na(df[i, right])) {
              right <- ifelse(right < n, right + 1, 
                              1)
            }
            xxleft <- xx[left] * CGap/(seg + CGap) + 
              xx[left] * (df[i, left] - df[2, left])/(df[1, 
                                                         left] - df[2, left]) * seg/(seg + CGap)
            yyleft <- yy[left] * CGap/(seg + CGap) + 
              yy[left] * (df[i, left] - df[2, left])/(df[1, 
                                                         left] - df[2, left]) * seg/(seg + CGap)
            xxright <- xx[right] * CGap/(seg + CGap) + 
              xx[right] * (df[i, right] - df[2, right])/(df[1, 
                                                            right] - df[2, right]) * seg/(seg + 
                                                                                            CGap)
            yyright <- yy[right] * CGap/(seg + CGap) + 
              yy[right] * (df[i, right] - df[2, right])/(df[1, 
                                                            right] - df[2, right]) * seg/(seg + 
                                                                                            CGap)
            if (xxleft > xxright) {
              xxtmp <- xxleft
              yytmp <- yyleft
              xxleft <- xxright
              yyleft <- yyright
              xxright <- xxtmp
              yyright <- yytmp
            }
            xxs[j] <- xx[j] * (yyleft * xxright - yyright * 
                                 xxleft)/(yy[j] * (xxright - xxleft) - 
                                            xx[j] * (yyright - yyleft))
            yys[j] <- (yy[j]/xx[j]) * xxs[j]
          }
          else {
            xxs[j] <- 0
            yys[j] <- 0
          }
        }
        else {
          xxs[j] <- xx[j] * CGap/(seg + CGap) + xx[j] * 
            (df[i, j] - df[2, j])/(df[1, j] - df[2, 
                                                 j]) * seg/(seg + CGap)
          yys[j] <- yy[j] * CGap/(seg + CGap) + yy[j] * 
            (df[i, j] - df[2, j])/(df[1, j] - df[2, 
                                                 j]) * seg/(seg + CGap)
        }
      }
      if (is.null(pdensities)) {
        polygon(xxs, yys, lty = pltys[i - 2], lwd = plwds[i - 
                                                            2], border = pcols[i - 2], col = pfcols[i - 
                                                                                                      2])
      }
      else {
        polygon(xxs, yys, lty = pltys[i - 2], lwd = plwds[i - 
                                                            2], border = pcols[i - 2], density = pdensities[i - 
                                                                                                              2], angle = pangles[i - 2], col = pfcols[i - 
                                                                                                                                                         2])
      }
      points(xx * scale, yy * scale, pch = ptys[i - 2], 
             col = pcols[i - 2])
    }
  }
}
####################################################################################################################
####################################################################################################################
####################################################################################################################

# load clean dataset

data_sub <-  read_csv("./input/data_all_split.csv")
data_sub <- data_sub %>% 
  filter(cohort %in% c("train","val"))

data_sub$log_nfl_csf_age_adj <- log((data_sub$nfl_csf_age_adj - min(data_sub$nfl_csf_age_adj,rm.na=TRUE))+1,10)

data_sub$log_nfl_serum_age_adj <- log((data_sub$nfl_serum_age_adj - min(data_sub$nfl_serum_age_adj,rm.na=TRUE))+1,10)

# generate cel column

data_sub$cel <- log(data_sub$cel_exact_number +1)

# generate new and enlarging lesions colum
data_sub$t2_new_large <- log(data_sub$t2_lesion_new_and_enlarging +1)

# generate medulla_ll_atrophy column

data_sub$medulla_ll_atrophy <- data_sub$medulla_ll + data_sub$medulla_atrophy

data_sub$sdmt_age <- data_sub$sdmt / data_sub$age

# generate panel11_17 column

data_sub$panel_11_17 <- data_sub$panel_11 + data_sub$panel_17


# isolate training cohort

data_sub_train <- data_sub %>%
  filter(cohort=="train")

# isolate validation cohort

data_sub_val <- data_sub %>%
  filter(cohort=="val")

# plot HD vs age


pred <- "nfl"
obs <- "age"

pred_nice <- "Log10 NFL"
obs_nice <- "Age"

for(i in 1:length(pred)){
  # i <- 2
  
  
  dfr <- data_sub %>% 
    filter(diagnosis=="Healthy Donor")
  
  dfr_s <- dfr %>% 
    select(age,log_nfl_serum,patientcode)
  names(dfr_s)[2] <- "nfl"
  dfr_s$source <- "serum"
  
  dfr_c <- dfr %>% 
    select(age,log_nfl_csf,patientcode)
  names(dfr_c)[2] <- "nfl"
  dfr_c$source <- "csf"
  
  df <- rbind(dfr_s,dfr_c)
  
  
  p <- df %>% 
    ggplot(aes_string(x=obs[i], y=pred[i])) +
    geom_point(aes(color=source),alpha = 0.4, shape = 1, size = 2,stroke = 0.6) +
   
    ylim(c(min(df[pred[i]]-0.1*(max(df[pred[i]])-min(df[pred[i]]))),
           max(df[pred[i]])+0.2*(max(df[pred[i]])-min(df[pred[i]]))))+
    xlab(obs_nice[i]) +
    ylab(pred_nice[i]) +
    scale_color_manual(values = c("#0171C0","#C00000"),
                       name="NFL source",
                       breaks = c("csf","serum"),
                       label = c("CSF","Serum"))+
    
    stat_cor(aes(label=..rr.label..,color=source), size=4,label.y.npc = .99,label.x.npc = 0)+
    stat_cor(aes(label=..p.label..,color=source), size=4,label.y.npc = .99, label.x.npc = 0.4)+
    geom_smooth(method="lm",se=TRUE, size=1, aes(color=source)) +
    theme_bw(base_size = 12) +
    theme(axis.title.x = element_text(colour = "black"),
          axis.title.y = element_text(colour = "black"))
  
  assign(paste("p",i,sep = ""),p)
}

p1

hd_serum <- lm(dfr$log_nfl_serum~dfr$age)
hd_csf <- lm(dfr$log_nfl_csf~dfr$age)

summary(hd_serum)
summary(hd_csf)

ggsave(plot = p1,filename = "./output/age_vs_nfl_hd.png", 
       width =5,height = 3.5,units = "in",dpi = 300)

# plot MS vs age


pred <- "nfl"
obs <- "age"

pred_nice <- "Log10 NFL"
obs_nice <- "Age"

for(i in 1:length(pred)){
  # i <- 2
  
  
  dfr <- data_sub_train %>% 
    filter(diagnosis %in% c("RR-MS","PP-MS","SP-MS"))
  
  dfr_s <- dfr %>% 
    select(age,log_nfl_serum,patientcode)
  names(dfr_s)[2] <- "nfl"
  dfr_s$source <- "serum"
  
  dfr_c <- dfr %>% 
    select(age,log_nfl_csf,patientcode)
  names(dfr_c)[2] <- "nfl"
  dfr_c$source <- "csf"
  
  df <- rbind(dfr_s,dfr_c)
  
  
  p <- df %>% 
    ggplot(aes_string(x=obs[i], y=pred[i])) +
    geom_point(aes(color=source),alpha = 0.4, shape = 1, size = 2,stroke = 0.6) +
    
    ylim(c(min(df[pred[i]]-0.1*(max(df[pred[i]])-min(df[pred[i]]))),
           max(df[pred[i]])+0.2*(max(df[pred[i]])-min(df[pred[i]]))))+
    xlab(obs_nice[i]) +
    ylab(pred_nice[i]) +
    
    stat_cor(aes(label=..rr.label..,color=source), size=4,label.y.npc = .99,label.x.npc = 0)+
    stat_cor(aes(label=..p.label..,color=source), size=4,label.y.npc = .99, label.x.npc = 0.4)+
    geom_smooth(method="lm",se=TRUE, size=1, aes(color=source)) +
    theme_bw(base_size = 12) +
    theme(axis.title.x = element_text(colour = "#0171C0"),
          axis.title.y = element_text(colour = "#C00000"))
  
  assign(paste("p",i,sep = ""),p)
}

p1

hd_serum <- lm(dfr$log_nfl_serum~dfr$age)
hd_csf <- lm(dfr$log_nfl_csf~dfr$age)

summary(hd_serum)
summary(hd_csf)


#plot CSF vs Serum NFL 

pred <- "log_nfl_serum"
obs <- "log_nfl_csf"

pred_nice <- "Log10 sNFL"
obs_nice <- "Log10 cNFL"

for(i in 1:length(pred)){
  # i <- 2
  test_train <- data_sub_train
  confound <- c("age","bmi_height","bmi_weight","estbloodvolume","bmi","serum_alt","serum_ast",
                "serum_creatinine","serum_egfr_calc","serum_ap","serum_bun")
  
  # generate clean dataset - no NAs

  
  test_train <- test_train[complete.cases(test_train[confound]),]
  
  
  df <- test_train
  
  p <- df %>% 
    ggplot(aes_string(x=obs[i], y=pred[i])) +
    geom_point(color="#001321",alpha = 0.4, shape = 1, size = 2,stroke = 0.6) +
    annotate("text",label = paste0('atop(n[s] ==', nrow(df[obs[i]]), ', n[p] ==', length(unique(df$patientcode)), ')'),
             x=max(df[obs[i]])-.1*(max(df[obs[i]])-min(df[obs[i]])),
             y=min(df[pred[i]])+.03*(max(df[pred[i]])-min(df[pred[i]])),
             size=4,parse = TRUE) +
    ylim(c(min(df[pred[i]]-0.1*(max(df[pred[i]])-min(df[pred[i]]))),
           max(df[pred[i]])+0.2*(max(df[pred[i]])-min(df[pred[i]]))))+
    xlab(obs_nice[i]) +
    ylab(pred_nice[i]) +
    stat_cor(aes(label=..r.label..), size=4,color="#008F00",label.y.npc = 1)+
    stat_cor(aes(label=..rr.label..), size=4,color="#008F00",label.y.npc = .90)+
    stat_cor(aes(label=..p.label..), size=4,color="#008F00",label.y.npc = .77)+
    geom_smooth(method="lm",se=TRUE, size=1, color="#008F00") +
    theme_bw(base_size = 12) +
    theme(axis.title.x = element_text(colour = "#0171C0"),
          axis.title.y = element_text(colour = "#C00000"))
  
  assign(paste("p",i,sep = ""),p)
}

ggsave(plot = p1,filename = "./output/log_nfl_csf_vs_log_nfl_serum_training_cohort.png", 
       width =3,height = 2.5,units = "in",dpi = 300)


# generate residuals

nfl_orig_mod <- lm(log_nfl_serum~log_nfl_csf,data = test_train)

#generate dummy dataset for dat_sub_train
test <- test_train

test$residuals <-nfl_orig_mod$residuals 

# plot residuals
#plot CSF vs Serum NFL 

pred <- "residuals"
obs <- "log_nfl_csf"

pred_nice <- "NFL residuals"
obs_nice <- "Log10 cNFL"

for(i in 1:length(pred)){
  # i <- 2
  df <- test
  
  p <- df %>% 
    ggplot(aes_string(x=obs[i], y=pred[i])) +
    geom_point(color="#001321",alpha = 0.4, shape = 1, size = 2,stroke = 0.6) +
    annotate("text",label = paste0('atop(n[s] ==', nrow(df[obs[i]]), ', n[p] ==', length(unique(df$patientcode)), ')'),
             x=max(df[obs[i]])-.1*(max(df[obs[i]])-min(df[obs[i]])),
             y=min(df[pred[i]])+.03*(max(df[pred[i]])-min(df[pred[i]])),
             size=4,parse = TRUE) +
    ylim(c(min(df[pred[i]]-0.1*(max(df[pred[i]])-min(df[pred[i]]))),
           max(df[pred[i]])+0.2*(max(df[pred[i]])-min(df[pred[i]]))))+
    xlab(obs_nice[i]) +
    ylab(pred_nice[i]) +
    stat_cor(aes(label=..r.label..), size=4,color="black",label.y.npc = .99)+
    stat_cor(aes(label=..rr.label..), size=4,color="black",label.y.npc = .90)+
    stat_cor(aes(label=..p.label..), size=4,color="black",label.y.npc = .77)+
    geom_smooth(method="lm",se=TRUE, size=1,color="black") +
    theme_bw(base_size = 12) +
    theme(axis.title.x = element_text(colour = "#0171C0"),
          axis.title.y = element_text(colour = "#C55A11"))
  
  assign(paste("p",i,sep = ""),p)
}

ggsave(plot = p1,filename = "./output/log_nfl_csf_vs_nfl_residuals_training_cohort.png", 
       width =3,height = 2.5,units = "in",dpi = 300)


#plot CSF vs Serum NFL 

pred <- "residuals"
obs <- "age"

pred_nice <- "NFL residuals"
obs_nice <- "Age"

for(i in 1:length(pred)){
  # i <- 2
  df <- test
  
  p <- df %>% 
    ggplot(aes_string(x=obs[i], y=pred[i])) +
    geom_point(color="#001321",alpha = 0.4, shape = 1, size = 2,stroke = 0.6) +
    annotate("text",label = paste0('atop(n[s] ==', nrow(df[obs[i]]), ', n[p] ==', length(unique(df$patientcode)), ')'),
             x=max(df[obs[i]])-.1*(max(df[obs[i]])-min(df[obs[i]])),
             y=min(df[pred[i]])+.03*(max(df[pred[i]])-min(df[pred[i]])),
             size=4,parse = TRUE) +
    ylim(c(min(df[pred[i]]-0.1*(max(df[pred[i]])-min(df[pred[i]]))),
           max(df[pred[i]])+0.2*(max(df[pred[i]])-min(df[pred[i]]))))+
    xlab(obs_nice[i]) +
    ylab(pred_nice[i]) +
    stat_cor(aes(label=..r.label..), size=4,color="black",label.y.npc = 1)+
    stat_cor(aes(label=..rr.label..), size=4,color="black",label.y.npc = .90)+
    stat_cor(aes(label=..p.label..), size=4,color="black",label.y.npc = .77)+
    geom_smooth(method="lm",se=TRUE, size=1,color="black") +
    theme_bw(base_size = 12) +
    theme(axis.title.x = element_text(colour = "red"),
          axis.title.y = element_text(colour = "#C55A11"))
  
  assign(paste("p",i,sep = ""),p)
}

ggsave(plot = p1,filename = "./output/age_vs_residuals_training_cohort.png", 
       width =3,height = 2.5,units = "in",dpi = 300)


#######################################################
#######################################################
#
# demographic data (samples with confounders used for model generation)
#
#######################################################
#######################################################


# TRAINING COHORT
training_demo <- data_sub %>% 
  filter(cohort=="train")
training_demo <- training_demo[complete.cases(training_demo[confound]),]

length(unique(training_demo$patientcode))

training <- training_demo %>% 
  # filter(cohort=="train") %>% 
  arrange(.,age)

# get number of samples per patient

x <- tibble::enframe(table(training$patientcode),name = "patientcode",value = "nr_samples")

# get first sample per patient

training_first <- training %>% 
  dplyr::group_by(patientcode) %>% 
  filter(row_number()==1)

training_first <- merge(training_first,x,by="patientcode")

#get couunts for 
training_first %>% 
  dplyr::group_by(diagnosis) %>% 
  dplyr::group_by(gender,.add=TRUE) %>% 
  count()

training_first %>% 
  dplyr::group_by(diagnosis) %>% 
  summarise_at(.,.vars = "age",.funs = c(mean,sd,min,max))

training_first %>% 
  dplyr::group_by(diagnosis) %>% 
  dplyr::group_by(ethnicity,.add=TRUE) %>% 
  count()

tab <- training_first %>% 
  dplyr::group_by(diagnosis) %>% 
  dplyr::group_by(race,.add=TRUE) %>% 
  count()

training_first %>% 
  dplyr::group_by(diagnosis) %>% 
  summarise_at(.,.vars = "nr_samples",.funs = c(sum,mean,sd,min,max))


training_demo_group <- training_demo %>% 
  group_by(diagnosis)

summarize_at(.tbl = training_demo_group,.vars = "nfl_csf",.funs = mean)
summarize_at(.tbl = training_demo_group,.vars = "nfl_csf",.funs = sd)

summarize_at(.tbl = training_demo_group,.vars = "nfl_serum",.funs = mean)
summarize_at(.tbl = training_demo_group,.vars = "nfl_serum",.funs = sd)

# VALIDATION COHORT

validation_demo <- data_sub %>% 
  filter(cohort=="val")
validation_demo <- validation_demo[complete.cases(validation_demo[confound]),]

length(unique(validation_demo$patientcode))

validation <- validation_demo %>% 
  # filter(cohort=="val") %>% 
  arrange(.,age)

# get number of samples per patient

x <- tibble::enframe(table(validation$patientcode),name = "patientcode",value = "nr_samples")

# get first sample per patient

validation_first <- validation %>% 
  dplyr::group_by(patientcode) %>% 
  filter(row_number()==1)

validation_first <- merge(validation_first,x,by="patientcode")

#get couunts for 
validation_first %>% 
  dplyr::group_by(diagnosis) %>% 
  dplyr::group_by(gender,.add=TRUE) %>% 
  count()


validation_first %>% 
  dplyr::group_by(diagnosis) %>% 
  dplyr::group_by(ethnicity,.add=TRUE) %>% 
  count()

tab <- validation_first %>% 
  dplyr::group_by(diagnosis) %>% 
  dplyr::group_by(race,.add=TRUE) %>% 
  count()


validation_first %>% 
  dplyr::group_by(diagnosis) %>% 
  summarise_at(.,.vars = "age",.funs = c(mean,sd,min,max))


validation_first %>% 
  dplyr::group_by(diagnosis) %>% 
  summarise_at(.,.vars = "nr_samples",.funs = c(sum,mean,sd,min,max))


validation_demo_group <- validation_demo %>% 
  group_by(diagnosis)

summarize_at(.tbl = validation_demo_group,.vars = "nfl_csf",.funs = mean)
summarize_at(.tbl = validation_demo_group,.vars = "nfl_csf",.funs = sd)

summarize_at(.tbl = validation_demo_group,.vars = "nfl_serum",.funs = mean)
summarize_at(.tbl = validation_demo_group,.vars = "nfl_serum",.funs = sd)


################################################################################################
################################################################################################
#
#
#    generate MLR model of Serum NFL in training cohort
#
#
################################################################################################
################################################################################################


# selected varaibles for building a model
vars_model <- c("log_nfl_csf","log_nfl_serum","age","bmi_height","bmi_weight","estbloodvolume","bmi","serum_alt","serum_ast",
                "serum_creatinine","serum_egfr_calc","serum_ap","serum_bun")

confound <- c("age","bmi_height","bmi_weight","estbloodvolume","bmi","serum_alt","serum_ast",
              "serum_creatinine","serum_egfr_calc","serum_ap","serum_bun")

# generate clean dataset - no NAs

data_sub_train_clean <- data_sub_train

data_sub_train_clean <- data_sub_train_clean[complete.cases(data_sub_train[confound]),]


#generate simple LM model in the clean cohort

mod_train_clean <- lm(log_nfl_serum~log_nfl_csf,data_sub_train_clean)
summary(mod_train_clean)

#build the model:

fit <- lm(log_nfl_serum~log_nfl_csf + age + bmi + bmi_height + bmi_weight + estbloodvolume + serum_alt +
            serum_ast + serum_ap + serum_bun + serum_creatinine + serum_egfr_calc, data_sub_train_clean)

summary(fit)
#stepwise regression
step <- stepAIC(fit, direction="both")

step$anova # display results

best_fit <- lm(log_nfl_serum ~ log_nfl_csf + age + bmi_weight + serum_ap + serum_bun + 
                 serum_creatinine, data_sub_train_clean)

summary(best_fit)

saveRDS(best_fit,"./output/best_fit_equation.RDS")


age_bmi_fit <- lm(log_nfl_serum ~ log_nfl_csf + age + bmi, data_sub_train_clean)


######
######
#### predict CSF from serum
# generate residuals in the CLEAN cohort

mod_train_clean_rev <- lm(log_nfl_csf~log_nfl_serum,data_sub_train_clean)

summary(mod_train_clean_rev)


#build the model:

fit_rev <- lm(log_nfl_csf~log_nfl_serum + age + bmi + bmi_height + bmi_weight + estbloodvolume + serum_alt +
                serum_ast + serum_ap + serum_bun + serum_creatinine + serum_egfr_calc, data_sub_train_clean)

#stepwise regression
step_rev <- stepAIC(fit_rev, direction="both")

step_rev$anova # display results

best_fit_rev <- lm(log_nfl_csf ~ log_nfl_serum + age + bmi + bmi_height + estbloodvolume + 
                     serum_alt + serum_ast + serum_bun + serum_egfr_calc, data_sub_train_clean)


summary(best_fit_rev)


#plot lolipop plot for t-values variables 

tidy_fit <- broom::tidy(fit)
tidy_fit <- tidy_fit[-c(1:2),]
tidy_fit <- tidy_fit %>% 
  arrange(desc(abs(statistic)))

t_values <- tidy_fit$statistic
t_names <- tidy_fit$term

data <- as.data.frame(matrix(tidy_fit$statistic,ncol=11))
colnames(data) <- c("Age","BUN","AP","Creatinine","Height","Weight","BMI","Est Blood Vol","ALT","AST","eGFR") 

data <- rbind(rep(max(t_values),11),rep(0,11),data)


#transpose dataframe
data_t <- t(data)
data_t <- data_t[,3]
data_t_names <- names(data_t)
data_t <- tibble::enframe(data_t,name = NULL,value = "statistic")
data_t <- cbind(data_t_names,data_t)
data_t <- data_t %>%
  arrange(desc(data_t_names))

data_t[12:15,] <- NA

data_t[12,1] <- "x"
data_t[13,1] <- "y"
data_t[14,1] <- "z"
data_t[15,1] <- "w"
data_names <- data_t$data_t_names


tidy_fit <- broom::tidy(best_fit)
tidy_fit <- tidy_fit[-c(1:2),]
tidy_fit <- tidy_fit %>% 
  select(term,statistic)
tidy_fit$term <- c("Age","Weight","AP","BUN","Creatinine")

names(tidy_fit)[1] <- "data_t_names"
names(tidy_fit)[2] <- "statistics_new"

data_t <- merge(data_t,tidy_fit,by="data_t_names",all.x = TRUE)

rownames(data_t)
#lolipop plot
t <- ggplot(data_t, aes(y=statistic, x=data_t_names)) +
  geom_segment( aes(y=0, yend=statistic,x=data_t_names, xend=data_t_names), color="magenta") +
  geom_point( color="magenta", size=3, alpha=0.6) +
  theme_light() +
  scale_x_discrete(limits = c("Age","x","y","eGFR","Creatinine","BUN","AST","AP","ALT","z","w","Weight","Height","Est Blood Vol","BMI")) +
  geom_hline(yintercept = 0,color = "black", linetype="dashed",size=.25) +
  coord_flip() +
  ylab("abs(t-statistics)") +
  xlab("")+
  ylim(c(-2.5,7.1))+
  theme(
    panel.grid.major.y = element_blank(),
    panel.border = element_blank(),
    axis.ticks.y = element_blank(),
    axis.title.x = element_blank(),
    axis.text.y = element_text(margin = margin(r=15),size = 12, face = c("bold","plain","plain","plain","bold","bold","plain","bold","plain","plain","plain","bold","plain","plain","plain" ))
    
  )
t
ggsave(plot = ggarrange(t,ncol = 1,nrow = 1, common.legend=TRUE, legend="bottom"),
       filename = "./output/vars_lolipop_MLM.png", 
       width =2.7,height = 3,units = "in",dpi = 300)

t <- ggplot(data_t, aes(y=statistics_new, x=data_t_names)) +
  geom_segment( aes(y=0, yend=statistics_new,x=data_t_names, xend=data_t_names), color="purple") +
  geom_point( color="purple", size=3, alpha=0.6) +
  theme_light() +
  scale_x_discrete(limits = c("Age","x","y","eGFR","Creatinine","BUN","AST","AP","ALT","z","w","Weight","Height","Est Blood Vol","BMI")) +
  geom_hline(yintercept = 0,color = "black", linetype="dashed",size=.25) +
  coord_flip() +
  ylab("abs(t-statistics)") +
  xlab("")+
  ylim(c(-8.7,8.2))+
  theme(
    panel.grid.major.y = element_blank(),
    panel.border = element_blank(),
    axis.ticks.y = element_blank(),
    axis.title.x = element_blank(),
    axis.text.y = element_text(margin = margin(r=15),size = 12, face = c("bold","plain","plain","plain","bold","bold","plain","bold","plain","plain","plain","bold","plain","plain","plain" ))
    
  )
t
ggsave(plot = ggarrange(t,ncol = 1,nrow = 1, common.legend=TRUE, legend="bottom"),
       filename = "./output/vars_lolipop_MLM_new.png", 
       width =2.7,height = 3,units = "in",dpi = 300)

#
#
#
#
##
#
#
#
#
#
#
#


#########################################################
#
#  THIS SHOULD GO INTO SUPPLEMENTARY
#
#########################################################

######
######
#### predict serum from CSF - mixed model

mod_train_clean_mix <- lmer(log_nfl_serum~log_nfl_csf + (1|patientcode),data = data_sub_train_clean)

summary(mod_train_clean_mix)

# R2 for mixed effect model
# https://stats.stackexchange.com/questions/7240/proportion-of-explained-variance-in-a-mixed-effects-model

r.squaredGLMM(mod_train_clean_mix) 

# R2m: marginal R squared value associated with fixed effects
# R2c conditional R2 value associated with fixed effects plus the random effects.

#build the model:

fit_mix <- lmer(log_nfl_serum~log_nfl_csf + age + bmi + bmi_height + estbloodvolume + serum_creatinine + 
                serum_egfr_calc + bmi_weight + serum_ap + serum_bun + (1|patientcode), data_sub_train_clean)


summary(fit_mix)
#stepwise regression
step_mix <- stepcAIC(fit_mix, direction="both", groupCandidates = "patientcode", trace=TRUE)

best_fit_mix <- lmer(log_nfl_serum ~ log_nfl_csf + age + bmi + bmi_height + estbloodvolume + serum_creatinine 
                     + serum_egfr_calc + bmi_weight + serum_ap + serum_bun + (1 | patientcode), 
                     data = data_sub_train_clean)
summary(best_fit_mix)

r.squaredGLMM(best_fit_mix)

# predict serum NFL levels in the trainig cohort

data_sub_train_clean$pred_serum_nfl_lm <- predict(mod_train_clean,newdata = data_sub_train_clean)

data_sub_train_clean$pred_serum_nfl_mlm <- predict(best_fit,newdata = data_sub_train_clean)

data_sub_train_clean$pred_serum_nfl_lmer <- predict(best_fit_mix,newdata = data_sub_train_clean)

data_sub_train_clean$pred_serum_nfl_mlm_age_bmi <- predict(age_bmi_fit,newdata = data_sub_train_clean)


# clean validation cohort:

data_sub_val_clean <- data_sub_val 

data_sub_val_clean <- data_sub_val_clean[complete.cases(data_sub_val[confound]),]


data_sub_val_clean$pred_serum_nfl_lm <- predict(mod_train_clean,newdata = data_sub_val_clean)

data_sub_val_clean$pred_serum_nfl_mlm <- predict(best_fit,newdata = data_sub_val_clean)

data_sub_val_clean$pred_serum_nfl_lmer <- predict(best_fit_mix,newdata = data_sub_val_clean,allow.new.levels=TRUE)

data_sub_val_clean$pred_serum_nfl_mlm_age_bmi <- predict(age_bmi_fit,newdata = data_sub_val_clean)

# clean validation cohort - first sample only:

data_sub_val_clean_first <- data_sub_val %>%
  dplyr::select(patientcode,lpdate,diagnosis,log_nfl_csf,log_nfl_serum,all_of(confound),cel)

data_sub_val_clean_first <- data_sub_val_clean_first[complete.cases(data_sub_val_clean_first[confound]),]

data_sub_val_clean_first <- data_sub_val_clean_first %>%
  group_by(patientcode) %>% 
  arrange(lpdate) %>% 
  slice(1)
  

data_sub_val_clean_first$pred_serum_nfl_lm <- predict(mod_train_clean,newdata = data_sub_val_clean_first)

data_sub_val_clean_first$pred_serum_nfl_mlm <- predict(best_fit,newdata = data_sub_val_clean_first)

data_sub_val_clean_first$pred_serum_nfl_lmer <- predict(best_fit_mix,newdata = data_sub_val_clean_first,allow.new.levels=TRUE)


# clean validation cohort - median sample:

data_sub_val_clean_med <- data_sub_val %>%
  dplyr::select(patientcode,lpdate,diagnosis,log_nfl_csf,log_nfl_serum,all_of(confound),cel)
data_sub_val_clean_med <- data_sub_val_clean_med %>%
  group_by(patientcode) %>% 
  mutate_at(.vars = c("log_nfl_csf","log_nfl_serum",all_of(confound)),.funs = median ,na.rm = TRUE) %>% 
  slice(1) %>% 
  ungroup()

data_sub_val_clean_med <- data_sub_val_clean_med[complete.cases(data_sub_val_clean_med[confound]),]


data_sub_val_clean_med$pred_serum_nfl_lm <- predict(mod_train_clean,newdata = data_sub_val_clean_med)

data_sub_val_clean_med$pred_serum_nfl_mlm <- predict(best_fit,newdata = data_sub_val_clean_med)

data_sub_val_clean_med$pred_serum_nfl_lmer <- predict(best_fit_mix,newdata = data_sub_val_clean_med,allow.new.levels=TRUE)


# plot

pred <- c("pred_serum_nfl_lm","pred_serum_nfl_mlm","pred_serum_nfl_lmer","pred_serum_nfl_mlm_age_bmi")
obs <- c("log_nfl_serum","log_nfl_serum","log_nfl_serum","log_nfl_serum")

pred_nice <- c("cNFL-predicted sNFL","cNFL-predicted sNFL","cNFL-predicted sNFL (LMER)","cNFL-predicted sNFL")
obs_nice <- c("measured Log10 sNFL","measured Log10 sNFL","measured Log10 sNFL","measured Log10 sNFL")


for(i in 1:length(pred)){
  # i <-2 
  df <- data_sub_train_clean
  
    p <- df %>% 
      ggplot(aes_string(x=obs[i], y=pred[i])) +
      geom_point(color="#001321",alpha = 0.4, shape = 1, size = 2,stroke = 0.6) +
      annotate("text",label = paste0('atop(n[s] ==', nrow(df[obs[i]]), ', n[p] ==', length(unique(df$patientcode)), ')'),
               x=max(df[obs[i]])-.1*(max(df[obs[i]])-min(df[obs[i]])),
               y=min(df[obs[i]])+.12*(max(df[obs[i]])-min(df[obs[i]])),
               size=4,parse = TRUE) +
      annotate("text",label = paste("CCC==",
                                     as.numeric(round(epi.ccc(as.numeric(df[[obs[i]]]),as.numeric(df[[pred[i]]]),
                                                                     ci = "z-transform",conf.level = 0.95)$rho.c[1],
                                                      digits = 2)),sep=""),
               x=max(df[obs[i]])-.3*(max(df[obs[i]])-min(df[obs[i]])),
               y=max(df[obs[i]])-.01*(max(df[obs[i]])-min(df[obs[i]])),
               size=4,parse = TRUE,color="blue") +
      
      ylim(c(min(df[obs[i]]),max(df[obs[i]])))+
      xlab(obs_nice[i]) +
      ylab(pred_nice[i]) +
      geom_abline(slope = 1,intercept = 0,color="blue",size=0.5) +
      stat_cor(aes(label=..r.label..), size=4,color="#008F00",label.y.npc = 1)+
      stat_cor(aes(label=..rr.label..), size=4,color="#008F00",label.y.npc = .90)+
      stat_cor(aes(label=..p.label..), size=4,color="#008F00",label.y.npc = .77)+
      geom_smooth(method="lm",se=TRUE, size=1,color="#008F00") +
      theme_bw(base_size = 12) +
    theme(axis.title.x = element_text(colour = "#C00000"),
          axis.title.y = element_text(colour = "#008F00"))
  
  assign(paste("p",i,sep = ""),p)
}


ggsave(plot = ggarrange(p1,ncol = 1,nrow = 1, common.legend=TRUE, legend="bottom"),filename = "./output/LM_predicted_sNFL_training.png", 
       width =3,height = 2.2,units = "in",dpi = 300)

ggsave(plot = ggarrange(p2,ncol = 1,nrow = 1, common.legend=TRUE, legend="bottom"),filename = "./output/MLM_predicted_sNFL_training.png", 
       width =3,height = 2.2,units = "in",dpi = 300)

ggsave(plot = ggarrange(p4,ncol = 1,nrow = 1, common.legend=TRUE, legend="bottom"),filename = "./output/MLM_age_bmi_predicted_sNFL_training.png", 
       width =3,height = 2.2,units = "in",dpi = 300)


pred <- c("pred_serum_nfl_lm","pred_serum_nfl_mlm","pred_serum_nfl_lmer","pred_serum_nfl_mlm_age_bmi")
obs <- c("log_nfl_serum","log_nfl_serum","log_nfl_serum","log_nfl_serum")

pred_nice <- c("cNFL-predicted sNFL","cNFL-predicted sNFL","cNFL-predicted sNFL (LMER)","cNFL-predicted sNFL")
obs_nice <- c("measured Log10 sNFL","measured Log10 sNFL","measured Log10 sNFL","measured Log10 sNFL")


for(i in 1:length(pred)){
  # i <- 1
  df <- data_sub_val_clean
  
  p <- df %>% 
    ggplot(aes_string(x=obs[i], y=pred[i])) +
    geom_point(color="#001321",alpha = 0.4, shape = 1, size = 2,stroke = 0.6) +
    annotate("text",label = paste0('atop(n[s] ==', nrow(df[obs[i]]), ', n[p] ==', length(unique(df$patientcode)), ')'),
             x=max(df[obs[i]])-.1*(max(df[obs[i]])-min(df[obs[i]])),
             y=min(df[obs[i]])+.12*(max(df[obs[i]])-min(df[obs[i]])),
             size=4,parse = TRUE) +
    annotate("text",label = paste("CCC==",
                                  as.numeric(round(epi.ccc(as.numeric(df[[obs[i]]]),as.numeric(df[[pred[i]]]),
                                                           ci = "z-transform",conf.level = 0.95)$rho.c[1],
                                                   digits = 2)),sep=""),
             x=max(df[obs[i]])-.3*(max(df[obs[i]])-min(df[obs[i]])),
             y=max(df[obs[i]])-.01*(max(df[obs[i]])-min(df[obs[i]])),
             size=4,parse = TRUE,color="blue") +
    # ylim(c(min(df[pred[i]]-0.1*(max(df[pred[i]])-min(df[pred[i]]))),
    #        max(df[pred[i]])+0.2*(max(df[pred[i]])-min(df[pred[i]]))))+
    ylim(c(min(df[obs[i]]),max(df[obs[i]])))+
    xlab(obs_nice[i]) +
    ylab(pred_nice[i]) +
    geom_abline(slope = 1,intercept = 0,color="blue",size=0.5) +
    stat_cor(aes(label=..r.label..), size=4,color="#008F00",label.y.npc = 1)+
    stat_cor(aes(label=..rr.label..), size=4,color="#008F00",label.y.npc = .90)+
    stat_cor(aes(label=..p.label..), size=4,color="#008F00",label.y.npc = .77)+
    geom_smooth(method="lm",se=TRUE, size=1,color="#008F00") +
    theme_bw(base_size = 12) +
    theme(axis.title.x = element_text(colour = "#C00000"),
          axis.title.y = element_text(colour = "#008F00"))
  
  assign(paste("v",i,sep = ""),p)
}

ggsave(plot = ggarrange(v1,ncol = 1,nrow = 1, common.legend=TRUE, legend="bottom"),filename = "./output/LM_predicted_sNFL_validation.png", 
       width =3,height = 2.2,units = "in",dpi = 300)

ggsave(plot = ggarrange(v2,ncol = 1,nrow = 1, common.legend=TRUE, legend="bottom"),filename = "./output/MLM_predicted_sNFL_validation.png", 
       width =3,height = 2.2,units = "in",dpi = 300)

ggsave(plot = ggarrange(v4,ncol = 1,nrow = 1, common.legend=TRUE, legend="bottom"),filename = "./output/MLM_age_bmi_predicted_sNFL_validation.png", 
       width =3,height = 2.2,units = "in",dpi = 300)

#plot firts LP


pred <- c("pred_serum_nfl_lm","pred_serum_nfl_mlm","pred_serum_nfl_lmer")
obs <- c("log_nfl_serum","log_nfl_serum","log_nfl_serum")

pred_nice <- c("cNFL-predicted sNFL","cNFL-predicted sNFL","predicted sNFL (LMER)")
obs_nice <- c("measured Log10 sNFL","measured Log10 sNFL","measured Log10 sNFL")


for(i in 1:length(pred)){
  # i <- 1
  df <- data_sub_val_clean_first
  
  p <- df %>% 
    ggplot(aes_string(x=obs[i], y=pred[i])) +
    geom_point(color="#001321",alpha = 0.4, shape = 1, size = 2,stroke = 0.6) +
    annotate("text",label = paste0('atop(n[s] ==', nrow(df[obs[i]]), ', n[p] ==', length(unique(df$patientcode)), ')'),
             x=max(df[obs[i]])-.1*(max(df[obs[i]])-min(df[obs[i]])),
             y=min(df[pred[i]])+.03*(max(df[pred[i]])-min(df[pred[i]])),
             size=4,parse = TRUE) +
    ylim(c(min(df[pred[i]]-0.1*(max(df[pred[i]])-min(df[pred[i]]))),
           max(df[pred[i]])+0.2*(max(df[pred[i]])-min(df[pred[i]]))))+
    xlab(obs_nice[i]) +
    ylab(pred_nice[i]) +
    stat_cor(aes(label=..r.label..), size=4,color="#008F00",label.y.npc = 1)+
    stat_cor(aes(label=..rr.label..), size=4,color="#008F00",label.y.npc = .90)+
    stat_cor(aes(label=..p.label..), size=4,color="#008F00",label.y.npc = .77)+
    geom_smooth(method="lm",se=TRUE, size=1,color="#008F00") +
    theme_bw(base_size = 12) +
    theme(axis.title.x = element_text(colour = "#C00000"),
          axis.title.y = element_text(colour = "#008F00"))
  
  assign(paste("f",i,sep = ""),p)
}

ggsave(plot = ggarrange(v1,v2,f1,f2,ncol = 2,nrow = 2, common.legend=TRUE, legend="bottom"),filename = "./output/predicted_sNFL_validation_first_sample.png", 
       width =6,height = 5,units = "in",dpi = 300)

#plot median value LP


pred <- c("pred_serum_nfl_lm","pred_serum_nfl_mlm","pred_serum_nfl_lmer")
obs <- c("log_nfl_serum","log_nfl_serum","log_nfl_serum")

pred_nice <- c("cNFL-predicted sNFL","cNFL-predicted sNFL","predicted sNFL (LMER)")
obs_nice <- c("measured Log10 sNFL","measured Log10 sNFL","measured Log10 sNFL")


for(i in 1:length(pred)){
  # i <- 1
  df <- data_sub_val_clean_med
  
  p <- df %>% 
    ggplot(aes_string(x=obs[i], y=pred[i])) +
    geom_point(color="#001321",alpha = 0.4, shape = 1, size = 2,stroke = 0.6) +
    annotate("text",label = paste0('atop(n[s] ==', nrow(df[obs[i]]), ', n[p] ==', length(unique(df$patientcode)), ')'),
             x=max(df[obs[i]])-.1*(max(df[obs[i]])-min(df[obs[i]])),
             y=min(df[pred[i]])+.03*(max(df[pred[i]])-min(df[pred[i]])),
             size=4,parse = TRUE) +
    ylim(c(min(df[pred[i]]-0.1*(max(df[pred[i]])-min(df[pred[i]]))),
           max(df[pred[i]])+0.2*(max(df[pred[i]])-min(df[pred[i]]))))+
    xlab(obs_nice[i]) +
    ylab(pred_nice[i]) +
    stat_cor(aes(label=..r.label..), size=4,color="#008F00",label.y.npc = 1)+
    stat_cor(aes(label=..rr.label..), size=4,color="#008F00",label.y.npc = .90)+
    stat_cor(aes(label=..p.label..), size=4,color="#008F00",label.y.npc = .77)+
    geom_smooth(method="lm",se=TRUE, size=1,color="#008F00") +
    theme_bw(base_size = 12) +
    theme(axis.title.x = element_text(colour = "#C00000"),
          axis.title.y = element_text(colour = "#008F00"))
  
  assign(paste("m",i,sep = ""),p)
}


ggsave(plot = ggarrange(m1,m2,ncol = 2,nrow = 1, common.legend=TRUE, legend="bottom"),filename = "./output/predicted_sNFL_validation_median_sample.png", 
       width =6,height = 2.5,units = "in",dpi = 300)

#
#
#
#
##
#
#
#
##
#

########################################################################################################################################################
########################################################################################################################################################
#
#             correlation with CEL - adjusted serum NFL
#
########################################################################################################################################################
########################################################################################################################################################

# predict csf NFL from serum NFL and cofounders
# best_fit


intercept <- as.numeric(best_fit$coefficients[1])
log_nfl_csf_slope <- as.numeric(best_fit$coefficients[2])
age_slope <- as.numeric(best_fit$coefficients[3])
bmi_weight_slope <- as.numeric(best_fit$coefficients[4])
serum_ap_slope <- as.numeric(best_fit$coefficients[5])
serum_bun_slope <- as.numeric(best_fit$coefficients[6])
serum_creatinine_slope <- as.numeric(best_fit$coefficients[7])

data_sub_train_clean$predicted_csf_nfl <- (data_sub_train_clean$log_nfl_serum - age_slope * data_sub_train_clean$age 
                                           - bmi_weight_slope * data_sub_train_clean$bmi_weight 
                                           - serum_ap_slope * data_sub_train_clean$serum_ap 
                                           - serum_bun_slope * data_sub_train_clean$serum_bun 
                                           - serum_creatinine_slope * data_sub_train_clean$serum_creatinine
                                           - intercept) / log_nfl_csf_slope

data_sub_val_clean$predicted_csf_nfl <- (data_sub_val_clean$log_nfl_serum - age_slope * data_sub_val_clean$age 
                                         - bmi_weight_slope * data_sub_val_clean$bmi_weight
                                         - serum_ap_slope * data_sub_val_clean$serum_ap 
                                         - serum_bun_slope * data_sub_val_clean$serum_bun 
                                         - serum_creatinine_slope * data_sub_val_clean$serum_creatinine
                                         - intercept) / log_nfl_csf_slope


######
#  generate age-adjusted predicted_csf_nfl
######

# merge train and val


data_sub_clean <- rbind(data_sub_train_clean[,c("sampleid","diagnosis","age","predicted_csf_nfl")], 
                        data_sub_val_clean[,c("sampleid","diagnosis","age","predicted_csf_nfl")])
data_sub_clean_hd <- data_sub_clean %>% 
  filter(diagnosis=="Healthy Donor")

# generate equation in HD
hd_mod <- lm(predicted_csf_nfl~age,data_sub_clean_hd)
summary(hd_mod)

# predict age-adjusted cNFL
data_sub_clean$predicted_csf_nfl_age_adj <- data_sub_clean$predicted_csf_nfl - predict(object = hd_mod,newdata = data_sub_clean)

# merge with data_sub_train_clean and data_sub_val_clean


data_sub_clean_to_merge <- data_sub_clean %>% 
  select(sampleid,predicted_csf_nfl_age_adj)


data_sub_train_clean <- merge(data_sub_train_clean,data_sub_clean_to_merge,by="sampleid",all.x = TRUE)
data_sub_val_clean <- merge(data_sub_val_clean,data_sub_clean_to_merge,by="sampleid",all.x = TRUE)


#plot CEL vs CSF NFL

obs <- c("log_nfl_csf","log_nfl_csf","log_nfl_csf_age_adj")
obs_nice <- c("Log10 cNFL","Log10 cNFL","Log10 age-adj cNFL")


pred <- c("cel","t2_new_large","cel")
pred_nice <- c("Ln(CEL+1)","Ln(#New+Enlarging T2 +1)","Ln(CEL+1)")

for(i in 1:length(obs)){
  # i <- 2
  data <- data_sub_train_clean
  df <- data %>% 
    filter(.,!is.na(data[[pred[i]]]) & !is.na(data$predicted_csf_nfl))

  p <- df %>% 
    ggplot(aes_string(x=obs[i], y=pred[i])) +
    geom_point(color="#001321",alpha = 0.4, shape = 1.5, size = 2,stroke = 0.6) +
    annotate("text",label = paste0('atop(n[s] ==', nrow(df[obs[i]]), ', n[p] ==', length(unique(df$patientcode)), ')'),
             x=max(df[obs[i]])-.15*(max(df[obs[i]])-min(df[obs[i]])),
             y=min(df[pred[i]])+1.2*(max(df[pred[i]])-min(df[pred[i]])),
             size=4,parse = TRUE) +
    ylim(c(min(df[pred[i]]),
           max(df[pred[i]])+0.35*(max(df[pred[i]])-min(df[pred[i]]))))+
    xlab(obs_nice[i]) +
    ylab(pred_nice[i]) +
    stat_cor(aes(label=..r.label..), size=4,color="#521B93",label.y.npc = 1)+
    stat_cor(aes(label=..rr.label..), size=4,color="#521B93",label.y.npc = .90)+
    stat_cor(aes(label=..p.label..), size=4,color="#521B93",label.y.npc = .77)+
    geom_smooth(method="lm", se=TRUE, size=1,color="#521B93") +
    theme_bw(base_size = 12) +
    theme(axis.title.x = element_text(colour = "#0171C0"),
          axis.title.y = element_text(colour = "#FF9300"))
  
  assign(paste("p",i,sep = ""),p)
}
# Save all plots 4x3

ggsave(plot = ggarrange(p1,ncol = 1,nrow = 1, common.legend=TRUE, legend="bottom"),
       filename = "./output/cel_vs_csf_nfl_training.png", width =2.5,height = 2.5,units = "in",dpi = 300)

# ggsave(plot = ggarrange(p2,ncol = 1,nrow = 1, common.legend=TRUE, legend="bottom"),
#        filename = "./output/T2NE_vs_csf_nfl_training.png", width =2.5,height = 2.5,units = "in",dpi = 300)

ggsave(plot = ggarrange(p3,ncol = 1,nrow = 1, common.legend=TRUE, legend="bottom"),
       filename = "./output/cel_vs_age_adj_csf_nfl_training.png", width =2.5,height = 2.5,units = "in",dpi = 300)


#plot CEL vs serum NFL

obs <- c("log_nfl_serum","log_nfl_serum","log_nfl_serum_age_adj")
obs_nice <- c("Log10 sNFL","Log10 sNFL","Log age-adj sNFL")


pred <- c("cel","t2_new_large","cel")
pred_nice <- c("Ln(CEL+1)","Ln(#New+Enlarging T2 +1)","Ln(CEL+1)")

for(i in 1:length(obs)){
  # i <- 2
  data <- data_sub_train_clean
  df <- data %>% 
    filter(.,!is.na(data[[pred[i]]]) & !is.na(data$predicted_csf_nfl))
  
  p <- df %>% 
    ggplot(aes_string(x=obs[i], y=pred[i])) +
    geom_point(color="#001321",alpha = 0.4, shape = 1, size = 2,stroke = 0.6) +
    annotate("text",label = paste0('atop(n[s] ==', nrow(df[obs[i]]), ', n[p] ==', length(unique(df$patientcode)), ')'),
             x=max(df[obs[i]])-.15*(max(df[obs[i]])-min(df[obs[i]])),
             y=min(df[pred[i]])+1.2*(max(df[pred[i]])-min(df[pred[i]])),
             size=4,parse = TRUE) +
    ylim(c(min(df[pred[i]]),
           max(df[pred[i]])+0.35*(max(df[pred[i]])-min(df[pred[i]]))))+
    xlab(obs_nice[i]) +
    ylab(pred_nice[i]) +
    stat_cor(aes(label=..r.label..), size=4,color="#521B93",label.y.npc = 1)+
    stat_cor(aes(label=..rr.label..), size=4,color="#521B93",label.y.npc = .90)+
    stat_cor(aes(label=..p.label..), size=4,color="#521B93",label.y.npc = .77)+
    geom_smooth(method="lm",se=TRUE, size=1,color="#521B93") +
    theme_bw(base_size = 12) +
    theme(axis.title.x = element_text(colour = "#C00000"),
          axis.title.y = element_text(colour = "#FF9300"))
  
  assign(paste("p",i,sep = ""),p)
}
# Save all plots 4x3

ggsave(plot = ggarrange(p1,ncol = 1,nrow = 1, common.legend=TRUE, legend="bottom"),
       filename = "./output/cel_vs_serum_nfl_training.png", width =2.5,height = 2.5,units = "in",dpi = 300)
# ggsave(plot = ggarrange(p2,ncol = 1,nrow = 1, common.legend=TRUE, legend="bottom"),
#        filename = "./output/T2NE_vs_serum_nfl_training.png", width =2.5,height = 2.5,units = "in",dpi = 300)

ggsave(plot = ggarrange(p3,ncol = 1,nrow = 1, common.legend=TRUE, legend="bottom"),
       filename = "./output/cel_vs_age_adj_serum_nfl_training.png", width =2.5,height = 2.5,units = "in",dpi = 300)


#plot CEL vs predicted CSF NFL

obs <- c("predicted_csf_nfl","predicted_csf_nfl","predicted_csf_nfl_age_adj")
obs_nice <- c("sNFL-predicted cNFL","sNFL-predicted cNFL","age-adj sNFL-predicted cNFL")

pred <- c("cel","t2_new_large","cel")
pred_nice <- c("Ln(CEL+1)","Ln(#New+Enlarging T2 +1)","Ln(CEL+1)")

for(i in 1:length(obs)){
  # i <- 2
  data <- data_sub_train_clean
  df <- data %>% 
    filter(.,!is.na(data[[pred[i]]]) & !is.na(data$predicted_csf_nfl))
  
  p <- df %>% 
    ggplot(aes_string(x=obs[i], y=pred[i])) +
    geom_point(color="#001321",alpha = 0.4, shape = 1, size = 2,stroke = 0.6) +
    annotate("text",label = paste0('atop(n[s] ==', nrow(df[obs[i]]), ', n[p] ==', length(unique(df$patientcode)), ')'),
             x=max(df[obs[i]])-.15*(max(df[obs[i]])-min(df[obs[i]])),
             y=min(df[pred[i]])+1.2*(max(df[pred[i]])-min(df[pred[i]])),
             size=4,parse = TRUE) +
    ylim(c(min(df[pred[i]]),
           max(df[pred[i]])+0.35*(max(df[pred[i]])-min(df[pred[i]]))))+
    xlab(obs_nice[i]) +
    ylab(pred_nice[i]) +
    stat_cor(aes(label=..r.label..), size=4,color="#521B93",label.y.npc = 1)+
    stat_cor(aes(label=..rr.label..), size=4,color="#521B93",label.y.npc = .90)+
    stat_cor(aes(label=..p.label..), size=4,color="#521B93",label.y.npc = .77)+
    geom_smooth(method="lm",se=TRUE, size=1,color="#521B93") +
    theme_bw(base_size = 12) +
    theme(axis.title.x = element_text(colour = "#008F00"),
          axis.title.y = element_text(colour = "#FF9300"))
  
  assign(paste("p",i,sep = ""),p)
}
# Save all plots 4x3

ggsave(plot = ggarrange(p1,ncol = 1,nrow = 1, common.legend=TRUE, legend="bottom"),
       filename = "./output/cel_vs_predicted_nfl_training.png", width =2.5,height = 2.5,units = "in",dpi = 300)
# ggsave(plot = ggarrange(p2,ncol = 1,nrow = 1, common.legend=TRUE, legend="bottom"),
#        filename = "./output/T2NE_vs_predicted_nfl_training.png", width =2.5,height = 2.5,units = "in",dpi = 300)
ggsave(plot = ggarrange(p3,ncol = 1,nrow = 1, common.legend=TRUE, legend="bottom"),
       filename = "./output/cel_vs_age_adj_predicted_nfl_training.png", width =2.5,height = 2.5,units = "in",dpi = 300)
#


# calculate residuals of serum vs CEL and adjusted serum vs CEL

data <- data_sub_train_clean
df <- data %>% 
  filter(.,!is.na(data$cel) & !is.na(data$predicted_csf_nfl))

mod_cel_serum <- lm(cel~log_nfl_serum,df)
summary(mod_cel_serum)

df$serum_cel_res <- mod_cel_serum$residuals
hist(mod_cel_serum$residuals)

mod_cel_adj_serum <- lm(cel~predicted_csf_nfl,df)
summary(mod_cel_adj_serum)

df$adj_serum_cel_res <- mod_cel_adj_serum$residuals
hist(df$adj_serum_cel_res)

wilcox.test(df$serum_cel_res,df$adj_serum_cel_res,paired = TRUE, alternative = "two.sided")


df_new <- tibble::enframe(df$serum_cel_res,name = NULL,value = "serum")
df_new <- df_new %>% 
  mutate(subclass=row_number()) %>% 
  mutate(group="serum")
df_new2 <- tibble::enframe(df$adj_serum_cel_res,name = NULL,value = "serum")
df_new2 <- df_new2 %>% 
  mutate(subclass=row_number()) %>% 
  mutate(group="adj-serum")


df_all <- rbind(df_new,df_new2)

# plot
obs <- c("group")
obs_nice <- c("residuals")
pred <- c("serum")
pred_nice <- c("serum-CEL residuals")

i <- 1
p4 <- df_all %>% 
  ggplot(aes_string(x=obs[i], y=pred[i], color = "group")) +
  geom_boxplot(outlier.colour = "transparent") +
  geom_jitter(alpha = 0.6, shape = 1, size = 2,stroke = 0.6,position=position_jitter(width = 0.07,height=0)) +
  geom_line(aes(group = subclass),color="gray", alpha = 0.6,size=0.3) +
  ylim(c(min(df_all[pred[i]])-0.25*(max(df_all[pred[i]])-min(df_all[pred[i]])),max(df_all[pred[i]])+0.2*(max(df_all[pred[i]])-min(df_all[pred[i]]))))+
  xlab(obs_nice[i]) +
  ylab(pred_nice[i]) +
  geom_line(aes(group = subclass),color="gray", alpha = 0.6,size=0.3) +
  scale_colour_discrete(labels = c("adjusted serum", "measured serum"))+
  scale_x_discrete(labels = c("adjusted serum", "measured serum")) +
  stat_compare_means(method = "t.test", paired = TRUE,
                     label.x = "adj-serum", 
                     label.y = max(df_all[pred[i]])+0.1*(max(df_all[pred[i]])-min(df_all[pred[i]])) )+
  theme_bw(base_size = 12) +
  stat_summary(fun.data = function(x) {
    return(
      data.frame(
        y = min(df_all[pred[i]])-0.05*(max(df_all[pred[i]])-min(df_all[pred[i]])),
        label = paste('n =',
                      format(length(x), big.mark = ",", decimal.mark = ".", scientific = FALSE), 
                      '\n',
                      'median =', 
                      format(round(median(x), 2), big.mark = ",", decimal.mark = ".", scientific = FALSE))
      )
    )
  },geom = "text", 
  hjust = 0.5,
  vjust = 1, size =3) +
  theme_bw(base_size = 12) +
  theme(legend.position = "none")+
  theme(axis.title.y = element_text( colour = "blue" ))


ggsave(plot = ggarrange(p4,ncol = 1,nrow = 1),
       filename = "./output/CEL_residuals_plot_t.test.png", width =2.7,height = 2.5,units = "in",dpi = 300)

####  VALIDATION cohort


#plot CEL vs CSF NFL

obs <- c("log_nfl_csf","log_nfl_csf","log_nfl_csf_age_adj")
obs_nice <- c("Log10 cNFL","Log10 cNFL","Log age-adj cNFL")


pred <- c("cel","t2_new_large","cel")
pred_nice <- c("Ln(CEL+1)","Ln(#New+Enlarging T2 +1)","Ln(CEL+1)")

for(i in 1:length(obs)){
  # i <- 2
  data <- data_sub_val_clean
  df <- data %>% 
    filter(.,!is.na(data[[pred[i]]]) & !is.na(data$predicted_csf_nfl))
  
  p <- df %>% 
    ggplot(aes_string(x=obs[i], y=pred[i])) +
    geom_point(color="#001321",alpha = 0.4, shape = 1, size = 2,stroke = 0.6) +
    annotate("text",label = paste0('atop(n[s] ==', nrow(df[obs[i]]), ', n[p] ==', length(unique(df$patientcode)), ')'),
             x=max(df[obs[i]])-.15*(max(df[obs[i]])-min(df[obs[i]])),
             y=min(df[pred[i]])+1.2*(max(df[pred[i]])-min(df[pred[i]])),
             size=4,parse = TRUE) +
    ylim(c(min(df[pred[i]]),
           max(df[pred[i]])+0.35*(max(df[pred[i]])-min(df[pred[i]]))))+
    xlab(obs_nice[i]) +
    ylab(pred_nice[i]) +
    stat_cor(aes(label=..r.label..), size=4,color="#521B93",label.y.npc = 1)+
    stat_cor(aes(label=..rr.label..), size=4,color="#521B93",label.y.npc = .90)+
    stat_cor(aes(label=..p.label..), size=4,color="#521B93",label.y.npc = .77)+
    geom_smooth(method="lm",se=TRUE, size=1,color="#521B93") +
    theme_bw(base_size = 12) +
    theme(axis.title.x = element_text(colour = "#0171C0"),
          axis.title.y = element_text(colour = "#FF9300"))
  
  assign(paste("p",i,sep = ""),p)
}
# Save all plots 4x3

ggsave(plot = ggarrange(p1,ncol = 1,nrow = 1, common.legend=TRUE, legend="bottom"),
       filename = "./output/cel_vs_csf_nfl_validation.png", width =2.5,height = 2.5,units = "in",dpi = 300)
# ggsave(plot = ggarrange(p2,ncol = 1,nrow = 1, common.legend=TRUE, legend="bottom"),
#        filename = "./output/T2NE_cel_vs_csf_nfl_validation.png", width =2.5,height = 2.5,units = "in",dpi = 300)
ggsave(plot = ggarrange(p3,ncol = 1,nrow = 1, common.legend=TRUE, legend="bottom"),
       filename = "./output/cel_vs_age_adj_csf_nfl_validation.png", width =2.5,height = 2.5,units = "in",dpi = 300)


#plot CEL vs serum NFL

obs <- c("log_nfl_serum","log_nfl_serum","log_nfl_serum_age_adj")
obs_nice <- c("Log10 sNFL","Log10 sNFL","Log age-adj sNFL")


pred <- c("cel","t2_new_large","cel")
pred_nice <- c("Ln(CEL+1)","Ln(#New+Enlarging T2 +1)","Ln(CEL+1)")


for(i in 1:length(obs)){
  # i <- 2
  data <- data_sub_val_clean
  df <- data %>% 
    filter(.,!is.na(data[[pred[i]]]) & !is.na(data$predicted_csf_nfl))
  
  p <- df %>% 
    ggplot(aes_string(x=obs[i], y=pred[i])) +
    geom_point(color="#001321",alpha = 0.4, shape = 1, size = 2,stroke = 0.6) +
    annotate("text",label = paste0('atop(n[s] ==', nrow(df[obs[i]]), ', n[p] ==', length(unique(df$patientcode)), ')'),
             x=max(df[obs[i]])-.15*(max(df[obs[i]])-min(df[obs[i]])),
             y=min(df[pred[i]])+1.2*(max(df[pred[i]])-min(df[pred[i]])),
             size=4,parse = TRUE) +
    ylim(c(min(df[pred[i]]),
           max(df[pred[i]])+0.35*(max(df[pred[i]])-min(df[pred[i]]))))+
    xlab(obs_nice[i]) +
    ylab(pred_nice[i]) +
    stat_cor(aes(label=..r.label..), size=4,color="#521B93",label.y.npc = 1)+
    stat_cor(aes(label=..rr.label..), size=4,color="#521B93",label.y.npc = .90)+
    stat_cor(aes(label=..p.label..), size=4,color="#521B93",label.y.npc = .77)+
    geom_smooth(method="lm",se=TRUE, size=1,color="#521B93") +
    theme_bw(base_size = 12) +
    theme(axis.title.x = element_text(colour = "#C00000"),
          axis.title.y = element_text(colour = "#FF9300"))
  
  assign(paste("p",i,sep = ""),p)
}
# Save all plots 4x3

ggsave(plot = ggarrange(p1,ncol = 1,nrow = 1, common.legend=TRUE, legend="bottom"),
       filename = "./output/cel_vs_serum_nfl_validation.png", width =2.5,height = 2.5,units = "in",dpi = 300)
# ggsave(plot = ggarrange(p2,ncol = 1,nrow = 1, common.legend=TRUE, legend="bottom"),
#        filename = "./output/T2NE_cel_vs_serum_nfl_validation.png", width =2.5,height = 2.5,units = "in",dpi = 300)
ggsave(plot = ggarrange(p3,ncol = 1,nrow = 1, common.legend=TRUE, legend="bottom"),
       filename = "./output/cel_vs_age_adj_serum_nfl_validation.png", width =2.5,height = 2.5,units = "in",dpi = 300)


#plot CEL vs predicted CSF NFL

obs <- c("predicted_csf_nfl","predicted_csf_nfl","predicted_csf_nfl_age_adj")
obs_nice <- c("sNFL-predicted cNFL","sNFL-predicted cNFL","age-adj sNFL-predicted cNFL")

pred <- c("cel","t2_new_large","cel")
pred_nice <- c("Ln(CEL+1)","Ln(#New+Enlarging T2 +1)","Ln(CEL+1)")


for(i in 1:length(obs)){
  # i <- 2
  data <- data_sub_val_clean
  df <- data %>% 
    filter(.,!is.na(data[[pred[i]]]) & !is.na(data$predicted_csf_nfl))
  
  p <- df %>% 
    ggplot(aes_string(x=obs[i], y=pred[i])) +
    geom_point(color="#001321",alpha = 0.4, shape = 1, size = 2,stroke = 0.6) +
    annotate("text",label = paste0('atop(n[s] ==', nrow(df[obs[i]]), ', n[p] ==', length(unique(df$patientcode)), ')'),
             x=max(df[obs[i]])-.15*(max(df[obs[i]])-min(df[obs[i]])),
             y=min(df[pred[i]])+1.2*(max(df[pred[i]])-min(df[pred[i]])),
             size=4,parse = TRUE) +
    ylim(c(min(df[pred[i]]),
           max(df[pred[i]])+0.35*(max(df[pred[i]])-min(df[pred[i]]))))+
    xlab(obs_nice[i]) +
    ylab(pred_nice[i]) +
    stat_cor(aes(label=..r.label..), size=4,color="#521B93",label.y.npc = 1)+
    stat_cor(aes(label=..rr.label..), size=4,color="#521B93",label.y.npc = .90)+
    stat_cor(aes(label=..p.label..), size=4,color="#521B93",label.y.npc = .77)+
    geom_smooth(method="lm",se=TRUE, size=1,color="#521B93") +
    theme_bw(base_size = 12) +
    theme(axis.title.x = element_text(colour = "#008F00"),
          axis.title.y = element_text(colour = "#FF9300"))
  
  assign(paste("p",i,sep = ""),p)
}
# Save all plots 4x3

ggsave(plot = ggarrange(p1,ncol = 1,nrow = 1, common.legend=TRUE, legend="bottom"),
       filename = "./output/cel_vs_predicted_nfl_validation.png", width =2.5,height = 2.5,units = "in",dpi = 300)
# ggsave(plot = ggarrange(p2,ncol = 1,nrow = 1, common.legend=TRUE, legend="bottom"),
#        filename = "./output/T2NE_cel_vs_adjusted_nfl_validation.png", width =2.5,height = 2.5,units = "in",dpi = 300)
ggsave(plot = ggarrange(p3,ncol = 1,nrow = 1, common.legend=TRUE, legend="bottom"),
       filename = "./output/cel_vs_age_adj_predicted_nfl_validation.png", width =2.5,height = 2.5,units = "in",dpi = 300)


# try Poisson regression in training cohort

summary(m1 <- glm(cel~log_nfl_csf, family = "poisson", data = data_sub_train_clean))

summary(m2 <- glm(cel~log_nfl_serum, family = "poisson", data = data_sub_train_clean))

summary(m3 <- glm(cel~predicted_csf_nfl, family = "poisson", data = data_sub_train_clean))


with(m1, cbind(res.deviance = deviance, df=df.residual, p= pchisq(deviance, df.residual, lower.tail = FALSE)))

with(m2, cbind(res.deviance = deviance, df=df.residual, p= pchisq(deviance, df.residual, lower.tail = FALSE)))

with(m3, cbind(res.deviance = deviance, df=df.residual, p= pchisq(deviance, df.residual, lower.tail = FALSE)))

data_sub_train_clean$poison_csf <- predict(object = m1,newdata = data_sub_train_clean)
data_sub_train_clean$poison_serum <- predict(object = m2,newdata = data_sub_train_clean)
data_sub_train_clean$poison_adj_serum <- predict(object = m3,newdata = data_sub_train_clean)


poison_res <- tibble::enframe(m1$residuals,name = NULL,value = "csf_res")
poison_res$serum_res <- m2$residuals
poison_res$adj_serum_res <- m3$residuals


wilcox.test(poison_res$serum_res,poison_res$adj_serum_res,paired = TRUE, alternative = "two.sided")

# plot comparison between residuals

df_new <- tibble::enframe(poison_res$serum_res,name = NULL,value = "serum")
df_new <- df_new %>% 
  mutate(subclass=row_number()) %>% 
  mutate(group="serum")
df_new2 <- tibble::enframe(poison_res$adj_serum_res,name = NULL,value = "serum")
df_new2 <- df_new2 %>% 
  mutate(subclass=row_number()) %>% 
  mutate(group="adj-serum")


df_all <- rbind(df_new,df_new2)

# plot
obs <- c("group")
obs_nice <- c("residuals")
pred <- c("serum")
pred_nice <- c("serum-CEL residuals")

i <- 1
p4 <- df_all %>% 
  ggplot(aes_string(x=obs[i], y=pred[i], color = "group")) +
  geom_boxplot(outlier.colour = "transparent") +
  geom_jitter(alpha = 0.6, shape = 1, size = 2,stroke = 0.6,position=position_jitter(width = 0.07,height=0)) +
  geom_line(aes(group = subclass),color="gray", alpha = 0.6,size=0.3) +
  ylim(c(min(df_all[pred[i]])-0.25*(max(df_all[pred[i]])-min(df_all[pred[i]])),max(df_all[pred[i]])+0.2*(max(df_all[pred[i]])-min(df_all[pred[i]]))))+
  xlab(obs_nice[i]) +
  ylab(pred_nice[i]) +
  geom_line(aes(group = subclass),color="gray", alpha = 0.6,size=0.3) +
  scale_colour_discrete(labels = c("adjusted serum", "measured serum"))+
  # scale_x_discrete(labels = c("adjusted serum", "measured serum")) +
  stat_compare_means(method = "wilcox.test", paired = TRUE,
                     label.x = "adj-serum", 
                     label.y = max(df_all[pred[i]])+0.1*(max(df_all[pred[i]])-min(df_all[pred[i]])) )+
  theme_bw(base_size = 12) +
  stat_summary(fun.data = function(x) {
    return(
      data.frame(
        y = min(df_all[pred[i]])-0.05*(max(df_all[pred[i]])-min(df_all[pred[i]])),
        label = paste('n =',
                      format(length(x), big.mark = ",", decimal.mark = ".", scientific = FALSE), 
                      '\n',
                      'median =', 
                      format(round(median(x), 2), big.mark = ",", decimal.mark = ".", scientific = FALSE))
      )
    )
  },geom = "text", 
  hjust = 0.5,
  vjust = 1, size =3) +
  theme_bw(base_size = 12) +
  theme(legend.position = "none")+
  theme(axis.title.y = element_text( colour = "blue" ))


ggsave(plot = ggarrange(p4,ncol = 1,nrow = 1),
       filename = "./output/Poison_CEL_residuals_plot_t.test.png", width =2.7,height = 2.5,units = "in",dpi = 300)

# try poisson regression in validation cohort

summary(m1 <- glm(cel~log_nfl_csf, family = "poisson", data = data_sub_val_clean))

summary(m2 <- glm(cel~log_nfl_serum, family = "poisson", data = data_sub_val_clean))

summary(m3 <- glm(cel~predicted_csf_nfl, family = "poisson", data = data_sub_val_clean))


with(m1, cbind(res.deviance = deviance, df=df.residual, p= pchisq(deviance, df.residual, lower.tail = FALSE)))

with(m2, cbind(res.deviance = deviance, df=df.residual, p= pchisq(deviance, df.residual, lower.tail = FALSE)))

with(m3, cbind(res.deviance = deviance, df=df.residual, p= pchisq(deviance, df.residual, lower.tail = FALSE)))

data_sub_val_clean$poison_csf <- predict(object = m1,newdata = data_sub_val_clean)
data_sub_val_clean$poison_serum <- predict(object = m2,newdata = data_sub_val_clean)
data_sub_val_clean$poison_adj_serum <- predict(object = m3,newdata = data_sub_val_clean)


poison_res <- tibble::enframe(m1$residuals,name = NULL,value = "csf_res")
poison_res$serum_res <- m2$residuals
poison_res$adj_serum_res <- m3$residuals


wilcox.test(poison_res$serum_res,poison_res$adj_serum_res,paired = TRUE, alternative = "two.sided")

# plot comparison between residuals

df_new <- tibble::enframe(poison_res$serum_res,name = NULL,value = "serum")
df_new <- df_new %>% 
  mutate(subclass=row_number()) %>% 
  mutate(group="serum")
df_new2 <- tibble::enframe(poison_res$adj_serum_res,name = NULL,value = "serum")
df_new2 <- df_new2 %>% 
  mutate(subclass=row_number()) %>% 
  mutate(group="adj-serum")


df_all <- rbind(df_new,df_new2)

# plot
obs <- c("group")
obs_nice <- c("residuals")
pred <- c("serum")
pred_nice <- c("serum-CEL residuals")

i <- 1
p4 <- df_all %>% 
  ggplot(aes_string(x=obs[i], y=pred[i], color = "group")) +
  geom_boxplot(outlier.colour = "transparent") +
  geom_jitter(alpha = 0.6, shape = 1, size = 2,stroke = 0.6,position=position_jitter(width = 0.07,height=0)) +
  geom_line(aes(group = subclass),color="gray", alpha = 0.6,size=0.3) +
  ylim(c(min(df_all[pred[i]])-0.25*(max(df_all[pred[i]])-min(df_all[pred[i]])),max(df_all[pred[i]])+0.2*(max(df_all[pred[i]])-min(df_all[pred[i]]))))+
  xlab(obs_nice[i]) +
  ylab(pred_nice[i]) +
  geom_line(aes(group = subclass),color="gray", alpha = 0.6,size=0.3) +
  scale_colour_discrete(labels = c("adjusted serum", "measured serum"))+
  # scale_x_discrete(labels = c("adjusted serum", "measured serum")) +
  stat_compare_means(method = "wilcox.test", paired = TRUE,
                     label.x = "adj-serum", 
                     label.y = max(df_all[pred[i]])+0.1*(max(df_all[pred[i]])-min(df_all[pred[i]])) )+
  theme_bw(base_size = 12) +
  stat_summary(fun.data = function(x) {
    return(
      data.frame(
        y = min(df_all[pred[i]])-0.05*(max(df_all[pred[i]])-min(df_all[pred[i]])),
        label = paste('n =',
                      format(length(x), big.mark = ",", decimal.mark = ".", scientific = FALSE), 
                      '\n',
                      'median =', 
                      format(round(median(x), 2), big.mark = ",", decimal.mark = ".", scientific = FALSE))
      )
    )
  },geom = "text", 
  hjust = 0.5,
  vjust = 1, size =3) +
  theme_bw(base_size = 12) +
  theme(legend.position = "none")+
  theme(axis.title.y = element_text( colour = "blue" ))

p4
ggsave(plot = ggarrange(p4,ncol = 1,nrow = 1),
       filename = "./output/Poison_CEL_residuals_plot_val.test.png", width =2.7,height = 2.5,units = "in",dpi = 300)


# POISSON REGRESSION in combined training and validation cohort

data_train <- data_sub_train_clean %>% 
  select(patientcode,cel,log_nfl_csf,log_nfl_serum,predicted_csf_nfl)

data_val <- data_sub_val_clean %>% 
  select(patientcode,cel,log_nfl_csf,log_nfl_serum,predicted_csf_nfl)

data_both <- rbind(data_train,data_val)

summary(m1 <- glm(cel~log_nfl_csf, family = "poisson", data = data_both))

summary(m2 <- glm(cel~log_nfl_serum, family = "poisson", data = data_both))

summary(m3 <- glm(cel~predicted_csf_nfl, family = "poisson", data = data_both))


with(m1, cbind(res.deviance = deviance, df=df.residual, p= pchisq(deviance, df.residual, lower.tail = FALSE)))

with(m2, cbind(res.deviance = deviance, df=df.residual, p= pchisq(deviance, df.residual, lower.tail = FALSE)))

with(m3, cbind(res.deviance = deviance, df=df.residual, p= pchisq(deviance, df.residual, lower.tail = FALSE)))


poison_res <- tibble::enframe(m1$residuals,name = NULL,value = "csf_res")
poison_res$serum_res <- m2$residuals
poison_res$adj_serum_res <- m3$residuals


wilcox.test(poison_res$serum_res,poison_res$adj_serum_res,paired = TRUE, alternative = "two.sided")

# plot comparison between residuals

df_new <- tibble::enframe(poison_res$serum_res,name = NULL,value = "serum")
df_new <- df_new %>% 
  mutate(subclass=row_number()) %>% 
  mutate(group="serum")
df_new2 <- tibble::enframe(poison_res$adj_serum_res,name = NULL,value = "serum")
df_new2 <- df_new2 %>% 
  mutate(subclass=row_number()) %>% 
  mutate(group="adj-serum")


df_all <- rbind(df_new,df_new2)

# plot
obs <- c("group")
obs_nice <- c("residuals")
pred <- c("serum")
pred_nice <- c("serum-CEL residuals")

i <- 1
p4 <- df_all %>% 
  ggplot(aes_string(x=obs[i], y=pred[i], color = "group")) +
  geom_boxplot(outlier.colour = "transparent") +
  geom_jitter(alpha = 0.6, shape = 1, size = 2,stroke = 0.6,position=position_jitter(width = 0.07,height=0)) +
  geom_line(aes(group = subclass),color="gray", alpha = 0.6,size=0.3) +
  ylim(c(min(df_all[pred[i]])-0.25*(max(df_all[pred[i]])-min(df_all[pred[i]])),max(df_all[pred[i]])+0.2*(max(df_all[pred[i]])-min(df_all[pred[i]]))))+
  xlab(obs_nice[i]) +
  ylab(pred_nice[i]) +
  geom_line(aes(group = subclass),color="gray", alpha = 0.6,size=0.3) +
  scale_colour_discrete(labels = c("adjusted serum", "measured serum"))+
  # scale_x_discrete(labels = c("adjusted serum", "measured serum")) +
  stat_compare_means(method = "wilcox.test", paired = TRUE,
                     label.x = "adj-serum", 
                     label.y = max(df_all[pred[i]])+0.1*(max(df_all[pred[i]])-min(df_all[pred[i]])) )+
  theme_bw(base_size = 12) +
  stat_summary(fun.data = function(x) {
    return(
      data.frame(
        y = min(df_all[pred[i]])-0.05*(max(df_all[pred[i]])-min(df_all[pred[i]])),
        label = paste('n =',
                      format(length(x), big.mark = ",", decimal.mark = ".", scientific = FALSE), 
                      '\n',
                      'median =', 
                      format(round(median(x), 2), big.mark = ",", decimal.mark = ".", scientific = FALSE))
      )
    )
  },geom = "text", 
  hjust = 0.5,
  vjust = 1, size =3) +
  theme_bw(base_size = 12) +
  theme(legend.position = "none")+
  theme(axis.title.y = element_text( colour = "blue" ))

p4
ggsave(plot = ggarrange(p4,ncol = 1,nrow = 1),
       filename = "./output/Poison_CEL_residuals_plot_train+val.test.png", width =2.7,height = 2.5,units = "in",dpi = 300)


# try poisson regression for T2 and New lesions

summary(m1 <- glm(t2_new_large~log_nfl_csf, family = "poisson", data = data_sub_train_clean))

summary(m2 <- glm(t2_new_large~log_nfl_serum, family = "poisson", data = data_sub_train_clean))

summary(m3 <- glm(t2_new_large~predicted_csf_nfl, family = "poisson", data = data_sub_train_clean))


with(m1, cbind(res.deviance = deviance, df=df.residual, p= pchisq(deviance, df.residual, lower.tail = FALSE)))

with(m2, cbind(res.deviance = deviance, df=df.residual, p= pchisq(deviance, df.residual, lower.tail = FALSE)))

with(m3, cbind(res.deviance = deviance, df=df.residual, p= pchisq(deviance, df.residual, lower.tail = FALSE)))

data_sub_train_clean$poison_csf <- predict(object = m1,newdata = data_sub_train_clean)
data_sub_train_clean$poison_serum <- predict(object = m2,newdata = data_sub_train_clean)
data_sub_train_clean$poison_adj_serum <- predict(object = m3,newdata = data_sub_train_clean)


poison_res <- tibble::enframe(m1$residuals,name = NULL,value = "csf_res")
poison_res$serum_res <- m2$residuals
poison_res$adj_serum_res <- m3$residuals


wilcox.test(poison_res$serum_res,poison_res$adj_serum_res,paired = TRUE, alternative = "two.sided")

test1 <- data_sub_train_clean %>% 
  filter(!is.na(cel) & !is.na(t2_new_large)) %>% 
  select(patientcode,log_nfl_csf,log_nfl_serum,predicted_csf_nfl,cel,t2_new_large)

test2 <- data_sub_val_clean %>% 
  filter(!is.na(cel) & !is.na(t2_new_large)) %>% 
  select(patientcode,log_nfl_csf,log_nfl_serum,predicted_csf_nfl,cel,t2_new_large)


test <- rbind(test1,test2)
test$composite_cel <- test$cel + test$t2_new_large

plot(composite_cel~predicted_csf_nfl,test)

# plot

obs <- c("log_nfl_csf","log_nfl_serum","predicted_csf_nfl")
obs_nice <- c("measured cNFL","measured sNFL","adjusted sNFL")

pred <- c("composite_cel","composite_cel","composite_cel")
pred_nice <- c("CEL+T2 new&enlarge","CEL+T2 new&enlarge","CEL+T2 new&enlarge")


for(i in 1:length(obs)){
  # i <- 2
  df <- test
  
  p <- df %>% 
    ggplot(aes_string(x=obs[i], y=pred[i])) +
    geom_point(color="#001321",alpha = 0.4, shape = 1, size = 2,stroke = 0.6) +
    annotate("text",label = paste0('atop(n[s] ==', nrow(df[obs[i]]), ', n[p] ==', length(unique(df$patientcode)), ')'),
             x=max(df[obs[i]])-.15*(max(df[obs[i]])-min(df[obs[i]])),
             y=min(df[pred[i]])+1.2*(max(df[pred[i]])-min(df[pred[i]])),
             size=4,parse = TRUE) +
    ylim(c(min(df[pred[i]]),
           max(df[pred[i]])+0.35*(max(df[pred[i]])-min(df[pred[i]]))))+
    xlab(obs_nice[i]) +
    ylab(pred_nice[i]) +
    stat_cor(aes(label=..r.label..), size=4,color="#521B93",label.y.npc = 1)+
    stat_cor(aes(label=..rr.label..), size=4,color="#521B93",label.y.npc = .90)+
    stat_cor(aes(label=..p.label..), size=4,color="#521B93",label.y.npc = .77)+
    geom_smooth(method="lm",se=TRUE, size=1,color="#521B93") +
    theme_bw(base_size = 12) +
    theme(axis.title.x = element_text(colour = "#008F00"),
          axis.title.y = element_text(colour = "#FF9300"))
  
  assign(paste("p",i,sep = ""),p)
}

ggsave(plot = ggarrange(p1,p2,p3,ncol = 3,nrow = 1, common.legend=TRUE, legend="bottom"),
       filename = "./output/CEL+T2NE_vs_nfl_train+val.png", width =7.5,height = 2.5,units = "in",dpi = 300)


####################
####################
##  exponential CEL
####################
####################

########  TRAINING cohort ############

#plot CEL vs CSF NFL

obs <- c("log_nfl_csf")
obs_nice <- c("Log10 cNFL")


pred <- c("cel")
pred_nice <- c("Ln(CEL+1)")

for(i in 1:length(obs)){
  # i <- 1
  data <- data_sub_train_clean
  df <- data %>% 
    filter(.,!is.na(data[[pred[i]]]) & !is.na(data$predicted_csf_nfl))
  mod <- summary(lm(df[[pred[i]]]~exp(df[[obs[i]]])))
  lb1 <- paste("R^2==", round(mod$r.squared,3))
  lp1 <- paste("p=", format.pval(mod$coefficients[2,4]))
  p <- df %>% 
    ggplot(aes_string(x=obs[i], y=pred[i])) +
    geom_point(color="#001321",alpha = 0.4, shape = 1.5, size = 2,stroke = 0.6) +
    annotate("text",label = paste0('atop(n[s] ==', nrow(df[obs[i]]), ', n[p] ==', length(unique(df$patientcode)), ')'),
             x=max(df[obs[i]])-.15*(max(df[obs[i]])-min(df[obs[i]])),
             y=min(df[pred[i]])+1.2*(max(df[pred[i]])-min(df[pred[i]])),
             size=4,parse = TRUE) +
    annotate("text",label = lb1,
             x=min(df[obs[i]])+.2*(max(df[obs[i]])-min(df[obs[i]])),
             y=min(df[pred[i]])+1.3*(max(df[pred[i]])-min(df[pred[i]])),
             size=4,parse = TRUE,color="#521B93") +
    annotate("text",label = lp1,
             x=min(df[obs[i]])+.27*(max(df[obs[i]])-min(df[obs[i]])),
             y=min(df[pred[i]])+1.1*(max(df[pred[i]])-min(df[pred[i]])),
             size=4,parse = FALSE,color="#521B93") +
    ylim(c(min(df[pred[i]]),
           max(df[pred[i]])+0.35*(max(df[pred[i]])-min(df[pred[i]]))))+
    xlab(obs_nice[i]) +
    ylab(pred_nice[i]) +
    # stat_cor(aes(label=..r.label..), size=4,color="#521B93",label.y.npc = 1)+
    # stat_cor(aes(label=..rr.label..), size=4,color="#521B93",label.y.npc = .90)+
    # stat_cor(aes(label=..p.label..), size=4,color="#521B93",label.y.npc = .77)+
    geom_smooth(method="lm", formula = y~exp(x), se=TRUE, size=1,color="#521B93") +
    theme_bw(base_size = 12) +
    theme(axis.title.x = element_text(colour = "#0171C0"),
          axis.title.y = element_text(colour = "#FF9300"))
  
  assign(paste("p",i,sep = ""),p)
}
# Save all plots 4x3

ggsave(plot = ggarrange(p1,ncol = 1,nrow = 1, common.legend=TRUE, legend="bottom"),
       filename = "./output/EXP_cel_vs_csf_nfl_training.png", width =2.5,height = 2.5,units = "in",dpi = 300)


#plot CEL vs serum NFL

obs <- c("log_nfl_serum")
obs_nice <- c("Log10 sNFL")


pred <- c("cel")
pred_nice <- c("Ln(CEL+1)")

for(i in 1:length(obs)){
  # i <- 1
  data <- data_sub_train_clean
  df <- data %>% 
    filter(.,!is.na(data[[pred[i]]]) & !is.na(data$predicted_csf_nfl))
  mod <- summary(lm(df[[pred[i]]]~exp(df[[obs[i]]])))
  lb1 <- paste("R^2==", round(mod$r.squared,3))
  lp1 <- paste("p=", format.pval(mod$coefficients[2,4]))
  p <- df %>% 
    ggplot(aes_string(x=obs[i], y=pred[i])) +
    geom_point(color="#001321",alpha = 0.4, shape = 1.5, size = 2,stroke = 0.6) +
    annotate("text",label = paste0('atop(n[s] ==', nrow(df[obs[i]]), ', n[p] ==', length(unique(df$patientcode)), ')'),
             x=max(df[obs[i]])-.15*(max(df[obs[i]])-min(df[obs[i]])),
             y=min(df[pred[i]])+1.2*(max(df[pred[i]])-min(df[pred[i]])),
             size=4,parse = TRUE) +
    annotate("text",label = lb1,
             x=min(df[obs[i]])+.2*(max(df[obs[i]])-min(df[obs[i]])),
             y=min(df[pred[i]])+1.3*(max(df[pred[i]])-min(df[pred[i]])),
             size=4,parse = TRUE,color="#521B93") +
    annotate("text",label = lp1,
             x=min(df[obs[i]])+.27*(max(df[obs[i]])-min(df[obs[i]])),
             y=min(df[pred[i]])+1.1*(max(df[pred[i]])-min(df[pred[i]])),
             size=4,parse = FALSE,color="#521B93") +
    ylim(c(min(df[pred[i]]),
           max(df[pred[i]])+0.35*(max(df[pred[i]])-min(df[pred[i]]))))+
    xlab(obs_nice[i]) +
    ylab(pred_nice[i]) +
    # stat_cor(aes(label=..r.label..), size=4,color="#521B93",label.y.npc = 1)+
    # stat_cor(aes(label=..rr.label..), size=4,color="#521B93",label.y.npc = .90)+
    # stat_cor(aes(label=..p.label..), size=4,color="#521B93",label.y.npc = .77)+
    geom_smooth(method="lm", formula = y~exp(x), se=TRUE, size=1,color="#521B93") +
    theme_bw(base_size = 12) +
    theme(axis.title.x = element_text(colour = "#C00000"),
          axis.title.y = element_text(colour = "#FF9300"))
  
  assign(paste("p",i,sep = ""),p)
}
# Save all plots 4x3

ggsave(plot = ggarrange(p1,ncol = 1,nrow = 1, common.legend=TRUE, legend="bottom"),
       filename = "./output/EXP_cel_vs_serum_nfl_training.png", width =2.5,height = 2.5,units = "in",dpi = 300)

#plot CEL vs predicted CSF NFL

obs <- c("predicted_csf_nfl")
obs_nice <- c("sNFL-predicted cNFL")

pred <- c("cel")
pred_nice <- c("Ln(CEL+1)")

for(i in 1:length(obs)){
  # i <- 1
  data <- data_sub_train_clean
  df <- data %>% 
    filter(.,!is.na(data[[pred[i]]]) & !is.na(data$predicted_csf_nfl))
  mod <- summary(lm(df[[pred[i]]]~exp(df[[obs[i]]])))
  lb1 <- paste("R^2==", round(mod$r.squared,3))
  lp1 <- paste("p=", format.pval(mod$coefficients[2,4]))
  p <- df %>% 
    ggplot(aes_string(x=obs[i], y=pred[i])) +
    geom_point(color="#001321",alpha = 0.4, shape = 1.5, size = 2,stroke = 0.6) +
    annotate("text",label = paste0('atop(n[s] ==', nrow(df[obs[i]]), ', n[p] ==', length(unique(df$patientcode)), ')'),
             x=max(df[obs[i]])-.15*(max(df[obs[i]])-min(df[obs[i]])),
             y=min(df[pred[i]])+1.2*(max(df[pred[i]])-min(df[pred[i]])),
             size=4,parse = TRUE) +
    annotate("text",label = lb1,
             x=min(df[obs[i]])+.2*(max(df[obs[i]])-min(df[obs[i]])),
             y=min(df[pred[i]])+1.3*(max(df[pred[i]])-min(df[pred[i]])),
             size=4,parse = TRUE,color="#521B93") +
    annotate("text",label = lp1,
             x=min(df[obs[i]])+.27*(max(df[obs[i]])-min(df[obs[i]])),
             y=min(df[pred[i]])+1.1*(max(df[pred[i]])-min(df[pred[i]])),
             size=4,parse = FALSE,color="#521B93") +
    ylim(c(min(df[pred[i]]),
           max(df[pred[i]])+0.35*(max(df[pred[i]])-min(df[pred[i]]))))+
    xlab(obs_nice[i]) +
    ylab(pred_nice[i]) +
    # stat_cor(aes(label=..r.label..), size=4,color="#521B93",label.y.npc = 1)+
    # stat_cor(aes(label=..rr.label..), size=4,color="#521B93",label.y.npc = .90)+
    # stat_cor(aes(label=..p.label..), size=4,color="#521B93",label.y.npc = .77)+
    geom_smooth(method="lm", formula = y~exp(x), se=TRUE, size=1,color="#521B93") +
    theme_bw(base_size = 12) +
    theme(axis.title.x = element_text(colour = "#008F00"),
          axis.title.y = element_text(colour = "#FF9300"))
  
  assign(paste("p",i,sep = ""),p)
}
# Save all plots 4x3

ggsave(plot = ggarrange(p1,ncol = 1,nrow = 1, common.legend=TRUE, legend="bottom"),
       filename = "./output/EXP_cel_vs_predicted_nfl_training.png", width =2.5,height = 2.5,units = "in",dpi = 300)


############ VALIDATION COHORT ############################
#plot CEL vs CSF NFL

obs <- c("log_nfl_csf")
obs_nice <- c("Log10 cNFL")


pred <- c("cel")
pred_nice <- c("Ln(CEL+1)")

for(i in 1:length(obs)){
  # i <- 1
  data <- data_sub_val_clean
  df <- data %>% 
    filter(.,!is.na(data[[pred[i]]]) & !is.na(data$predicted_csf_nfl))
  mod <- summary(lm(df[[pred[i]]]~exp(df[[obs[i]]])))
  lb1 <- paste("R^2==", round(mod$r.squared,3))
  lp1 <- paste("p=", format.pval(mod$coefficients[2,4]))
  p <- df %>% 
    ggplot(aes_string(x=obs[i], y=pred[i])) +
    geom_point(color="#001321",alpha = 0.4, shape = 1.5, size = 2,stroke = 0.6) +
    annotate("text",label = paste0('atop(n[s] ==', nrow(df[obs[i]]), ', n[p] ==', length(unique(df$patientcode)), ')'),
             x=max(df[obs[i]])-.15*(max(df[obs[i]])-min(df[obs[i]])),
             y=min(df[pred[i]])+1.2*(max(df[pred[i]])-min(df[pred[i]])),
             size=4,parse = TRUE) +
    annotate("text",label = lb1,
             x=min(df[obs[i]])+.2*(max(df[obs[i]])-min(df[obs[i]])),
             y=min(df[pred[i]])+1.3*(max(df[pred[i]])-min(df[pred[i]])),
             size=4,parse = TRUE,color="#521B93") +
    annotate("text",label = lp1,
             x=min(df[obs[i]])+.27*(max(df[obs[i]])-min(df[obs[i]])),
             y=min(df[pred[i]])+1.1*(max(df[pred[i]])-min(df[pred[i]])),
             size=4,parse = FALSE,color="#521B93") +
    ylim(c(min(df[pred[i]]),
           max(df[pred[i]])+0.35*(max(df[pred[i]])-min(df[pred[i]]))))+
    xlab(obs_nice[i]) +
    ylab(pred_nice[i]) +
    # stat_cor(aes(label=..r.label..), size=4,color="#521B93",label.y.npc = 1)+
    # stat_cor(aes(label=..rr.label..), size=4,color="#521B93",label.y.npc = .90)+
    # stat_cor(aes(label=..p.label..), size=4,color="#521B93",label.y.npc = .77)+
    geom_smooth(method="lm", formula = y~exp(x), se=TRUE, size=1,color="#521B93") +
    theme_bw(base_size = 12) +
    theme(axis.title.x = element_text(colour = "#0171C0"),
          axis.title.y = element_text(colour = "#FF9300"))
  
  assign(paste("p",i,sep = ""),p)
}
# Save all plots 4x3

ggsave(plot = ggarrange(p1,ncol = 1,nrow = 1, common.legend=TRUE, legend="bottom"),
       filename = "./output/EXP_cel_vs_csf_nfl_validation.png", width =2.5,height = 2.5,units = "in",dpi = 300)


#plot CEL vs serum NFL

obs <- c("log_nfl_serum")
obs_nice <- c("Log10 sNFL")


pred <- c("cel")
pred_nice <- c("Ln(CEL+1)")

for(i in 1:length(obs)){
  # i <- 1
  data <- data_sub_val_clean
  df <- data %>% 
    filter(.,!is.na(data[[pred[i]]]) & !is.na(data$predicted_csf_nfl))
  mod <- summary(lm(df[[pred[i]]]~exp(df[[obs[i]]])))
  lb1 <- paste("R^2==", round(mod$r.squared,3))
  lp1 <- paste("p=", format.pval(mod$coefficients[2,4]))
  p <- df %>% 
    ggplot(aes_string(x=obs[i], y=pred[i])) +
    geom_point(color="#001321",alpha = 0.4, shape = 1.5, size = 2,stroke = 0.6) +
    annotate("text",label = paste0('atop(n[s] ==', nrow(df[obs[i]]), ', n[p] ==', length(unique(df$patientcode)), ')'),
             x=max(df[obs[i]])-.15*(max(df[obs[i]])-min(df[obs[i]])),
             y=min(df[pred[i]])+1.2*(max(df[pred[i]])-min(df[pred[i]])),
             size=4,parse = TRUE) +
    annotate("text",label = lb1,
             x=min(df[obs[i]])+.2*(max(df[obs[i]])-min(df[obs[i]])),
             y=min(df[pred[i]])+1.3*(max(df[pred[i]])-min(df[pred[i]])),
             size=4,parse = TRUE,color="#521B93") +
    annotate("text",label = lp1,
             x=min(df[obs[i]])+.27*(max(df[obs[i]])-min(df[obs[i]])),
             y=min(df[pred[i]])+1.1*(max(df[pred[i]])-min(df[pred[i]])),
             size=4,parse = FALSE,color="#521B93") +
    ylim(c(min(df[pred[i]]),
           max(df[pred[i]])+0.35*(max(df[pred[i]])-min(df[pred[i]]))))+
    xlab(obs_nice[i]) +
    ylab(pred_nice[i]) +
    # stat_cor(aes(label=..r.label..), size=4,color="#521B93",label.y.npc = 1)+
    # stat_cor(aes(label=..rr.label..), size=4,color="#521B93",label.y.npc = .90)+
    # stat_cor(aes(label=..p.label..), size=4,color="#521B93",label.y.npc = .77)+
    geom_smooth(method="lm", formula = y~exp(x), se=TRUE, size=1,color="#521B93") +
    theme_bw(base_size = 12) +
    theme(axis.title.x = element_text(colour = "#C00000"),
          axis.title.y = element_text(colour = "#FF9300"))
  
  assign(paste("p",i,sep = ""),p)
}
# Save all plots 4x3

ggsave(plot = ggarrange(p1,ncol = 1,nrow = 1, common.legend=TRUE, legend="bottom"),
       filename = "./output/EXP_cel_vs_serum_nfl_validation.png", width =2.5,height = 2.5,units = "in",dpi = 300)

#plot CEL vs predicted CSF NFL

obs <- c("predicted_csf_nfl")
obs_nice <- c("sNFL-predicted cNFL")

pred <- c("cel")
pred_nice <- c("Ln(CEL+1)")

for(i in 1:length(obs)){
  # i <- 1
  data <- data_sub_val_clean
  df <- data %>% 
    filter(.,!is.na(data[[pred[i]]]) & !is.na(data$predicted_csf_nfl))
  mod <- summary(lm(df[[pred[i]]]~exp(df[[obs[i]]])))
  lb1 <- paste("R^2==", round(mod$r.squared,3))
  lp1 <- paste("p=", format.pval(mod$coefficients[2,4]))
  p <- df %>% 
    ggplot(aes_string(x=obs[i], y=pred[i])) +
    geom_point(color="#001321",alpha = 0.4, shape = 1.5, size = 2,stroke = 0.6) +
    annotate("text",label = paste0('atop(n[s] ==', nrow(df[obs[i]]), ', n[p] ==', length(unique(df$patientcode)), ')'),
             x=max(df[obs[i]])-.15*(max(df[obs[i]])-min(df[obs[i]])),
             y=min(df[pred[i]])+1.2*(max(df[pred[i]])-min(df[pred[i]])),
             size=4,parse = TRUE) +
    annotate("text",label = lb1,
             x=min(df[obs[i]])+.2*(max(df[obs[i]])-min(df[obs[i]])),
             y=min(df[pred[i]])+1.3*(max(df[pred[i]])-min(df[pred[i]])),
             size=4,parse = TRUE,color="#521B93") +
    annotate("text",label = lp1,
             x=min(df[obs[i]])+.27*(max(df[obs[i]])-min(df[obs[i]])),
             y=min(df[pred[i]])+1.1*(max(df[pred[i]])-min(df[pred[i]])),
             size=4,parse = FALSE,color="#521B93") +
    ylim(c(min(df[pred[i]]),
           max(df[pred[i]])+0.35*(max(df[pred[i]])-min(df[pred[i]]))))+
    xlab(obs_nice[i]) +
    ylab(pred_nice[i]) +
    # stat_cor(aes(label=..r.label..), size=4,color="#521B93",label.y.npc = 1)+
    # stat_cor(aes(label=..rr.label..), size=4,color="#521B93",label.y.npc = .90)+
    # stat_cor(aes(label=..p.label..), size=4,color="#521B93",label.y.npc = .77)+
    geom_smooth(method="lm", formula = y~exp(x), se=TRUE, size=1,color="#521B93") +
    theme_bw(base_size = 12) +
    theme(axis.title.x = element_text(colour = "#008F00"),
          axis.title.y = element_text(colour = "#FF9300"))
  
  assign(paste("p",i,sep = ""),p)
}
# Save all plots 4x3

ggsave(plot = ggarrange(p1,ncol = 1,nrow = 1, common.legend=TRUE, legend="bottom"),
       filename = "./output/EXP_cel_vs_predicted_nfl_validation.png", width =2.5,height = 2.5,units = "in",dpi = 300)


# calculate residuals of serum vs CEL and adjusted serum vs CEL in the exponential models

data <- data_sub_train_clean
df <- data %>% 
  filter(.,!is.na(data$cel) & !is.na(data$predicted_csf_nfl))

mod_cel_serum <- lm(cel~exp(log_nfl_serum),df)
summary(mod_cel_serum)

df$serum_cel_res <- mod_cel_serum$residuals
hist(mod_cel_serum$residuals)

mod_cel_adj_serum <- lm(cel~exp(predicted_csf_nfl),df)
summary(mod_cel_adj_serum)

df$adj_serum_cel_res <- mod_cel_adj_serum$residuals
hist(df$adj_serum_cel_res)

wilcox.test(df$serum_cel_res,df$adj_serum_cel_res,paired = TRUE, alternative = "two.sided")


df_new <- tibble::enframe(df$serum_cel_res,name = NULL,value = "serum")
df_new <- df_new %>% 
  mutate(subclass=row_number()) %>% 
  mutate(group="serum")
df_new2 <- tibble::enframe(df$adj_serum_cel_res,name = NULL,value = "serum")
df_new2 <- df_new2 %>% 
  mutate(subclass=row_number()) %>% 
  mutate(group="adj-serum")


df_all <- rbind(df_new,df_new2)

# plot
obs <- c("group")
obs_nice <- c("residuals")
pred <- c("serum")
pred_nice <- c("serum-CEL residuals")

i <- 1
p4 <- df_all %>% 
  ggplot(aes_string(x=obs[i], y=pred[i], color = "group")) +
  geom_boxplot(outlier.colour = "transparent") +
  geom_jitter(alpha = 0.6, shape = 1, size = 2,stroke = 0.6,position=position_jitter(width = 0.07,height=0)) +
  geom_line(aes(group = subclass),color="gray", alpha = 0.6,size=0.3) +
  ylim(c(min(df_all[pred[i]])-0.25*(max(df_all[pred[i]])-min(df_all[pred[i]])),max(df_all[pred[i]])+0.2*(max(df_all[pred[i]])-min(df_all[pred[i]]))))+
  xlab(obs_nice[i]) +
  ylab(pred_nice[i]) +
  geom_line(aes(group = subclass),color="gray", alpha = 0.6,size=0.3) +
  scale_colour_discrete(labels = c("adjusted serum", "measured serum"))+
  scale_x_discrete(labels = c("adjusted serum", "measured serum")) +
  stat_compare_means(method = "t.test", paired = TRUE,
                     label.x = "adj-serum", 
                     label.y = max(df_all[pred[i]])+0.1*(max(df_all[pred[i]])-min(df_all[pred[i]])) )+
  theme_bw(base_size = 12) +
  stat_summary(fun.data = function(x) {
    return(
      data.frame(
        y = min(df_all[pred[i]])-0.05*(max(df_all[pred[i]])-min(df_all[pred[i]])),
        label = paste('n =',
                      format(length(x), big.mark = ",", decimal.mark = ".", scientific = FALSE), 
                      '\n',
                      'median =', 
                      format(round(median(x), 2), big.mark = ",", decimal.mark = ".", scientific = FALSE))
      )
    )
  },geom = "text", 
  hjust = 0.5,
  vjust = 1, size =3) +
  theme_bw(base_size = 12) +
  theme(legend.position = "none")+
  theme(axis.title.y = element_text( colour = "blue" ))


ggsave(plot = ggarrange(p4,ncol = 1,nrow = 1),
       filename = "./output/CEL_residuals_plot_t.test.png", width =2.7,height = 2.5,units = "in",dpi = 300)

####  VALIDATION cohort


#plot CEL vs CSF NFL

obs <- c("log_nfl_csf","log_nfl_csf")
obs_nice <- c("Log10 cNFL","Log10 cNFL")


pred <- c("cel","t2_new_large")
pred_nice <- c("Ln(CEL+1)","Ln(#New+Enlarging T2 +1)")

for(i in 1:length(obs)){
  # i <- 2
  data <- data_sub_val_clean
  df <- data %>% 
    filter(.,!is.na(data[[pred[i]]]) & !is.na(data$predicted_csf_nfl))
  
  p <- df %>% 
    ggplot(aes_string(x=obs[i], y=pred[i])) +
    geom_point(color="#001321",alpha = 0.4, shape = 1, size = 2,stroke = 0.6) +
    annotate("text",label = paste0('atop(n[s] ==', nrow(df[obs[i]]), ', n[p] ==', length(unique(df$patientcode)), ')'),
             x=max(df[obs[i]])-.15*(max(df[obs[i]])-min(df[obs[i]])),
             y=min(df[pred[i]])+1.2*(max(df[pred[i]])-min(df[pred[i]])),
             size=4,parse = TRUE) +
    ylim(c(min(df[pred[i]]),
           max(df[pred[i]])+0.35*(max(df[pred[i]])-min(df[pred[i]]))))+
    xlab(obs_nice[i]) +
    ylab(pred_nice[i]) +
    stat_cor(aes(label=..r.label..), size=4,color="#521B93",label.y.npc = 1)+
    stat_cor(aes(label=..rr.label..), size=4,color="#521B93",label.y.npc = .90)+
    stat_cor(aes(label=..p.label..), size=4,color="#521B93",label.y.npc = .77)+
    geom_smooth(method="lm",se=TRUE, size=1,color="#521B93") +
    theme_bw(base_size = 12) +
    theme(axis.title.x = element_text(colour = "#0171C0"),
          axis.title.y = element_text(colour = "#FF9300"))
  
  assign(paste("p",i,sep = ""),p)
}
# Save all plots 4x3

ggsave(plot = ggarrange(p1,ncol = 1,nrow = 1, common.legend=TRUE, legend="bottom"),
       filename = "./output/cel_vs_csf_nfl_validation.png", width =2.5,height = 2.5,units = "in",dpi = 300)
ggsave(plot = ggarrange(p2,ncol = 1,nrow = 1, common.legend=TRUE, legend="bottom"),
       filename = "./output/T2NE_cel_vs_csf_nfl_validation.png", width =2.5,height = 2.5,units = "in",dpi = 300)


#plot CEL vs serum NFL

obs <- c("log_nfl_serum","log_nfl_serum")
obs_nice <- c("Log10 sNFL","Log10 sNFL")


pred <- c("cel","t2_new_large")
pred_nice <- c("Ln(CEL+1)","Ln(#New+Enlarging T2 +1)")


for(i in 1:length(obs)){
  # i <- 2
  data <- data_sub_val_clean
  df <- data %>% 
    filter(.,!is.na(data[[pred[i]]]) & !is.na(data$predicted_csf_nfl))
  
  p <- df %>% 
    ggplot(aes_string(x=obs[i], y=pred[i])) +
    geom_point(color="#001321",alpha = 0.4, shape = 1, size = 2,stroke = 0.6) +
    annotate("text",label = paste0('atop(n[s] ==', nrow(df[obs[i]]), ', n[p] ==', length(unique(df$patientcode)), ')'),
             x=max(df[obs[i]])-.15*(max(df[obs[i]])-min(df[obs[i]])),
             y=min(df[pred[i]])+1.2*(max(df[pred[i]])-min(df[pred[i]])),
             size=4,parse = TRUE) +
    ylim(c(min(df[pred[i]]),
           max(df[pred[i]])+0.35*(max(df[pred[i]])-min(df[pred[i]]))))+
    xlab(obs_nice[i]) +
    ylab(pred_nice[i]) +
    stat_cor(aes(label=..r.label..), size=4,color="#521B93",label.y.npc = 1)+
    stat_cor(aes(label=..rr.label..), size=4,color="#521B93",label.y.npc = .90)+
    stat_cor(aes(label=..p.label..), size=4,color="#521B93",label.y.npc = .77)+
    geom_smooth(method="lm",se=TRUE, size=1,color="#521B93") +
    theme_bw(base_size = 12) +
    theme(axis.title.x = element_text(colour = "#C00000"),
          axis.title.y = element_text(colour = "#FF9300"))
  
  assign(paste("p",i,sep = ""),p)
}
# Save all plots 4x3

ggsave(plot = ggarrange(p1,ncol = 1,nrow = 1, common.legend=TRUE, legend="bottom"),
       filename = "./output/cel_vs_serum_nfl_validation.png", width =2.5,height = 2.5,units = "in",dpi = 300)
ggsave(plot = ggarrange(p2,ncol = 1,nrow = 1, common.legend=TRUE, legend="bottom"),
       filename = "./output/T2NE_cel_vs_serum_nfl_validation.png", width =2.5,height = 2.5,units = "in",dpi = 300)


#plot CEL vs predicted CSF NFL

obs <- c("predicted_csf_nfl","predicted_csf_nfl")
obs_nice <- c("adjusted sNFL","adjusted sNFL")

pred <- c("cel","t2_new_large")
pred_nice <- c("Ln(CEL+1)","Ln(#New+Enlarging T2 +1)")


for(i in 1:length(obs)){
  # i <- 2
  data <- data_sub_val_clean
  df <- data %>% 
    filter(.,!is.na(data[[pred[i]]]) & !is.na(data$predicted_csf_nfl))
  
  p <- df %>% 
    ggplot(aes_string(x=obs[i], y=pred[i])) +
    geom_point(color="#001321",alpha = 0.4, shape = 1, size = 2,stroke = 0.6) +
    annotate("text",label = paste0('atop(n[s] ==', nrow(df[obs[i]]), ', n[p] ==', length(unique(df$patientcode)), ')'),
             x=max(df[obs[i]])-.15*(max(df[obs[i]])-min(df[obs[i]])),
             y=min(df[pred[i]])+1.2*(max(df[pred[i]])-min(df[pred[i]])),
             size=4,parse = TRUE) +
    ylim(c(min(df[pred[i]]),
           max(df[pred[i]])+0.35*(max(df[pred[i]])-min(df[pred[i]]))))+
    xlab(obs_nice[i]) +
    ylab(pred_nice[i]) +
    stat_cor(aes(label=..r.label..), size=4,color="#521B93",label.y.npc = 1)+
    stat_cor(aes(label=..rr.label..), size=4,color="#521B93",label.y.npc = .90)+
    stat_cor(aes(label=..p.label..), size=4,color="#521B93",label.y.npc = .77)+
    geom_smooth(method="lm",se=TRUE, size=1,color="#521B93") +
    theme_bw(base_size = 12) +
    theme(axis.title.x = element_text(colour = "#008F00"),
          axis.title.y = element_text(colour = "#FF9300"))
  
  assign(paste("p",i,sep = ""),p)
}
# Save all plots 4x3

ggsave(plot = ggarrange(p1,ncol = 1,nrow = 1, common.legend=TRUE, legend="bottom"),
       filename = "./output/cel_vs_predicted_nfl_validation.png", width =2.5,height = 2.5,units = "in",dpi = 300)
ggsave(plot = ggarrange(p2,ncol = 1,nrow = 1, common.legend=TRUE, legend="bottom"),
       filename = "./output/T2NE_cel_vs_adjusted_nfl_validation.png", width =2.5,height = 2.5,units = "in",dpi = 300)


########################################################################################################################################################
########################################################################################################################################################
#             logistic regression - predction of CEL status (yes/no) from sNFL, cNFL, pred cNFL
########################################################################################################################################################
########################################################################################################################################################

# generate dichotomized CEL column
data_sub_train_clean$cel_dich <- NA

data_sub_train_clean$cel_dich[which(data_sub_train_clean$cel==0)] <- 0

data_sub_train_clean$cel_dich[which(data_sub_train_clean$cel>0)] <- 1


# logistic regression model 1 - using measured cNFL
model1 <- glm(cel_dich~log_nfl_csf,family=binomial(link="logit"),data = data_sub_train_clean)

# logistic regression model 2 - using measured sNFL
model2 <- glm(cel_dich~log_nfl_serum,family=binomial(link="logit"),data = data_sub_train_clean)

# logistic regression model 3 - using predicted cNFL
model3 <- glm(cel_dich~predicted_csf_nfl,family=binomial(link="logit"),data = data_sub_train_clean)


# calculate predicted probabilities of cCEL for each model
data_sub_train_clean$glm_1 <- plogis(predict(model1,data_sub_train_clean))
data_sub_train_clean$glm_2 <- plogis(predict(model2,data_sub_train_clean))
data_sub_train_clean$glm_3 <- plogis(predict(model3,data_sub_train_clean))


plot.roc(roc(data_sub_train_clean$cel_dich,data_sub_train_clean$glm_1,percent=TRUE, plot=FALSE, ci=TRUE),print.auc = TRUE)
plot.roc(roc(data_sub_train_clean$cel_dich,data_sub_train_clean$glm_2,percent=TRUE, plot=FALSE, ci=TRUE),print.auc = TRUE)
plot.roc(roc(data_sub_train_clean$cel_dich,data_sub_train_clean$glm_3,percent=TRUE, plot=FALSE, ci=TRUE),print.auc = TRUE)


# generate cut-off for NFL that best differentiates CEL vs noCEL in training cohort

cutoff_1 <- optimalCutoff(data_sub_train_clean$cel_dich,data_sub_train_clean$glm_1)[1]
cutoff_2 <- optimalCutoff(data_sub_train_clean$cel_dich,data_sub_train_clean$glm_2)[1]
cutoff_3 <- optimalCutoff(data_sub_train_clean$cel_dich,data_sub_train_clean$glm_3)[1]

##############################
# NFL cutoff for model 1

# identify the lowest NFL from category 1
test <- data_sub_train_clean %>% 
  filter(glm_1>cutoff_1) 
min <-   min(test$nfl_csf)

# identify the highest NFL from category 0
test <- data_sub_train_clean %>% 
  filter(glm_1<cutoff_1) 
max <-   max(test$nfl_csf)

#calculate average betwen min and max
cnfl_cutoff <- mean(c(min,max))

##############################
# NFL cutoff for model 2

# identify the lowest NFL from category 1
test <- data_sub_train_clean %>% 
  filter(glm_2>cutoff_2) 
min <-   min(test$nfl_serum)

# identify the highest NFL from category 0
test <- data_sub_train_clean %>% 
  filter(glm_2<cutoff_2) 
max <-   max(test$nfl_serum)

#calculate average betwen min and max
snfl_cutoff <- mean(c(min,max))

##############################
# NFL cutoff for model 3

# identify the lowest NFL from category 1
test <- data_sub_train_clean %>% 
  filter(glm_3>cutoff_3) 
min <-   min(test$predicted_csf_nfl)

# identify the highest NFL from category 0
test <- data_sub_train_clean %>% 
  filter(glm_3<cutoff_3) 
max <-   max(test$predicted_csf_nfl)

#calculate average betwen min and max
predicted_cnfl_cutoff <- 10^mean(c(min,max))


# sensitivity and specificity

#glm1
data_train_recoded <- data_sub_train_clean

data_train_recoded$glm_1_recoded <- ifelse(data_train_recoded$glm_1 > cutoff_1, 1,0)
data_train_recoded$glm_2_recoded <- ifelse(data_train_recoded$glm_2 > cutoff_2, 1,0)
data_train_recoded$glm_3_recoded <- ifelse(data_train_recoded$glm_3 > cutoff_3, 1,0)


library(caret)
gl1_confmx <- table(factor(data_train_recoded$glm_1_recoded, levels=c(1,0)),factor(data_train_recoded$cel_dich,levels = c(1,0)))

gl1_sens <- round(caret::sensitivity(gl1_confmx)*100,digits = 1)
gl1_spec <- round(caret::specificity(gl1_confmx)*100,digits = 1)


gl2_confmx <- table(factor(data_train_recoded$glm_2_recoded,levels = c(1,0)),factor(data_train_recoded$cel_dich,levels = c(1,0)))

gl2_sens <- round(caret::sensitivity(gl2_confmx)*100,digits = 1)
gl2_spec <- round(caret::specificity(gl2_confmx)*100,digits = 1)

gl3_confmx <- table(factor(data_train_recoded$glm_3_recoded,levels = c(1,0)),factor(data_train_recoded$cel_dich,levels = c(1,0)))

gl3_sens <- round(caret::sensitivity(gl3_confmx)*100,digits = 1)
gl3_spec <- round(caret::specificity(gl3_confmx)*100,digits = 1)


p1 <- data_sub_train_clean %>%
  mutate_at(.,.vars="cel_dich",.funs = as.factor) %>% 
  filter(!is.na(cel_dich)) %>% 
  ggplot(aes(x=cel_dich, y=glm_1, color = cel_dich)) +
  geom_violin(color="black",alpha = 0.7,size=0.3,aes(fill=cel_dich)) +
  geom_jitter( shape = 21, size = 3,stroke = 0.6,position=position_jitter(0.07),color="black",aes(fill=cel_dich))+
  scale_fill_manual(values = c("orange","red"),breaks = c(0,1))+
  geom_hline(yintercept = cutoff_1,color="black",linetype="dotted",size=.5)+
  # ggtitle("TRAINING cohort \npredictors: Age + Gender") +
  xlab("") +
  ylab("predicted probability of CEL") +
  ggtitle(paste0("AUC: 78.4% \nSens = ",gl1_sens,"%"," Spec = ",gl1_spec,"%"))+
  scale_y_continuous(limits = c(0,1.02),labels = scales::percent) +
  scale_x_discrete(label=c("No","Yes"))+
  stat_summary(fun="median", geom = "crossbar",color="black",size=.5,width=0.5)+
  stat_compare_means(method = "wilcox.test", paired = FALSE,label.x = "0",size=4)+
  theme_classic2() +
  theme(legend.position = "none",
        plot.title = element_text(size=12),
        axis.text = element_text(size=12),
        axis.title = element_text(size=14))

p1


p2 <- data_sub_train_clean %>%
  mutate_at(.,.vars="cel_dich",.funs = as.factor) %>% 
  filter(!is.na(cel_dich)) %>% 
  ggplot(aes(x=cel_dich, y=glm_2, color = cel_dich)) +
  geom_violin(color="black",alpha = 0.7,size=0.3,aes(fill=cel_dich)) +
  geom_jitter( shape = 21, size = 3,stroke = 0.6,position=position_jitter(0.07),color="black",aes(fill=cel_dich))+
  scale_fill_manual(values = c("orange","red"),breaks = c(0,1))+
  geom_hline(yintercept = cutoff_2,color="black",linetype="dotted",size=.5)+
  # ggtitle("TRAINING cohort \npredictors: Age + Gender") +
  xlab("") +
  ylab("predicted probability of CEL") +
  ggtitle(paste0("AUC: 61.8%  \nSens = ",gl2_sens,"%"," Spec = ",gl2_spec,"%"))+
  scale_y_continuous(limits = c(0,1.02),labels = scales::percent) +
  scale_x_discrete(label=c("No","Yes"))+
  stat_summary(fun="median", geom = "crossbar",color="black",size=.5,width=0.5)+
  stat_compare_means(method = "wilcox.test", paired = FALSE,label.x = "0")+
  theme_classic2() +
  theme(legend.position = "none",
        plot.title = element_text(size=12),
        axis.text = element_text(size=12),
        axis.title = element_text(size=14))

p2

p3 <- data_sub_train_clean %>%
  mutate_at(.,.vars="cel_dich",.funs = as.factor) %>% 
  filter(!is.na(cel_dich)) %>% 
  ggplot(aes(x=cel_dich, y=glm_3, color = cel_dich)) +
  geom_violin(color="black",alpha = 0.7,size=0.3,aes(fill=cel_dich)) +
  geom_jitter( shape = 21, size = 3,stroke = 0.6,position=position_jitter(0.07),color="black",aes(fill=cel_dich))+
  scale_fill_manual(values = c("orange","red"),breaks = c(0,1))+
  geom_hline(yintercept = cutoff_3,color="black",linetype="dotted",size=.5)+
  # ggtitle("TRAINING cohort \npredictors: Age + Gender") +
  xlab("") +
  ylab("predicted probability of CEL") +
  ggtitle(paste0("AUC: 69.2%  \nSens = ",gl3_sens,"%"," Spec = ",gl3_spec,"%"))+
  scale_y_continuous(limits = c(0,1.02),labels = scales::percent) +
  scale_x_discrete(label=c("No","Yes"))+
  stat_summary(fun="median", geom = "crossbar",color="black",size=.5,width=0.5)+
  stat_compare_means(method = "wilcox.test", paired = FALSE,label.x = "0")+
  theme_classic2() +
  theme(legend.position = "none",
        plot.title = element_text(size=12),
        axis.text = element_text(size=12),
        axis.title = element_text(size=14))

p3


# predict in validation cohort

data_sub_val_clean$cel_dich <- ifelse(data_sub_val_clean$cel >0,1,0)
# data_sub_val_clean$cel_dich[which(data_sub_val_clean$Clinical_Disease_Activity=="Yes")] <- 1


# calculate predicted probabilities of cCEL for each model
data_sub_val_clean$glm_1 <- plogis(predict(model1,data_sub_val_clean))
data_sub_val_clean$glm_2 <- plogis(predict(model2,data_sub_val_clean))
data_sub_val_clean$glm_3 <- plogis(predict(model3,data_sub_val_clean))


plot.roc(roc(data_sub_val_clean$cel_dich,data_sub_val_clean$glm_1,percent=TRUE, plot=FALSE, ci=TRUE),print.auc = TRUE)
plot.roc(roc(data_sub_val_clean$cel_dich,data_sub_val_clean$glm_2,percent=TRUE, plot=FALSE, ci=TRUE),print.auc = TRUE)
plot.roc(roc(data_sub_val_clean$cel_dich,data_sub_val_clean$glm_3,percent=TRUE, plot=FALSE, ci=TRUE),print.auc = TRUE)


#glm1
data_val_recoded <- data_sub_val_clean

data_val_recoded$glm_1_recoded <- ifelse(data_val_recoded$glm_1 > cutoff_1, 1,0)
data_val_recoded$glm_2_recoded <- ifelse(data_val_recoded$glm_2 > cutoff_2, 1,0)
data_val_recoded$glm_3_recoded <- ifelse(data_val_recoded$glm_3 > cutoff_3, 1,0)


gl1_confmx <- table(factor(data_val_recoded$glm_1_recoded,levels = c(1,0)),factor(data_val_recoded$cel_dich,levels = c(1,0)))

gl1_sens <- round(caret::sensitivity(gl1_confmx)*100,digits = 1)
gl1_spec <- round(caret::specificity(gl1_confmx)*100,digits = 1)

gl2_confmx <- table(factor(data_val_recoded$glm_2_recoded,levels = c(1,0)),factor(data_val_recoded$cel_dich,levels = c(1,0)))

gl2_sens <- round(caret::sensitivity(gl2_confmx)*100,digits = 1)
gl2_spec <- round(caret::specificity(gl2_confmx)*100,digits = 1)

gl3_confmx <- table(factor(data_val_recoded$glm_3_recoded,levels = c(1,0)),factor(data_val_recoded$cel_dich,levels = c(1,0)))

gl3_sens <- round(caret::sensitivity(gl3_confmx)*100,digits = 1)
gl3_spec <- round(caret::specificity(gl3_confmx)*100,digits = 1)


p1_v <- data_sub_val_clean %>%
  mutate_at(.,.vars="cel_dich",.funs = as.factor) %>% 
  filter(!is.na(cel_dich)) %>% 
  ggplot(aes(x=cel_dich, y=glm_1, color = cel_dich)) +
  geom_violin(color="black",alpha = 0.7,size=0.3,aes(fill=cel_dich)) +
  geom_jitter( shape = 21, size = 3,stroke = 0.6,position=position_jitter(0.07),color="black",aes(fill=cel_dich))+
  scale_fill_manual(values = c("orange","red"),breaks = c(0,1))+
  geom_hline(yintercept = cutoff_1,color="black",linetype="dotted",size=.5)+
  # ggtitle("TRAINING cohort \npredictors: Age + Gender") +
  xlab("") +
  ylab("predicted probability of CEL") +
  ggtitle(paste0("AUC: 73.5%  \nSens = ",gl1_sens,"%"," Spec = ",gl1_spec,"%"))+
  scale_y_continuous(limits = c(0,1.02),labels = scales::percent) +
  scale_x_discrete(label=c("No","Yes"))+
  stat_summary(fun="median", geom = "crossbar",color="black",size=.5,width=0.5)+
  stat_compare_means(method = "wilcox.test", paired = FALSE,label.x = "0")+
  theme_classic2() +
  theme(legend.position = "none",
        plot.title = element_text(size=12),
        axis.text = element_text(size=12),
        axis.title = element_text(size=14))

p1_v


p2_v <- data_sub_val_clean %>%
  mutate_at(.,.vars="cel_dich",.funs = as.factor) %>% 
  filter(!is.na(cel_dich)) %>% 
  ggplot(aes(x=cel_dich, y=glm_2, color = cel_dich)) +
  geom_violin(color="black",alpha = 0.7,size=0.3,aes(fill=cel_dich)) +
  geom_jitter( shape = 21, size = 3,stroke = 0.6,position=position_jitter(0.07),color="black",aes(fill=cel_dich))+
  scale_fill_manual(values = c("orange","red"),breaks = c(0,1))+
  geom_hline(yintercept = cutoff_2,color="black",linetype="dotted",size=.5)+
  # ggtitle("TRAINING cohort \npredictors: Age + Gender") +
  xlab("") +
  ylab("predicted probability of CEL") +
  ggtitle(paste0("AUC: 65.1%  \nSens = ",gl2_sens,"%"," Spec = ",gl2_spec,"%"))+
  scale_y_continuous(limits = c(0,1.02),labels = scales::percent) +
  scale_x_discrete(label=c("No","Yes"))+
  stat_summary(fun="median", geom = "crossbar",color="black",size=.5,width=0.5)+
  stat_compare_means(method = "wilcox.test", paired = FALSE,label.x = "0")+
  theme_classic2() +
  theme(legend.position = "none",
        plot.title = element_text(size=12),
        axis.text = element_text(size=12),
        axis.title = element_text(size=14))
p2_v

p3_v <- data_sub_val_clean %>%
  mutate_at(.,.vars="cel_dich",.funs = as.factor) %>% 
  filter(!is.na(cel_dich)) %>% 
  ggplot(aes(x=cel_dich, y=glm_3, color = cel_dich)) +
  geom_violin(color="black",alpha = 0.7,size=0.3,aes(fill=cel_dich)) +
  geom_jitter( shape = 21, size = 3,stroke = 0.6,position=position_jitter(0.07),color="black",aes(fill=cel_dich))+
  scale_fill_manual(values = c("orange","red"),breaks = c(0,1))+
  geom_hline(yintercept = cutoff_3,color="black",linetype="dotted",size=.5)+
  # ggtitle("TRAINING cohort \npredictors: Age + Gender") +
  xlab("") +
  ylab("predicted probability of CEL") +
  ggtitle(paste0("AUC: 75.3%  \nSens = ",gl3_sens,"%"," Spec = ",gl3_spec,"%"))+
  scale_y_continuous(limits = c(0,1.02),labels = scales::percent) +
  scale_x_discrete(label=c("No","Yes"))+
  stat_summary(fun="median", geom = "crossbar",color="black",size=.5,width=0.5)+
  stat_compare_means(method = "wilcox.test", paired = FALSE,label.x = "0")+
  theme_classic2() +
  theme(legend.position = "none",
        plot.title = element_text(size=12),
        axis.text = element_text(size=12),
        axis.title = element_text(size=14))

p3_v

ggsave(plot = ggarrange(p1,p2,p3,p1_v,p2_v,p3_v,
                        ncol = 3,nrow = 2, common.legend=TRUE, legend="none"),
       filename = "./output/cel_dichotomized_train+val.png", 
       width=15,height=10,units = "in",dpi = 150)


ggsave(plot = ggarrange(p1,ncol = 1,nrow = 1, common.legend=TRUE, legend="none"),
       filename = "./output/dichotomized_cel_csf_nfl_training.png", width =3.5,height = 3.5,units = "in",dpi = 300)
ggsave(plot = ggarrange(p2,ncol = 1,nrow = 1, common.legend=TRUE, legend="none"),
       filename = "./output/dichotomized_cel_serum_nfl_training.png", width =3.5,height = 3.5,units = "in",dpi = 300)
ggsave(plot = ggarrange(p3,ncol = 1,nrow = 1, common.legend=TRUE, legend="none"),
       filename = "./output/dichotomized_cel_predicted_csf_nfl_training.png", width =3.5,height =3.5,units = "in",dpi = 300)
ggsave(plot = ggarrange(p1_v,ncol = 1,nrow = 1, common.legend=TRUE, legend="none"),
       filename = "./output/dichotomized_cel_csf_nfl_validation.png", width =3.5,height = 3.5,units = "in",dpi = 300)
ggsave(plot = ggarrange(p2_v,ncol = 1,nrow = 1, common.legend=TRUE, legend="none"),
       filename = "./output/dichotomized_cel_serum_nfl_validation.png", width =3.5,height = 3.5,units = "in",dpi = 300)
ggsave(plot = ggarrange(p3_v,ncol = 1,nrow = 1, common.legend=TRUE, legend="none"),
       filename = "./output/dichotomized_cel_predicted_csf_nfl_validation.png", width =3.5,height = 3.5,units = "in",dpi = 300)


########################################################################################################################################################

########################################################################################################################################################
########################################################################################################################################################
#             logistic regression - predction of CEL status (yes/no) from HD age-adjusted sNFL, cNFL, pred cNFL
########################################################################################################################################################
########################################################################################################################################################

########################################################################################################################################################


# generate dichotomized CEL column
data_sub_train_clean$cel_dich <- NA

data_sub_train_clean$cel_dich[which(data_sub_train_clean$cel==0)] <- 0

data_sub_train_clean$cel_dich[which(data_sub_train_clean$cel>0)] <- 1


# logistic regression model 1 - using measured cNFL
model1 <- glm(cel_dich~log_nfl_csf_age_adj,family=binomial(link="logit"),data = data_sub_train_clean)

# logistic regression model 2 - using measured sNFL
model2 <- glm(cel_dich~log_nfl_serum_age_adj,family=binomial(link="logit"),data = data_sub_train_clean)

# logistic regression model 3 - using predicted cNFL
model3 <- glm(cel_dich~predicted_csf_nfl_age_adj,family=binomial(link="logit"),data = data_sub_train_clean)


# calculate predicted probabilities of cCEL for each model
data_sub_train_clean$glm_1 <- plogis(predict(model1,data_sub_train_clean))
data_sub_train_clean$glm_2 <- plogis(predict(model2,data_sub_train_clean))
data_sub_train_clean$glm_3 <- plogis(predict(model3,data_sub_train_clean))


plot.roc(roc(data_sub_train_clean$cel_dich,data_sub_train_clean$glm_1,percent=TRUE, plot=FALSE, ci=TRUE),print.auc = TRUE)
plot.roc(roc(data_sub_train_clean$cel_dich,data_sub_train_clean$glm_2,percent=TRUE, plot=FALSE, ci=TRUE),print.auc = TRUE)
plot.roc(roc(data_sub_train_clean$cel_dich,data_sub_train_clean$glm_3,percent=TRUE, plot=FALSE, ci=TRUE),print.auc = TRUE)


# generate cut-off for NFL that best differentiates CEL vs noCEL in training cohort

cutoff_1 <- optimalCutoff(data_sub_train_clean$cel_dich,data_sub_train_clean$glm_1)[1]
cutoff_2 <- optimalCutoff(data_sub_train_clean$cel_dich,data_sub_train_clean$glm_2)[1]
cutoff_3 <- optimalCutoff(data_sub_train_clean$cel_dich,data_sub_train_clean$glm_3)[1]

##############################
# NFL cutoff for model 1

# identify the lowest NFL from category 1
test <- data_sub_train_clean %>% 
  filter(glm_1>cutoff_1) 
min <-   min(test$nfl_csf)

# identify the highest NFL from category 0
test <- data_sub_train_clean %>% 
  filter(glm_1<cutoff_1) 
max <-   max(test$nfl_csf)

#calculate average betwen min and max
cnfl_cutoff <- mean(c(min,max))

##############################
# NFL cutoff for model 2

# identify the lowest NFL from category 1
test <- data_sub_train_clean %>% 
  filter(glm_2>cutoff_2) 
min <-   min(test$nfl_serum)

# identify the highest NFL from category 0
test <- data_sub_train_clean %>% 
  filter(glm_2<cutoff_2) 
max <-   max(test$nfl_serum)

#calculate average betwen min and max
snfl_cutoff <- mean(c(min,max))

##############################
# NFL cutoff for model 3

# identify the lowest NFL from category 1
test <- data_sub_train_clean %>% 
  filter(glm_3>cutoff_3) 
min <-   min(test$predicted_csf_nfl)

# identify the highest NFL from category 0
test <- data_sub_train_clean %>% 
  filter(glm_3<cutoff_3) 
max <-   max(test$predicted_csf_nfl)

#calculate average betwen min and max
predicted_cnfl_cutoff <- 10^mean(c(min,max))


# sensitivity and specificity

#glm1
data_train_recoded <- data_sub_train_clean

data_train_recoded$glm_1_recoded <- ifelse(data_train_recoded$glm_1 > cutoff_1, 1,0)
data_train_recoded$glm_2_recoded <- ifelse(data_train_recoded$glm_2 > cutoff_2, 1,0)
data_train_recoded$glm_3_recoded <- ifelse(data_train_recoded$glm_3 > cutoff_3, 1,0)


gl1_confmx <- table(factor(data_train_recoded$glm_1_recoded, levels=c(1,0)),factor(data_train_recoded$cel_dich,levels = c(1,0)))

gl1_sens <- round(caret::sensitivity(gl1_confmx)*100,digits = 1)
gl1_spec <- round(caret::specificity(gl1_confmx)*100,digits = 1)


gl2_confmx <- table(factor(data_train_recoded$glm_2_recoded,levels = c(1,0)),factor(data_train_recoded$cel_dich,levels = c(1,0)))

gl2_sens <- round(caret::sensitivity(gl2_confmx)*100,digits = 1)
gl2_spec <- round(caret::specificity(gl2_confmx)*100,digits = 1)

gl3_confmx <- table(factor(data_train_recoded$glm_3_recoded,levels = c(1,0)),factor(data_train_recoded$cel_dich,levels = c(1,0)))

gl3_sens <- round(caret::sensitivity(gl3_confmx)*100,digits = 1)
gl3_spec <- round(caret::specificity(gl3_confmx)*100,digits = 1)


p1 <- data_sub_train_clean %>%
  mutate_at(.,.vars="cel_dich",.funs = as.factor) %>% 
  filter(!is.na(cel_dich)) %>% 
  ggplot(aes(x=cel_dich, y=glm_1, color = cel_dich)) +
  geom_violin(color="black",alpha = 0.7,size=0.3,aes(fill=cel_dich)) +
  geom_jitter( shape = 21, size = 3,stroke = 0.6,position=position_jitter(0.07),color="black",aes(fill=cel_dich))+
  scale_fill_manual(values = c("orange","red"),breaks = c(0,1))+
  geom_hline(yintercept = cutoff_1,color="black",linetype="dotted",size=.5)+
  # ggtitle("TRAINING cohort \npredictors: Age + Gender") +
  xlab("MRI CEL") +
  ylab("predicted probability of CEL") +
  ggtitle(paste0("AUC: 82.7%  Sens = ",gl1_sens,"%"," Spec = ",gl1_spec,"%"))+
  scale_y_continuous(limits = c(0,1.02),labels = scales::percent) +
  scale_x_discrete(label=c("No","Yes"))+
  stat_summary(fun="median", geom = "crossbar",color="black",size=.5,width=0.5)+
  stat_compare_means(method = "wilcox.test", paired = FALSE,label.x = "0")+
  theme_classic2() +
  theme(legend.position = "none",
        plot.title = element_text(size=12),
        axis.text = element_text(size=12))

p1


p2 <- data_sub_train_clean %>%
  mutate_at(.,.vars="cel_dich",.funs = as.factor) %>% 
  filter(!is.na(cel_dich)) %>% 
  ggplot(aes(x=cel_dich, y=glm_2, color = cel_dich)) +
  geom_violin(color="black",alpha = 0.7,size=0.3,aes(fill=cel_dich)) +
  geom_jitter( shape = 21, size = 3,stroke = 0.6,position=position_jitter(0.07),color="black",aes(fill=cel_dich))+
  scale_fill_manual(values = c("orange","red"),breaks = c(0,1))+
  geom_hline(yintercept = cutoff_2,color="black",linetype="dotted",size=.5)+
  # ggtitle("TRAINING cohort \npredictors: Age + Gender") +
  xlab("MRI CEL") +
  ylab("predicted probability of CEL") +
  ggtitle(paste0("AUC: 74.8%  Sens = ",gl2_sens,"%"," Spec = ",gl2_spec,"%"))+
  scale_y_continuous(limits = c(0,1.02),labels = scales::percent) +
  scale_x_discrete(label=c("No","Yes"))+
  stat_summary(fun="median", geom = "crossbar",color="black",size=.5,width=0.5)+
  stat_compare_means(method = "wilcox.test", paired = FALSE,label.x = "0")+
  theme_classic2() +
  theme(legend.position = "none",
        plot.title = element_text(size=12),
        axis.text = element_text(size=12))

p2

p3 <- data_sub_train_clean %>%
  mutate_at(.,.vars="cel_dich",.funs = as.factor) %>% 
  filter(!is.na(cel_dich)) %>% 
  ggplot(aes(x=cel_dich, y=glm_3, color = cel_dich)) +
  geom_violin(color="black",alpha = 0.7,size=0.3,aes(fill=cel_dich)) +
  geom_jitter( shape = 21, size = 3,stroke = 0.6,position=position_jitter(0.07),color="black",aes(fill=cel_dich))+
  scale_fill_manual(values = c("orange","red"),breaks = c(0,1))+
  geom_hline(yintercept = cutoff_3,color="black",linetype="dotted",size=.5)+
  # ggtitle("TRAINING cohort \npredictors: Age + Gender") +
  xlab("MRI CEL") +
  ylab("predicted probability of CEL") +
  ggtitle(paste0("AUC: 78.4%  Sens = ",gl3_sens,"%"," Spec = ",gl3_spec,"%"))+
  scale_y_continuous(limits = c(0,1.02),labels = scales::percent) +
  scale_x_discrete(label=c("No","Yes"))+
  stat_summary(fun="median", geom = "crossbar",color="black",size=.5,width=0.5)+
  stat_compare_means(method = "wilcox.test", paired = FALSE,label.x = "0")+
  theme_classic2() +
  theme(legend.position = "none",
        plot.title = element_text(size=12),
        axis.text = element_text(size=12))

p3


# predict in validation cohort

data_sub_val_clean$cel_dich <- ifelse(data_sub_val_clean$cel >0,1,0)


# calculate predicted probabilities of cCEL for each model
data_sub_val_clean$glm_1 <- plogis(predict(model1,data_sub_val_clean))
data_sub_val_clean$glm_2 <- plogis(predict(model2,data_sub_val_clean))
data_sub_val_clean$glm_3 <- plogis(predict(model3,data_sub_val_clean))


plot.roc(roc(data_sub_val_clean$cel_dich,data_sub_val_clean$glm_1,percent=TRUE, plot=FALSE, ci=TRUE),print.auc = TRUE)
plot.roc(roc(data_sub_val_clean$cel_dich,data_sub_val_clean$glm_2,percent=TRUE, plot=FALSE, ci=TRUE),print.auc = TRUE)
plot.roc(roc(data_sub_val_clean$cel_dich,data_sub_val_clean$glm_3,percent=TRUE, plot=FALSE, ci=TRUE),print.auc = TRUE)


#glm1
data_val_recoded <- data_sub_val_clean

data_val_recoded$glm_1_recoded <- ifelse(data_val_recoded$glm_1 > cutoff_1, 1,0)
data_val_recoded$glm_2_recoded <- ifelse(data_val_recoded$glm_2 > cutoff_2, 1,0)
data_val_recoded$glm_3_recoded <- ifelse(data_val_recoded$glm_3 > cutoff_3, 1,0)


gl1_confmx <- table(factor(data_val_recoded$glm_1_recoded,levels = c(1,0)),factor(data_val_recoded$cel_dich,levels = c(1,0)))

gl1_sens <- round(caret::sensitivity(gl1_confmx)*100,digits = 1)
gl1_spec <- round(caret::specificity(gl1_confmx)*100,digits = 1)

gl2_confmx <- table(factor(data_val_recoded$glm_2_recoded,levels = c(1,0)),factor(data_val_recoded$cel_dich,levels = c(1,0)))

gl2_sens <- round(caret::sensitivity(gl2_confmx)*100,digits = 1)
gl2_spec <- round(caret::specificity(gl2_confmx)*100,digits = 1)

gl3_confmx <- table(factor(data_val_recoded$glm_3_recoded,levels = c(1,0)),factor(data_val_recoded$cel_dich,levels = c(1,0)))

gl3_sens <- round(caret::sensitivity(gl3_confmx)*100,digits = 1)
gl3_spec <- round(caret::specificity(gl3_confmx)*100,digits = 1)


p1_v <- data_sub_val_clean %>%
  mutate_at(.,.vars="cel_dich",.funs = as.factor) %>% 
  filter(!is.na(cel_dich)) %>% 
  ggplot(aes(x=cel_dich, y=glm_1, color = cel_dich)) +
  geom_violin(color="black",alpha = 0.7,size=0.3,aes(fill=cel_dich)) +
  geom_jitter( shape = 21, size = 3,stroke = 0.6,position=position_jitter(0.07),color="black",aes(fill=cel_dich))+
  scale_fill_manual(values = c("orange","red"),breaks = c(0,1))+
  geom_hline(yintercept = cutoff_1,color="black",linetype="dotted",size=.5)+
  # ggtitle("TRAINING cohort \npredictors: Age + Gender") +
  xlab("MRI CEL") +
  ylab("predicted probability of CEL") +
  ggtitle(paste0("AUC: 78.3%  Sens = ",gl1_sens,"%"," Spec = ",gl1_spec,"%"))+
  scale_y_continuous(limits = c(0,1.02),labels = scales::percent) +
  scale_x_discrete(label=c("No","Yes"))+
  stat_summary(fun="median", geom = "crossbar",color="black",size=.5,width=0.5)+
  stat_compare_means(method = "wilcox.test", paired = FALSE,label.x = "0")+
  theme_classic2() +
  theme(legend.position = "none",
        plot.title = element_text(size=12),
        axis.text = element_text(size=12))

p1_v


p2_v <- data_sub_val_clean %>%
  mutate_at(.,.vars="cel_dich",.funs = as.factor) %>% 
  filter(!is.na(cel_dich)) %>% 
  ggplot(aes(x=cel_dich, y=glm_2, color = cel_dich)) +
  geom_violin(color="black",alpha = 0.7,size=0.3,aes(fill=cel_dich)) +
  geom_jitter( shape = 21, size = 3,stroke = 0.6,position=position_jitter(0.07),color="black",aes(fill=cel_dich))+
  scale_fill_manual(values = c("orange","red"),breaks = c(0,1))+
  geom_hline(yintercept = cutoff_2,color="black",linetype="dotted",size=.5)+
  # ggtitle("TRAINING cohort \npredictors: Age + Gender") +
  xlab("MRI CEL") +
  ylab("predicted probability of CEL") +
  ggtitle(paste0("AUC: 78.6%  Sens = ",gl2_sens,"%"," Spec = ",gl2_spec,"%"))+
  scale_y_continuous(limits = c(0,1.02),labels = scales::percent) +
  scale_x_discrete(label=c("No","Yes"))+
  stat_summary(fun="median", geom = "crossbar",color="black",size=.5,width=0.5)+
  stat_compare_means(method = "wilcox.test", paired = FALSE,label.x = "0")+
  theme_classic2() +
  theme(legend.position = "none",
        plot.title = element_text(size=12),
        axis.text = element_text(size=12))

p2_v

p3_v <- data_sub_val_clean %>%
  mutate_at(.,.vars="cel_dich",.funs = as.factor) %>% 
  filter(!is.na(cel_dich)) %>% 
  ggplot(aes(x=cel_dich, y=glm_3, color = cel_dich)) +
  geom_violin(color="black",alpha = 0.7,size=0.3,aes(fill=cel_dich)) +
  geom_jitter( shape = 21, size = 3,stroke = 0.6,position=position_jitter(0.07),color="black",aes(fill=cel_dich))+
  scale_fill_manual(values = c("orange","red"),breaks = c(0,1))+
  geom_hline(yintercept = cutoff_3,color="black",linetype="dotted",size=.5)+
  # ggtitle("TRAINING cohort \npredictors: Age + Gender") +
  xlab("MRI CEL") +
  ylab("predicted probability of CEL") +
  ggtitle(paste0("AUC: 84.7%  Sens = ",gl3_sens,"%"," Spec = ",gl3_spec,"%"))+
  scale_y_continuous(limits = c(0,1.02),labels = scales::percent) +
  scale_x_discrete(label=c("No","Yes"))+
  stat_summary(fun="median", geom = "crossbar",color="black",size=.5,width=0.5)+
  stat_compare_means(method = "wilcox.test", paired = FALSE,label.x = "0")+
  theme_classic2() +
  theme(legend.position = "none",
        plot.title = element_text(size=12),
        axis.text = element_text(size=12))

p3_v


ggsave(plot = ggarrange(p1,ncol = 1,nrow = 1, common.legend=TRUE, legend="none"),
       filename = "./output/dichotomized_cel_csf_nfl_age_adj_training.png", width =4.5,height = 3.5,units = "in",dpi = 300)
ggsave(plot = ggarrange(p2,ncol = 1,nrow = 1, common.legend=TRUE, legend="none"),
       filename = "./output/dichotomized_cel_serum_nfl_age_adj_training.png", width =4.5,height = 3.5,units = "in",dpi = 300)
ggsave(plot = ggarrange(p3,ncol = 1,nrow = 1, common.legend=TRUE, legend="none"),
       filename = "./output/dichotomized_cel_predicted_csf_nfl_age_adj_training.png", width =4.5,height =3.5,units = "in",dpi = 300)
ggsave(plot = ggarrange(p1_v,ncol = 1,nrow = 1, common.legend=TRUE, legend="none"),
       filename = "./output/dichotomized_cel_csf_nfl_age_adj_validation.png", width =4.5,height = 3.5,units = "in",dpi = 300)
ggsave(plot = ggarrange(p2_v,ncol = 1,nrow = 1, common.legend=TRUE, legend="none"),
       filename = "./output/dichotomized_cel_serum_nfl_age_adj_validation.png", width =4.5,height = 3.5,units = "in",dpi = 300)
ggsave(plot = ggarrange(p3_v,ncol = 1,nrow = 1, common.legend=TRUE, legend="none"),
       filename = "./output/dichotomized_cel_predicted_csf_nfl_age_adj_validation.png", width =4.5,height = 3.5,units = "in",dpi = 300)


########################################################################################################################################################
########################################################################################################################################################
#             plot SIMOA vs ELISA NFL
########################################################################################################################################################
########################################################################################################################################################

simoa_data <- read_csv("./input/simoa_vs_elisa.csv")

#plot CEL vs CSF NFL

obs <- c("simoa")
obs_nice <- c("cNFL (pg/ml) - SIMOA")


pred <- c("elisa")
pred_nice <- c("cNFL (pg/ml) - ELISA")

for(i in 1:length(obs)){
  # i <- 2
  data <- simoa_data
  df <- data 
  
  p <- df %>% 
    ggplot(aes_string(x=obs[i], y=pred[i])) +
    geom_point(color="#001321",alpha = 0.4, shape = 1.5, size = 2,stroke = 0.6) +
    annotate("text",label = paste0('atop(n[s] ==', nrow(df[obs[i]]), ')'),
             x=max(df[obs[i]])-.15*(max(df[obs[i]])-min(df[obs[i]])),
             y=min(df[pred[i]])+1.2*(max(df[pred[i]])-min(df[pred[i]])),
             size=4,parse = TRUE) +
    ylim(c(min(df[pred[i]]),
           max(df[pred[i]])+0.35*(max(df[pred[i]])-min(df[pred[i]]))))+
    xlab(obs_nice[i]) +
    ylab(pred_nice[i]) +
    stat_cor(aes(label=..r.label..), size=4,color="darkred",label.y.npc = 1)+
    stat_cor(aes(label=..rr.label..), size=4,color="darkred",label.y.npc = .90)+
    stat_cor(aes(label=..p.label..), size=4,color="darkred",label.y.npc = .77)+
    geom_smooth(method="lm", se=TRUE, size=0.75,color="darkred") +
    theme_bw(base_size = 12) +
    theme(axis.title.x = element_text(colour = "#0171C0"),
          axis.title.y = element_text(colour = "darkblue"))
  
  assign(paste("p",i,sep = ""),p)
}
p1
# Save all plots 4x3

ggsave(plot = ggarrange(p1,ncol = 1,nrow = 1, common.legend=TRUE, legend="bottom"),
       filename = "./output/simao_vs_elisa.png", width =3,height = 2.5,units = "in",dpi = 300)


########################################################################################################################################################
########################################################################################################################################################
#             plot sNFL-predicted cNFL with diagnoses distinguished in the validation cohort
########################################################################################################################################################
########################################################################################################################################################


# plot residuals
#plot CSF vs Serum NFL 
 
pred <- c("log_nfl_serum","predicted_csf_nfl")
obs <- c("log_nfl_csf","log_nfl_csf")

pred_nice <- c("Log10 sNFL","sNFL-predicted cNFL")
obs_nice <- c("Log10 cNFL", "Log10 cNFL")

col <- c("#C00000","blue")
for(i in 1:length(pred)){
  # i <- 2
  df <- data_sub_val_clean
  
  p <- df %>% 
    ggplot(aes_string(x=obs[i], y=pred[i])) +
    geom_point(aes(color=diagnosis_simple),alpha = 0.4, shape = 1, size = 2,stroke = 0.6) +
    annotate("text",label = paste0('atop(n[s] ==', nrow(df[obs[i]]), ', n[p] ==', length(unique(df$patientcode)), ')'),
             x=max(df[obs[i]])-.1*(max(df[obs[i]])-min(df[obs[i]])),
             y=min(df[pred[i]])+.03*(max(df[pred[i]])-min(df[pred[i]])),
             size=4,parse = TRUE) +
    ylim(c(min(df[pred[i]]-0.1*(max(df[pred[i]])-min(df[pred[i]]))),
           max(df[pred[i]])+0.2*(max(df[pred[i]])-min(df[pred[i]]))))+
    xlab(obs_nice[i]) +
    ylab(pred_nice[i]) +
    scale_color_manual(breaks = c("MS","non-MS"), values = c("orange","darkgray"))+
    # stat_cor(aes(label=..r.label..), size=4,color="black",label.y.npc = 0.99)+
    stat_cor(aes(label=..rr.label..,color=diagnosis_simple), size=4,label.y.npc = .99,label.x.npc = 0)+
    stat_cor(aes(label=..p.label..,color=diagnosis_simple), size=4,label.y.npc = .99, label.x.npc = 0.4)+
    geom_smooth(aes(color=diagnosis_simple),method="lm",se=FALSE, size=1) +
    theme_bw(base_size = 12) +
    theme(axis.title.x = element_text(colour = "#0171C0"),
          axis.title.y = element_text(colour = col[i]),
          legend.position = "none")
  
  assign(paste("p",i,sep = ""),p)
}

p1
p2


ggsave(plot = p1,filename = "./output/validation_csf_vs_serum_nfl.png", 
       width =3,height = 3,units = "in",dpi = 300)


ggsave(plot = p2,filename = "./output/validation_csf_vs_serum_predicted_cNFL_nfl.png", 
       width =3,height = 3,units = "in",dpi = 300)


pred <- c("log_nfl_serum","predicted_csf_nfl")
obs <- c("log_nfl_csf","log_nfl_csf")

pred_nice <- c("Log10 sNFL","sNFL-predicted cNFL")
obs_nice <- c("Log10 cNFL", "Log10 cNFL")

for(i in 1:length(pred)){
  # i <- 2
  df <- data_sub_val_clean
  
  p <- df %>% 
    ggplot(aes_string(x=obs[i], y=pred[i])) +
    geom_point(color="black",alpha = 0.4, shape = 1, size = 2,stroke = 0.6) +
    annotate("text",label = paste0('atop(n[s] ==', nrow(df[obs[i]]), ', n[p] ==', length(unique(df$patientcode)), ')'),
             x=max(df[obs[i]])-.1*(max(df[obs[i]])-min(df[obs[i]])),
             y=min(df[pred[i]])+.03*(max(df[pred[i]])-min(df[pred[i]])),
             size=4,parse = TRUE) +
    ylim(c(min(df[pred[i]]-0.1*(max(df[pred[i]])-min(df[pred[i]]))),
           max(df[pred[i]])+0.2*(max(df[pred[i]])-min(df[pred[i]]))))+
    xlab(obs_nice[i]) +
    ylab(pred_nice[i]) +
    # stat_cor(aes(label=..r.label..), size=4,color="black",label.y.npc = 0.99)+
    stat_cor(aes(label=..rr.label..),color="brown", size=4,label.y.npc = .99,label.x.npc = 0)+
    stat_cor(aes(label=..p.label..),color="brown", size=4,label.y.npc = .99, label.x.npc = 0.4)+
    geom_smooth(color="brown",method="lm",se=FALSE, size=1) +
    theme_bw(base_size = 12) +
    theme(axis.title.x = element_text(colour = "#0171C0"),
          axis.title.y = element_text(colour = col[i]))
  
  assign(paste("p",i,sep = ""),p)
}

p1
p2


ggsave(plot = p1,filename = "./output/validation_csf_vs_serum_nfl_all.png", 
       width =3,height = 3,units = "in",dpi = 300)


ggsave(plot = p2,filename = "./output/validation_csf_vs_serum_predicted_cNFL_nfl_all.png", 
       width =3,height = 3,units = "in",dpi = 300)


########################################################################################################################################################
########################################################################################################################################################
#             correlation with severity outcomes in TRAINING cohort
########################################################################################################################################################
########################################################################################################################################################

pred <- c("msdss","msss","armss")
               
pred_nice <- c("MS-DSS","MSSS","ARMSS")
                    
obs <- c("log_nfl_csf","log_nfl_csf","log_nfl_csf")

obs_nice <- c("Log10 cNFL","Log10 cNFL","Log10 cNFL")


for(i in 1:length(obs)){
  # i <- 2
  data <- data_sub_train_clean %>% 
    filter(diagnosis %in% c("RR-MS","PP-MS","SP-MS"))
  df <- data %>% 
    filter(.,!is.na(data[[pred[i]]])) 
  # %>% 
  #   group_by(patientcode) %>% 
  #   arrange(lpdate) %>% 
  #   slice_head() %>% 
  #   ungroup()
  # 
  p <- df %>% 
    ggplot(aes_string(x=obs[i], y=pred[i])) +
    geom_point(color="#001321",alpha = 0.4, shape = 1, size = 2,stroke = 0.6) +
    annotate("text",label = paste0('atop(n[s] ==', nrow(df[obs[i]]), ', n[p] ==', length(unique(df$patientcode)), ')'),
             x=max(df[obs[i]])-.15*(max(df[obs[i]])-min(df[obs[i]])),
             y=min(df[pred[i]])+1.2*(max(df[pred[i]])-min(df[pred[i]])),
             size=4,parse = TRUE) +
    ylim(c(min(df[pred[i]]),
           max(df[pred[i]])+0.35*(max(df[pred[i]])-min(df[pred[i]]))))+
    xlab(obs_nice[i]) +
    ylab(pred_nice[i]) +
    stat_cor(aes(label=..r.label..), size=4,color="#521B93",label.y.npc = 1)+
    stat_cor(aes(label=..rr.label..), size=4,color="#521B93",label.y.npc = .90)+
    stat_cor(aes(label=..p.label..), size=4,color="#521B93",label.y.npc = .77)+
    geom_smooth(method="lm",se=TRUE, size=1,color="#521B93") +
    theme_bw(base_size = 12) +
    theme(axis.title.x = element_blank(),
          axis.title.y = element_blank())
  
  assign(paste("p",i,sep = ""),p)
}


ggsave(plot = ggarrange(p1,p2,p3,
                        ncol = 1,nrow = 3, common.legend=TRUE, legend="bottom"),
       filename = "./output/csf_nfl_vs_severity_outcomes_training.png", 
       width=2.5,height=7.5,units = "in",dpi = 150)


# serum NFL

pred <- c("msdss","msss","armss")
pred_nice <- c("MS-DSS","MSSS","ARMSS")

obs <- c("log_nfl_serum","log_nfl_serum","log_nfl_serum")
obs_nice <- c("Log10 sNFL","Log10 sNFL","Log10 sNFL")


for(i in 1:length(obs)){
  # i <- 2
  data <- data_sub_train_clean%>% 
    filter(diagnosis %in% c("RR-MS","PP-MS","SP-MS"))
  df <- data %>% 
    filter(.,!is.na(data[[pred[i]]])) 
  # %>% 
  #   group_by(patientcode) %>% 
  #   arrange(lpdate) %>% 
  #   slice_head() %>% 
  #   ungroup()
  
  p <- df %>% 
    ggplot(aes_string(x=obs[i], y=pred[i])) +
    geom_point(color="#001321",alpha = 0.4, shape = 1, size = 2,stroke = 0.6) +
    annotate("text",label = paste0('atop(n[s] ==', nrow(df[obs[i]]), ', n[p] ==', length(unique(df$patientcode)), ')'),
             x=max(df[obs[i]])-.15*(max(df[obs[i]])-min(df[obs[i]])),
             y=min(df[pred[i]])+1.2*(max(df[pred[i]])-min(df[pred[i]])),
             size=4,parse = TRUE) +
    ylim(c(min(df[pred[i]]),
           max(df[pred[i]])+0.35*(max(df[pred[i]])-min(df[pred[i]]))))+
    xlab(obs_nice[i]) +
    ylab(pred_nice[i]) +
    stat_cor(aes(label=..r.label..), size=4,color="#521B93",label.y.npc = 1)+
    stat_cor(aes(label=..rr.label..), size=4,color="#521B93",label.y.npc = .90)+
    stat_cor(aes(label=..p.label..), size=4,color="#521B93",label.y.npc = .77)+
    geom_smooth(method="lm",se=TRUE, size=1,color="#521B93") +
    theme_bw(base_size = 12) +
    theme(axis.title.x = element_blank(),
          axis.title.y = element_blank())
  
  assign(paste("p",i,sep = ""),p)
}


ggsave(plot = ggarrange(p1,p2,p3,
                        ncol = 1,nrow = 3, common.legend=TRUE, legend="bottom"),
       filename = "./output/serum_nfl_vs_severity_outcomes_training.png", 
       width=2.5,height=7.5,units = "in",dpi = 150)


# predicted cNFL

pred <- c("msdss","msss","armss")

pred_nice <- c("MS-DSS","MSSS","ARMSS")

obs <- c("predicted_csf_nfl","predicted_csf_nfl","predicted_csf_nfl")

obs_nice <- c("sNFL-predicted cNFL","sNFL-predicted cNFL","sNFL-predicted cNFL")


for(i in 1:length(obs)){
  # i <- 2
  data <- data_sub_train_clean %>% 
    filter(diagnosis %in% c("RR-MS","PP-MS","SP-MS"))
  df <- data %>% 
    filter(.,!is.na(data[[pred[i]]])) 
  # %>% 
  #   group_by(patientcode) %>% 
  #   arrange(lpdate) %>% 
  #   slice_head() %>% 
  #   ungroup()
  
  p <- df %>% 
    ggplot(aes_string(x=obs[i], y=pred[i])) +
    geom_point(color="#001321",alpha = 0.4, shape = 1, size = 2,stroke = 0.6) +
    annotate("text",label = paste0('atop(n[s] ==', nrow(df[obs[i]]), ', n[p] ==', length(unique(df$patientcode)), ')'),
             x=max(df[obs[i]])-.15*(max(df[obs[i]])-min(df[obs[i]])),
             y=min(df[pred[i]])+1.2*(max(df[pred[i]])-min(df[pred[i]])),
             size=4,parse = TRUE) +
    ylim(c(min(df[pred[i]]),
           max(df[pred[i]])+0.35*(max(df[pred[i]])-min(df[pred[i]]))))+
    xlab(obs_nice[i]) +
    ylab(pred_nice[i]) +
    stat_cor(aes(label=..r.label..), size=4,color="#521B93",label.y.npc = 1)+
    stat_cor(aes(label=..rr.label..), size=4,color="#521B93",label.y.npc = .90)+
    stat_cor(aes(label=..p.label..), size=4,color="#521B93",label.y.npc = .77)+
    geom_smooth(method="lm",se=TRUE, size=1,color="#521B93") +
    theme_bw(base_size = 12) +
    theme(axis.title.x = element_blank(),
          axis.title.y = element_blank())
  
  assign(paste("p",i,sep = ""),p)
}


ggsave(plot = ggarrange(p1,p2,p3,
                        ncol = 1,nrow = 3, common.legend=TRUE, legend="bottom"),
       filename = "./output/predicted_csf_nfl_vs_severity_outcomes_training.png", 
       width=2.5,height=7.5,units = "in",dpi = 150)


# age-adjusted cNFL

pred <- c("msdss","msss","armss")

pred_nice <- c("MS-DSS","MSSS","ARMSS")

obs <- c("log_nfl_csf_age_adj","log_nfl_csf_age_adj","log_nfl_csf_age_adj")

obs_nice <- c("age-adjusted cNFL","age-adjusted cNFL","age-adjusted cNFL")


for(i in 1:length(obs)){
  # i <- 2
  data <- data_sub_train_clean %>% 
    filter(diagnosis %in% c("RR-MS","PP-MS","SP-MS"))
  df <- data %>% 
    filter(.,!is.na(data[[pred[i]]]))
  
  p <- df %>% 
    ggplot(aes_string(x=obs[i], y=pred[i])) +
    geom_point(color="#001321",alpha = 0.4, shape = 1, size = 2,stroke = 0.6) +
    annotate("text",label = paste0('atop(n[s] ==', nrow(df[obs[i]]), ', n[p] ==', length(unique(df$patientcode)), ')'),
             x=max(df[obs[i]])-.15*(max(df[obs[i]])-min(df[obs[i]])),
             y=min(df[pred[i]])+1.2*(max(df[pred[i]])-min(df[pred[i]])),
             size=4,parse = TRUE) +
    ylim(c(min(df[pred[i]]),
           max(df[pred[i]])+0.35*(max(df[pred[i]])-min(df[pred[i]]))))+
    xlab(obs_nice[i]) +
    ylab(pred_nice[i]) +
    stat_cor(aes(label=..r.label..), size=4,color="#521B93",label.y.npc = 1)+
    stat_cor(aes(label=..rr.label..), size=4,color="#521B93",label.y.npc = .90)+
    stat_cor(aes(label=..p.label..), size=4,color="#521B93",label.y.npc = .77)+
    geom_smooth(method="lm",se=TRUE, size=1,color="#521B93") +
    theme_bw(base_size = 12) +
    theme(axis.title.x = element_blank(),
          axis.title.y = element_blank())
  
  assign(paste("p",i,sep = ""),p)
}


ggsave(plot = ggarrange(p1,p2,p3,
                        ncol = 1,nrow = 3, common.legend=TRUE, legend="bottom"),
       filename = "./output/age-adjusted_csf_nfl_vs_severity_outcomes_training.png", 
       width=2.5,height=7.5,units = "in",dpi = 150)


# age-adjusted sNFL

pred <- c("msdss","msss","armss")

pred_nice <- c("MS-DSS","MSSS","ARMSS")

obs <- c("log_nfl_serum_age_adj","log_nfl_serum_age_adj","log_nfl_serum_age_adj")

obs_nice <- c("age-adjusted cNFL","age-adjusted cNFL","age-adjusted cNFL")


for(i in 1:length(obs)){
  # i <- 2
  data <- data_sub_train_clean %>% 
    filter(diagnosis %in% c("RR-MS","PP-MS","SP-MS"))
  df <- data %>% 
    filter(.,!is.na(data[[pred[i]]]))
  
  p <- df %>% 
    ggplot(aes_string(x=obs[i], y=pred[i])) +
    geom_point(color="#001321",alpha = 0.4, shape = 1, size = 2,stroke = 0.6) +
    annotate("text",label = paste0('atop(n[s] ==', nrow(df[obs[i]]), ', n[p] ==', length(unique(df$patientcode)), ')'),
             x=max(df[obs[i]])-.15*(max(df[obs[i]])-min(df[obs[i]])),
             y=min(df[pred[i]])+1.2*(max(df[pred[i]])-min(df[pred[i]])),
             size=4,parse = TRUE) +
    ylim(c(min(df[pred[i]]),
           max(df[pred[i]])+0.35*(max(df[pred[i]])-min(df[pred[i]]))))+
    xlab(obs_nice[i]) +
    ylab(pred_nice[i]) +
    stat_cor(aes(label=..r.label..), size=4,color="#521B93",label.y.npc = 1)+
    stat_cor(aes(label=..rr.label..), size=4,color="#521B93",label.y.npc = .90)+
    stat_cor(aes(label=..p.label..), size=4,color="#521B93",label.y.npc = .77)+
    geom_smooth(method="lm",se=TRUE, size=1,color="#521B93") +
    theme_bw(base_size = 12) +
    theme(axis.title.x = element_blank(),
          axis.title.y = element_blank())
  
  assign(paste("p",i,sep = ""),p)
}


ggsave(plot = ggarrange(p1,p2,p3,
                        ncol = 1,nrow = 3, common.legend=TRUE, legend="bottom"),
       filename = "./output/age-adjusted_serum_nfl_vs_severity_outcomes_training.png", 
       width=2.5,height=7.5,units = "in",dpi = 150)

# age-adjusted sNFL

pred <- c("msdss","msss","armss")

pred_nice <- c("MS-DSS","MSSS","ARMSS")

obs <- c("predicted_csf_nfl_age_adj","predicted_csf_nfl_age_adj","predicted_csf_nfl_age_adj")

obs_nice <- c("age-adj sNFL-predicted cNFL","age-adj sNFL-predicted cNFL","age-adj sNFL-predicted cNFL")


for(i in 1:length(obs)){
  # i <- 2
  data <- data_sub_train_clean %>% 
    filter(diagnosis %in% c("RR-MS","PP-MS","SP-MS"))
  df <- data %>% 
    filter(.,!is.na(data[[pred[i]]]))
  
  p <- df %>% 
    ggplot(aes_string(x=obs[i], y=pred[i])) +
    geom_point(color="#001321",alpha = 0.4, shape = 1, size = 2,stroke = 0.6) +
    annotate("text",label = paste0('atop(n[s] ==', nrow(df[obs[i]]), ', n[p] ==', length(unique(df$patientcode)), ')'),
             x=max(df[obs[i]])-.15*(max(df[obs[i]])-min(df[obs[i]])),
             y=min(df[pred[i]])+1.2*(max(df[pred[i]])-min(df[pred[i]])),
             size=4,parse = TRUE) +
    ylim(c(min(df[pred[i]]),
           max(df[pred[i]])+0.35*(max(df[pred[i]])-min(df[pred[i]]))))+
    xlab(obs_nice[i]) +
    ylab(pred_nice[i]) +
    stat_cor(aes(label=..r.label..), size=4,color="#521B93",label.y.npc = 1)+
    stat_cor(aes(label=..rr.label..), size=4,color="#521B93",label.y.npc = .90)+
    stat_cor(aes(label=..p.label..), size=4,color="#521B93",label.y.npc = .77)+
    geom_smooth(method="lm",se=TRUE, size=1,color="#521B93") +
    theme_bw(base_size = 12) +
    theme(axis.title.x = element_blank(),
          axis.title.y = element_blank())
  
  assign(paste("p",i,sep = ""),p)
}


ggsave(plot = ggarrange(p1,p2,p3,
                        ncol = 1,nrow = 3, common.legend=TRUE, legend="bottom"),
       filename = "./output/age-adjusted_snfl_predicted_cNFL_vs_severity_outcomes_training.png", 
       width=2.5,height=7.5,units = "in",dpi = 150)


###################################
#
#  with diagnosis
#
###################################


pred <- c("msdss","msss","armss")

pred_nice <- c("MS-DSS","MSSS","ARMSS")

obs <- c("log_nfl_csf","log_nfl_csf","log_nfl_csf")

obs_nice <- c("Log10 cNFL","Log10 cNFL","Log10 cNFL")


for(i in 1:length(obs)){
  # i <- 2
  data <- data_sub_train_clean %>% 
    filter(diagnosis %in% c("RR-MS","PP-MS","SP-MS"))
  data$diagnosis <- str_replace_all(string = data$diagnosis,pattern = "PP-MS",replacement = "P-MS")
  data$diagnosis <- str_replace_all(string = data$diagnosis,pattern = "SP-MS",replacement = "P-MS")
  df <- data %>% 
    filter(.,!is.na(data[[pred[i]]]))
  
  p <- df %>% 
    ggplot(aes_string(x=obs[i], y=pred[i])) +
    geom_point(aes(color=diagnosis),alpha = 0.4, shape = 1, size = 2,stroke = 0.6) +
    annotate("text",label = paste0('atop(n[s] ==', nrow(df[obs[i]]), ', n[p] ==', length(unique(df$patientcode)), ')'),
             x=max(df[obs[i]])-.15*(max(df[obs[i]])-min(df[obs[i]])),
             y=min(df[pred[i]])+1.28*(max(df[pred[i]])-min(df[pred[i]])),
             size=4,parse = TRUE) +
    ylim(c(min(df[pred[i]]),
           max(df[pred[i]])+0.35*(max(df[pred[i]])-min(df[pred[i]]))))+
    xlab(obs_nice[i]) +
    ylab(pred_nice[i]) +
    scale_color_manual(breaks = c("P-MS","RR-MS"), values = c("darkblue","orange"))+
    # stat_cor(aes(label=..r.label..,color=diagnosis), size=4,label.x.npc = 0, label.y.npc = 1)+
    stat_cor(aes(label=..rr.label..,color=diagnosis), size=4,label.x.npc = 0, label.y.npc = 1)+
    stat_cor(aes(label=..p.label..,color=diagnosis), size=4,label.x.npc = .35, label.y.npc = 1)+
    geom_smooth(aes(color=diagnosis),method="lm",se=TRUE, size=1) +
    theme_bw(base_size = 12) +
    theme(axis.title.x = element_blank(),
          axis.title.y = element_blank())
  
  assign(paste("p",i,sep = ""),p)
}


ggsave(plot = ggarrange(p1,p2,p3,
                        ncol = 1,nrow = 3, common.legend=TRUE, legend="none"),
       filename = "./output/csf_nfl_vs_severity_outcomes_training_with_diagnosis.png", 
       width=3.5,height=10,units = "in",dpi = 150)


# serum NFL

pred <- c("msdss","msss","armss")
pred_nice <- c("MS-DSS","MSSS","ARMSS")

obs <- c("log_nfl_serum","log_nfl_serum","log_nfl_serum")
obs_nice <- c("Log10 sNFL","Log10 sNFL","Log10 sNFL")


for(i in 1:length(obs)){
  # i <- 2
  data <- data_sub_train_clean %>% 
    filter(diagnosis %in% c("RR-MS","PP-MS","SP-MS"))
  
  data$diagnosis <- str_replace_all(string = data$diagnosis,pattern = "PP-MS",replacement = "P-MS")
  data$diagnosis <- str_replace_all(string = data$diagnosis,pattern = "SP-MS",replacement = "P-MS")
  
  df <- data %>% 
    filter(.,!is.na(data[[pred[i]]]))
  
  p <- df %>% 
    ggplot(aes_string(x=obs[i], y=pred[i])) +
    geom_point(aes(color=diagnosis),alpha = 0.4, shape = 1, size = 2,stroke = 0.6) +
    annotate("text",label = paste0('atop(n[s] ==', nrow(df[obs[i]]), ', n[p] ==', length(unique(df$patientcode)), ')'),
             x=max(df[obs[i]])-.15*(max(df[obs[i]])-min(df[obs[i]])),
             y=min(df[pred[i]])+1.28*(max(df[pred[i]])-min(df[pred[i]])),
             size=4,parse = TRUE) +
    ylim(c(min(df[pred[i]]),
           max(df[pred[i]])+0.35*(max(df[pred[i]])-min(df[pred[i]]))))+
    xlab(obs_nice[i]) +
    ylab(pred_nice[i]) +
    scale_color_manual(breaks = c("P-MS","RR-MS"), values = c("darkblue","orange"))+
    # stat_cor(aes(label=..r.label..,color=diagnosis), size=4,label.x.npc = 0, label.y.npc = 1)+
    stat_cor(aes(label=..rr.label..,color=diagnosis), size=4,label.x.npc = 0, label.y.npc = 1)+
    stat_cor(aes(label=..p.label..,color=diagnosis), size=4,label.x.npc = .35, label.y.npc = 1)+
    geom_smooth(aes(color=diagnosis),method="lm",se=TRUE, size=1) +
    theme_bw(base_size = 12) +
    theme(axis.title.x = element_blank(),
          axis.title.y = element_blank())
  
  
  assign(paste("p",i,sep = ""),p)
}


ggsave(plot = ggarrange(p1,p2,p3,
                        ncol = 1,nrow = 3, common.legend=TRUE, legend="none"),
       filename = "./output/serum_nfl_vs_severity_outcomes_training_with_diagnosis.png", 
       width=3.5,height=10,units = "in",dpi = 150)

# predicted cNFL

pred <- c("msdss","msss","armss")

pred_nice <- c("MS-DSS","MSSS","ARMSS")

obs <- c("predicted_csf_nfl","predicted_csf_nfl","predicted_csf_nfl")

obs_nice <- c("predicted cNFL","predicted cNFL","predicted cNFL")


for(i in 1:length(obs)){
  # i <- 2
  data <- data_sub_train_clean %>% 
    filter(diagnosis %in% c("RR-MS","PP-MS","SP-MS"))
  data$diagnosis <- str_replace_all(string = data$diagnosis,pattern = "PP-MS",replacement = "P-MS")
  data$diagnosis <- str_replace_all(string = data$diagnosis,pattern = "SP-MS",replacement = "P-MS")
  df <- data %>% 
    filter(.,!is.na(data[[pred[i]]]))
  
  p <- df %>% 
    ggplot(aes_string(x=obs[i], y=pred[i])) +
    geom_point(aes(color=diagnosis),alpha = 0.4, shape = 1, size = 2,stroke = 0.6) +
    annotate("text",label = paste0('atop(n[s] ==', nrow(df[obs[i]]), ', n[p] ==', length(unique(df$patientcode)), ')'),
             x=max(df[obs[i]])-.15*(max(df[obs[i]])-min(df[obs[i]])),
             y=min(df[pred[i]])+1.28*(max(df[pred[i]])-min(df[pred[i]])),
             size=4,parse = TRUE) +
    ylim(c(min(df[pred[i]]),
           max(df[pred[i]])+0.35*(max(df[pred[i]])-min(df[pred[i]]))))+
    xlab(obs_nice[i]) +
    ylab(pred_nice[i]) +
    scale_color_manual(breaks = c("P-MS","RR-MS"), values = c("darkblue","orange"))+
    # stat_cor(aes(label=..r.label..,color=diagnosis), size=4,label.x.npc = 0, label.y.npc = 1)+
    stat_cor(aes(label=..rr.label..,color=diagnosis), size=4,label.x.npc = 0, label.y.npc = 1)+
    stat_cor(aes(label=..p.label..,color=diagnosis), size=4,label.x.npc = .35, label.y.npc = 1)+
    geom_smooth(aes(color=diagnosis),method="lm",se=TRUE, size=1) +
    theme_bw(base_size = 12) +
    theme(axis.title.x = element_blank(),
          axis.title.y = element_blank())
  
  assign(paste("p",i,sep = ""),p)
}


ggsave(plot = ggarrange(p1,p2,p3,
                        ncol = 1,nrow = 3, common.legend=TRUE, legend="none"),
       filename = "./output/predicted_csf_nfl_vs_severity_outcomes_training_with_diagnosis.png", 
       width=3.5,height=10,units = "in",dpi = 150)


# serum NFL - SPMS only

pred <- c("msdss","msss","armss")
pred_nice <- c("MS-DSS","MSSS","ARMSS")

obs <- c("log_nfl_serum","log_nfl_serum","log_nfl_serum")
obs_nice <- c("Log10 sNFL","Log10 sNFL","Log10 sNFL")


for(i in 1:length(obs)){
  # i <- 2
  data <- data_sub_train_clean %>% 
    filter(diagnosis %in% c("SP-MS"))
  df <- data %>% 
    filter(.,!is.na(data[[pred[i]]])) 
  # %>% 
  #   filter(log_nfl_serum<2)
  
  p <- df %>% 
    ggplot(aes_string(x=obs[i], y=pred[i])) +
    geom_point(aes(color=diagnosis),alpha = 0.4, shape = 1, size = 2,stroke = 0.6) +
    annotate("text",label = paste0('atop(n[s] ==', nrow(df[obs[i]]), ', n[p] ==', length(unique(df$patientcode)), ')'),
             x=max(df[obs[i]])-.15*(max(df[obs[i]])-min(df[obs[i]])),
             y=min(df[pred[i]])+1.2*(max(df[pred[i]])-min(df[pred[i]])),
             size=4,parse = TRUE) +
    ylim(c(min(df[pred[i]]),
           max(df[pred[i]])+0.35*(max(df[pred[i]])-min(df[pred[i]]))))+
    xlab(obs_nice[i]) +
    ylab(pred_nice[i]) +
    stat_cor(aes(label=..r.label..,color=diagnosis), size=4,label.x.npc = 0, label.y.npc = 1)+
    stat_cor(aes(label=..rr.label..,color=diagnosis), size=4,label.x.npc = .25, label.y.npc = 1)+
    stat_cor(aes(label=..p.label..,color=diagnosis), size=4,label.x.npc = .5, label.y.npc = 1)+
    geom_smooth(aes(color=diagnosis),method="lm",se=TRUE, size=1) +
    theme_bw(base_size = 12) +
    theme(axis.title.x = element_blank(),
          axis.title.y = element_blank())
  
  assign(paste("p",i,sep = ""),p)
}


# removed top three sNFL samples

for(i in 1:length(obs)){
  # i <- 2
  data <- data_sub_train_clean %>% 
    filter(diagnosis %in% c("SP-MS"))
  df <- data %>% 
    filter(.,!is.na(data[[pred[i]]])) %>% 
    filter(log_nfl_serum<2 & msdss<5)
  
  p <- df %>% 
    ggplot(aes_string(x=obs[i], y=pred[i])) +
    geom_point(aes(color=diagnosis),alpha = 0.4, shape = 1, size = 2,stroke = 0.6) +
    annotate("text",label = paste0('atop(n[s] ==', nrow(df[obs[i]]), ', n[p] ==', length(unique(df$patientcode)), ')'),
             x=max(df[obs[i]])-.15*(max(df[obs[i]])-min(df[obs[i]])),
             y=min(df[pred[i]])+1.2*(max(df[pred[i]])-min(df[pred[i]])),
             size=4,parse = TRUE) +
    ylim(c(min(df[pred[i]]),
           max(df[pred[i]])+0.35*(max(df[pred[i]])-min(df[pred[i]]))))+
    xlab(obs_nice[i]) +
    ylab(pred_nice[i]) +
    stat_cor(aes(label=..r.label..,color=diagnosis), size=4,label.x.npc = 0, label.y.npc = 1)+
    stat_cor(aes(label=..rr.label..,color=diagnosis), size=4,label.x.npc = .25, label.y.npc = 1)+
    stat_cor(aes(label=..p.label..,color=diagnosis), size=4,label.x.npc = .5, label.y.npc = 1)+
    geom_smooth(aes(color=diagnosis),method="lm",se=TRUE, size=1) +
    theme_bw(base_size = 12) +
    theme(axis.title.x = element_blank(),
          axis.title.y = element_blank())
  
  assign(paste("q",i,sep = ""),p)
}


ggsave(plot = ggarrange(p1,p2,p3,
                        ncol = 1,nrow = 3, common.legend=TRUE, legend="bottom"),
       filename = "./output/serum_nfl_vs_severity_outcomes_training_with_diagnosis_SPMS_only.png", 
       width=5,height=15,units = "in",dpi = 150)


ggsave(plot = ggarrange(q1,q2,q3,
                        ncol = 1,nrow = 3, common.legend=TRUE, legend="bottom"),
       filename = "./output/serum_nfl_vs_severity_outcomes_training_with_diagnosis_SPMS_only_noout.png", 
       width=5,height=15,units = "in",dpi = 150)


# corelation of severity outcomes with Upper SC atrophy

pred <- c("msdss","msss","armss")

pred_nice <- c("MS-DSS","MSSS","ARMSS")

obs <- c("medulla_ll_atrophy","medulla_ll_atrophy","medulla_ll_atrophy")

obs_nice <- c("Medulla LL&atrophy","Medulla LL&atrophy","Medulla LL&atrophy")


for(i in 1:length(obs)){
  # i <- 2
  data <- data_sub_train %>% 
    filter(diagnosis %in% c("RR-MS","PP-MS","SP-MS"))
  df <- data %>% 
    filter(.,!is.na(data[[pred[i]]]))
  
  p <- df %>% 
    ggplot(aes_string(x=obs[i], y=pred[i])) +
    geom_point(color="#001321",alpha = 0.4, shape = 1, size = 2,stroke = 0.6) +
    annotate("text",label = paste0('atop(n[s] ==', nrow(df[obs[i]]), ', n[p] ==', length(unique(df$patientcode)), ')'),
             x=max(df[obs[i]])-.15*(max(df[obs[i]])-min(df[obs[i]])),
             y=min(df[pred[i]])+1.2*(max(df[pred[i]])-min(df[pred[i]])),
             size=4,parse = TRUE) +
    ylim(c(min(df[pred[i]]),
           max(df[pred[i]])+0.35*(max(df[pred[i]])-min(df[pred[i]]))))+
    xlab(obs_nice[i]) +
    ylab(pred_nice[i]) +
    stat_cor(aes(label=..r.label..), size=4,color="#521B93",label.y.npc = 1)+
    stat_cor(aes(label=..rr.label..), size=4,color="#521B93",label.y.npc = .90)+
    stat_cor(aes(label=..p.label..), size=4,color="#521B93",label.y.npc = .77)+
    geom_smooth(method="lm",se=TRUE, size=1,color="#521B93") +
    theme_bw(base_size = 12) 
  
  assign(paste("p",i,sep = ""),p)
}


ggsave(plot = ggarrange(p1,p2,p3,
                        ncol = 1,nrow = 3, common.legend=TRUE, legend="bottom"),
       filename = "./output/medulla_atrophy_vs_severity_outcomes_training.png", 
       width=2.5,height=7.5,units = "in",dpi = 150)


########################################################################################################################################################
########################################################################################################################################################
#             correlation with severity outcomes in VALIDATION cohort
########################################################################################################################################################
########################################################################################################################################################


pred <- c("msdss","msss","armss")
pred_nice <- c("MS-DSS","MSSS","ARMSS")

obs <- c("log_nfl_csf","log_nfl_csf","log_nfl_csf")
obs_nice <- c("Log10 cNFL","Log10 cNFL","Log10 cNFL")


for(i in 1:length(obs)){
  # i <- 1
  data <- data_sub_val_clean%>% 
    filter(diagnosis %in% c("RR-MS","PP-MS","SP-MS"))
  df <- data %>% 
    filter(.,!is.na(data[[pred[i]]])) 
  # %>% 
  #   group_by(patientcode) %>% 
  #   arrange(lpdate) %>% 
  #   slice_head() %>% 
  #   ungroup()
  
  p <- df %>% 
    ggplot(aes_string(x=obs[i], y=pred[i])) +
    geom_point(color="#001321",alpha = 0.4, shape = 1, size = 2,stroke = 0.6) +
    annotate("text",label = paste0('atop(n[s] ==', nrow(df[obs[i]]), ', n[p] ==', length(unique(df$patientcode)), ')'),
             x=max(df[obs[i]])-.15*(max(df[obs[i]])-min(df[obs[i]])),
             y=min(df[pred[i]])+1.2*(max(df[pred[i]])-min(df[pred[i]])),
             size=4,parse = TRUE) +
    ylim(c(min(df[pred[i]]),
           max(df[pred[i]])+0.35*(max(df[pred[i]])-min(df[pred[i]]))))+
    xlab(obs_nice[i]) +
    ylab(pred_nice[i]) +
    stat_cor(aes(label=..r.label..), size=4,color="#521B93",label.y.npc = 1)+
    stat_cor(aes(label=..rr.label..), size=4,color="#521B93",label.y.npc = .90)+
    stat_cor(aes(label=..p.label..), size=4,color="#521B93",label.y.npc = .77)+
    geom_smooth(method="lm",se=TRUE, size=1,color="#521B93") +
    theme_bw(base_size = 12) +
    theme(axis.title.x = element_blank(),
          axis.title.y = element_blank())
  
  assign(paste("p",i,sep = ""),p)
}


ggsave(plot = ggarrange(p1,p2,p3,
                        ncol = 1,nrow = 3, common.legend=TRUE, legend="bottom"),
       filename = "./output/csf_nfl_vs_severity_outcomes_validation.png", 
       width=2.5,height=7.5,units = "in",dpi = 150)


# serum NFL

pred <- c("msdss","msss","armss")
pred_nice <- c("MS-DSS","MSSS","ARMSS")

obs <- c("log_nfl_serum","log_nfl_serum","log_nfl_serum")
obs_nice <- c("Log10 sNFL","Log10 sNFL","Log10 sNFL")


for(i in 1:length(obs)){
  # i <- 2
  data <- data_sub_val_clean%>% 
    filter(diagnosis %in% c("RR-MS","PP-MS","SP-MS"))
  df <- data %>% 
    filter(.,!is.na(data[[pred[i]]])) 
  # %>% 
  #   group_by(patientcode) %>% 
  #   arrange(lpdate) %>% 
  #   slice_head() %>% 
  #   ungroup()
  
  p <- df %>% 
    ggplot(aes_string(x=obs[i], y=pred[i])) +
    geom_point(color="#001321",alpha = 0.4, shape = 1, size = 2,stroke = 0.6) +
    annotate("text",label = paste0('atop(n[s] ==', nrow(df[obs[i]]), ', n[p] ==', length(unique(df$patientcode)), ')'),
             x=max(df[obs[i]])-.15*(max(df[obs[i]])-min(df[obs[i]])),
             y=min(df[pred[i]])+1.2*(max(df[pred[i]])-min(df[pred[i]])),
             size=4,parse = TRUE) +
    ylim(c(min(df[pred[i]]),
           max(df[pred[i]])+0.35*(max(df[pred[i]])-min(df[pred[i]]))))+
    xlab(obs_nice[i]) +
    ylab(pred_nice[i]) +
    stat_cor(aes(label=..r.label..), size=4,color="#521B93",label.y.npc = 1)+
    stat_cor(aes(label=..rr.label..), size=4,color="#521B93",label.y.npc = .90)+
    stat_cor(aes(label=..p.label..), size=4,color="#521B93",label.y.npc = .77)+
    geom_smooth(method="lm",se=TRUE, size=1,color="#521B93") +
    theme_bw(base_size = 12) +
    theme(axis.title.x = element_blank(),
          axis.title.y = element_blank())
  
  assign(paste("p",i,sep = ""),p)
}


ggsave(plot = ggarrange(p1,p2,p3,
                        ncol = 1,nrow = 3, common.legend=TRUE, legend="bottom"),
       filename = "./output/serum_nfl_vs_severity_outcomes_validation.png", 
       width=2.5,height=7.5,units = "in",dpi = 150)


# predicted cNFL

pred <- c("msdss","msss","armss")

pred_nice <- c("MS-DSS","MSSS","ARMSS")

obs <- c("predicted_csf_nfl","predicted_csf_nfl","predicted_csf_nfl")

obs_nice <- c("predicted cNFL","predicted cNFL","predicted cNFL")

for(i in 1:length(obs)){
  # i <- 2
  data <- data_sub_val_clean%>% 
    filter(diagnosis %in% c("RR-MS","PP-MS","SP-MS"))
  df <- data %>% 
    filter(.,!is.na(data[[pred[i]]])) 
  # %>% 
  #   group_by(patientcode) %>% 
  #   arrange(lpdate) %>% 
  #   slice_head() %>% 
  #   ungroup()
  # 
  p <- df %>% 
    ggplot(aes_string(x=obs[i], y=pred[i])) +
    geom_point(color="#001321",alpha = 0.4, shape = 1, size = 2,stroke = 0.6) +
    annotate("text",label = paste0('atop(n[s] ==', nrow(df[obs[i]]), ', n[p] ==', length(unique(df$patientcode)), ')'),
             x=max(df[obs[i]])-.15*(max(df[obs[i]])-min(df[obs[i]])),
             y=min(df[pred[i]])+1.2*(max(df[pred[i]])-min(df[pred[i]])),
             size=4,parse = TRUE) +
    ylim(c(min(df[pred[i]]),
           max(df[pred[i]])+0.35*(max(df[pred[i]])-min(df[pred[i]]))))+
    xlab(obs_nice[i]) +
    ylab(pred_nice[i]) +
    stat_cor(aes(label=..r.label..), size=4,color="#521B93",label.y.npc = 1)+
    stat_cor(aes(label=..rr.label..), size=4,color="#521B93",label.y.npc = .90)+
    stat_cor(aes(label=..p.label..), size=4,color="#521B93",label.y.npc = .77)+
    geom_smooth(method="lm",se=TRUE, size=1,color="#521B93") +
    theme_bw(base_size = 12) +
    theme(axis.title.x = element_blank(),
          axis.title.y = element_blank())
  
  assign(paste("p",i,sep = ""),p)
}


ggsave(plot = ggarrange(p1,p2,p3,
                        ncol = 1,nrow = 3, common.legend=TRUE, legend="bottom"),
       filename = "./output/predicted_csf_nfl_vs_severity_outcomes_validation.png", 
       width=2.5,height=7.5,units = "in",dpi = 150)


###################################
#
#  with diagnosis
#
###################################


pred <- c("msdss","msss","armss")

pred_nice <- c("MS-DSS","MSSS","ARMSS")

obs <- c("log_nfl_csf","log_nfl_csf","log_nfl_csf")

obs_nice <- c("Log10 cNFL","Log10 cNFL","Log10 cNFL")


for(i in 1:length(obs)){
  # i <- 2
  data <- data_sub_val_clean %>% 
    filter(diagnosis %in% c("RR-MS","PP-MS","SP-MS"))
  data$diagnosis <- str_replace_all(string = data$diagnosis,pattern = "PP-MS",replacement = "P-MS")
  data$diagnosis <- str_replace_all(string = data$diagnosis,pattern = "SP-MS",replacement = "P-MS")
  df <- data %>% 
    filter(.,!is.na(data[[pred[i]]]))
  
  p <- df %>% 
    ggplot(aes_string(x=obs[i], y=pred[i])) +
    geom_point(aes(color=diagnosis),alpha = 0.4, shape = 1, size = 2,stroke = 0.6) +
    annotate("text",label = paste0('atop(n[s] ==', nrow(df[obs[i]]), ', n[p] ==', length(unique(df$patientcode)), ')'),
             x=max(df[obs[i]])-.15*(max(df[obs[i]])-min(df[obs[i]])),
             y=min(df[pred[i]])+1.28*(max(df[pred[i]])-min(df[pred[i]])),
             size=4,parse = TRUE) +
    ylim(c(min(df[pred[i]]),
           max(df[pred[i]])+0.35*(max(df[pred[i]])-min(df[pred[i]]))))+
    xlab(obs_nice[i]) +
    ylab(pred_nice[i]) +
    scale_color_manual(breaks = c("P-MS","RR-MS"), values = c("darkblue","orange"))+
    # stat_cor(aes(label=..r.label..,color=diagnosis), size=4,label.x.npc = 0, label.y.npc = 1)+
    stat_cor(aes(label=..rr.label..,color=diagnosis), size=4,label.x.npc = 0, label.y.npc = 1)+
    stat_cor(aes(label=..p.label..,color=diagnosis), size=4,label.x.npc = .35, label.y.npc = 1)+
    geom_smooth(aes(color=diagnosis),method="lm",se=TRUE, size=1) +
    theme_bw(base_size = 12) +
    theme(axis.title.x = element_blank(),
          axis.title.y = element_blank())
  
  assign(paste("p",i,sep = ""),p)
}


ggsave(plot = ggarrange(p1,p2,p3,
                        ncol = 1,nrow = 3, common.legend=TRUE, legend="none"),
       filename = "./output/csf_nfl_vs_severity_outcomes_validation_with_diagnosis.png", 
       width=3.5,height=10,units = "in",dpi = 150)


# serum NFL

pred <- c("msdss","msss","armss")
pred_nice <- c("MS-DSS","MSSS","ARMSS")

obs <- c("log_nfl_serum","log_nfl_serum","log_nfl_serum")
obs_nice <- c("Log10 sNFL","Log10 sNFL","Log10 sNFL")


for(i in 1:length(obs)){
  # i <- 2
  data <- data_sub_val_clean %>% 
    filter(diagnosis %in% c("RR-MS","PP-MS","SP-MS"))
  data$diagnosis <- str_replace_all(string = data$diagnosis,pattern = "PP-MS",replacement = "P-MS")
  data$diagnosis <- str_replace_all(string = data$diagnosis,pattern = "SP-MS",replacement = "P-MS")
  df <- data %>% 
    filter(.,!is.na(data[[pred[i]]]))
  
  p <- df %>% 
    ggplot(aes_string(x=obs[i], y=pred[i])) +
    geom_point(aes(color=diagnosis),alpha = 0.4, shape = 1, size = 2,stroke = 0.6) +
    annotate("text",label = paste0('atop(n[s] ==', nrow(df[obs[i]]), ', n[p] ==', length(unique(df$patientcode)), ')'),
             x=max(df[obs[i]])-.15*(max(df[obs[i]])-min(df[obs[i]])),
             y=min(df[pred[i]])+1.28*(max(df[pred[i]])-min(df[pred[i]])),
             size=4,parse = TRUE) +
    ylim(c(min(df[pred[i]]),
           max(df[pred[i]])+0.35*(max(df[pred[i]])-min(df[pred[i]]))))+
    xlab(obs_nice[i]) +
    ylab(pred_nice[i]) +
    scale_color_manual(breaks = c("P-MS","RR-MS"), values = c("darkblue","orange"))+
    # stat_cor(aes(label=..r.label..,color=diagnosis), size=4,label.x.npc = 0, label.y.npc = 1)+
    stat_cor(aes(label=..rr.label..,color=diagnosis), size=4,label.x.npc = 0, label.y.npc = 1)+
    stat_cor(aes(label=..p.label..,color=diagnosis), size=4,label.x.npc = .35, label.y.npc = 1)+
    geom_smooth(aes(color=diagnosis),method="lm",se=TRUE, size=1) +
    theme_bw(base_size = 12) +
    theme(axis.title.x = element_blank(),
          axis.title.y = element_blank())
  
  
  assign(paste("p",i,sep = ""),p)
}


ggsave(plot = ggarrange(p1,p2,p3,
                        ncol = 1,nrow = 3, common.legend=TRUE, legend="none"),
       filename = "./output/serum_nfl_vs_severity_outcomes_validation_with_diagnosis.png", 
       width=3.5,height=10,units = "in",dpi = 150)

# predicted cNFL

pred <- c("msdss","msss","armss")

pred_nice <- c("MS-DSS","MSSS","ARMSS")

obs <- c("predicted_csf_nfl","predicted_csf_nfl","predicted_csf_nfl")

obs_nice <- c("predicted cNFL","predicted cNFL","predicted cNFL")


for(i in 1:length(obs)){
  # i <- 2
  data <- data_sub_val_clean %>% 
    filter(diagnosis %in% c("RR-MS","PP-MS","SP-MS"))
  
  data$diagnosis <- str_replace_all(string = data$diagnosis,pattern = "PP-MS",replacement = "P-MS")
  data$diagnosis <- str_replace_all(string = data$diagnosis,pattern = "SP-MS",replacement = "P-MS")
  df <- data %>% 
    filter(.,!is.na(data[[pred[i]]]))
  
  p <- df %>% 
    ggplot(aes_string(x=obs[i], y=pred[i])) +
    geom_point(aes(color=diagnosis),alpha = 0.4, shape = 1, size = 2,stroke = 0.6) +
    annotate("text",label = paste0('atop(n[s] ==', nrow(df[obs[i]]), ', n[p] ==', length(unique(df$patientcode)), ')'),
             x=max(df[obs[i]])-.15*(max(df[obs[i]])-min(df[obs[i]])),
             y=min(df[pred[i]])+1.28*(max(df[pred[i]])-min(df[pred[i]])),
             size=4,parse = TRUE) +
    ylim(c(min(df[pred[i]]),
           max(df[pred[i]])+0.35*(max(df[pred[i]])-min(df[pred[i]]))))+
    xlab(obs_nice[i]) +
    ylab(pred_nice[i]) +
    scale_color_manual(breaks = c("P-MS","RR-MS"), values = c("darkblue","orange"))+
    # stat_cor(aes(label=..r.label..,color=diagnosis), size=4,label.x.npc = 0, label.y.npc = 1)+
    stat_cor(aes(label=..rr.label..,color=diagnosis), size=4,label.x.npc = 0, label.y.npc = 1)+
    stat_cor(aes(label=..p.label..,color=diagnosis), size=4,label.x.npc = .35, label.y.npc = 1)+
    geom_smooth(aes(color=diagnosis),method="lm",se=TRUE, size=1) +
    theme_bw(base_size = 12) +
    theme(axis.title.x = element_blank(),
          axis.title.y = element_blank())
  
  
  assign(paste("p",i,sep = ""),p)
}


ggsave(plot = ggarrange(p1,p2,p3,
                        ncol = 1,nrow = 3, common.legend=TRUE, legend="none"),
       filename = "./output/predicted_csf_nfl_vs_severity_outcomes_validation_with_diagnosis.png", 
       width=3.5,height=10,units = "in",dpi = 150)


# corelation of severity outcomes with Upper SC atrophy

pred <- c("msdss","msss","armss")

pred_nice <- c("MS-DSS","MSSS","ARMSS")

obs <- c("medulla_ll_atrophy","medulla_ll_atrophy","medulla_ll_atrophy")

obs_nice <- c("Medulla LL&atrophy","Medulla LL&atrophy","Medulla LL&atrophy")


for(i in 1:length(obs)){
  # i <- 2
  data <- data_sub_val%>% 
    filter(diagnosis %in% c("RR-MS","PP-MS","SP-MS"))
  df <- data %>% 
    filter(.,!is.na(data[[pred[i]]]))
  
  p <- df %>% 
    ggplot(aes_string(x=obs[i], y=pred[i])) +
    geom_point(color="#001321",alpha = 0.4, shape = 1, size = 2,stroke = 0.6) +
    annotate("text",label = paste0('atop(n[s] ==', nrow(df[obs[i]]), ', n[p] ==', length(unique(df$patientcode)), ')'),
             x=max(df[obs[i]])-.15*(max(df[obs[i]])-min(df[obs[i]])),
             y=min(df[pred[i]])+1.2*(max(df[pred[i]])-min(df[pred[i]])),
             size=4,parse = TRUE) +
    ylim(c(min(df[pred[i]]),
           max(df[pred[i]])+0.35*(max(df[pred[i]])-min(df[pred[i]]))))+
    xlab(obs_nice[i]) +
    ylab(pred_nice[i]) +
    stat_cor(aes(label=..r.label..), size=4,color="#521B93",label.y.npc = 1)+
    stat_cor(aes(label=..rr.label..), size=4,color="#521B93",label.y.npc = .90)+
    stat_cor(aes(label=..p.label..), size=4,color="#521B93",label.y.npc = .77)+
    geom_smooth(method="lm",se=TRUE, size=1,color="#521B93") +
    theme_bw(base_size = 12) 
  
  assign(paste("p",i,sep = ""),p)
}


ggsave(plot = ggarrange(p1,p2,p3,
                        ncol = 1,nrow = 3, common.legend=TRUE, legend="bottom"),
       filename = "./output/medulla_atrophy_vs_severity_outcomes_validation.png", 
       width=2.5,height=7.5,units = "in",dpi = 150)


################################################################################
################################################################################
##
##
##  MLR model prediciting MS severity from sNFL
##
##
################################################################################
################################################################################

#######
#MSDSS
#######
severity_data_msdss_train <- data_sub_train %>% 
  filter(diagnosis %in% c("RR-MS","PP-MS","SP-MS"))%>% 
  filter(!is.na(msdss))

severity_data_msdss_train_clean <- severity_data_msdss_train[complete.cases(severity_data_msdss_train[confound]),]
#build the model:

fit <- lm(msdss~log_nfl_serum + age + bmi + bmi_weight +
             serum_ap + serum_bun + serum_creatinine, severity_data_msdss_train_clean)

fit_msdss <- lm(msdss~log_nfl_serum, severity_data_msdss_train_clean)
plot(msdss~log_nfl_serum, severity_data_msdss_train_clean)
abline(lm(msdss~log_nfl_serum, severity_data_msdss_train_clean))
summary(fit_msdss)

#stepwise regression
step <- stepAIC(fit, direction="both")

step$anova # display results

best_fit_msdss <- lm(msdss ~ log_nfl_serum + age + bmi + bmi_weight + serum_ap, severity_data_msdss_train_clean)

summary(best_fit_msdss)

# predict msdss from LM and MLM

severity_data_msdss_train_clean$msdss_lm <- fit_msdss$fitted.values
severity_data_msdss_train_clean$msdss_mlm <- predict(object = best_fit_msdss,newdata = severity_data_msdss_train_clean)

# plot


pred <- c("msdss_lm","msdss_mlm")

pred_nice <- c("sNFL-predicted MS-DSS","sNFL-predicted MS-DSS")

obs <- c("msdss","msdss")

obs_nice <- c("measured MS-DSS","measured MS-DSS")


for(i in 1:length(obs)){
  # i <- 2
  df <- severity_data_msdss_train_clean 
  
  p <- df %>% 
    ggplot(aes_string(x=obs[i], y=pred[i])) +
    geom_point(color="#001321",alpha = 0.4, shape = 1, size = 2,stroke = 0.6) +
    annotate("text",label = paste0('atop(n[s] ==', nrow(df[obs[i]]), ', n[p] ==', length(unique(df$patientcode)), ')'),
             x=max(df[obs[i]])-.15*(max(df[obs[i]])-min(df[obs[i]])),
             y=min(df[pred[i]])+1.2*(max(df[pred[i]])-min(df[pred[i]])),
             size=4,parse = TRUE) +
    ylim(c(min(df[pred[i]]),
           max(df[pred[i]])+0.35*(max(df[pred[i]])-min(df[pred[i]]))))+
    xlab(obs_nice[i]) +
    ylab(pred_nice[i]) +
    stat_cor(aes(label=..r.label..), size=4,color="#521B93",label.y.npc = 1)+
    stat_cor(aes(label=..rr.label..), size=4,color="#521B93",label.y.npc = .90)+
    stat_cor(aes(label=..p.label..), size=4,color="#521B93",label.y.npc = .77)+
    geom_smooth(method="lm",se=TRUE, size=1,color="#521B93") +
    theme_bw(base_size = 12) 
  
  assign(paste("p",i,sep = ""),p)
}


ggsave(plot = ggarrange(p1,p2,
                        ncol = 2,nrow = 1, common.legend=TRUE, legend="bottom"),
       filename = "./output/msdss_LM_vs_MLM_prediction_training.png", 
       width=6,height=2.5,units = "in",dpi = 150)


#VALIDATION cohort

severity_data_msdss_val <- data_sub_val %>% 
  filter(diagnosis %in% c("RR-MS","PP-MS","SP-MS"))%>% 
  filter(!is.na(msdss))

severity_data_msdss_val_clean <- severity_data_msdss_val[complete.cases(severity_data_msdss_val[confound]),]

# predict msdss from LM and MLM in the VALIDATION cohort

severity_data_msdss_val_clean$msdss_lm <- predict(object = fit_msdss,newdata = severity_data_msdss_val_clean)
severity_data_msdss_val_clean$msdss_mlm <- predict(object = best_fit_msdss,newdata = severity_data_msdss_val_clean)

# plot


pred <- c("msdss_lm","msdss_mlm")

pred_nice <- c("sNFL-predicted MS-DSS","sNFL-predicted MS-DSS")

obs <- c("msdss","msdss")

obs_nice <- c("measured MS-DSS","measured MS-DSS")


for(i in 1:length(obs)){
  # i <- 2
  df <- severity_data_msdss_val_clean 
  
  p <- df %>% 
    ggplot(aes_string(x=obs[i], y=pred[i])) +
    geom_point(color="#001321",alpha = 0.4, shape = 1, size = 2,stroke = 0.6) +
    annotate("text",label = paste0('atop(n[s] ==', nrow(df[obs[i]]), ', n[p] ==', length(unique(df$patientcode)), ')'),
             x=max(df[obs[i]])-.15*(max(df[obs[i]])-min(df[obs[i]])),
             y=min(df[pred[i]])+1.2*(max(df[pred[i]])-min(df[pred[i]])),
             size=4,parse = TRUE) +
    ylim(c(min(df[pred[i]]),
           max(df[pred[i]])+0.35*(max(df[pred[i]])-min(df[pred[i]]))))+
    xlab(obs_nice[i]) +
    ylab(pred_nice[i]) +
    stat_cor(aes(label=..r.label..), size=4,color="#521B93",label.y.npc = 1)+
    stat_cor(aes(label=..rr.label..), size=4,color="#521B93",label.y.npc = .90)+
    stat_cor(aes(label=..p.label..), size=4,color="#521B93",label.y.npc = .77)+
    geom_smooth(method="lm",se=TRUE, size=1,color="#521B93") +
    theme_bw(base_size = 12) 
  
  assign(paste("p",i,sep = ""),p)
}


ggsave(plot = ggarrange(p1,p2,
                        ncol = 2,nrow = 1, common.legend=TRUE, legend="bottom"),
       filename = "./output/msdss_LM_vs_MLM_prediction_validation.png", 
       width=6,height=2.5,units = "in",dpi = 150)

########
# MSSS
########


severity_data_msss_train <- data_sub_train %>% 
  filter(diagnosis %in% c("RR-MS","PP-MS","SP-MS"))%>% 
  filter(!is.na(msss))

severity_data_msss_train_clean <- severity_data_msss_train[complete.cases(severity_data_msss_train[confound]),]
#build the model:

fit <- lm(msss~log_nfl_serum + age + bmi  + bmi_weight +
            serum_ap + serum_bun + serum_creatinine , severity_data_msss_train_clean)

fit_msss <- lm(msss~log_nfl_serum, severity_data_msss_train_clean)
plot(msss~log_nfl_serum, severity_data_msss_train_clean)
abline(lm(msss~log_nfl_serum, severity_data_msss_train_clean))
summary(fit_msss)

#stepwise regression
step <- stepAIC(fit, direction="both")

step$anova # display results

best_fit_msss <- lm(msss ~ log_nfl_serum + bmi_weight + serum_ap + serum_bun + serum_creatinine, severity_data_msss_train_clean)

summary(best_fit_msss)

# predict msss from LM and MLM

severity_data_msss_train_clean$msss_lm <- fit_msss$fitted.values
severity_data_msss_train_clean$msss_mlm <- predict(object = best_fit_msss,newdata = severity_data_msss_train_clean)

# plot


pred <- c("msss_lm","msss_mlm")

pred_nice <- c("sNFL-predicted MSSS","sNFL-predicted MSSS")

obs <- c("msss","msss")

obs_nice <- c("measured MSSS","measured MSSS")


for(i in 1:length(obs)){
  # i <- 2
  df <- severity_data_msss_train_clean 
  
  p <- df %>% 
    ggplot(aes_string(x=obs[i], y=pred[i])) +
    geom_point(color="#001321",alpha = 0.4, shape = 1, size = 2,stroke = 0.6) +
    annotate("text",label = paste0('atop(n[s] ==', nrow(df[obs[i]]), ', n[p] ==', length(unique(df$patientcode)), ')'),
             x=max(df[obs[i]])-.15*(max(df[obs[i]])-min(df[obs[i]])),
             y=min(df[pred[i]])+1.2*(max(df[pred[i]])-min(df[pred[i]])),
             size=4,parse = TRUE) +
    ylim(c(min(df[pred[i]]),
           max(df[pred[i]])+0.35*(max(df[pred[i]])-min(df[pred[i]]))))+
    xlab(obs_nice[i]) +
    ylab(pred_nice[i]) +
    stat_cor(aes(label=..r.label..), size=4,color="#521B93",label.y.npc = 1)+
    stat_cor(aes(label=..rr.label..), size=4,color="#521B93",label.y.npc = .90)+
    stat_cor(aes(label=..p.label..), size=4,color="#521B93",label.y.npc = .77)+
    geom_smooth(method="lm",se=TRUE, size=1,color="#521B93") +
    theme_bw(base_size = 12) 
  
  assign(paste("p",i,sep = ""),p)
}


ggsave(plot = ggarrange(p1,p2,
                        ncol = 2,nrow = 1, common.legend=TRUE, legend="bottom"),
       filename = "./output/msss_LM_vs_MLM_prediction_training.png", 
       width=6,height=2.5,units = "in",dpi = 150)


#VALIDATION cohort

severity_data_msss_val <- data_sub_val %>% 
  filter(diagnosis %in% c("RR-MS","PP-MS","SP-MS"))%>% 
  filter(!is.na(msss))

severity_data_msss_val_clean <- severity_data_msss_val[complete.cases(severity_data_msss_val[confound]),]

# predict msss from LM and MLM in the VALIDATION cohort

severity_data_msss_val_clean$msss_lm <- predict(object = fit_msss,newdata = severity_data_msss_val_clean)
severity_data_msss_val_clean$msss_mlm <- predict(object = best_fit_msss,newdata = severity_data_msss_val_clean)

# plot


pred <- c("msss_lm","msss_mlm")

pred_nice <- c("sNFL-predicted MSSS","sNFL-predicted MSSS")

obs <- c("msss","msss")

obs_nice <- c("measured MSSS","measured MSSS")


for(i in 1:length(obs)){
  # i <- 2
  df <- severity_data_msss_val_clean 
  
  p <- df %>% 
    ggplot(aes_string(x=obs[i], y=pred[i])) +
    geom_point(color="#001321",alpha = 0.4, shape = 1, size = 2,stroke = 0.6) +
    annotate("text",label = paste0('atop(n[s] ==', nrow(df[obs[i]]), ', n[p] ==', length(unique(df$patientcode)), ')'),
             x=max(df[obs[i]])-.15*(max(df[obs[i]])-min(df[obs[i]])),
             y=min(df[pred[i]])+1.2*(max(df[pred[i]])-min(df[pred[i]])),
             size=4,parse = TRUE) +
    ylim(c(min(df[pred[i]]),
           max(df[pred[i]])+0.35*(max(df[pred[i]])-min(df[pred[i]]))))+
    xlab(obs_nice[i]) +
    ylab(pred_nice[i]) +
    stat_cor(aes(label=..r.label..), size=4,color="#521B93",label.y.npc = 1)+
    stat_cor(aes(label=..rr.label..), size=4,color="#521B93",label.y.npc = .90)+
    stat_cor(aes(label=..p.label..), size=4,color="#521B93",label.y.npc = .77)+
    geom_smooth(method="lm",se=TRUE, size=1,color="#521B93") +
    theme_bw(base_size = 12) 
  
  assign(paste("p",i,sep = ""),p)
}


ggsave(plot = ggarrange(p1,p2,
                        ncol = 2,nrow = 1, common.legend=TRUE, legend="bottom"),
       filename = "./output/msss_LM_vs_MLM_prediction_validation.png", 
       width=6,height=2.5,units = "in",dpi = 150)


########
# ARMSS
########


severity_data_armss_train <- data_sub_train %>% 
  filter(diagnosis %in% c("RR-MS","PP-MS","SP-MS"))%>% 
  filter(!is.na(armss))

severity_data_armss_train_clean <- severity_data_armss_train[complete.cases(severity_data_armss_train[confound]),]
#build the model:

fit <- lm(armss~log_nfl_serum + age + bmi + bmi_weight + serum_ap + serum_bun + serum_creatinine, severity_data_armss_train_clean)

fit_armss <- lm(armss~log_nfl_serum, severity_data_armss_train_clean)
plot(armss~log_nfl_serum, severity_data_armss_train_clean)
abline(lm(armss~log_nfl_serum, severity_data_armss_train_clean))
summary(fit_armss)

#stepwise regression
step <- stepAIC(fit, direction="both")

step$anova # display results

best_fit_armss <- lm(armss ~ log_nfl_serum + age + bmi + bmi_weight + serum_ap + serum_bun + 
                       serum_creatinine, severity_data_armss_train_clean)

summary(best_fit_armss)

# predict armss from LM and MLM

severity_data_armss_train_clean$armss_lm <- fit_armss$fitted.values
severity_data_armss_train_clean$armss_mlm <- predict(object = best_fit_armss,newdata = severity_data_armss_train_clean)

# plot


pred <- c("armss_lm","armss_mlm")

pred_nice <- c("sNFL-predicted ARMSS","sNFL-predicted ARMSS")

obs <- c("armss","armss")

obs_nice <- c("measured ARMSS","measured ARMSS")


for(i in 1:length(obs)){
  # i <- 2
  df <- severity_data_armss_train_clean 
  
  p <- df %>% 
    ggplot(aes_string(x=obs[i], y=pred[i])) +
    geom_point(color="#001321",alpha = 0.4, shape = 1, size = 2,stroke = 0.6) +
    annotate("text",label = paste0('atop(n[s] ==', nrow(df[obs[i]]), ', n[p] ==', length(unique(df$patientcode)), ')'),
             x=max(df[obs[i]])-.15*(max(df[obs[i]])-min(df[obs[i]])),
             y=min(df[pred[i]])+1.2*(max(df[pred[i]])-min(df[pred[i]])),
             size=4,parse = TRUE) +
    ylim(c(min(df[pred[i]]),
           max(df[pred[i]])+0.35*(max(df[pred[i]])-min(df[pred[i]]))))+
    xlab(obs_nice[i]) +
    ylab(pred_nice[i]) +
    stat_cor(aes(label=..r.label..), size=4,color="#521B93",label.y.npc = 1)+
    stat_cor(aes(label=..rr.label..), size=4,color="#521B93",label.y.npc = .90)+
    stat_cor(aes(label=..p.label..), size=4,color="#521B93",label.y.npc = .77)+
    geom_smooth(method="lm",se=TRUE, size=1,color="#521B93") +
    theme_bw(base_size = 12) 
  
  assign(paste("p",i,sep = ""),p)
}


ggsave(plot = ggarrange(p1,p2,
                        ncol = 2,nrow = 1, common.legend=TRUE, legend="bottom"),
       filename = "./output/armss_LM_vs_MLM_prediction_training.png", 
       width=6,height=2.5,units = "in",dpi = 150)


#VALIDATION cohort

severity_data_armss_val <- data_sub_val %>% 
  filter(diagnosis %in% c("RR-MS","PP-MS","SP-MS"))%>% 
  filter(!is.na(armss))

severity_data_armss_val_clean <- severity_data_armss_val[complete.cases(severity_data_armss_val[confound]),]

# predict armss from LM and MLM in the VALIDATION cohort

severity_data_armss_val_clean$armss_lm <- predict(object = fit_armss,newdata = severity_data_armss_val_clean)
severity_data_armss_val_clean$armss_mlm <- predict(object = best_fit_armss,newdata = severity_data_armss_val_clean)

# plot


pred <- c("armss_lm","armss_mlm")

pred_nice <- c("sNFL-predicted ARMSS","sNFL-predicted ARMSS")

obs <- c("armss","armss")

obs_nice <- c("measured ARMSS","measured ARMSS")


for(i in 1:length(obs)){
  # i <- 2
  df <- severity_data_armss_val_clean 
  
  p <- df %>% 
    ggplot(aes_string(x=obs[i], y=pred[i])) +
    geom_point(color="#001321",alpha = 0.4, shape = 1, size = 2,stroke = 0.6) +
    annotate("text",label = paste0('atop(n[s] ==', nrow(df[obs[i]]), ', n[p] ==', length(unique(df$patientcode)), ')'),
             x=max(df[obs[i]])-.15*(max(df[obs[i]])-min(df[obs[i]])),
             y=min(df[pred[i]])+1.2*(max(df[pred[i]])-min(df[pred[i]])),
             size=4,parse = TRUE) +
    ylim(c(min(df[pred[i]]),
           max(df[pred[i]])+0.35*(max(df[pred[i]])-min(df[pred[i]]))))+
    xlab(obs_nice[i]) +
    ylab(pred_nice[i]) +
    stat_cor(aes(label=..r.label..), size=4,color="#521B93",label.y.npc = 1)+
    stat_cor(aes(label=..rr.label..), size=4,color="#521B93",label.y.npc = .90)+
    stat_cor(aes(label=..p.label..), size=4,color="#521B93",label.y.npc = .77)+
    geom_smooth(method="lm",se=TRUE, size=1,color="#521B93") +
    theme_bw(base_size = 12) 
  
  assign(paste("p",i,sep = ""),p)
}


ggsave(plot = ggarrange(p1,p2,
                        ncol = 2,nrow = 1, common.legend=TRUE, legend="bottom"),
       filename = "./output/armss_LM_vs_MLM_prediction_validation.png", 
       width=6,height=2.5,units = "in",dpi = 150)


###########################
# add PNS variables


#######
#MSDSS
#######
severity_data_msdss_train <- data_sub_train %>% 
  filter(diagnosis %in% c("RR-MS","PP-MS","SP-MS"))%>% 
  filter(!is.na(msdss))

confound_pns <- c(confound,"panel_11_17","medulla_ll_atrophy")

severity_data_msdss_train_clean <- severity_data_msdss_train[complete.cases(severity_data_msdss_train[confound_pns]),]
#build the model:

fit <- lm(msdss~log_nfl_serum + age + bmi + bmi_weight +
             + serum_ap + serum_bun + serum_creatinine + panel_11_17 +
            medulla_ll_atrophy, severity_data_msdss_train_clean)

fit_msdss <- lm(msdss~log_nfl_serum, severity_data_msdss_train_clean)
plot(msdss~log_nfl_serum, severity_data_msdss_train_clean)
abline(lm(msdss~log_nfl_serum, severity_data_msdss_train_clean))
summary(fit_msdss)

#stepwise regression
step <- stepAIC(fit, direction="both")

step$anova # display results

best_fit_msdss_pns <- lm(msdss ~ log_nfl_serum + age + bmi + bmi_weight + serum_ap + medulla_ll_atrophy, severity_data_msdss_train_clean)

summary(best_fit_msdss_pns)

# predict msdss from LM and MLM

severity_data_msdss_train_clean$msdss_lm <- fit_msdss$fitted.values
severity_data_msdss_train_clean$msdss_mlm <- predict(object = best_fit_msdss_pns,newdata = severity_data_msdss_train_clean)

# plot


pred <- c("msdss_lm","msdss_mlm")

pred_nice <- c("sNFL-predicted MS-DSS","sNFL-predicted MS-DSS")

obs <- c("msdss","msdss")

obs_nice <- c("measured MS-DSS","measured MS-DSS")


for(i in 1:length(obs)){
  # i <- 2
  df <- severity_data_msdss_train_clean 
  
  p <- df %>% 
    ggplot(aes_string(x=obs[i], y=pred[i])) +
    geom_point(color="#001321",alpha = 0.4, shape = 1, size = 2,stroke = 0.6) +
    annotate("text",label = paste0('atop(n[s] ==', nrow(df[obs[i]]), ', n[p] ==', length(unique(df$patientcode)), ')'),
             x=max(df[obs[i]])-.15*(max(df[obs[i]])-min(df[obs[i]])),
             y=min(df[pred[i]])+1.2*(max(df[pred[i]])-min(df[pred[i]])),
             size=4,parse = TRUE) +
    ylim(c(min(df[pred[i]]),
           max(df[pred[i]])+0.35*(max(df[pred[i]])-min(df[pred[i]]))))+
    xlab(obs_nice[i]) +
    ylab(pred_nice[i]) +
    stat_cor(aes(label=..r.label..), size=4,color="#521B93",label.y.npc = 1)+
    stat_cor(aes(label=..rr.label..), size=4,color="#521B93",label.y.npc = .90)+
    stat_cor(aes(label=..p.label..), size=4,color="#521B93",label.y.npc = .77)+
    geom_smooth(method="lm",se=TRUE, size=1,color="#521B93") +
    theme_bw(base_size = 12) 
  
  assign(paste("p",i,sep = ""),p)
}


ggsave(plot = ggarrange(p1,p2,
                        ncol = 2,nrow = 1, common.legend=TRUE, legend="bottom"),
       filename = "./output/msdss_LM_vs_MLM_prediction_wPNS_training.png", 
       width=6,height=2.5,units = "in",dpi = 150)


#VALIDATION cohort

severity_data_msdss_val <- data_sub_val %>% 
  filter(diagnosis %in% c("RR-MS","PP-MS","SP-MS"))%>% 
  filter(!is.na(msdss))

severity_data_msdss_val_clean <- severity_data_msdss_val[complete.cases(severity_data_msdss_val[confound_pns]),]

# predict msdss from LM and MLM in the VALIDATION cohort

severity_data_msdss_val_clean$msdss_lm <- predict(object = fit_msdss,newdata = severity_data_msdss_val_clean)
severity_data_msdss_val_clean$msdss_mlm <- predict(object = best_fit_msdss_pns,newdata = severity_data_msdss_val_clean)

# plot


pred <- c("msdss_lm","msdss_mlm")

pred_nice <- c("sNFL-predicted MS-DSS","sNFL-predicted MS-DSS")

obs <- c("msdss","msdss")

obs_nice <- c("measured MS-DSS","measured MS-DSS")


for(i in 1:length(obs)){
  # i <- 2
  df <- severity_data_msdss_val_clean 
  
  p <- df %>% 
    ggplot(aes_string(x=obs[i], y=pred[i])) +
    geom_point(color="#001321",alpha = 0.4, shape = 1, size = 2,stroke = 0.6) +
    annotate("text",label = paste0('atop(n[s] ==', nrow(df[obs[i]]), ', n[p] ==', length(unique(df$patientcode)), ')'),
             x=max(df[obs[i]])-.15*(max(df[obs[i]])-min(df[obs[i]])),
             y=min(df[pred[i]])+1.2*(max(df[pred[i]])-min(df[pred[i]])),
             size=4,parse = TRUE) +
    ylim(c(min(df[pred[i]]),
           max(df[pred[i]])+0.35*(max(df[pred[i]])-min(df[pred[i]]))))+
    xlab(obs_nice[i]) +
    ylab(pred_nice[i]) +
    stat_cor(aes(label=..r.label..), size=4,color="#521B93",label.y.npc = 1)+
    stat_cor(aes(label=..rr.label..), size=4,color="#521B93",label.y.npc = .90)+
    stat_cor(aes(label=..p.label..), size=4,color="#521B93",label.y.npc = .77)+
    geom_smooth(method="lm",se=TRUE, size=1,color="#521B93") +
    theme_bw(base_size = 12) 
  
  assign(paste("p",i,sep = ""),p)
}


ggsave(plot = ggarrange(p1,p2,
                        ncol = 2,nrow = 1, common.legend=TRUE, legend="bottom"),
       filename = "./output/msdss_LM_vs_MLM_prediction_wPNS_validation.png", 
       width=6,height=2.5,units = "in",dpi = 150)

########
# MSSS
########


severity_data_msss_train <- data_sub_train %>% 
  filter(diagnosis %in% c("RR-MS","PP-MS","SP-MS"))%>% 
  filter(!is.na(msss))

severity_data_msss_train_clean <- severity_data_msss_train[complete.cases(severity_data_msss_train[confound_pns]),]
#build the model:

fit <- lm(msss~log_nfl_serum + age + bmi + bmi_weight +
             serum_ap + serum_bun + serum_creatinine +
            panel_11_17 + medulla_ll_atrophy, severity_data_msss_train_clean)

fit_msss <- lm(msss~log_nfl_serum, severity_data_msss_train_clean)
plot(msss~log_nfl_serum, severity_data_msss_train_clean)
abline(lm(msss~log_nfl_serum, severity_data_msss_train_clean))
summary(fit_msss)

#stepwise regression
step <- stepAIC(fit, direction="both")

step$anova # display results

best_fit_msss_pns <- lm(msss ~ log_nfl_serum + bmi_weight + serum_ap + serum_bun + serum_creatinine + 
                          panel_11_17 + medulla_ll_atrophy, severity_data_msss_train_clean)

summary(best_fit_msss_pns)

# predict msss from LM and MLM

severity_data_msss_train_clean$msss_lm <- fit_msss$fitted.values
severity_data_msss_train_clean$msss_mlm <- predict(object = best_fit_msss_pns,newdata = severity_data_msss_train_clean)

# plot


pred <- c("msss_lm","msss_mlm")

pred_nice <- c("sNFL-predicted MSSS","sNFL-predicted MSSS")

obs <- c("msss","msss")

obs_nice <- c("measured MSSS","measured MSSS")


for(i in 1:length(obs)){
  # i <- 2
  df <- severity_data_msss_train_clean 
  
  p <- df %>% 
    ggplot(aes_string(x=obs[i], y=pred[i])) +
    geom_point(color="#001321",alpha = 0.4, shape = 1, size = 2,stroke = 0.6) +
    annotate("text",label = paste0('atop(n[s] ==', nrow(df[obs[i]]), ', n[p] ==', length(unique(df$patientcode)), ')'),
             x=max(df[obs[i]])-.15*(max(df[obs[i]])-min(df[obs[i]])),
             y=min(df[pred[i]])+1.2*(max(df[pred[i]])-min(df[pred[i]])),
             size=4,parse = TRUE) +
    ylim(c(min(df[pred[i]]),
           max(df[pred[i]])+0.35*(max(df[pred[i]])-min(df[pred[i]]))))+
    xlab(obs_nice[i]) +
    ylab(pred_nice[i]) +
    stat_cor(aes(label=..r.label..), size=4,color="#521B93",label.y.npc = 1)+
    stat_cor(aes(label=..rr.label..), size=4,color="#521B93",label.y.npc = .90)+
    stat_cor(aes(label=..p.label..), size=4,color="#521B93",label.y.npc = .77)+
    geom_smooth(method="lm",se=TRUE, size=1,color="#521B93") +
    theme_bw(base_size = 12) 
  
  assign(paste("p",i,sep = ""),p)
}


ggsave(plot = ggarrange(p1,p2,
                        ncol = 2,nrow = 1, common.legend=TRUE, legend="bottom"),
       filename = "./output/msss_LM_vs_MLM_prediction_wPNS_training.png", 
       width=6,height=2.5,units = "in",dpi = 150)


#VALIDATION cohort

severity_data_msss_val <- data_sub_val %>% 
  filter(diagnosis %in% c("RR-MS","PP-MS","SP-MS"))%>% 
  filter(!is.na(msss))

severity_data_msss_val_clean <- severity_data_msss_val[complete.cases(severity_data_msss_val[confound_pns]),]

# predict msss from LM and MLM in the VALIDATION cohort

severity_data_msss_val_clean$msss_lm <- predict(object = fit_msss,newdata = severity_data_msss_val_clean)
severity_data_msss_val_clean$msss_mlm <- predict(object = best_fit_msss_pns,newdata = severity_data_msss_val_clean)

# plot


pred <- c("msss_lm","msss_mlm")

pred_nice <- c("sNFL-predicted MSSS","sNFL-predicted MSSS")

obs <- c("msss","msss")

obs_nice <- c("measured MSSS","measured MSSS")


for(i in 1:length(obs)){
  # i <- 2
  df <- severity_data_msss_val_clean 
  
  p <- df %>% 
    ggplot(aes_string(x=obs[i], y=pred[i])) +
    geom_point(color="#001321",alpha = 0.4, shape = 1, size = 2,stroke = 0.6) +
    annotate("text",label = paste0('atop(n[s] ==', nrow(df[obs[i]]), ', n[p] ==', length(unique(df$patientcode)), ')'),
             x=max(df[obs[i]])-.15*(max(df[obs[i]])-min(df[obs[i]])),
             y=min(df[pred[i]])+1.2*(max(df[pred[i]])-min(df[pred[i]])),
             size=4,parse = TRUE) +
    ylim(c(min(df[pred[i]]),
           max(df[pred[i]])+0.35*(max(df[pred[i]])-min(df[pred[i]]))))+
    xlab(obs_nice[i]) +
    ylab(pred_nice[i]) +
    stat_cor(aes(label=..r.label..), size=4,color="#521B93",label.y.npc = 1)+
    stat_cor(aes(label=..rr.label..), size=4,color="#521B93",label.y.npc = .90)+
    stat_cor(aes(label=..p.label..), size=4,color="#521B93",label.y.npc = .77)+
    geom_smooth(method="lm",se=TRUE, size=1,color="#521B93") +
    theme_bw(base_size = 12) 
  
  assign(paste("p",i,sep = ""),p)
}


ggsave(plot = ggarrange(p1,p2,
                        ncol = 2,nrow = 1, common.legend=TRUE, legend="bottom"),
       filename = "./output/msss_LM_vs_MLM_prediction_wPNS_validation.png", 
       width=6,height=2.5,units = "in",dpi = 150)


########
# ARMSS
########


severity_data_armss_train <- data_sub_train %>% 
  filter(diagnosis %in% c("RR-MS","PP-MS","SP-MS"))%>% 
  filter(!is.na(armss))

severity_data_armss_train_clean <- severity_data_armss_train[complete.cases(severity_data_armss_train[confound_pns]),]
#build the model:

fit <- lm(armss~log_nfl_serum + age + bmi + bmi_weight +
            serum_ap + serum_bun + serum_creatinine +
            panel_11_17 + medulla_ll_atrophy, severity_data_armss_train_clean)

fit_armss <- lm(armss~log_nfl_serum, severity_data_armss_train_clean)
plot(armss~log_nfl_serum, severity_data_armss_train_clean)
abline(lm(armss~log_nfl_serum, severity_data_armss_train_clean))
summary(fit_armss)

#stepwise regression
step <- stepAIC(fit, direction="both")

step$anova # display results

best_fit_armss_pns <- lm(armss ~ log_nfl_serum + age + bmi_weight + serum_ap + serum_creatinine + 
                           panel_11_17 + medulla_ll_atrophy, severity_data_armss_train_clean)

summary(best_fit_armss_pns)

# predict armss from LM and MLM

severity_data_armss_train_clean$armss_lm <- fit_armss$fitted.values
severity_data_armss_train_clean$armss_mlm <- predict(object = best_fit_armss_pns,newdata = severity_data_armss_train_clean)

# plot


pred <- c("armss_lm","armss_mlm")

pred_nice <- c("sNFL-predicted ARMSS","sNFL-predicted ARMSS")

obs <- c("armss","armss")

obs_nice <- c("measured ARMSS","measured ARMSS")


for(i in 1:length(obs)){
  # i <- 2
  df <- severity_data_armss_train_clean 
  
  p <- df %>% 
    ggplot(aes_string(x=obs[i], y=pred[i])) +
    geom_point(color="#001321",alpha = 0.4, shape = 1, size = 2,stroke = 0.6) +
    annotate("text",label = paste0('atop(n[s] ==', nrow(df[obs[i]]), ', n[p] ==', length(unique(df$patientcode)), ')'),
             x=max(df[obs[i]])-.15*(max(df[obs[i]])-min(df[obs[i]])),
             y=min(df[pred[i]])+1.2*(max(df[pred[i]])-min(df[pred[i]])),
             size=4,parse = TRUE) +
    ylim(c(min(df[pred[i]]),
           max(df[pred[i]])+0.35*(max(df[pred[i]])-min(df[pred[i]]))))+
    xlab(obs_nice[i]) +
    ylab(pred_nice[i]) +
    stat_cor(aes(label=..r.label..), size=4,color="#521B93",label.y.npc = 1)+
    stat_cor(aes(label=..rr.label..), size=4,color="#521B93",label.y.npc = .90)+
    stat_cor(aes(label=..p.label..), size=4,color="#521B93",label.y.npc = .77)+
    geom_smooth(method="lm",se=TRUE, size=1,color="#521B93") +
    theme_bw(base_size = 12) 
  
  assign(paste("p",i,sep = ""),p)
}


ggsave(plot = ggarrange(p1,p2,
                        ncol = 2,nrow = 1, common.legend=TRUE, legend="bottom"),
       filename = "./output/armss_LM_vs_MLM_prediction_wPNS_training.png", 
       width=6,height=2.5,units = "in",dpi = 150)


#VALIDATION cohort

severity_data_armss_val <- data_sub_val %>% 
  filter(diagnosis %in% c("RR-MS","PP-MS","SP-MS"))%>% 
  filter(!is.na(armss))

severity_data_armss_val_clean <- severity_data_armss_val[complete.cases(severity_data_armss_val[confound_pns]),]

# predict armss from LM and MLM in the VALIDATION cohort

severity_data_armss_val_clean$armss_lm <- predict(object = fit_armss,newdata = severity_data_armss_val_clean)
severity_data_armss_val_clean$armss_mlm <- predict(object = best_fit_armss_pns,newdata = severity_data_armss_val_clean)

# plot


pred <- c("armss_lm","armss_mlm")

pred_nice <- c("sNFL-predicted ARMSS","sNFL-predicted ARMSS")

obs <- c("armss","armss")

obs_nice <- c("measured ARMSS","measured ARMSS")


for(i in 1:length(obs)){
  # i <- 2
  df <- severity_data_armss_val_clean 
  
  p <- df %>% 
    ggplot(aes_string(x=obs[i], y=pred[i])) +
    geom_point(color="#001321",alpha = 0.4, shape = 1, size = 2,stroke = 0.6) +
    annotate("text",label = paste0('atop(n[s] ==', nrow(df[obs[i]]), ', n[p] ==', length(unique(df$patientcode)), ')'),
             x=max(df[obs[i]])-.15*(max(df[obs[i]])-min(df[obs[i]])),
             y=min(df[pred[i]])+1.2*(max(df[pred[i]])-min(df[pred[i]])),
             size=4,parse = TRUE) +
    ylim(c(min(df[pred[i]]),
           max(df[pred[i]])+0.35*(max(df[pred[i]])-min(df[pred[i]]))))+
    xlab(obs_nice[i]) +
    ylab(pred_nice[i]) +
    stat_cor(aes(label=..r.label..), size=4,color="#521B93",label.y.npc = 1)+
    stat_cor(aes(label=..rr.label..), size=4,color="#521B93",label.y.npc = .90)+
    stat_cor(aes(label=..p.label..), size=4,color="#521B93",label.y.npc = .77)+
    geom_smooth(method="lm",se=TRUE, size=1,color="#521B93") +
    theme_bw(base_size = 12) 
  
  assign(paste("p",i,sep = ""),p)
}


ggsave(plot = ggarrange(p1,p2,
                        ncol = 2,nrow = 1, common.legend=TRUE, legend="bottom"),
       filename = "./output/armss_LM_vs_MLM_prediction_wPNS_validation.png", 
       width=6,height=2.5,units = "in",dpi = 150)

###############################################
############## lolipop plots of t statistics
###############################################


# get Tstat from the MLM model
# start by all variables
tidy_fit <- broom::tidy(fit)
tidy_vars <- tibble::enframe(tidy_fit$term,name = NULL,value = "term")

# MSDSS no PNS
tidy_fit <- broom::tidy(best_fit_msdss)
tidy_fit <- tidy_fit[,c(1,4)]
names(tidy_fit)[2] <- "msdss_nopns"
tidy_vars <- merge(tidy_vars,tidy_fit,by="term",all.x = TRUE)

# MSDSS w PNS
tidy_fit <- broom::tidy(best_fit_msdss_pns)
tidy_fit <- tidy_fit[,c(1,4)]
names(tidy_fit)[2] <- "msdss_wpns"
tidy_vars <- merge(tidy_vars,tidy_fit,by="term",all.x = TRUE)

# MSSS no PNS
tidy_fit <- broom::tidy(best_fit_msss)
tidy_fit <- tidy_fit[,c(1,4)]
names(tidy_fit)[2] <- "msss_nopns"
tidy_vars <- merge(tidy_vars,tidy_fit,by="term",all.x = TRUE)

# MSSS w PNS
tidy_fit <- broom::tidy(best_fit_msss_pns)
tidy_fit <- tidy_fit[,c(1,4)]
names(tidy_fit)[2] <- "msss_wpns"
tidy_vars <- merge(tidy_vars,tidy_fit,by="term",all.x = TRUE)

# ARMSS no PNS
tidy_fit <- broom::tidy(best_fit_armss)
tidy_fit <- tidy_fit[,c(1,4)]
names(tidy_fit)[2] <- "armss_nopns"
tidy_vars <- merge(tidy_vars,tidy_fit,by="term",all.x = TRUE)

# ARMSS w PNS
tidy_fit <- broom::tidy(best_fit_armss_pns)
tidy_fit <- tidy_fit[,c(1,4)]
names(tidy_fit)[2] <- "armss_wpns"
tidy_vars <- merge(tidy_vars,tidy_fit,by="term",all.x = TRUE)

sev_outcomes <- names(tidy_vars)[-1]
terms <- tidy_vars$term

# lolipop plotplot(# Horizontal version
for(i in 1:length(sev_outcomes)){
  # i <- 1
z <- ggplot(tidy_vars,aes_string(y=sev_outcomes[i], x="terms")) +
  geom_segment( aes_string(y=0, yend=sev_outcomes[i],x="terms", xend="terms"), color="skyblue") +
  geom_point( color="blue", size=4, alpha=0.6) +
  theme_light() +
  scale_x_discrete(limits = terms) +
  geom_hline(yintercept = 0,color = "black", linetype="dashed",size=.25) +
  coord_flip() +
  ggtitle(sev_outcomes[i])+
  ylab("t-statistics") +
  xlab("")+
  ylim(c(-8,15))+
  theme(
    panel.grid.major.y = element_blank(),
    panel.border = element_blank(),
    axis.ticks.y = element_blank()
    
  )
assign(paste("z",i,sep = ""),z)
}


ggsave(plot = ggarrange(z1,z3,z5,ncol = 1,nrow = 3, common.legend=TRUE, legend="bottom"),
       filename = "./output/vars_severity_predicitions_noPNS.png", 
       width =5,height = 9,units = "in",dpi = 300)


ggsave(plot = ggarrange(z2,z4,z6,ncol = 1,nrow = 3, common.legend=TRUE, legend="bottom"),
       filename = "./output/vars_severity_predicitions_wPNS.png", 
       width =5,height = 9,units = "in",dpi = 300)


###########################
# use PNS variables only to predict severity


#######
#MSDSS
#######
severity_data_msdss_train <- data_sub_train %>% 
  filter(diagnosis %in% c("RR-MS","PP-MS","SP-MS"))%>% 
  filter(!is.na(msdss))

confound_pns <- c("panel_11_17","medulla_ll_atrophy")

severity_data_msdss_train_clean <- severity_data_msdss_train[complete.cases(severity_data_msdss_train[confound_pns]),]
#build the model:

fit <- lm(msdss~panel_11_17 +
            medulla_ll_atrophy, severity_data_msdss_train_clean)

fit_msdss <- lm(msdss~log_nfl_serum, severity_data_msdss_train_clean)
plot(msdss~log_nfl_serum, severity_data_msdss_train_clean)
abline(lm(msdss~log_nfl_serum, severity_data_msdss_train_clean))
summary(fit_msdss)

#stepwise regression
step <- stepAIC(fit, direction="both")

step$anova # display results

best_fit_msdss_pns <- lm(msdss ~ panel_11_17 + medulla_ll_atrophy, severity_data_msdss_train_clean)

summary(best_fit_msdss_pns)

# predict msdss from LM and MLM

severity_data_msdss_train_clean$msdss_lm <- fit_msdss$fitted.values
severity_data_msdss_train_clean$msdss_mlm <- predict(object = best_fit_msdss_pns,newdata = severity_data_msdss_train_clean)

# plot


pred <- c("msdss_lm","msdss_mlm")

pred_nice <- c("sNFL-predicted MS-DSS","sNFL-predicted MS-DSS")

obs <- c("msdss","msdss")

obs_nice <- c("measured MS-DSS","measured MS-DSS")


for(i in 1:length(obs)){
  # i <- 2
  df <- severity_data_msdss_train_clean 
  
  p <- df %>% 
    ggplot(aes_string(x=obs[i], y=pred[i])) +
    geom_point(color="#001321",alpha = 0.4, shape = 1, size = 2,stroke = 0.6) +
    annotate("text",label = paste0('atop(n[s] ==', nrow(df[obs[i]]), ', n[p] ==', length(unique(df$patientcode)), ')'),
             x=max(df[obs[i]])-.15*(max(df[obs[i]])-min(df[obs[i]])),
             y=min(df[pred[i]])+1.2*(max(df[pred[i]])-min(df[pred[i]])),
             size=4,parse = TRUE) +
    ylim(c(min(df[pred[i]]),
           max(df[pred[i]])+0.35*(max(df[pred[i]])-min(df[pred[i]]))))+
    xlab(obs_nice[i]) +
    ylab(pred_nice[i]) +
    stat_cor(aes(label=..r.label..), size=4,color="#521B93",label.y.npc = 1)+
    stat_cor(aes(label=..rr.label..), size=4,color="#521B93",label.y.npc = .90)+
    stat_cor(aes(label=..p.label..), size=4,color="#521B93",label.y.npc = .77)+
    geom_smooth(method="lm",se=TRUE, size=1,color="#521B93") +
    theme_bw(base_size = 12) 
  
  assign(paste("p",i,sep = ""),p)
}


ggsave(plot = ggarrange(p1,p2,
                        ncol = 2,nrow = 1, common.legend=TRUE, legend="bottom"),
       filename = "./output/msdss_LM_vs_MLM_prediction_PNSonly_training.png", 
       width=6,height=2.5,units = "in",dpi = 150)


#VALIDATION cohort

severity_data_msdss_val <- data_sub_val %>% 
  filter(diagnosis %in% c("RR-MS","PP-MS","SP-MS"))%>% 
  filter(!is.na(msdss))

severity_data_msdss_val_clean <- severity_data_msdss_val[complete.cases(severity_data_msdss_val[confound_pns]),]

# predict msdss from LM and MLM in the VALIDATION cohort

severity_data_msdss_val_clean$msdss_lm <- predict(object = fit_msdss,newdata = severity_data_msdss_val_clean)
severity_data_msdss_val_clean$msdss_mlm <- predict(object = best_fit_msdss_pns,newdata = severity_data_msdss_val_clean)

# plot


pred <- c("msdss_lm","msdss_mlm")

pred_nice <- c("sNFL-predicted MS-DSS","sNFL-predicted MS-DSS")

obs <- c("msdss","msdss")

obs_nice <- c("measured MS-DSS","measured MS-DSS")


for(i in 1:length(obs)){
  # i <- 2
  df <- severity_data_msdss_val_clean 
  
  p <- df %>% 
    ggplot(aes_string(x=obs[i], y=pred[i])) +
    geom_point(color="#001321",alpha = 0.4, shape = 1, size = 2,stroke = 0.6) +
    annotate("text",label = paste0('atop(n[s] ==', nrow(df[obs[i]]), ', n[p] ==', length(unique(df$patientcode)), ')'),
             x=max(df[obs[i]])-.15*(max(df[obs[i]])-min(df[obs[i]])),
             y=min(df[pred[i]])+1.2*(max(df[pred[i]])-min(df[pred[i]])),
             size=4,parse = TRUE) +
    ylim(c(min(df[pred[i]]),
           max(df[pred[i]])+0.35*(max(df[pred[i]])-min(df[pred[i]]))))+
    xlab(obs_nice[i]) +
    ylab(pred_nice[i]) +
    stat_cor(aes(label=..r.label..), size=4,color="#521B93",label.y.npc = 1)+
    stat_cor(aes(label=..rr.label..), size=4,color="#521B93",label.y.npc = .90)+
    stat_cor(aes(label=..p.label..), size=4,color="#521B93",label.y.npc = .77)+
    geom_smooth(method="lm",se=TRUE, size=1,color="#521B93") +
    theme_bw(base_size = 12) 
  
  assign(paste("p",i,sep = ""),p)
}


ggsave(plot = ggarrange(p1,p2,
                        ncol = 2,nrow = 1, common.legend=TRUE, legend="bottom"),
       filename = "./output/msdss_LM_vs_MLM_prediction_PNSonly_validation.png", 
       width=6,height=2.5,units = "in",dpi = 150)

########
# MSSS
########


severity_data_msss_train <- data_sub_train %>% 
  filter(diagnosis %in% c("RR-MS","PP-MS","SP-MS"))%>% 
  filter(!is.na(msss))

severity_data_msss_train_clean <- severity_data_msss_train[complete.cases(severity_data_msss_train[confound_pns]),]
#build the model:

fit <- lm(msss~
            panel_11_17 + medulla_ll_atrophy, severity_data_msss_train_clean)

fit_msss <- lm(msss~log_nfl_serum, severity_data_msss_train_clean)
plot(msss~log_nfl_serum, severity_data_msss_train_clean)
abline(lm(msss~log_nfl_serum, severity_data_msss_train_clean))
summary(fit_msss)

#stepwise regression
step <- stepAIC(fit, direction="both")

step$anova # display results

best_fit_msss_pns <- lm(msss ~ panel_11_17 + medulla_ll_atrophy, severity_data_msss_train_clean)

summary(best_fit_msss_pns)

# predict msss from LM and MLM

severity_data_msss_train_clean$msss_lm <- fit_msss$fitted.values
severity_data_msss_train_clean$msss_mlm <- predict(object = best_fit_msss_pns,newdata = severity_data_msss_train_clean)

# plot


pred <- c("msss_lm","msss_mlm")

pred_nice <- c("sNFL-predicted MSSS","sNFL-predicted MSSS")

obs <- c("msss","msss")

obs_nice <- c("measured MSSS","measured MSSS")


for(i in 1:length(obs)){
  # i <- 2
  df <- severity_data_msss_train_clean 
  
  p <- df %>% 
    ggplot(aes_string(x=obs[i], y=pred[i])) +
    geom_point(color="#001321",alpha = 0.4, shape = 1, size = 2,stroke = 0.6) +
    annotate("text",label = paste0('atop(n[s] ==', nrow(df[obs[i]]), ', n[p] ==', length(unique(df$patientcode)), ')'),
             x=max(df[obs[i]])-.15*(max(df[obs[i]])-min(df[obs[i]])),
             y=min(df[pred[i]])+1.2*(max(df[pred[i]])-min(df[pred[i]])),
             size=4,parse = TRUE) +
    ylim(c(min(df[pred[i]]),
           max(df[pred[i]])+0.35*(max(df[pred[i]])-min(df[pred[i]]))))+
    xlab(obs_nice[i]) +
    ylab(pred_nice[i]) +
    stat_cor(aes(label=..r.label..), size=4,color="#521B93",label.y.npc = 1)+
    stat_cor(aes(label=..rr.label..), size=4,color="#521B93",label.y.npc = .90)+
    stat_cor(aes(label=..p.label..), size=4,color="#521B93",label.y.npc = .77)+
    geom_smooth(method="lm",se=TRUE, size=1,color="#521B93") +
    theme_bw(base_size = 12) 
  
  assign(paste("p",i,sep = ""),p)
}


ggsave(plot = ggarrange(p1,p2,
                        ncol = 2,nrow = 1, common.legend=TRUE, legend="bottom"),
       filename = "./output/msss_LM_vs_MLM_prediction_PNSonly_training.png", 
       width=6,height=2.5,units = "in",dpi = 150)


#VALIDATION cohort

severity_data_msss_val <- data_sub_val %>% 
  filter(diagnosis %in% c("RR-MS","PP-MS","SP-MS"))%>% 
  filter(!is.na(msss))

severity_data_msss_val_clean <- severity_data_msss_val[complete.cases(severity_data_msss_val[confound_pns]),]

# predict msss from LM and MLM in the VALIDATION cohort

severity_data_msss_val_clean$msss_lm <- predict(object = fit_msss,newdata = severity_data_msss_val_clean)
severity_data_msss_val_clean$msss_mlm <- predict(object = best_fit_msss_pns,newdata = severity_data_msss_val_clean)

# plot


pred <- c("msss_lm","msss_mlm")

pred_nice <- c("sNFL-predicted MSSS","sNFL-predicted MSSS")

obs <- c("msss","msss")

obs_nice <- c("measured MSSS","measured MSSS")


for(i in 1:length(obs)){
  # i <- 2
  df <- severity_data_msss_val_clean 
  
  p <- df %>% 
    ggplot(aes_string(x=obs[i], y=pred[i])) +
    geom_point(color="#001321",alpha = 0.4, shape = 1, size = 2,stroke = 0.6) +
    annotate("text",label = paste0('atop(n[s] ==', nrow(df[obs[i]]), ', n[p] ==', length(unique(df$patientcode)), ')'),
             x=max(df[obs[i]])-.15*(max(df[obs[i]])-min(df[obs[i]])),
             y=min(df[pred[i]])+1.2*(max(df[pred[i]])-min(df[pred[i]])),
             size=4,parse = TRUE) +
    ylim(c(min(df[pred[i]]),
           max(df[pred[i]])+0.35*(max(df[pred[i]])-min(df[pred[i]]))))+
    xlab(obs_nice[i]) +
    ylab(pred_nice[i]) +
    stat_cor(aes(label=..r.label..), size=4,color="#521B93",label.y.npc = 1)+
    stat_cor(aes(label=..rr.label..), size=4,color="#521B93",label.y.npc = .90)+
    stat_cor(aes(label=..p.label..), size=4,color="#521B93",label.y.npc = .77)+
    geom_smooth(method="lm",se=TRUE, size=1,color="#521B93") +
    theme_bw(base_size = 12) 
  
  assign(paste("p",i,sep = ""),p)
}


ggsave(plot = ggarrange(p1,p2,
                        ncol = 2,nrow = 1, common.legend=TRUE, legend="bottom"),
       filename = "./output/msss_LM_vs_MLM_prediction_PNSonly_validation.png", 
       width=6,height=2.5,units = "in",dpi = 150)


########
# ARMSS
########


severity_data_armss_train <- data_sub_train %>% 
  filter(diagnosis %in% c("RR-MS","PP-MS","SP-MS"))%>% 
  filter(!is.na(armss))

severity_data_armss_train_clean <- severity_data_armss_train[complete.cases(severity_data_armss_train[confound_pns]),]
#build the model:

fit <- lm(armss~
            panel_11_17 + medulla_ll_atrophy, severity_data_armss_train_clean)

fit_armss <- lm(armss~log_nfl_serum, severity_data_armss_train_clean)
plot(armss~log_nfl_serum, severity_data_armss_train_clean)
abline(lm(armss~log_nfl_serum, severity_data_armss_train_clean))
summary(fit_armss)

#stepwise regression
step <- stepAIC(fit, direction="both")

step$anova # display results

best_fit_armss_pns <- lm(armss ~ panel_11_17 + medulla_ll_atrophy, severity_data_armss_train_clean)

summary(best_fit_armss_pns)

# predict armss from LM and MLM

severity_data_armss_train_clean$armss_lm <- fit_armss$fitted.values
severity_data_armss_train_clean$armss_mlm <- predict(object = best_fit_armss_pns,newdata = severity_data_armss_train_clean)

# plot


pred <- c("armss_lm","armss_mlm")

pred_nice <- c("sNFL-predicted ARMSS","sNFL-predicted ARMSS")

obs <- c("armss","armss")

obs_nice <- c("measured ARMSS","measured ARMSS")


for(i in 1:length(obs)){
  # i <- 2
  df <- severity_data_armss_train_clean 
  
  p <- df %>% 
    ggplot(aes_string(x=obs[i], y=pred[i])) +
    geom_point(color="#001321",alpha = 0.4, shape = 1, size = 2,stroke = 0.6) +
    annotate("text",label = paste0('atop(n[s] ==', nrow(df[obs[i]]), ', n[p] ==', length(unique(df$patientcode)), ')'),
             x=max(df[obs[i]])-.15*(max(df[obs[i]])-min(df[obs[i]])),
             y=min(df[pred[i]])+1.2*(max(df[pred[i]])-min(df[pred[i]])),
             size=4,parse = TRUE) +
    ylim(c(min(df[pred[i]]),
           max(df[pred[i]])+0.35*(max(df[pred[i]])-min(df[pred[i]]))))+
    xlab(obs_nice[i]) +
    ylab(pred_nice[i]) +
    stat_cor(aes(label=..r.label..), size=4,color="#521B93",label.y.npc = 1)+
    stat_cor(aes(label=..rr.label..), size=4,color="#521B93",label.y.npc = .90)+
    stat_cor(aes(label=..p.label..), size=4,color="#521B93",label.y.npc = .77)+
    geom_smooth(method="lm",se=TRUE, size=1,color="#521B93") +
    theme_bw(base_size = 12) 
  
  assign(paste("p",i,sep = ""),p)
}


ggsave(plot = ggarrange(p1,p2,
                        ncol = 2,nrow = 1, common.legend=TRUE, legend="bottom"),
       filename = "./output/armss_LM_vs_MLM_prediction_PNSonly_training.png", 
       width=6,height=2.5,units = "in",dpi = 150)


#VALIDATION cohort

severity_data_armss_val <- data_sub_val %>% 
  filter(diagnosis %in% c("RR-MS","PP-MS","SP-MS"))%>% 
  filter(!is.na(armss))

severity_data_armss_val_clean <- severity_data_armss_val[complete.cases(severity_data_armss_val[confound_pns]),]

# predict armss from LM and MLM in the VALIDATION cohort

severity_data_armss_val_clean$armss_lm <- predict(object = fit_armss,newdata = severity_data_armss_val_clean)
severity_data_armss_val_clean$armss_mlm <- predict(object = best_fit_armss_pns,newdata = severity_data_armss_val_clean)

# plot


pred <- c("armss_lm","armss_mlm")

pred_nice <- c("sNFL-predicted ARMSS","sNFL-predicted ARMSS")

obs <- c("armss","armss")

obs_nice <- c("measured ARMSS","measured ARMSS")


for(i in 1:length(obs)){
  # i <- 2
  df <- severity_data_armss_val_clean 
  
  p <- df %>% 
    ggplot(aes_string(x=obs[i], y=pred[i])) +
    geom_point(color="#001321",alpha = 0.4, shape = 1, size = 2,stroke = 0.6) +
    annotate("text",label = paste0('atop(n[s] ==', nrow(df[obs[i]]), ', n[p] ==', length(unique(df$patientcode)), ')'),
             x=max(df[obs[i]])-.15*(max(df[obs[i]])-min(df[obs[i]])),
             y=min(df[pred[i]])+1.2*(max(df[pred[i]])-min(df[pred[i]])),
             size=4,parse = TRUE) +
    ylim(c(min(df[pred[i]]),
           max(df[pred[i]])+0.35*(max(df[pred[i]])-min(df[pred[i]]))))+
    xlab(obs_nice[i]) +
    ylab(pred_nice[i]) +
    stat_cor(aes(label=..r.label..), size=4,color="#521B93",label.y.npc = 1)+
    stat_cor(aes(label=..rr.label..), size=4,color="#521B93",label.y.npc = .90)+
    stat_cor(aes(label=..p.label..), size=4,color="#521B93",label.y.npc = .77)+
    geom_smooth(method="lm",se=TRUE, size=1,color="#521B93") +
    theme_bw(base_size = 12) 
  
  assign(paste("p",i,sep = ""),p)
}


ggsave(plot = ggarrange(p1,p2,
                        ncol = 2,nrow = 1, common.legend=TRUE, legend="bottom"),
       filename = "./output/armss_LM_vs_MLM_prediction_PNSonly_validation.png", 
       width=6,height=2.5,units = "in",dpi = 150)


########################################################################################################################################################
########################################################################################################################################################
#
#            PROPENSITY score matching for brain atrophy
#
########################################################################################################################################################
########################################################################################################################################################


#####################################
# isolate training cohort

data_sub_train <- data_sub %>%
  filter(cohort=="train")%>% 
  filter(diagnosis %in% c("RR-MS","SP-MS","PP-MS"))

# generate lm model in the training cohort 

mod_train <- lm(log_nfl_csf~log_nfl_serum,data_sub_train)

# get residuals in the whole cohort

data_sub$serum_csf_nfl_res <- data_sub$log_nfl_csf - predict(mod_train,newdata = data_sub)

# isolate training cohort again

data_sub_train <- data_sub %>%
  filter(cohort=="train")%>% 
  filter(diagnosis %in% c("RR-MS","SP-MS","PP-MS"))

# isolate validation cohort

data_sub_val <- data_sub %>%
  filter(cohort=="val")%>% 
  filter(diagnosis %in% c("RR-MS","SP-MS","PP-MS"))


#plot CSF vs Serum CSF residualsNFL in the training and validation cohort

pred <- c("log_nfl_csf")
obs <- c("log_nfl_serum")

pred_nice <- c("Log10 cNFL")
obs_nice <- c("Log10 sNFL")

for(i in 1:length(obs)){
  # i <- 2
  data <- data_sub_train
  df <- data %>% 
    filter(.,!is.na(data[[pred[i]]]))
  
  p <- df %>% 
    ggplot(aes_string(x=obs[i], y=pred[i])) +
    geom_point(color="#001321",alpha = 0.4, shape = 1, size = 2,stroke = 0.6) +
    annotate("text",label = paste0('atop(n[s] ==', nrow(df[obs[i]]), ', n[p] ==', length(unique(df$patientcode)), ')'),
             x=max(df[obs[i]])-.1*(max(df[obs[i]])-min(df[obs[i]])),
             y=min(df[pred[i]])+.03*(max(df[pred[i]])-min(df[pred[i]])),
             size=4,parse = TRUE) +
    ylim(c(min(df[pred[i]]-0.1*(max(df[pred[i]])-min(df[pred[i]]))),
           max(df[pred[i]])+0.2*(max(df[pred[i]])-min(df[pred[i]]))))+
    xlab(obs_nice[i]) +
    ylab(pred_nice[i]) +
    stat_cor(aes(label=..r.label..), size=4,color="blue",label.y.npc = 1)+
    stat_cor(aes(label=..rr.label..), size=4,color="blue",label.y.npc = .90)+
    stat_cor(aes(label=..p.label..), size=4,color="blue",label.y.npc = .77)+
    geom_smooth(method="lm",se=TRUE, size=1) +
    theme_bw(base_size = 12) +
  theme(axis.title.x = element_text(colour = "#C00000"),
        axis.title.y = element_text(colour = "#0171C0"))

assign(paste("p",i,sep = ""),p)
}


ggsave(plot = ggarrange(p1,ncol = 1,nrow = 1, common.legend=TRUE, legend="bottom"),
       filename = "./output/propensity_csf_vs_serum_training.png", width =2.7,height = 2.5,units = "in",dpi = 300)


# get quartiles

data <- data_sub_train

# calculate quartiles
quart <- (quantile(data$serum_csf_nfl_res))

# identify first quartile values
q1_ind <- which(data$serum_csf_nfl_res <= quart[2])

#identify third quartile values 
q3_ind <- which(data$serum_csf_nfl_res >= quart[4])

#create a new column for quartiless
data$quart <- NA
data$quart[q1_ind] <- "q1"
data$quart[q3_ind] <- "q3"

data$placeholder <- "A"


# plot q1 and q3 nly
#make sure this plot is made right after the previous one - otherwise the y-axis limits will not work. 
# WITHOUT IQR points

p <- data %>% 
  filter(!is.na(quart)) %>%
  ggplot(aes(x=log_nfl_serum, y= log_nfl_csf, color=quart)) +
  geom_point(alpha = 0.6, shape = 1, size = 2,stroke = 0.6) +
  ylab("Log10 cNFL ") +
  xlab("Log10 sNFL") +
  ylim(c(min(df[pred[i]]-0.1*(max(df[pred[i]])-min(df[pred[i]]))),
         max(df[pred[i]])+0.2*(max(df[pred[i]])-min(df[pred[i]]))))+
  theme_bw(base_size = 12) +
  geom_smooth(method = "lm",mapping = aes(x=log_nfl_serum, y= log_nfl_csf),color="blue",se=TRUE,data = data_sub_train ) +
  theme(axis.title.y = element_text(colour = "#0171C0"),
        axis.title.x = element_text( colour = "#C00000" ),
        legend.position = "none")

ggsave(plot = ggarrange(p,ncol = 1,nrow = 1),
       filename = "./output/geom_point_propensity_csf_vs_serum_training.png", width =2.7,height = 2.5,units = "in",dpi = 300)

#WITH IQR POINTS

p <- data %>% 
  ggplot(aes(x=log_nfl_serum, y= log_nfl_csf, color=quart)) +
  geom_point(alpha = 0.6, shape = 1, size = 2,stroke = 0.6) +
  ylab("Log10 cNFL ") +
  xlab("Log10 sNFL") +
  ylim(c(min(df[pred[i]]-0.1*(max(df[pred[i]])-min(df[pred[i]]))),
         max(df[pred[i]])+0.2*(max(df[pred[i]])-min(df[pred[i]]))))+
  annotate("text",label = paste0('atop(n[s] ==', nrow(df[obs[i]]), ', n[p] ==', length(unique(df$patientcode)), ')'),
           x=max(df[obs[i]])-.1*(max(df[obs[i]])-min(df[obs[i]])),
           y=min(df[pred[i]])+.03*(max(df[pred[i]])-min(df[pred[i]])),
           size=4,parse = TRUE) +
  theme_bw(base_size = 12) +
  stat_cor(aes(label=..r.label..), size=4,color="black",label.y.npc = 1)+
  stat_cor(aes(label=..rr.label..), size=4,color="black",label.y.npc = .90)+
  stat_cor(aes(label=..p.label..), size=4,color="black",label.y.npc = .77)+
  geom_smooth(method = "lm",mapping = aes(x=log_nfl_serum, y= log_nfl_csf),color="black",size = 0.5,se=FALSE,data = data_sub_train ) +
  theme(axis.title.y = element_text(colour = "#0171C0"),
        axis.title.x = element_text( colour = "#C00000" ),
        legend.position = "none")

ggsave(plot = ggarrange(p,ncol = 1,nrow = 1),
       filename = "./output/geom_point_wIQR_propensity_csf_vs_serum_training.png", width =2.7,height = 2.5,units = "in",dpi = 300)

#BOXPLOT
p1 <- data %>% 
  ggplot(aes(x=factor(0),y=serum_csf_nfl_res)) +
  geom_jitter(alpha = 0.6, shape = 1, size = 1.5,stroke = 0.4,position=position_jitter(0.3),aes(color=quart))+
  geom_boxplot(outlier.colour = "transparent",fill="transparent") +
  theme_bw()+
  theme(axis.title.x=element_blank(),
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank(),
        legend.position = "none",panel.border = element_blank(), panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(), axis.line = element_blank())

ggsave(plot = ggarrange(p1,ncol = 1,nrow = 1),
       filename = "./output/boxplot_wIQR_propensity_csf_vs_serum_training.png", width =1.3,height = 1.3,units = "in",dpi = 300)


# boxplots of clean quartiles


data_clean <- data %>%
  filter(!is.na(quart))


df <- data_clean

obs <- c("quart")
obs_nice <- c("NFL Residual Quartiles")
pred <- c("log_nfl_serum")
pred_nice <- c("Log10 sNFL")

for(i in 1:length(obs)){
  
p1 <- df %>% 
  ggplot(aes_string(x=obs[i], y=pred[i], color = "quart")) +
  geom_boxplot(outlier.colour = "transparent") +
  geom_jitter(alpha = 0.6, shape = 1, size = 2,stroke = 0.6,position=position_jitter(0.07)) +
  # geom_line(aes(group = subclass),color="gray", alpha = 0.6,size=0.3) +
  ylim(c(min(df[pred[i]])-0.25*(max(df[pred[i]])-min(df[pred[i]])),max(df[pred[i]])+0.2*(max(df[pred[i]])-min(df[pred[i]]))))+
  xlab(obs_nice[i]) +
  ylab(pred_nice[i]) +
  scale_colour_discrete(labels = c("<Q1", ">Q3"))+
  scale_x_discrete(labels = c("<Q1", ">Q3")) +
  stat_compare_means(method = "t.test", paired = FALSE,
                     label.x = "q1", 
                     label.y = max(df[pred[i]])+0.1*(max(df[pred[i]])-min(df[pred[i]])) )+
  theme_bw(base_size = 12) +
  stat_summary(fun.data = function(x) {
    return(
      data.frame(
        y = min(df[pred[i]])-0.05*(max(df[pred[i]])-min(df[pred[i]])),
        label = paste('n =',
                      format(length(x), big.mark = ",", decimal.mark = ".", scientific = FALSE), 
                      '\n',
                      'mean =', 
                      format(round(mean(x), 2), big.mark = ",", decimal.mark = ".", scientific = FALSE))
        )
      )
    },geom = "text", 
               hjust = 0.5,
               vjust = 1, size =3) +
  theme(axis.title.y = element_text( colour = "#C00000" ),
        axis.title.x = element_text( colour = "black" ),
                    legend.position = "none")
}

ggsave(plot = ggarrange(p1,ncol = 1,nrow = 1),
       filename = "./output/serum_propensity_csf_vs_serum_training.png", width =2.7,height = 2.5,units = "in",dpi = 300)


# residuals
df <- data_clean

obs <- c("quart")
obs_nice <- c("NFL Residual Quartiles")
pred <- c("serum_csf_nfl_res")
pred_nice <- c("NFL residuals")

for(i in 1:length(obs)){
  
  p1 <- df %>% 
    ggplot(aes_string(x=obs[i], y=pred[i], color = "quart")) +
    geom_boxplot(outlier.colour = "transparent") +
    geom_jitter(alpha = 0.6, shape = 1, size = 2,stroke = 0.6,position=position_jitter(0.07)) +
    # geom_line(aes(group = subclass),color="gray", alpha = 0.6,size=0.3) +
    ylim(c(min(df[pred[i]])-0.25*(max(df[pred[i]])-min(df[pred[i]])),max(df[pred[i]])+0.2*(max(df[pred[i]])-min(df[pred[i]]))))+
    xlab(obs_nice[i]) +
    ylab(pred_nice[i]) +
    scale_colour_discrete(labels = c("<Q1", ">Q3"))+
    scale_x_discrete(labels = c("<Q1", ">Q3")) +
    stat_compare_means(method = "t.test", paired = FALSE,
                       label.x = "q1", 
                       label.y = max(df[pred[i]])+0.1*(max(df[pred[i]])-min(df[pred[i]])) )+
    theme_bw(base_size = 12) +
    stat_summary(fun.data = function(x) {
      return(
        data.frame(
          y = min(df[pred[i]])-0.05*(max(df[pred[i]])-min(df[pred[i]])),
          label = paste('n =',
                        format(length(x), big.mark = ",", decimal.mark = ".", scientific = FALSE), 
                        '\n',
                        'mean =', 
                        format(round(mean(x), 2), big.mark = ",", decimal.mark = ".", scientific = FALSE))
        )
      )
    },geom = "text", 
    hjust = 0.5,
    vjust = 1, size =3) +
  theme_bw(base_size = 12) +
  theme(axis.title.y = element_text( colour = "#C55A11" ),
        axis.title.x = element_text( colour = "black" ),
        legend.position = "none")
}

ggsave(plot = ggarrange(p1,ncol = 1,nrow = 1),
       filename = "./output/residulas_propensity_csf_vs_serum_training.png", width =2.7,height = 2.5,units = "in",dpi = 300)


#
#
####
####  correlations of residuals with different outcomes in the training cohort
####
#
#
obs <- c("quart","quart","quart","quart")
obs_nice <- c("NFL Residual Quartiles","NFL Residual Quartiles","NFL Residual Quartiles","NFL Residual Quartiles")

pred <- c("bpfr","atrophy_severity_score","sdmt_age","sdmt")
pred_nice <- c("Brain Parenchymal Fraction","Total Brain Atrophy","SDMT/Age","SDMT") 

cohorts <- c("train","train","train","train")
cohort_nice <- c("TRAINING cohort","TRAINING cohort","TRAINING cohort","TRAINING cohort")

# isolate samples with outcome in training cohort

for(i in 1:length(pred)){
  
  # i <-2
  data <- data_sub %>%
    dplyr::filter(cohort %in% cohorts[i])
  data <- data %>% 
    filter(.,!is.na(data[pred[i]]))%>% 
    filter(diagnosis %in% c("RR-MS","SP-MS","PP-MS"))
  
  # calculate quartiles
  quart <- (quantile(data$serum_csf_nfl_res))
  
  # identify first quartile values
  q1_ind <- which(data$serum_csf_nfl_res <= quart[2])
  
  #identify third quartile values 
  q3_ind <- which(data$serum_csf_nfl_res >= quart[4])
  
  #create a new column for quartiless
  data$quart <- NA
  data$quart[q1_ind] <- "q1"
  data$quart[q3_ind] <- "q3"
  
  data_clean <- data %>%
    filter(!is.na(quart))
  
  
  df <- data_clean
  
  p3 <- df %>% 
    ggplot(aes_string(x=obs[i], y=pred[i], color = "quart")) +
    geom_boxplot(outlier.colour = "transparent") +
    geom_jitter(alpha = 0.6, shape = 1, size = 2,stroke = 0.6,position=position_jitter(width = 0.07,height=0)) +
    # geom_line(aes(group = subclass),color="gray", alpha = 0.6,size=0.3) +
    ylim(c(min(df[pred[i]])-0.25*(max(df[pred[i]])-min(df[pred[i]])),max(df[pred[i]])+0.2*(max(df[pred[i]])-min(df[pred[i]]))))+
    xlab(obs_nice[i]) +
    ylab(pred_nice[i]) +
    scale_colour_discrete(labels = c("<Q1", ">Q3"))+
    scale_x_discrete(labels = c("<Q1", ">Q3")) +
    stat_compare_means(method = "wilcox.test", paired = FALSE,
                       label.x = "q1", 
                       label.y = max(df[pred[i]])+0.1*(max(df[pred[i]])-min(df[pred[i]])) )+
    theme_bw(base_size = 12) +
    stat_summary(fun.data = function(x) {
      return(
        data.frame(
          y = min(df[pred[i]])-0.05*(max(df[pred[i]])-min(df[pred[i]])),
          label = paste('n =',
                        format(length(x), big.mark = ",", decimal.mark = ".", scientific = FALSE), 
                        '\n',
                        'median =', 
                        format(round(median(x), 2), big.mark = ",", decimal.mark = ".", scientific = FALSE))
        )
      )
    },geom = "text", 
    hjust = 0.5,
    vjust = 1, size =3) +
    theme_bw(base_size = 12) +
    theme(legend.position = "none") +
    theme(axis.title.y = element_text( colour = "blue" ))
  
  ggsave(plot = ggarrange(p3,ncol = 1,nrow = 1),
         filename = paste("./output/",pred[i],"_final_plot_",cohorts[i],".png",sep=""), width =2.7,height = 2.5,units = "in",dpi = 300)
  
  
  df <- data_clean
  df$group <- df$quart
  df$group <- df$group=="q1"
  
  df <- df %>% 
    filter(!is.na(df[pred[i]]))
  
  set.seed(1234)
  match.it <- matchit(group~log_nfl_serum,data = df, method="full")
  a <- summary(match.it)
  plot(match.it)
  
  df.match <- match.data(match.it)
  
  df_q1 <- df.match %>%
    dplyr::group_by(subclass) %>%
    arrange(desc(abs(serum_csf_nfl_res))) %>%
    arrange(weights) %>%
    filter(quart=="q1") %>%
    filter(row_number()==1)
  
  df_q3 <- df.match %>%
    dplyr::group_by(subclass) %>%
    arrange(desc(abs(serum_csf_nfl_res))) %>%
    arrange(weights) %>%
    filter(quart=="q3") %>%
    filter(row_number()==1)
  
  df <- rbind(df_q1,df_q3)
  
  df <- df %>%
    dplyr::select(subclass,quart,log_nfl_serum,serum_csf_nfl_res,pred[i]) %>%
    dplyr::arrange(.,subclass)
  
  #plot serum NFL in two quartile groups
  
    
    p4 <- df %>% 
      ggplot(aes_string(x=obs[i], y=pred[i], color = "quart")) +
      geom_boxplot(outlier.colour = "transparent") +
      geom_jitter(alpha = 0.6, shape = 1, size = 2,stroke = 0.6,position=position_jitter(width = 0.07,height=0)) +
      geom_line(aes(group = subclass),color="gray", alpha = 0.6,size=0.3) +
      ylim(c(min(df[pred[i]])-0.25*(max(df[pred[i]])-min(df[pred[i]])),max(df[pred[i]])+0.2*(max(df[pred[i]])-min(df[pred[i]]))))+
      xlab(obs_nice[i]) +
      ylab(pred_nice[i]) +
      geom_line(aes(group = subclass),color="gray", alpha = 0.6,size=0.3) +
      scale_colour_discrete(labels = c("<Q1", ">Q3"))+
      scale_x_discrete(labels = c("<Q1", ">Q3")) +
      stat_compare_means(method = "wilcox.test", paired = TRUE,
                         label.x = "q1", 
                         label.y = max(df[pred[i]])+0.1*(max(df[pred[i]])-min(df[pred[i]])) )+
      theme_bw(base_size = 12) +
      stat_summary(fun.data = function(x) {
        return(
          data.frame(
            y = min(df[pred[i]])-0.05*(max(df[pred[i]])-min(df[pred[i]])),
            label = paste('n =',
                          format(length(x), big.mark = ",", decimal.mark = ".", scientific = FALSE), 
                          '\n',
                          'median =', 
                          format(round(median(x), 2), big.mark = ",", decimal.mark = ".", scientific = FALSE))
          )
        )
      },geom = "text", 
      hjust = 0.5,
      vjust = 1, size =3) +
      theme_bw(base_size = 12) +
      theme(legend.position = "none")+
      theme(axis.title.y = element_text( colour = "blue" ))
    
    ggsave(plot = ggarrange(p4,ncol = 1,nrow = 1),
           filename = paste("./output/",pred[i],"_final_plot_propensity_matched_",cohorts[i],".png",sep=""), width =2.7,height = 2.5,units = "in",dpi = 300)
    
    
}

# boxplots of matched quartiles


obs <- c("quart")
obs_nice <- c("NFL Residual Quartiles")
pred <- c("log_nfl_serum")
pred_nice <- c("Log10 sNFL")

for(i in 1:length(obs)){
  
  p1 <- df %>% 
    ggplot(aes_string(x=obs[i], y=pred[i], color = "quart")) +
    geom_boxplot(outlier.colour = "transparent") +
    geom_jitter(alpha = 0.6, shape = 1, size = 2,stroke = 0.6,position=position_jitter(0.07)) +
    # geom_line(aes(group = subclass),color="gray", alpha = 0.6,size=0.3) +
    ylim(c(min(df[pred[i]])-0.25*(max(df[pred[i]])-min(df[pred[i]])),max(df[pred[i]])+0.2*(max(df[pred[i]])-min(df[pred[i]]))))+
    xlab(obs_nice[i]) +
    ylab(pred_nice[i]) +
    geom_line(aes(group = subclass),color="gray", alpha = 0.6,size=0.3) +
    scale_colour_discrete(labels = c("<Q1", ">Q3"))+
    scale_x_discrete(labels = c("<Q1", ">Q3")) +
    stat_compare_means(method = "t.test", paired = TRUE,
                       label.x = "q1", 
                       label.y = max(df[pred[i]])+0.1*(max(df[pred[i]])-min(df[pred[i]])) )+
    theme_bw(base_size = 12) +
    stat_summary(fun.data = function(x) {
      return(
        data.frame(
          y = min(df[pred[i]])-0.05*(max(df[pred[i]])-min(df[pred[i]])),
          label = paste('n =',
                        format(length(x), big.mark = ",", decimal.mark = ".", scientific = FALSE), 
                        '\n',
                        'mean =', 
                        format(round(mean(x), 2), big.mark = ",", decimal.mark = ".", scientific = FALSE))
        )
      )
    },geom = "text", 
    hjust = 0.5,
    vjust = 1, size =3) +
    theme(axis.title.y = element_text( colour = "#C00000" ),
          axis.title.x = element_text( colour = "black" ),
          legend.position = "none")
}

ggsave(plot = ggarrange(p1,ncol = 1,nrow = 1),
       filename = "./output/serum_propensity_matched_csf_vs_serum_training.png", width =2.7,height = 2.5,units = "in",dpi = 300)


# residuals

obs <- c("quart")
obs_nice <- c("NFL Residual Quartiles")
pred <- c("serum_csf_nfl_res")
pred_nice <- c("NFL residuals")

for(i in 1:length(obs)){
  
  p1 <- df %>% 
    ggplot(aes_string(x=obs[i], y=pred[i], color = "quart")) +
    geom_boxplot(outlier.colour = "transparent") +
    geom_jitter(alpha = 0.6, shape = 1, size = 2,stroke = 0.6,position=position_jitter(0.07)) +
    # geom_line(aes(group = subclass),color="gray", alpha = 0.6,size=0.3) +
    ylim(c(min(df[pred[i]])-0.25*(max(df[pred[i]])-min(df[pred[i]])),max(df[pred[i]])+0.2*(max(df[pred[i]])-min(df[pred[i]]))))+
    xlab(obs_nice[i]) +
    ylab(pred_nice[i]) +
    scale_colour_discrete(labels = c("<Q1", ">Q3"))+
    scale_x_discrete(labels = c("<Q1", ">Q3")) +
    geom_line(aes(group = subclass),color="gray", alpha = 0.6,size=0.3) +
    stat_compare_means(method = "t.test", paired = TRUE,
                       label.x = "q1", 
                       label.y = max(df[pred[i]])+0.1*(max(df[pred[i]])-min(df[pred[i]])) )+
    theme_bw(base_size = 12) +
    stat_summary(fun.data = function(x) {
      return(
        data.frame(
          y = min(df[pred[i]])-0.05*(max(df[pred[i]])-min(df[pred[i]])),
          label = paste('n =',
                        format(length(x), big.mark = ",", decimal.mark = ".", scientific = FALSE), 
                        '\n',
                        'mean =', 
                        format(round(mean(x), 2), big.mark = ",", decimal.mark = ".", scientific = FALSE))
        )
      )
    },geom = "text", 
    hjust = 0.5,
    vjust = 1, size =3) +
    theme_bw(base_size = 12) +
    theme(axis.title.y = element_text( colour = "#C55A11" ),
          axis.title.x = element_text( colour = "black" ),
          legend.position = "none")
}

ggsave(plot = ggarrange(p1,ncol = 1,nrow = 1),
       filename = "./output/residulas_propensity_matched_csf_vs_serum_training.png", width =2.7,height = 2.5,units = "in",dpi = 300)


# isolate samples with outcome in validation cohort

# VALIDATION!!!!!!!!

#
#
####
####  correlations of residuals with different outcomes in the validation cohort
####
#
#
obs <- c("quart","quart")
obs_nice <- c("NFL Residual Quartiles","NFL Residual Quartiles")

pred <- c("bpfr","atrophy_severity_score")
pred_nice <- c("Brain Parenchymal Fraction","Total Brain Atrophy") 


cohorts <- c("val","val")
cohort_nice <- c("VALIDATION cohort","VALIDATION cohort")

# isolate samples with outcome in training cohort

for(i in 1:length(pred)){
  
  # i <-1
  data <- data_sub %>%
    dplyr::filter(cohort %in% cohorts[i])
  data <- data %>% 
    filter(.,!is.na(data[pred[i]]))%>% 
    filter(diagnosis %in% c("RR-MS","SP-MS","PP-MS"))
  
  # calculate quartiles
  quart <- (quantile(data$serum_csf_nfl_res))
  
  # identify first quartile values
  q1_ind <- which(data$serum_csf_nfl_res <= quart[2])
  
  #identify third quartile values 
  q3_ind <- which(data$serum_csf_nfl_res >= quart[4])
  
  #create a new column for quartiless
  data$quart <- NA
  data$quart[q1_ind] <- "q1"
  data$quart[q3_ind] <- "q3"
  
  data_clean <- data %>%
    filter(!is.na(quart))
  
  
  df <- data_clean
  
  p3 <- df %>% 
    ggplot(aes_string(x=obs[i], y=pred[i], color = "quart")) +
    geom_boxplot(outlier.colour = "transparent") +
    geom_jitter(alpha = 0.6, shape = 1, size = 2,stroke = 0.6,position=position_jitter(width = 0.07,height=0)) +
    # geom_line(aes(group = subclass),color="gray", alpha = 0.6,size=0.3) +
    ylim(c(min(df[pred[i]])-0.25*(max(df[pred[i]])-min(df[pred[i]])),max(df[pred[i]])+0.2*(max(df[pred[i]])-min(df[pred[i]]))))+
    xlab(obs_nice[i]) +
    ylab(pred_nice[i]) +
    scale_colour_discrete(labels = c("<Q1", ">Q3"))+
    scale_x_discrete(labels = c("<Q1", ">Q3")) +
    stat_compare_means(method = "wilcox.test", paired = FALSE,
                       label.x = "q1", 
                       label.y = max(df[pred[i]])+0.1*(max(df[pred[i]])-min(df[pred[i]])) )+
    theme_bw(base_size = 12) +
    stat_summary(fun.data = function(x) {
      return(
        data.frame(
          y = min(df[pred[i]])-0.05*(max(df[pred[i]])-min(df[pred[i]])),
          label = paste('n =',
                        format(length(x), big.mark = ",", decimal.mark = ".", scientific = FALSE), 
                        '\n',
                        'median =', 
                        format(round(median(x), 2), big.mark = ",", decimal.mark = ".", scientific = FALSE))
        )
      )
    },geom = "text", 
    hjust = 0.5,
    vjust = 1, size =3) +
    theme_bw(base_size = 12) +
    theme(legend.position = "none")
  
  ggsave(plot = ggarrange(p3,ncol = 1,nrow = 1),
         filename = paste("./output/",pred[i],"_final_plot_",cohorts[i],".png",sep=""), width =2.7,height = 2.5,units = "in",dpi = 300)
  
  
  df <- data_clean
  df$group <- df$quart
  df$group <- df$group=="q1"
  
  df <- df %>% 
    filter(!is.na(df[pred[i]]))
  
  set.seed(1234)
  match.it <- matchit(group~log_nfl_serum,data = df, method="full")
  a <- summary(match.it)
  plot(match.it)
  
  df.match <- match.data(match.it)
  
  df_q1 <- df.match %>%
    dplyr::group_by(subclass) %>%
    arrange(desc(abs(serum_csf_nfl_res))) %>%
    arrange(weights) %>%
    filter(quart=="q1") %>%
    filter(row_number()==1)
  
  df_q3 <- df.match %>%
    dplyr::group_by(subclass) %>%
    arrange(desc(abs(serum_csf_nfl_res))) %>%
    arrange(weights) %>%
    filter(quart=="q3") %>%
    filter(row_number()==1)
  
  df <- rbind(df_q1,df_q3)
  
  df <- df %>%
    dplyr::select(subclass,quart,log_nfl_serum,serum_csf_nfl_res,pred[i]) %>%
    dplyr::arrange(.,subclass)
  
  #plot serum NFL in two quartile groups
  
  
  p4 <- df %>% 
    ggplot(aes_string(x=obs[i], y=pred[i], color = "quart")) +
    geom_boxplot(outlier.colour = "transparent") +
    geom_jitter(alpha = 0.6, shape = 1, size = 2,stroke = 0.6,position=position_jitter(width = 0.07,height=0)) +
    geom_line(aes(group = subclass),color="gray", alpha = 0.6,size=0.3) +
    ylim(c(min(df[pred[i]])-0.25*(max(df[pred[i]])-min(df[pred[i]])),max(df[pred[i]])+0.2*(max(df[pred[i]])-min(df[pred[i]]))))+
    xlab(obs_nice[i]) +
    ylab(pred_nice[i]) +
    geom_line(aes(group = subclass),color="gray", alpha = 0.6,size=0.3) +
    scale_colour_discrete(labels = c("<Q1", ">Q3"))+
    scale_x_discrete(labels = c("<Q1", ">Q3")) +
    stat_compare_means(method = "wilcox.test", paired = TRUE,
                       label.x = "q1", 
                       label.y = max(df[pred[i]])+0.1*(max(df[pred[i]])-min(df[pred[i]])) )+
    theme_bw(base_size = 12) +
    stat_summary(fun.data = function(x) {
      return(
        data.frame(
          y = min(df[pred[i]])-0.05*(max(df[pred[i]])-min(df[pred[i]])),
          label = paste('n =',
                        format(length(x), big.mark = ",", decimal.mark = ".", scientific = FALSE), 
                        '\n',
                        'median =', 
                        format(round(median(x), 2), big.mark = ",", decimal.mark = ".", scientific = FALSE))
        )
      )
    },geom = "text", 
    hjust = 0.5,
    vjust = 1, size =3) +
    theme_bw(base_size = 12) +
    theme(legend.position = "none")
  
  ggsave(plot = ggarrange(p4,ncol = 1,nrow = 1),
         filename = paste("./output/",pred[i],"_final_plot_propensity_matched_",cohorts[i],".png",sep=""), width =2.7,height = 2.5,units = "in",dpi = 300)
  
  
}


########################################################################################################################################################
########################################################################################################################################################
#
#            PROPENSITY score matching for PNS injury
#
########################################################################################################################################################
########################################################################################################################################################


#####################################
# isolate training cohort

data_sub_train <- data_sub %>%
  filter(cohort=="train")%>% 
  filter(diagnosis %in% c("RR-MS","SP-MS","PP-MS"))

# generate lm model in the training cohort 

mod_train <- lm(log_nfl_serum~log_nfl_csf,data_sub_train)

# get residuals in the whole cohort

data_sub$serum_csf_nfl_res <- data_sub$log_nfl_serum - predict(mod_train,newdata = data_sub)

# isolate training cohort again

data_sub_train <- data_sub %>%
  filter(cohort=="train")%>% 
  filter(diagnosis %in% c("RR-MS","SP-MS","PP-MS"))

# isolate validation cohort

data_sub_val <- data_sub %>%
  filter(cohort=="val")%>% 
  filter(diagnosis %in% c("RR-MS","SP-MS","PP-MS"))


######################
# get quartiles
######################

data <- data_sub_train

# calculate quartiles
quart <- (quantile(data$serum_csf_nfl_res))

# identify first quartile values
q1_ind <- which(data$serum_csf_nfl_res <= quart[2])

#identify third quartile values 
q3_ind <- which(data$serum_csf_nfl_res >= quart[4])

#create a new column for quartiless
data$quart <- NA
data$quart[q1_ind] <- "q1"
data$quart[q3_ind] <- "q3"

data$placeholder <- "A"

#plot Serum vs CSF NFL in the training and validation cohort

pred <- c("log_nfl_serum")
obs <- c("log_nfl_csf")

pred_nice <- c("Log10 sNFL")
obs_nice <- c("Log10 cNFL")

for(i in 1:length(obs)){
  # i <- 2
  
  df <- data_sub_train %>% 
    filter(.,!is.na(data[[pred[i]]]))
  
  p <- df %>% 
    ggplot(aes_string(x=obs[i], y=pred[i])) +
    geom_point(color="#001321",alpha = 0.4, shape = 1, size = 2,stroke = 0.6) +
    annotate("text",label = paste0('atop(n[s] ==', nrow(df[obs[i]]), ', n[p] ==', length(unique(df$patientcode)), ')'),
             x=max(df[obs[i]])-.1*(max(df[obs[i]])-min(df[obs[i]])),
             y=min(df[pred[i]])+.03*(max(df[pred[i]])-min(df[pred[i]])),
             size=4,parse = TRUE) +
    ylim(c(min(df[pred[i]]-0.1*(max(df[pred[i]])-min(df[pred[i]]))),
           max(df[pred[i]])+0.2*(max(df[pred[i]])-min(df[pred[i]]))))+
    xlab(obs_nice[i]) +
    ylab(pred_nice[i]) +
    stat_cor(aes(label=..r.label..), size=4,color="blue",label.y.npc = 1)+
    stat_cor(aes(label=..rr.label..), size=4,color="blue",label.y.npc = .90)+
    stat_cor(aes(label=..p.label..), size=4,color="blue",label.y.npc = .77)+
    geom_smooth(method="lm",se=TRUE, size=1) +
    theme_bw(base_size = 12) +
    theme(axis.title.y = element_text(colour = "#C00000"),
          axis.title.x = element_text(colour = "#0171C0"))
  
  assign(paste("p",i,sep = ""),p)
}


ggsave(plot = ggarrange(p1,ncol = 1,nrow = 1, common.legend=TRUE, legend="bottom"),
       filename = "./output/propensity_serum_vs_csf_training.png", width =2.7,height = 2.5,units = "in",dpi = 300)


# plot q1 and q3 nly

p <- data %>% 
  filter(!is.na(quart)) %>%
  ggplot(aes(y=log_nfl_serum, x= log_nfl_csf, color=quart)) +
  geom_point(alpha = 0.6, shape = 1, size = 2,stroke = 0.6) +
  ylab("Log10 sNFL ") +
  xlab("Log10 cNFL") +
  theme_bw(base_size = 12) +
  ylim(c(min(df[pred[i]]-0.1*(max(df[pred[i]])-min(df[pred[i]]))),
         max(df[pred[i]])+0.2*(max(df[pred[i]])-min(df[pred[i]]))))+
  geom_smooth(method = "lm",mapping = aes(y=log_nfl_serum, x= log_nfl_csf),color="blue",se=TRUE,data = data_sub_train ) +
  theme(axis.title.x = element_text(colour = "#0171C0"),
        axis.title.y = element_text(colour = "#C00000"),
        legend.position = "none")

ggsave(plot = ggarrange(p,ncol = 1,nrow = 1),
       filename = "./output/geom_point_propensity_csf_vs_serum_PNS_training.png", width =2.7,height = 2.5,units = "in",dpi = 300)

#WITH IQR POINTS

p <- data %>% 
  ggplot(aes(y=log_nfl_serum, x= log_nfl_csf,color=quart)) +
  geom_point(alpha = 0.4, shape = 1, size = 2,stroke = 0.6)+
  xlab("Log10 cNFL ") +
  ylab("Log10 sNFL") +
  ylim(c(min(df[pred[i]]-0.1*(max(df[pred[i]])-min(df[pred[i]]))),
         max(df[pred[i]])+0.2*(max(df[pred[i]])-min(df[pred[i]]))))+
  annotate("text",label = paste0('atop(n[s] ==', nrow(df[obs[i]]), ', n[p] ==', length(unique(df$patientcode)), ')'),
           x=max(df[obs[i]])-.1*(max(df[obs[i]])-min(df[obs[i]])),
           y=min(df[pred[i]])+.03*(max(df[pred[i]])-min(df[pred[i]])),
           size=4,parse = TRUE) +
  theme_bw(base_size = 12) +
  stat_cor(aes(label=..r.label..), size=4,color="black",label.y.npc = 1)+
  stat_cor(aes(label=..rr.label..), size=4,color="black",label.y.npc = .90)+
  stat_cor(aes(label=..p.label..), size=4,color="black",label.y.npc = .77)+
  geom_smooth(method = "lm",mapping = aes(y=log_nfl_serum, x= log_nfl_csf),color="black",size = 0.5,se=FALSE,data = data ) +
  theme(axis.title.y = element_text(colour = "#C00000"),
        axis.title.x = element_text(colour = "#0171C0"),
        legend.position = "none")


ggsave(plot = ggarrange(p,ncol = 1,nrow = 1),
       filename = "./output/geom_point_wIQR_propensity_serum_vs_csf_training.png", width =2.7,height = 2.5,units = "in",dpi = 300)


#BOXPLOT
p1 <- data %>% 
  ggplot(aes(x=factor(0),y=serum_csf_nfl_res)) +
  geom_jitter(alpha = 0.6, shape = 1, size = 1.5,stroke = 0.4,position=position_jitter(0.3),aes(color=quart))+
  geom_boxplot(outlier.colour = "transparent",fill="transparent") +
  theme_bw()+
  theme(axis.title.x=element_blank(),
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank(),
        legend.position = "none",panel.border = element_blank(), panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(), axis.line = element_blank())

ggsave(plot = ggarrange(p1,ncol = 1,nrow = 1),
       filename = "./output/boxplot_wIQR_propensity_serum_vs_csf_training.png", width =1.3,height = 1.3,units = "in",dpi = 300)


# boxplots of clean quartiles


data_clean <- data %>%
  filter(!is.na(quart))


df <- data_clean

obs <- c("quart")
obs_nice <- c("NFL Residual Quartiles")
pred <- c("log_nfl_csf")
pred_nice <- c("Log10 cNFL")

for(i in 1:length(obs)){
  
  p1 <- df %>% 
    ggplot(aes_string(x=obs[i], y=pred[i], color = "quart")) +
    geom_boxplot(outlier.colour = "transparent") +
    geom_jitter(alpha = 0.6, shape = 1, size = 2,stroke = 0.6,position=position_jitter(0.07)) +
    # geom_line(aes(group = subclass),color="gray", alpha = 0.6,size=0.3) +
    ylim(c(min(df[pred[i]])-0.25*(max(df[pred[i]])-min(df[pred[i]])),max(df[pred[i]])+0.2*(max(df[pred[i]])-min(df[pred[i]]))))+
    xlab(obs_nice[i]) +
    ylab(pred_nice[i]) +
    scale_colour_discrete(labels = c("<Q1", ">Q3"))+
    scale_x_discrete(labels = c("<Q1", ">Q3")) +
    stat_compare_means(method = "t.test", paired = FALSE,
                       label.x = "q1", 
                       label.y = max(df[pred[i]])+0.1*(max(df[pred[i]])-min(df[pred[i]])) )+
    theme_bw(base_size = 12) +
    stat_summary(fun.data = function(x) {
      return(
        data.frame(
          y = min(df[pred[i]])-0.05*(max(df[pred[i]])-min(df[pred[i]])),
          label = paste('n =',
                        format(length(x), big.mark = ",", decimal.mark = ".", scientific = FALSE), 
                        '\n',
                        'mean =', 
                        format(round(mean(x), 2), big.mark = ",", decimal.mark = ".", scientific = FALSE))
        )
      )
    },geom = "text", 
    hjust = 0.5,
    vjust = 1, size =3) +
    theme(axis.title.y = element_text( colour = "#0171C0" ),
          axis.title.x = element_text( colour = "black" ),
          legend.position = "none")
}

ggsave(plot = ggarrange(p1,ncol = 1,nrow = 1),
       filename = "./output/serum_propensity_csf_vs_serum_PNS_training.png", width =2.7,height = 2.5,units = "in",dpi = 300)


# residuals
df <- data_clean

obs <- c("quart")
obs_nice <- c("NFL Residual Quartiles")
pred <- c("serum_csf_nfl_res")
pred_nice <- c("NFL residuals")

for(i in 1:length(obs)){
  
  p1 <- df %>% 
    ggplot(aes_string(x=obs[i], y=pred[i], color = "quart")) +
    geom_boxplot(outlier.colour = "transparent") +
    geom_jitter(alpha = 0.6, shape = 1, size = 2,stroke = 0.6,position=position_jitter(0.07)) +
    # geom_line(aes(group = subclass),color="gray", alpha = 0.6,size=0.3) +
    ylim(c(min(df[pred[i]])-0.25*(max(df[pred[i]])-min(df[pred[i]])),max(df[pred[i]])+0.2*(max(df[pred[i]])-min(df[pred[i]]))))+
    xlab(obs_nice[i]) +
    ylab(pred_nice[i]) +
    scale_colour_discrete(labels = c("<Q1", ">Q3"))+
    scale_x_discrete(labels = c("<Q1", ">Q3")) +
    stat_compare_means(method = "t.test", paired = FALSE,
                       label.x = "q1", 
                       label.y = max(df[pred[i]])+0.1*(max(df[pred[i]])-min(df[pred[i]])) )+
    theme_bw(base_size = 12) +
    stat_summary(fun.data = function(x) {
      return(
        data.frame(
          y = min(df[pred[i]])-0.05*(max(df[pred[i]])-min(df[pred[i]])),
          label = paste('n =',
                        format(length(x), big.mark = ",", decimal.mark = ".", scientific = FALSE), 
                        '\n',
                        'mean =', 
                        format(round(mean(x), 2), big.mark = ",", decimal.mark = ".", scientific = FALSE))
        )
      )
    },geom = "text", 
    hjust = 0.5,
    vjust = 1, size =3) +
    theme_bw(base_size = 12) +
    theme(axis.title.y = element_text( colour = "#C55A11" ),
          axis.title.x = element_text( colour = "black" ),
          legend.position = "none")
}

ggsave(plot = ggarrange(p1,ncol = 1,nrow = 1),
       filename = "./output/residulas_propensity_csf_vs_serum_PNS_training.png", width =2.7,height = 2.5,units = "in",dpi = 300)


#
#
####
####  correlations of residuals with different outcomes in the training cohort
####
#
#
obs <- c("quart","quart","quart","quart")
obs_nice <- c("NFL Residual Quartiles","NFL Residual Quartiles","NFL Residual Quartiles","NFL Residual Quartiles")

pred <- c("medulla_ll_atrophy","panel_11_17","panel_11","panel_17")
pred_nice <- c("LL + Atrophy \nof Medulla & Upper CS",
               "NeurEx - \nMuscle Atrophy + BBSA",
               "NeurEx - \nMuscle Atrophy",
               "NeurEx - \nBBSA") 

cohorts <- c("train","train","train","train")
cohort_nice <- c("TRAINING cohort","TRAINING cohort","TRAINING cohort","TRAINING cohort")

# isolate samples with outcome in training cohort

for(i in 1:length(pred)){
  
  # i <-1
  data <- data_sub %>%
    dplyr::filter(cohort %in% cohorts[i])
  data <- data %>% 
    filter(.,!is.na(data[pred[i]]))%>% 
    filter(diagnosis %in% c("RR-MS","SP-MS","PP-MS"))
  
  # calculate quartiles
  quart <- (quantile(data$serum_csf_nfl_res))
  
  # identify first quartile values
  q1_ind <- which(data$serum_csf_nfl_res <= quart[2])
  
  #identify third quartile values 
  q3_ind <- which(data$serum_csf_nfl_res >= quart[4])
  
  #create a new column for quartiless
  data$quart <- NA
  data$quart[q1_ind] <- "q1"
  data$quart[q3_ind] <- "q3"
  
  data_clean <- data %>%
    filter(!is.na(quart))
  
  
  df <- data_clean
  
  p3 <- df %>% 
    ggplot(aes_string(x=obs[i], y=pred[i], color = "quart")) +
    geom_boxplot(outlier.colour = "transparent") +
    geom_jitter(alpha = 0.6, shape = 1, size = 2,stroke = 0.6,position=position_jitter(width = 0.07,height=0)) +
    # geom_line(aes(group = subclass),color="gray", alpha = 0.6,size=0.3) +
    ylim(c(min(df[pred[i]])-0.25*(max(df[pred[i]])-min(df[pred[i]])),max(df[pred[i]])+0.2*(max(df[pred[i]])-min(df[pred[i]]))))+
    xlab(obs_nice[i]) +
    ylab(pred_nice[i]) +
    scale_colour_discrete(labels = c("<Q1", ">Q3"))+
    scale_x_discrete(labels = c("<Q1", ">Q3")) +
    stat_compare_means(method = "wilcox.test", paired = FALSE,
                       label.x = "q1", 
                       label.y = max(df[pred[i]])+0.1*(max(df[pred[i]])-min(df[pred[i]])) )+
    theme_bw(base_size = 12) +
    stat_summary(fun.data = function(x) {
      return(
        data.frame(
          y = min(df[pred[i]])-0.05*(max(df[pred[i]])-min(df[pred[i]])),
          label = paste('n =',
                        format(length(x), big.mark = ",", decimal.mark = ".", scientific = FALSE), 
                        '\n',
                        'median =', 
                        format(round(median(x), 2), big.mark = ",", decimal.mark = ".", scientific = FALSE))
        )
      )
    },geom = "text", 
    hjust = 0.5,
    vjust = 1, size =3) +
    theme_bw(base_size = 12) +
    theme(legend.position = "none")+
    theme(axis.title.y = element_text( colour = "blue" ))
  
  ggsave(plot = ggarrange(p3,ncol = 1,nrow = 1),
         filename = paste("./output/",pred[i],"_final_plot_",cohorts[i],".png",sep=""), width =2.7,height = 2.5,units = "in",dpi = 300)
  
  
  df <- data_clean
  df$group <- df$quart
  df$group <- df$group=="q1"
  
  df <- df %>% 
    filter(!is.na(df[pred[i]]))
  
  set.seed(1234)
  match.it <- matchit(group~log_nfl_csf,data = df, method="full")
  a <- summary(match.it)
  plot(match.it)
  
  df.match <- match.data(match.it)
  
  df_q1 <- df.match %>%
    dplyr::group_by(subclass) %>%
    arrange(desc(abs(serum_csf_nfl_res))) %>%
    arrange(weights) %>%
    filter(quart=="q1") %>%
    filter(row_number()==1)
  
  df_q3 <- df.match %>%
    dplyr::group_by(subclass) %>%
    arrange(desc(abs(serum_csf_nfl_res))) %>%
    arrange(weights) %>%
    filter(quart=="q3") %>%
    filter(row_number()==1)
  
  df <- rbind(df_q1,df_q3)
  
  df <- df %>%
    dplyr::select(subclass,quart,log_nfl_csf,serum_csf_nfl_res,pred[i]) %>%
    dplyr::arrange(.,subclass)
  
  #plot serum NFL in two quartile groups
  
  
  p4 <- df %>% 
    ggplot(aes_string(x=obs[i], y=pred[i], color = "quart")) +
    geom_boxplot(outlier.colour = "transparent") +
    geom_jitter(alpha = 0.6, shape = 1, size = 2,stroke = 0.6,position=position_jitter(width = 0.07,height=0)) +
    geom_line(aes(group = subclass),color="gray", alpha = 0.6,size=0.3) +
    ylim(c(min(df[pred[i]])-0.25*(max(df[pred[i]])-min(df[pred[i]])),max(df[pred[i]])+0.2*(max(df[pred[i]])-min(df[pred[i]]))))+
    xlab(obs_nice[i]) +
    ylab(pred_nice[i]) +
    geom_line(aes(group = subclass),color="gray", alpha = 0.6,size=0.3) +
    scale_colour_discrete(labels = c("<Q1", ">Q3"))+
    scale_x_discrete(labels = c("<Q1", ">Q3")) +
    stat_compare_means(method = "wilcox.test", paired = TRUE,
                       label.x = "q1", 
                       label.y = max(df[pred[i]])+0.1*(max(df[pred[i]])-min(df[pred[i]])) )+
    theme_bw(base_size = 12) +
    stat_summary(fun.data = function(x) {
      return(
        data.frame(
          y = min(df[pred[i]])-0.05*(max(df[pred[i]])-min(df[pred[i]])),
          label = paste('n =',
                        format(length(x), big.mark = ",", decimal.mark = ".", scientific = FALSE), 
                        '\n',
                        'median =', 
                        format(round(median(x), 2), big.mark = ",", decimal.mark = ".", scientific = FALSE))
        )
      )
    },geom = "text", 
    hjust = 0.5,
    vjust = 1, size =3) +
    theme_bw(base_size = 12) +
    theme(legend.position = "none")+
    theme(axis.title.y = element_text( colour = "blue" ))
  
  ggsave(plot = ggarrange(p4,ncol = 1,nrow = 1),
         filename = paste("./output/",pred[i],"_final_plot_propensity_matched_",cohorts[i],".png",sep=""), width =2.7,height = 2.5,units = "in",dpi = 300)
  
  
}


# MATCHED boxplots of clean quartiles


obs <- c("quart")
obs_nice <- c("NFL Residual Quartiles")
pred <- c("log_nfl_csf")
pred_nice <- c("Log10 cNFL")

for(i in 1:length(obs)){
  
  p1 <- df %>% 
    ggplot(aes_string(x=obs[i], y=pred[i], color = "quart")) +
    geom_boxplot(outlier.colour = "transparent") +
    geom_jitter(alpha = 0.6, shape = 1, size = 2,stroke = 0.6,position=position_jitter(0.07)) +
    # geom_line(aes(group = subclass),color="gray", alpha = 0.6,size=0.3) +
    ylim(c(min(df[pred[i]])-0.25*(max(df[pred[i]])-min(df[pred[i]])),max(df[pred[i]])+0.2*(max(df[pred[i]])-min(df[pred[i]]))))+
    xlab(obs_nice[i]) +
    ylab(pred_nice[i]) +
    geom_line(aes(group = subclass),color="gray", alpha = 0.6,size=0.3) +
    scale_colour_discrete(labels = c("<Q1", ">Q3"))+
    scale_x_discrete(labels = c("<Q1", ">Q3")) +
    stat_compare_means(method = "t.test", paired = TRUE,
                       label.x = "q1", 
                       label.y = max(df[pred[i]])+0.1*(max(df[pred[i]])-min(df[pred[i]])) )+
    theme_bw(base_size = 12) +
    stat_summary(fun.data = function(x) {
      return(
        data.frame(
          y = min(df[pred[i]])-0.05*(max(df[pred[i]])-min(df[pred[i]])),
          label = paste('n =',
                        format(length(x), big.mark = ",", decimal.mark = ".", scientific = FALSE), 
                        '\n',
                        'mean =', 
                        format(round(mean(x), 2), big.mark = ",", decimal.mark = ".", scientific = FALSE))
        )
      )
    },geom = "text", 
    hjust = 0.5,
    vjust = 1, size =3) +
    theme(axis.title.y = element_text( colour = "#0171C0" ),
          axis.title.x = element_text( colour = "black" ),
          legend.position = "none")
}

ggsave(plot = ggarrange(p1,ncol = 1,nrow = 1),
       filename = "./output/serum_propensity_matched_csf_vs_serum_PNS_training.png", width =2.7,height = 2.5,units = "in",dpi = 300)


# residuals

obs <- c("quart")
obs_nice <- c("NFL Residual Quartiles")
pred <- c("serum_csf_nfl_res")
pred_nice <- c("NFL residuals")

for(i in 1:length(obs)){
  
  p1 <- df %>% 
    ggplot(aes_string(x=obs[i], y=pred[i], color = "quart")) +
    geom_boxplot(outlier.colour = "transparent") +
    geom_jitter(alpha = 0.6, shape = 1, size = 2,stroke = 0.6,position=position_jitter(0.07)) +
    # geom_line(aes(group = subclass),color="gray", alpha = 0.6,size=0.3) +
    ylim(c(min(df[pred[i]])-0.25*(max(df[pred[i]])-min(df[pred[i]])),max(df[pred[i]])+0.2*(max(df[pred[i]])-min(df[pred[i]]))))+
    xlab(obs_nice[i]) +
    ylab(pred_nice[i]) +
    geom_line(aes(group = subclass),color="gray", alpha = 0.6,size=0.3) +
    scale_colour_discrete(labels = c("<Q1", ">Q3"))+
    scale_x_discrete(labels = c("<Q1", ">Q3")) +
    stat_compare_means(method = "t.test", paired = TRUE,
                       label.x = "q1", 
                       label.y = max(df[pred[i]])+0.1*(max(df[pred[i]])-min(df[pred[i]])) )+
    theme_bw(base_size = 12) +
    stat_summary(fun.data = function(x) {
      return(
        data.frame(
          y = min(df[pred[i]])-0.05*(max(df[pred[i]])-min(df[pred[i]])),
          label = paste('n =',
                        format(length(x), big.mark = ",", decimal.mark = ".", scientific = FALSE), 
                        '\n',
                        'mean =', 
                        format(round(mean(x), 2), big.mark = ",", decimal.mark = ".", scientific = FALSE))
        )
      )
    },geom = "text", 
    hjust = 0.5,
    vjust = 1, size =3) +
    theme_bw(base_size = 12) +
    theme(axis.title.y = element_text( colour = "#C55A11" ),
          axis.title.x = element_text( colour = "black" ),
          legend.position = "none")
}

ggsave(plot = ggarrange(p1,ncol = 1,nrow = 1),
       filename = "./output/residulas_propensity_matched_csf_vs_serum_PNS_training.png", width =2.7,height = 2.5,units = "in",dpi = 300)


# isolate samples with outcome in validation cohort

# VALIDATION!!!!!!!!

#
#
####
####  correlations of residuals with different outcomes in the validation cohort
####
#
#
obs <- c("quart","quart","quart","quart")
obs_nice <- c("NFL Residual Quartiles","NFL Residual Quartiles","NFL Residual Quartiles","NFL Residual Quartiles")

pred <- c("medulla_ll_atrophy","panel_11_17","panel_11","panel_17")
pred_nice <- c("LL + Atrophy \nof Medulla & Upper CS",
               "NeurEx - \nMuscle Atrophy + BBSA",
               "NeurEx - \nMuscle Atrophy",
               "NeurEx - \nBBSA") 

cohorts <- c("val","val","val","val")
cohort_nice <- c("VALIDATION cohort","VALIDATION cohort","VALIDATION cohort","VALIDATION cohort")

# isolate samples with outcome in training cohort

for(i in 1:length(pred)){
  
  # i <-1
  data <- data_sub %>%
    dplyr::filter(cohort %in% cohorts[i])
  data <- data %>% 
    filter(.,!is.na(data[pred[i]]))%>% 
    filter(diagnosis %in% c("RR-MS","SP-MS","PP-MS"))
  
  # calculate quartiles
  quart <- (quantile(data$serum_csf_nfl_res))
  
  # identify first quartile values
  q1_ind <- which(data$serum_csf_nfl_res <= quart[2])
  
  #identify third quartile values 
  q3_ind <- which(data$serum_csf_nfl_res >= quart[4])
  
  #create a new column for quartiless
  data$quart <- NA
  data$quart[q1_ind] <- "q1"
  data$quart[q3_ind] <- "q3"
  
  data_clean <- data %>%
    filter(!is.na(quart))
  
  
  df <- data_clean
  
  p3 <- df %>% 
    ggplot(aes_string(x=obs[i], y=pred[i], color = "quart")) +
    geom_boxplot(outlier.colour = "transparent") +
    geom_jitter(alpha = 0.6, shape = 1, size = 2,stroke = 0.6,position=position_jitter(width = 0.07,height=0)) +
    # geom_line(aes(group = subclass),color="gray", alpha = 0.6,size=0.3) +
    ylim(c(min(df[pred[i]])-0.25*(max(df[pred[i]])-min(df[pred[i]])),max(df[pred[i]])+0.2*(max(df[pred[i]])-min(df[pred[i]]))))+
    xlab(obs_nice[i]) +
    ylab(pred_nice[i]) +
    scale_colour_discrete(labels = c("<Q1", ">Q3"))+
    scale_x_discrete(labels = c("<Q1", ">Q3")) +
    stat_compare_means(method = "wilcox.test", paired = FALSE,
                       label.x = "q1", 
                       label.y = max(df[pred[i]])+0.1*(max(df[pred[i]])-min(df[pred[i]])) )+
    theme_bw(base_size = 12) +
    stat_summary(fun.data = function(x) {
      return(
        data.frame(
          y = min(df[pred[i]])-0.05*(max(df[pred[i]])-min(df[pred[i]])),
          label = paste('n =',
                        format(length(x), big.mark = ",", decimal.mark = ".", scientific = FALSE), 
                        '\n',
                        'median =', 
                        format(round(median(x), 2), big.mark = ",", decimal.mark = ".", scientific = FALSE))
        )
      )
    },geom = "text", 
    hjust = 0.5,
    vjust = 1, size =3) +
    theme_bw(base_size = 12) +
    theme(legend.position = "none")
  
  ggsave(plot = ggarrange(p3,ncol = 1,nrow = 1),
         filename = paste("./output/",pred[i],"_final_plot_",cohorts[i],".png",sep=""), width =2.7,height = 2.5,units = "in",dpi = 300)
  
  
  df <- data_clean
  df$group <- df$quart
  df$group <- df$group=="q1"
  
  df <- df %>% 
    filter(!is.na(df[pred[i]]))
  
  set.seed(1234)
  match.it <- matchit(group~log_nfl_csf,data = df, method="full")
  a <- summary(match.it)
  plot(match.it)
  
  df.match <- match.data(match.it)
  
  df_q1 <- df.match %>%
    dplyr::group_by(subclass) %>%
    arrange(desc(abs(serum_csf_nfl_res))) %>%
    arrange(weights) %>%
    filter(quart=="q1") %>%
    filter(row_number()==1)
  
  df_q3 <- df.match %>%
    dplyr::group_by(subclass) %>%
    arrange(desc(abs(serum_csf_nfl_res))) %>%
    arrange(weights) %>%
    filter(quart=="q3") %>%
    filter(row_number()==1)
  
  df <- rbind(df_q1,df_q3)
  
  df <- df %>%
    dplyr::select(subclass,quart,log_nfl_serum,serum_csf_nfl_res,pred[i]) %>%
    dplyr::arrange(.,subclass)
  
  #plot serum NFL in two quartile groups
  
  
  p4 <- df %>% 
    ggplot(aes_string(x=obs[i], y=pred[i], color = "quart")) +
    geom_boxplot(outlier.colour = "transparent") +
    geom_jitter(alpha = 0.6, shape = 1, size = 2,stroke = 0.6,position=position_jitter(width = 0.07,height=0)) +
    geom_line(aes(group = subclass),color="gray", alpha = 0.6,size=0.3) +
    ylim(c(min(df[pred[i]])-0.25*(max(df[pred[i]])-min(df[pred[i]])),max(df[pred[i]])+0.2*(max(df[pred[i]])-min(df[pred[i]]))))+
    xlab(obs_nice[i]) +
    ylab(pred_nice[i]) +
    geom_line(aes(group = subclass),color="gray", alpha = 0.6,size=0.3) +
    scale_colour_discrete(labels = c("<Q1", ">Q3"))+
    scale_x_discrete(labels = c("<Q1", ">Q3")) +
    stat_compare_means(method = "wilcox.test", paired = TRUE,
                       label.x = "q1", 
                       label.y = max(df[pred[i]])+0.1*(max(df[pred[i]])-min(df[pred[i]])) )+
    theme_bw(base_size = 12) +
    stat_summary(fun.data = function(x) {
      return(
        data.frame(
          y = min(df[pred[i]])-0.05*(max(df[pred[i]])-min(df[pred[i]])),
          label = paste('n =',
                        format(length(x), big.mark = ",", decimal.mark = ".", scientific = FALSE), 
                        '\n',
                        'median =', 
                        format(round(median(x), 2), big.mark = ",", decimal.mark = ".", scientific = FALSE))
        )
      )
    },geom = "text", 
    hjust = 0.5,
    vjust = 1, size =3) +
    theme_bw(base_size = 12) +
    theme(legend.position = "none")
  
  ggsave(plot = ggarrange(p4,ncol = 1,nrow = 1),
         filename = paste("./output/",pred[i],"_final_plot_propensity_matched_",cohorts[i],".png",sep=""), width =2.7,height = 2.5,units = "in",dpi = 300)
  
  
}

#
#
#


########################################################################################################################################################
########################################################################################################################################################
#
#            check the if incorporation of PNS injury variable decreases variance explained by patientcode
#
########################################################################################################################################################
########################################################################################################################################################

#####################################
# isolate training cohort

vars_model <- c("log_nfl_csf","log_nfl_serum","age","bmi_weight",
                "serum_creatinine","serum_ap","serum_bun","medulla_ll_atrophy","panel_11_17")

confound <- c("age","bmi_weight",
              "serum_creatinine","serum_ap","serum_bun","medulla_ll_atrophy","panel_11_17")

data_sub_train <- data_sub %>%
  filter(cohort=="train")%>% 
  filter(diagnosis %in% c("RR-MS","SP-MS","PP-MS"))%>% 
  select(patientcode,lpdate,all_of(vars_model))
  
data_sub_train_clean <- data_sub_train[complete.cases(data_sub_train[confound]),]


# generate lm model in the training cohort 

mod_train <- lm(log_nfl_serum~log_nfl_csf,data_sub_train_clean)

summary(mod_train)

#MLM
#build the model:

fit <- lm(log_nfl_serum~log_nfl_csf + age + serum_creatinine + 
              bmi_weight + serum_ap + serum_bun, data_sub_train_clean)

#stepwise regression
step <- stepAIC(fit, direction="both")

step$anova # display results

best_fit <- lm(log_nfl_serum ~ log_nfl_csf + age + serum_creatinine + bmi_weight + 
                 serum_ap + serum_bun, data_sub_train_clean)

summary(best_fit)


#########################
##### add PNS variables
#########################


#MLM
#build the model:

fit_pns <- lm(log_nfl_serum~log_nfl_csf + age + serum_creatinine
              + bmi_weight + serum_ap + serum_bun + medulla_ll_atrophy + panel_11_17, data_sub_train_clean)

#stepwise regression
step <- stepAIC(fit_pns, direction="both")

step$anova # display results

best_fit_pns <- lm(log_nfl_serum ~ log_nfl_csf + age + serum_creatinine + bmi_weight + 
                     serum_ap + serum_bun + medulla_ll_atrophy + panel_11_17, data_sub_train_clean)

summary(best_fit_pns)


#####################################
# isolate validation cohort

data_sub_val <- data_sub %>%
  filter(cohort=="val")%>% 
  filter(diagnosis %in% c("RR-MS","SP-MS","PP-MS"))%>% 
  select(patientcode,lpdate,all_of(vars_model))

data_sub_val_clean <- data_sub_val[complete.cases(data_sub_val[confound]),]


################################################################################################
####  predict and plot
################################################################################################

#### TRAINING COHORT

#predict from simple LM
data_sub_train_clean$lm_pred_sNFL <- predict(mod_train,data_sub_train_clean)

#predict from MLM
data_sub_train_clean$mlm_pred_sNFL <- predict(best_fit,data_sub_train_clean)

#predict from PNS-MLM
data_sub_train_clean$mlm_pns_pred_sNFL <- predict(best_fit_pns,data_sub_train_clean)

# plot

pred <- c("lm_pred_sNFL","mlm_pred_sNFL","mlm_pns_pred_sNFL")
obs <- c("log_nfl_serum","log_nfl_serum","log_nfl_serum")

pred_nice <- c("cNFL-predicted sNFL","cNFL-predicted sNFL","cNFL-predicted sNFL")
obs_nice <- c("measured Log10 sNFL","measured Log10 sNFL","measured Log10 sNFL")


for(i in 1:length(pred)){
  # i <-2 
  df <- data_sub_train_clean
  
  p <- df %>% 
    ggplot(aes_string(x=obs[i], y=pred[i])) +
    geom_point(color="#001321",alpha = 0.4, shape = 1, size = 2,stroke = 0.6) +
    annotate("text",label = paste0('atop(n[s] ==', nrow(df[obs[i]]), ', n[p] ==', length(unique(df$patientcode)), ')'),
             x=max(df[obs[i]])-.1*(max(df[obs[i]])-min(df[obs[i]])),
             y=min(df[obs[i]])+.12*(max(df[obs[i]])-min(df[obs[i]])),
             size=4,parse = TRUE) +
    annotate("text",label = paste("CCC==",
                                  as.numeric(round(epi.ccc(as.numeric(df[[obs[i]]]),as.numeric(df[[pred[i]]]),
                                                           ci = "z-transform",conf.level = 0.95)$rho.c[1],
                                                   digits = 2)),sep=""),
             x=max(df[obs[i]])-.3*(max(df[obs[i]])-min(df[obs[i]])),
             y=max(df[obs[i]])-.01*(max(df[obs[i]])-min(df[obs[i]])),
             size=4,parse = TRUE,color="blue") +
    # ylim(c(min(df[pred[i]]-0.1*(max(df[pred[i]])-min(df[pred[i]]))),
    #        max(df[pred[i]])+0.2*(max(df[pred[i]])-min(df[pred[i]]))))+
    ylim(c(min(df[obs[i]]),max(df[obs[i]])))+
    xlab(obs_nice[i]) +
    ylab(pred_nice[i]) +
    geom_abline(slope = 1,intercept = 0,color="blue",size=0.5) +
    stat_cor(aes(label=..r.label..), size=4,color="#008F00",label.y.npc = 1)+
    stat_cor(aes(label=..rr.label..), size=4,color="#008F00",label.y.npc = .90)+
    stat_cor(aes(label=..p.label..), size=4,color="#008F00",label.y.npc = .77)+
    geom_smooth(method="lm",se=TRUE, size=1,color="#008F00") +
    theme_bw(base_size = 12) +
    theme(axis.title.x = element_text(colour = "#C00000"),
          axis.title.y = element_text(colour = "#008F00"))
  
  assign(paste("p",i,sep = ""),p)
}

ggsave(plot = ggarrange(p1,p2,p3,ncol = 3,nrow = 1, common.legend=TRUE, legend="bottom"),
       filename = "./output/predicted_sNFL_wPNS_training.png", 
       width =9,height = 2.2,units = "in",dpi = 300)

###############################################################
#### VALIDATION COHORT
###############################################################

#predict from simple LM
data_sub_val_clean$lm_pred_sNFL <- predict(mod_train,data_sub_val_clean)

#predict from MLM
data_sub_val_clean$mlm_pred_sNFL <- predict(best_fit,data_sub_val_clean)

#predict from PNS-MLM
data_sub_val_clean$mlm_pns_pred_sNFL <- predict(best_fit_pns,data_sub_val_clean)

# plot

pred <- c("lm_pred_sNFL","mlm_pred_sNFL","mlm_pns_pred_sNFL")
obs <- c("log_nfl_serum","log_nfl_serum","log_nfl_serum")

pred_nice <- c("cNFL-predicted sNFL","cNFL-predicted sNFL","cNFL-predicted sNFL")
obs_nice <- c("measured Log10 sNFL","measured Log10 sNFL","measured Log10 sNFL")


for(i in 1:length(pred)){
  # i <-2 
  df <- data_sub_val_clean
  
  p <- df %>% 
    ggplot(aes_string(x=obs[i], y=pred[i])) +
    geom_point(color="#001321",alpha = 0.4, shape = 1, size = 2,stroke = 0.6) +
    annotate("text",label = paste0('atop(n[s] ==', nrow(df[obs[i]]), ', n[p] ==', length(unique(df$patientcode)), ')'),
             x=max(df[obs[i]])-.1*(max(df[obs[i]])-min(df[obs[i]])),
             y=min(df[obs[i]])+.12*(max(df[obs[i]])-min(df[obs[i]])),
             size=4,parse = TRUE) +
    annotate("text",label = paste("CCC==",
                                  as.numeric(round(epi.ccc(as.numeric(df[[obs[i]]]),as.numeric(df[[pred[i]]]),
                                                           ci = "z-transform",conf.level = 0.95)$rho.c[1],
                                                   digits = 2)),sep=""),
             x=max(df[obs[i]])-.3*(max(df[obs[i]])-min(df[obs[i]])),
             y=max(df[obs[i]])-.01*(max(df[obs[i]])-min(df[obs[i]])),
             size=4,parse = TRUE,color="blue") +
    # ylim(c(min(df[pred[i]]-0.1*(max(df[pred[i]])-min(df[pred[i]]))),
    #        max(df[pred[i]])+0.2*(max(df[pred[i]])-min(df[pred[i]]))))+
    ylim(c(min(df[obs[i]]),max(df[obs[i]])))+
    xlab(obs_nice[i]) +
    ylab(pred_nice[i]) +
    geom_abline(slope = 1,intercept = 0,color="blue",size=0.5) +
    stat_cor(aes(label=..r.label..), size=4,color="#008F00",label.y.npc = 1)+
    stat_cor(aes(label=..rr.label..), size=4,color="#008F00",label.y.npc = .90)+
    stat_cor(aes(label=..p.label..), size=4,color="#008F00",label.y.npc = .77)+
    geom_smooth(method="lm",se=TRUE, size=1,color="#008F00") +
    theme_bw(base_size = 12) +
    theme(axis.title.x = element_text(colour = "#C00000"),
          axis.title.y = element_text(colour = "#008F00"))
  
  assign(paste("p",i,sep = ""),p)
}

ggsave(plot = ggarrange(p1,p2,p3,ncol = 3,nrow = 1, common.legend=TRUE, legend="bottom"),
       filename = "./output/predicted_sNFL_wPNS_validation.png", 
       width =9,height = 2.2,units = "in",dpi = 300)

summary(mod_train)
summary(best_fit)
summary(best_fit_pns)


#plot lolipop plots for t-values variables 

# define dataset:
data <- tibble::enframe(confound,name = NULL,value = "term")
data$term_nice <- c("Age","Weight","Creatinine","AP","BUN","MRI SC Atrophy","Muscle Atrophy+BBSA")

# get Tstat from the MLM model
tidy_fit <- broom::tidy(best_fit)
tidy_fit <- tidy_fit[-c(1:2),]
tidy_fit$statistic_mlm <- tidy_fit$statistic
tidy_fit <- tidy_fit %>% 
  select(term,statistic_mlm)
#merge with data
data <- merge(data,tidy_fit,by="term",all.x = TRUE)


# get Tstat from the MLM_PNS model
tidy_fit <- broom::tidy(best_fit_pns)
tidy_fit <- tidy_fit[-c(1:2),]
tidy_fit$statistic_mlm_pns <- tidy_fit$statistic
tidy_fit <- tidy_fit %>% 
  select(term,statistic_mlm_pns)
#merge with data
data <- merge(data,tidy_fit,by="term",all.x = TRUE)

data$statistic <- NA

# lolipop plotplot(# Horizontal version
z <- ggplot(data, aes(y=statistic_mlm, x=term_nice)) +
  geom_segment( aes(y=0, yend=statistic_mlm,x=term_nice, xend=term_nice), color="skyblue") +
  geom_point( color="blue", size=4, alpha=0.6) +
  theme_light() +
  scale_x_discrete(limits = c("Weight","Muscle Atrophy+BBSA","MRI SC Atrophy","Creatinine","BUN","AP","Age")) +
  geom_hline(yintercept = 0,color = "black", linetype="dashed",size=.25) +
  coord_flip() +
  ylab("t-statistics") +
  xlab("")+
  ylim(c(-7.5,9.5))+
  theme(
    panel.grid.major.y = element_blank(),
    panel.border = element_blank(),
    axis.ticks.y = element_blank()
    
  )

ggsave(plot = ggarrange(z,ncol = 1,nrow = 1, common.legend=TRUE, legend="bottom"),
       filename = "./output/vars_lolipop_MLM.png", 
       width =3.5,height = 2.2,units = "in",dpi = 300)

#lolipop for PNS version
q <- ggplot(data, aes(y=statistic_mlm_pns, x=term_nice)) +
  geom_segment( aes(y=0, yend=statistic_mlm_pns,x=term_nice, xend=term_nice), color="pink") +
  geom_point( color="red", size=4, alpha=0.6) +
  theme_light() +
  scale_x_discrete(limits = c("Weight","Muscle Atrophy+BBSA","MRI SC Atrophy","Creatinine","BUN","AP","Age")) +
  geom_hline(yintercept = 0,color = "black", linetype="dashed",size=.25) +
  coord_flip() +
  ylab("t-statistics") +
  xlab("")+
  ylim(c(-7.5,9.5))+
  theme(
    panel.grid.major.y = element_blank(),
    panel.border = element_blank(),
    axis.ticks.y = element_blank()
    
  )

ggsave(plot = ggarrange(q,ncol = 1,nrow = 1, common.legend=TRUE, legend="bottom"),
       filename = "./output/vars_lolipop_MLM_PNS.png", 
       width =3.5,height = 2.2,units = "in",dpi = 300)

# lolipop for LM version
#lolipop for PNS version
t <- ggplot(data, aes(y=statistic_mlm_pns, x=term_nice)) +
  geom_segment( aes(y=0, yend=statistic_mlm_pns,x=term_nice, xend=term_nice), color="transparent") +
  geom_point( color="transparent", size=4, alpha=0.6) +
  theme_light() +
  scale_x_discrete(limits = c("Weight","Muscle Atrophy+BBSA","MRI SC Atrophy","Creatinine","BUN","AP","Age")) +
  geom_hline(yintercept = 0,color = "black", linetype="dashed",size=.25) +
  coord_flip() +
  ylab("t-statistics") +
  xlab("")+
  ylim(c(-7.5,9.5))+
  theme(
    panel.grid.major.y = element_blank(),
    panel.border = element_blank(),
    axis.ticks.y = element_blank()
    
  )

ggsave(plot = ggarrange(t,ncol = 1,nrow = 1, common.legend=TRUE, legend="bottom"),
       filename = "./output/vars_lolipop_LM.png", 
       width =3.5,height = 2.2,units = "in",dpi = 300)
summary(mod_train)