# A computational model of telomere attrition - R script S2 v3

# Revised 6th September 2016 to address referees comments

# written by Melissa Bateson

# This script uses R script S1 in order to generate the results and figures in the published paper.


setwd("D:/Dropbox/Telomere paradox/Telomere lengthening paper")


library(psych)
library(ggplot2)
library(scales)
library(tidyr)
library(plyr)
library(grid)
library(gridExtra)

#### Load the telomere data simulation function  ####

source('R script S1.R')



###### Plot Steenstrup et al's original results: Figure 1 A and B ##############

s1 = read.csv("Table S1.csv", header = TRUE) # Read in the data from Steenstrup Table 1

# Plot the Steenstrup data from Table S1
# Plot negative relationship between FU and % gainers
fig1a = ggplot(data=s1, aes(x = FU, y = ObsGainers)) +
  theme_bw()+
  theme(panel.grid.major = element_blank()) + 
  theme(panel.grid.minor = element_blank()) +
  geom_point(size = 3) +
  stat_smooth(method = lm, se = FALSE, colour = "black")+
  xlab("Follow-up period (years)") +
  ylab("Observed % TL gainers") +
  annotate("text", x = 0, y = 2, label="A", size=6)
fig1a

# plot the Steenstrup predictions against the observed % gainers in Table S1
s = s1[is.na(s1$PredGainersSteenstrup)==FALSE, ] # remove studies from table that don't have predictions
# model the data to generate regression data
m1 = lm(s$ObsGainers ~ 0 + s$PredGainersSteenstrup)
summary(m1)$r.squared
fig1b = ggplot(data = s, aes(y = ObsGainers, x = PredGainersSteenstrup)) +
  geom_abline(intercept = 0, slope = 1, linetype = "dotted") +
  geom_point(size = 3) +
  geom_abline(slope = m1$coefficients, intercept = 0, size = 1) +
  theme_bw()+
  theme(panel.grid.major = element_blank()) + 
  theme(panel.grid.minor = element_blank()) +
  ylab("Observed % TL gainers")+
  xlab("Predicted % TL gainers") +
  annotate("text", x = 25, y = 50, parse=TRUE, label="Parameter~values:~sigma[a]==0~~~r==0", size=4) +
  annotate("text", x = 35, y = 10, label=paste0("R-squared = ", round(summary(m1)$r.squared, digits = 2),", slope = ", round(m1$coefficients, digits=2)), size=4) +
  annotate("text", x = 0, y = 50, label="B", size=6)
fig1b  





###### Simulating TL data for various values of sd, r and CV using the computational model: Figures 2,3, S1A and S1B ####

# Set the variables that will remain constant in the first round of simulations:
n = 10000
max.years = 15

for (i in seq(0,100,50)){ print(i)  # values of stress.attrition.sd
  for (j in c(0,0.5,1.0)){ print(j)  # values of r
    for (z in c(0,2,4,8)) { # values of CV
d = generate.TL.data(n = n, max.years = max.years, CV = z, stress.attrition.sd = i, r = j)  
d$obsGainers[d$obsPCloss<0] = 1
d$trueGainers[d$trueTLloss<0] = 1
sum = ddply(d, .(year, stress.attrition.sd, r, CV), summarise, obsGainers=sum(obsGainers, na.rm=TRUE), trueGainers=sum(trueGainers, na.rm=TRUE))
sum$propObsGainers = sum$obsGainers/n
sum$propTrueGainers = sum$trueGainers/n
if (i==0 & j==0 & z==0) summary.data = sum else summary.data = rbind(summary.data,sum) # summary data frame
if (i==0 & j==0 & z==0) full.data = d else full.data = rbind(full.data,d) # long thin data frame    
    }
  }
}

summary.data = summary.data[summary.data$year>0,] # remove year 0 from the summary data set on attrition

# save the data generated by the simulation
write.csv(full.data, "Long_thin_simulation_data.CSV")
write.csv(summary.data, "Summary_simulation_data.CSV")


# Read in the saved data from the simulation (if required)
full.data = read.csv("Long_thin_simulation_data.CSV", header = TRUE)
summary.data = read.csv("Summary_simulation_data.CSV", header = TRUE)

# plot graphs showing TL change over time for a range of parameter values
plot.data = full.data[full.data$CV==0,] # CV = 0 corresponds to true TL length (no measurement error)
sub.plot.data=plot.data[plot.data$subject>2 & plot.data$subject<17,] # now reduce data to a manageable number (i.e. 15) of subjects for plotting
# create facet labels for graphs
col.variable.labels = c(expression(paste(sigma[a]~"="~0)), expression(paste(sigma[a]~"="~50)),expression(paste(sigma[a]~"="~100)))
sub.plot.data$sd.label = gl(3,672,labels = col.variable.labels)
col.variable.labels = c(expression(paste("r = 0")), expression(paste("r = 0.5")), expression(paste("r = 1")))
sub.plot.data$r.label = rep(gl(3,224,labels = col.variable.labels),3)
fig2 = ggplot(sub.plot.data, aes(x=year, y=measuredTL/1000, colour=as.factor(subject)) )+
  geom_point() + 
  theme_bw()+
  theme(panel.grid.major = element_blank()) + 
  theme(panel.grid.minor = element_blank()) +
  geom_line(size=0.5) +
  guides(colour=FALSE)+
  facet_grid(r.label ~ sd.label, labeller = "label_parsed") +
  xlab("Year") +
  ylab("Telomere Length (kb)")
fig2


# plot how mean and sd in TL changes over time
stats = ddply(full.data, .(year, r, stress.attrition.sd, CV), summarise, mean.TLlength=mean(measuredTL, na.rm=TRUE), sd.TLlength=sd(measuredTL, na.rm = TRUE))
sub.stats = stats[stats$CV==0,]
# create facet labels for graphs
col.variable.labels = c(expression(paste(sigma[a]~"="~0)), expression(paste(sigma[a]~"="~50)),expression(paste(sigma[a]~"="~100)))
sub.stats$sd.label = rep(gl(3,1,labels = col.variable.labels),48)
col.variable.labels = c(expression(paste("r = 0")), expression(paste("r = 0.5")), expression(paste("r = 1")))
sub.stats$r.label = rep(gl(3,3,labels = col.variable.labels),16)
# plot mean TL by year
figS1a = ggplot(sub.stats, aes(x = year, mean.TLlength/1000)) +
  theme_bw()+
  theme(panel.grid.major = element_blank()) + 
  theme(panel.grid.minor = element_blank()) +
  geom_point() +
  geom_line(size=0.5) +
  facet_grid(r.label~ sd.label, labeller = "label_parsed") +
  xlab("Year") +
  ylab("Mean telomere length (kb)")
figS1a
# plot sd in TL by year
figS1b = ggplot(sub.stats, aes(x = year, sd.TLlength/1000)) +
  theme_bw()+
  theme(panel.grid.major = element_blank()) + 
  theme(panel.grid.minor = element_blank()) +
  geom_point() +
  geom_line(size=0.5) +
  facet_grid(r.label ~ sd.label, labeller = "label_parsed")+
  xlab("Year") +
  ylab("Standard deviation of telomere length (kb)")
figS1b


# Now plot how the proportion of true and observed gainers changes with years
# Do this for different levels of measurement error (CV)
summary.data$percentObsGainers = summary.data$propObsGainers*100 # convert prop gainers to a percentage
# create facet labels for graphs
col.variable.labels = c(expression(paste(sigma[a]~"="~0)), expression(paste(sigma[a]~"="~50)),expression(paste(sigma[a]~"="~100)))
summary.data$sd.label = gl(3,180,labels = col.variable.labels)
col.variable.labels = c(expression(paste("r = 0")), expression(paste("r = 0.5")), expression(paste("r = 1")))
summary.data$r.label = rep(gl(3,60,labels = col.variable.labels),3)
fig3 = ggplot(data = summary.data, aes(x = year, y = percentObsGainers, colour = as.factor(CV))) +
  theme_bw()+
  theme(panel.grid.major = element_blank()) +
  theme(panel.grid.minor = element_blank()) +
  geom_point() +
  geom_line(size = 0.5) +
  facet_grid(r.label ~ sd.label, labeller = "label_parsed")+
  xlab("Year") +
  ylab("% TL gainers from baseline")+
  theme(legend.justification=c(1,1),legend.position=c(1,.23)) +
  theme(legend.background=element_rect(fill="white", colour = "black"))+
  scale_colour_manual(guide=guide_legend(title=NULL),labels=c("True","Measured, CV = 2%","Measured, CV = 4%", "Measured, CV = 8%"), values = c("black","grey40","grey60","grey80"))
fig3







############# Replicate Steenstrup predictions using the analytical model: Fig 4A #########################

# set constant values in simulations
bl.mean = 7000 # mean of baseline TL distribution for studies where this is not available
bl.sd = 700 # sd of baseline TL distribution for studies where this is not available
change = 30 # attrition per year for studies where this is not available in bp
no.studies = length(s$study)
# make some arrays to store the output in
sim.pred = array(,dim = c(no.studies)) # make an empty array for data on predicted gainers
obs.gainers = as.array(s$ObsGainers)
for (study in 1:no.studies) {
       # Set the variables for the current study
         n = 10000 # use this to produce a closer simulation of the analytical model
         FU = s$FU[study]
         FU.int = as.integer(FU)
         rem = FU - FU.int
         if (FU == FU.int) {max.years = FU} else {max.years = FU.int + 1}
         if (is.na(s$TLb[study] == TRUE)) {baselineTL.mean = bl.mean} else {baselineTL.mean = s$TLb[study] * 1000}
         if (is.na(s$TLb.sd[study] == TRUE)) {baselineTL.sd = bl.sd} else {baselineTL.sd = s$TLb.sd[study]}
         if (is.na(s$change[study] == TRUE)) {stress.attrition.mean = change} else {stress.attrition.mean = s$change[study]}
         CV = s$CV[study]
         if (s$rep[study]==2) {CV = CV/sqrt(2)} # reduce the CV of measurement error in studies with 2 replicates
    # run the simulation  for sd = 0, r = 0:
    d = generate.TL.data(n = n, max.years = max.years, baselineTL.mean = baselineTL.mean,
                                          baselineTL.sd = baselineTL.sd, stress.attrition.mean = stress.attrition.mean,
                                                          CV = CV, stress.attrition.sd = 0, r = 0)  
          d$obsGainers[d$obsPCloss<0] = 1
          d$trueGainers[d$trueTLloss<0] = 1
          sum = ddply(d, .(year, stress.attrition.sd, r, CV), summarise, obsGainers=sum(obsGainers, na.rm=TRUE), trueGainers=sum(trueGainers, na.rm=TRUE))
          sum$obsGainers[sum$year==0] = n/2 # this deals with the fact that if you measure people at baseline 50% will appear to have gained on average
          sum$propObsGainers = sum$obsGainers/n
          sum$propTrueGainers = sum$trueGainers/n
          # write the prediction into the output file
            if (FU.int == FU) {sim.pred[study] = sum$propObsGainers[sum$year==max.years]*100} else
                 {sim.pred[study] = 
                     (((1-rem)*sum$propObsGainers[sum$year== (max.years-1)])+(rem*sum$propObsGainers[sum$year==max.years]))*100
                }
           }
s$simulated.gainers = sim.pred
cor.test(s$simulated.gainers,s$PredGainersSteenstrup) # calculate the correlation for the paper

# plot the correlation 
fig4a = ggplot(data = s, aes(x = PredGainersSteenstrup, y = simulated.gainers)) +
  geom_point(size = 2) +
  geom_abline(intercept = 0, slope = 1, linetype = "dotted")+
  theme_bw()+
  theme(panel.grid.major = element_blank()) + 
  theme(panel.grid.minor = element_blank()) +
  ylab('Predicted % TL gainers (computational model)') +
  xlab("Predicted % TL gainers (Steenstrup et al.)") +
  annotate("text", x = 0, y = 50, label="A", size=6)
fig4a






######### Search the parameter space to find the best fitting scenario: Fig 4B  ##########


# set constant values in simulations
total.replicates = 100 # the number of times to repeat the simulation (NB this needs to run overnight)
bl.mean = 7000 # mean of baseline TL distribution for studies where this is not available
bl.sd = 700 # sd of baseline TL distribution for studies where this is not available
change = 30 # attrition per year for studies where this is not available in bp
no.studies = length(s$study)
max.i = 100 # max attrition.sd value
max.j = 1 # max k value
step.i = max.i/10
step.j = max.j/2
max.rows = total.replicates*no.studies*((max.i/step.i)+1)*((max.j/step.j)+1)
data = array(, dim = c(max.rows, 6)) 
current.row = 1
for (replicate in 1:total.replicates) {
  print(paste0("Replicate: ", replicate))
for (i in seq(0,max.i,step.i)) {    # values of stress.attrition.sd
        for (j in seq(0,max.j,step.j))   {  # values of r
          for (study in 1:no.studies) {
  # Set the variables for the current study 
  n = s$N[study]
  FU = s$FU[study]
  FU.int = as.integer(FU)
  rem = FU - FU.int
  if (FU == FU.int) {max.years = FU} else {max.years = FU.int + 1}
  if (is.na(s$TLb[study] == TRUE)) {baselineTL.mean = bl.mean} else {baselineTL.mean = s$TLb[study] * 1000}
  if (is.na(s$TLb.sd[study] == TRUE)) {baselineTL.sd = bl.sd} else {baselineTL.sd = s$TLb.sd[study]}
  if (is.na(s$change[study] == TRUE)) {stress.attrition.mean = change} else {stress.attrition.mean = s$change[study]}
  CV = s$CV[study]
  if (s$rep[study]==2) {CV = CV/sqrt(2)} # reduce the CV of measurement error in studies with 2 replicates
  # run the simulation
  d = generate.TL.data(n = n, max.years = max.years, baselineTL.mean = baselineTL.mean,
                       baselineTL.sd = baselineTL.sd, stress.attrition.mean = stress.attrition.mean,
                       CV = CV, stress.attrition.sd = i, r = j)  
  # calculate the true and observed TL gainers predicted
  d$obsGainers[d$obsPCloss<0] = 1
  d$trueGainers[d$trueTLloss<0] =1
  sum = ddply(d, .(year, stress.attrition.sd, r, CV), summarise, obsGainers=sum(obsGainers, na.rm=TRUE), trueGainers=sum(trueGainers, na.rm=TRUE))
  sum$obsGainers[sum$year==0] = n/2 # this deals with the fact that if you measure people at baseline 50% will appear to have gained on average
  sum$propObsGainers = sum$obsGainers/n
  sum$propTrueGainers = sum$trueGainers/n
  
  
  # write the prediction into the output file
  if (FU.int == FU) {prediction = sum$propObsGainers[sum$year==max.years]} else
  {prediction = (((1-rem)*sum$propObsGainers[sum$year== (max.years-1)])+(rem*sum$propObsGainers[sum$year==max.years]))}
  
  if (FU.int == FU) {trueGainers = sum$propTrueGainers[sum$year==max.years]} else
  {trueGainers = (((1-rem)*sum$propTrueGainers[sum$year== (max.years-1)])+(rem*sum$propTrueGainers[sum$year==max.years]))}
  
  #dev = (s$ObsGainers[study]-(prediction*100))^2 # to assess goodness of fit to observed data
  case = c(replicate,study, i, j, trueGainers*100, prediction*100)
  data[current.row,] = case # accumulate summary array
  current.row = current.row + 1
          }}}
}
# create the saved data frame  
dimnames(data) <- list(NULL, c("replicate","study","attrition.sd","r","trueGainers", "prediction"))
data = as.data.frame(data, row.names = NULL)
# save the data generated by the simulation
write.csv(data, "Steenstrup_sim_data.csv")

# Read in the saved data from the simulation (if required)
data = read.csv("Steenstrup_sim_data3.csv", header = TRUE)

# now calculate the r-squared values for each simulation
summary = array(, dim = c(max.rows/no.studies, 4)) 
current.row = 1
for (replicate in 1:total.replicates) {
  for (i in seq(0,max.i,step.i)) {    # values of stress.attrition.sd
  #print(paste0("Value of i: ", i))
    for (j in seq(0,max.j,step.j))   {  # values of r
        sub = data[data$replicate==replicate & data$attrition.sd==i & data$r==j,]
        m1 = lm(s$ObsGainers ~ 0 + sub$prediction)
        summary[current.row,] = c(replicate, i, j, summary(m1)$r.squared)
        current.row = current.row+1
    }}}

dimnames(summary) <- list(NULL, c("replicate","attrition.sd","r","r.squared"))
summary = as.data.frame(summary, row.names = NULL)

summary2 = ddply(summary, .(attrition.sd,r), summarise, mean.r.squared = mean(r.squared), sd.r.squared = sd(r.squared))
summary2$CI = qnorm(0.975)*summary2$sd.r.squared/sqrt(total.replicates)

# plot the results
fig4b = ggplot(summary2, aes(x = attrition.sd, y = mean.r.squared, colour = as.factor(r))) +
  geom_point(size = 2) +
  geom_errorbar(aes(ymin=mean.r.squared-CI, ymax=mean.r.squared+CI), width=.5) +
  geom_line(size = 1) +
  geom_abline(slope = 0, intercept = 0.862, linetype = "dotted") + # the Steenstrup fit
  annotate("segment", x = 95, xend = 95, y = (0.862-0.015), yend = (0.862+0.015), arrow=arrow(end = "both", length = unit(0.25, "cm"))) +
  annotate("text", x = 95, y = (0.862+.022), label = "Better fit") +
  annotate("text", x = 95, y = (0.862-.02), label = "Worse fit") +
  annotate("text", x = 50, y = 0.885, label = "*", size = 8) + # the best model
  annotate("segment", x=14.3, xend = 53.2, y=0.9, yend=0.9,arrow=arrow(ends="both",angle = 90, length=unit(.1,"cm")))+ # range of measures values of sd of annual attrition
  annotate("text", x = 33.5, y = 0.92, label = "Measured sd annual telomere attrition")+
  theme_bw()+
  theme(panel.grid.major = element_blank()) +
  theme(panel.grid.minor = element_blank()) +
  theme(legend.justification=c(0.95,0.98),legend.position=c(0.4,.3)) +
  theme(legend.background=element_rect(fill="white", colour = "black")) +
  xlab(bquote('Standard deviation of annual telomere attrition  ('*sigma[a]*')'))+
  ylab("R-squared")+
  labs(colour = "Autocorrelation (r)") +
  scale_colour_manual(values = c("black","grey40","grey60"))+
  annotate("text", x = 0, y = .94, label="B", size=6)
fig4b


# Tabulate when the computational model is better and worse than Steenstrup
summary2$Steenstrup.r.squared = 0.8619852
summary2$better.diff = summary2$mean.r.squared-summary2$CI - summary2$Steenstrup.r.squared
summary2$worse.diff = summary2$Steenstrup.r.squared-(summary2$mean.r.squared+summary2$CI)
summary2$diff[summary2$better.diff>0] = "Better"
summary2$diff[summary2$worse.diff>0] = "Worse"




###### Now plot the results for the "best" model (sd mean = 50, r = 0): Fig 4C ################

best = data[data$attrition.sd==50 & data$r==0,]
best.sum = ddply(best, .(study), summarise, mean.true = mean(trueGainers), sd.true = sd(trueGainers),mean.prediction = mean(prediction), sd.prediction = sd(prediction))
s$best.model.mean = best.sum$mean.prediction

m2 = lm(s$ObsGainers ~ 0 + s$best.model.mean)
summary(m2)
fig4c = ggplot(data = s, aes(y = ObsGainers, x = best.model.mean)) +
  geom_abline(intercept = 0, slope = 1, linetype = "dotted") +
  geom_point(size = 3) +
  geom_abline(slope = m2$coefficients, intercept = 0, size = 1) +
  theme_bw()+
  theme(panel.grid.major = element_blank()) + # switch off major gridlines
  theme(panel.grid.minor = element_blank()) +
  ylab("Observed % TL gainers")+
  xlab("Predicted % TL gainers") +
  annotate("text", x = 25, y = 50, parse=TRUE, label="Parameter~values:~sigma[a]==50~~~r==0", size=4) +
  annotate("text", x = 35, y = 10, size = 4, label=paste0("R-squared = ", round(summary(m2)$r.squared, digits = 2),", slope = ", round(m2$coefficients, digits=2))) +
  annotate("text", x = 0, y = 50, label="C", size=6)
fig4c 


###### Now calculate how many true gainers there are with the best model: Fig 4D ###########

s$best.model.true.mean = best.sum$mean.true

mean(s$best.model.mean)
sd(s$best.model.mean)

mean(s$best.model.true.mean)
sd(s$best.model.true.mean)

s2 = s %>% gather(TL,propGain,c(best.model.mean,best.model.true.mean))
s2$sds=c(best.sum$sd.prediction, best.sum$sd.true)
s2$CI=qnorm(0.975)*s2$sds/sqrt(total.replicates)


dodge = position_dodge(width = 0.8)

fig4d = ggplot(s2, aes(x = study, y = propGain, fill = TL)) +
  geom_bar(stat = "identity", position = dodge, width = 0.8)+
  geom_errorbar(aes(ymax = propGain+CI, ymin = propGain-CI), position = dodge,width = 0.2)+
  theme_bw()+
  theme(panel.grid.major = element_blank()) + 
  theme(panel.grid.minor = element_blank()) +
  ylab("% TL gainers")+
  theme(axis.text.x = element_text(angle = 45, hjust = 1))+
  theme(legend.position=c(0.25,.75)) +
  theme(legend.background=element_rect(fill="white", colour = "black"))+
  theme(axis.title.x=element_blank())+
  scale_fill_manual(guide = guide_legend(title=NULL),labels=c("Measured % TL gainers","True % TL gainers"), values = c("grey80","grey50"))+
  labs(fill = "Measure")+
  annotate("text", x = 5, y = 50, parse=TRUE, label="Parameter~values:~sigma[a]==50~~~r==0", size=4) +
  annotate("text", x = 1, y = 50, label="D", size=6)
fig4d



############### Generate the figures for the paper ##############

# Figure 1ab
pdf("Figure1ab.pdf",width=2800/300, height=1400/300)
grid.arrange(fig1a, fig1b, ncol=2)
dev.off()  

# Figure 2
pdf("Figure2.pdf",width=1800/300, height=1800/300)
print(fig2)
dev.off()

# Figure S1ab
pdf("FigureS1ab.pdf",width=2800/300, height=1400/300)
grid.arrange(figS1a, figS1b, ncol=2)
dev.off() 

# Figure 3
pdf("Figure3.pdf",width=2000/300, height=2000/300)
print(fig3)
dev.off()

# Figure 4abcd
pdf("Figure4abcd.pdf",width = 2800/300, height = 2800/300)
grid.arrange(fig4a, fig4b, fig4c,fig4d, ncol=2)
dev.off()