# A computational model of telomere attrition - R script S2 v3 # Revised 6th September 2016 to address referees comments # written by Melissa Bateson # This script uses R script S1 in order to generate the results and figures in the published paper. setwd("D:/Dropbox/Telomere paradox/Telomere lengthening paper") library(psych) library(ggplot2) library(scales) library(tidyr) library(plyr) library(grid) library(gridExtra) #### Load the telomere data simulation function #### source('R script S1.R') ###### Plot Steenstrup et al's original results: Figure 1 A and B ############## s1 = read.csv("Table S1.csv", header = TRUE) # Read in the data from Steenstrup Table 1 # Plot the Steenstrup data from Table S1 # Plot negative relationship between FU and % gainers fig1a = ggplot(data=s1, aes(x = FU, y = ObsGainers)) + theme_bw()+ theme(panel.grid.major = element_blank()) + theme(panel.grid.minor = element_blank()) + geom_point(size = 3) + stat_smooth(method = lm, se = FALSE, colour = "black")+ xlab("Follow-up period (years)") + ylab("Observed % TL gainers") + annotate("text", x = 0, y = 2, label="A", size=6) fig1a # plot the Steenstrup predictions against the observed % gainers in Table S1 s = s1[is.na(s1$PredGainersSteenstrup)==FALSE, ] # remove studies from table that don't have predictions # model the data to generate regression data m1 = lm(s$ObsGainers ~ 0 + s$PredGainersSteenstrup) summary(m1)$r.squared fig1b = ggplot(data = s, aes(y = ObsGainers, x = PredGainersSteenstrup)) + geom_abline(intercept = 0, slope = 1, linetype = "dotted") + geom_point(size = 3) + geom_abline(slope = m1$coefficients, intercept = 0, size = 1) + theme_bw()+ theme(panel.grid.major = element_blank()) + theme(panel.grid.minor = element_blank()) + ylab("Observed % TL gainers")+ xlab("Predicted % TL gainers") + annotate("text", x = 25, y = 50, parse=TRUE, label="Parameter~values:~sigma[a]==0~~~r==0", size=4) + annotate("text", x = 35, y = 10, label=paste0("R-squared = ", round(summary(m1)$r.squared, digits = 2),", slope = ", round(m1$coefficients, digits=2)), size=4) + annotate("text", x = 0, y = 50, label="B", size=6) fig1b ###### Simulating TL data for various values of sd, r and CV using the computational model: Figures 2,3, S1A and S1B #### # Set the variables that will remain constant in the first round of simulations: n = 10000 max.years = 15 for (i in seq(0,100,50)){ print(i) # values of stress.attrition.sd for (j in c(0,0.5,1.0)){ print(j) # values of r for (z in c(0,2,4,8)) { # values of CV d = generate.TL.data(n = n, max.years = max.years, CV = z, stress.attrition.sd = i, r = j) d$obsGainers[d$obsPCloss<0] = 1 d$trueGainers[d$trueTLloss<0] = 1 sum = ddply(d, .(year, stress.attrition.sd, r, CV), summarise, obsGainers=sum(obsGainers, na.rm=TRUE), trueGainers=sum(trueGainers, na.rm=TRUE)) sum$propObsGainers = sum$obsGainers/n sum$propTrueGainers = sum$trueGainers/n if (i==0 & j==0 & z==0) summary.data = sum else summary.data = rbind(summary.data,sum) # summary data frame if (i==0 & j==0 & z==0) full.data = d else full.data = rbind(full.data,d) # long thin data frame } } } summary.data = summary.data[summary.data$year>0,] # remove year 0 from the summary data set on attrition # save the data generated by the simulation write.csv(full.data, "Long_thin_simulation_data.CSV") write.csv(summary.data, "Summary_simulation_data.CSV") # Read in the saved data from the simulation (if required) full.data = read.csv("Long_thin_simulation_data.CSV", header = TRUE) summary.data = read.csv("Summary_simulation_data.CSV", header = TRUE) # plot graphs showing TL change over time for a range of parameter values plot.data = full.data[full.data$CV==0,] # CV = 0 corresponds to true TL length (no measurement error) sub.plot.data=plot.data[plot.data$subject>2 & plot.data$subject<17,] # now reduce data to a manageable number (i.e. 15) of subjects for plotting # create facet labels for graphs col.variable.labels = c(expression(paste(sigma[a]~"="~0)), expression(paste(sigma[a]~"="~50)),expression(paste(sigma[a]~"="~100))) sub.plot.data$sd.label = gl(3,672,labels = col.variable.labels) col.variable.labels = c(expression(paste("r = 0")), expression(paste("r = 0.5")), expression(paste("r = 1"))) sub.plot.data$r.label = rep(gl(3,224,labels = col.variable.labels),3) fig2 = ggplot(sub.plot.data, aes(x=year, y=measuredTL/1000, colour=as.factor(subject)) )+ geom_point() + theme_bw()+ theme(panel.grid.major = element_blank()) + theme(panel.grid.minor = element_blank()) + geom_line(size=0.5) + guides(colour=FALSE)+ facet_grid(r.label ~ sd.label, labeller = "label_parsed") + xlab("Year") + ylab("Telomere Length (kb)") fig2 # plot how mean and sd in TL changes over time stats = ddply(full.data, .(year, r, stress.attrition.sd, CV), summarise, mean.TLlength=mean(measuredTL, na.rm=TRUE), sd.TLlength=sd(measuredTL, na.rm = TRUE)) sub.stats = stats[stats$CV==0,] # create facet labels for graphs col.variable.labels = c(expression(paste(sigma[a]~"="~0)), expression(paste(sigma[a]~"="~50)),expression(paste(sigma[a]~"="~100))) sub.stats$sd.label = rep(gl(3,1,labels = col.variable.labels),48) col.variable.labels = c(expression(paste("r = 0")), expression(paste("r = 0.5")), expression(paste("r = 1"))) sub.stats$r.label = rep(gl(3,3,labels = col.variable.labels),16) # plot mean TL by year figS1a = ggplot(sub.stats, aes(x = year, mean.TLlength/1000)) + theme_bw()+ theme(panel.grid.major = element_blank()) + theme(panel.grid.minor = element_blank()) + geom_point() + geom_line(size=0.5) + facet_grid(r.label~ sd.label, labeller = "label_parsed") + xlab("Year") + ylab("Mean telomere length (kb)") figS1a # plot sd in TL by year figS1b = ggplot(sub.stats, aes(x = year, sd.TLlength/1000)) + theme_bw()+ theme(panel.grid.major = element_blank()) + theme(panel.grid.minor = element_blank()) + geom_point() + geom_line(size=0.5) + facet_grid(r.label ~ sd.label, labeller = "label_parsed")+ xlab("Year") + ylab("Standard deviation of telomere length (kb)") figS1b # Now plot how the proportion of true and observed gainers changes with years # Do this for different levels of measurement error (CV) summary.data$percentObsGainers = summary.data$propObsGainers*100 # convert prop gainers to a percentage # create facet labels for graphs col.variable.labels = c(expression(paste(sigma[a]~"="~0)), expression(paste(sigma[a]~"="~50)),expression(paste(sigma[a]~"="~100))) summary.data$sd.label = gl(3,180,labels = col.variable.labels) col.variable.labels = c(expression(paste("r = 0")), expression(paste("r = 0.5")), expression(paste("r = 1"))) summary.data$r.label = rep(gl(3,60,labels = col.variable.labels),3) fig3 = ggplot(data = summary.data, aes(x = year, y = percentObsGainers, colour = as.factor(CV))) + theme_bw()+ theme(panel.grid.major = element_blank()) + theme(panel.grid.minor = element_blank()) + geom_point() + geom_line(size = 0.5) + facet_grid(r.label ~ sd.label, labeller = "label_parsed")+ xlab("Year") + ylab("% TL gainers from baseline")+ theme(legend.justification=c(1,1),legend.position=c(1,.23)) + theme(legend.background=element_rect(fill="white", colour = "black"))+ scale_colour_manual(guide=guide_legend(title=NULL),labels=c("True","Measured, CV = 2%","Measured, CV = 4%", "Measured, CV = 8%"), values = c("black","grey40","grey60","grey80")) fig3 ############# Replicate Steenstrup predictions using the analytical model: Fig 4A ######################### # set constant values in simulations bl.mean = 7000 # mean of baseline TL distribution for studies where this is not available bl.sd = 700 # sd of baseline TL distribution for studies where this is not available change = 30 # attrition per year for studies where this is not available in bp no.studies = length(s$study) # make some arrays to store the output in sim.pred = array(,dim = c(no.studies)) # make an empty array for data on predicted gainers obs.gainers = as.array(s$ObsGainers) for (study in 1:no.studies) { # Set the variables for the current study n = 10000 # use this to produce a closer simulation of the analytical model FU = s$FU[study] FU.int = as.integer(FU) rem = FU - FU.int if (FU == FU.int) {max.years = FU} else {max.years = FU.int + 1} if (is.na(s$TLb[study] == TRUE)) {baselineTL.mean = bl.mean} else {baselineTL.mean = s$TLb[study] * 1000} if (is.na(s$TLb.sd[study] == TRUE)) {baselineTL.sd = bl.sd} else {baselineTL.sd = s$TLb.sd[study]} if (is.na(s$change[study] == TRUE)) {stress.attrition.mean = change} else {stress.attrition.mean = s$change[study]} CV = s$CV[study] if (s$rep[study]==2) {CV = CV/sqrt(2)} # reduce the CV of measurement error in studies with 2 replicates # run the simulation for sd = 0, r = 0: d = generate.TL.data(n = n, max.years = max.years, baselineTL.mean = baselineTL.mean, baselineTL.sd = baselineTL.sd, stress.attrition.mean = stress.attrition.mean, CV = CV, stress.attrition.sd = 0, r = 0) d$obsGainers[d$obsPCloss<0] = 1 d$trueGainers[d$trueTLloss<0] = 1 sum = ddply(d, .(year, stress.attrition.sd, r, CV), summarise, obsGainers=sum(obsGainers, na.rm=TRUE), trueGainers=sum(trueGainers, na.rm=TRUE)) sum$obsGainers[sum$year==0] = n/2 # this deals with the fact that if you measure people at baseline 50% will appear to have gained on average sum$propObsGainers = sum$obsGainers/n sum$propTrueGainers = sum$trueGainers/n # write the prediction into the output file if (FU.int == FU) {sim.pred[study] = sum$propObsGainers[sum$year==max.years]*100} else {sim.pred[study] = (((1-rem)*sum$propObsGainers[sum$year== (max.years-1)])+(rem*sum$propObsGainers[sum$year==max.years]))*100 } } s$simulated.gainers = sim.pred cor.test(s$simulated.gainers,s$PredGainersSteenstrup) # calculate the correlation for the paper # plot the correlation fig4a = ggplot(data = s, aes(x = PredGainersSteenstrup, y = simulated.gainers)) + geom_point(size = 2) + geom_abline(intercept = 0, slope = 1, linetype = "dotted")+ theme_bw()+ theme(panel.grid.major = element_blank()) + theme(panel.grid.minor = element_blank()) + ylab('Predicted % TL gainers (computational model)') + xlab("Predicted % TL gainers (Steenstrup et al.)") + annotate("text", x = 0, y = 50, label="A", size=6) fig4a ######### Search the parameter space to find the best fitting scenario: Fig 4B ########## # set constant values in simulations total.replicates = 100 # the number of times to repeat the simulation (NB this needs to run overnight) bl.mean = 7000 # mean of baseline TL distribution for studies where this is not available bl.sd = 700 # sd of baseline TL distribution for studies where this is not available change = 30 # attrition per year for studies where this is not available in bp no.studies = length(s$study) max.i = 100 # max attrition.sd value max.j = 1 # max k value step.i = max.i/10 step.j = max.j/2 max.rows = total.replicates*no.studies*((max.i/step.i)+1)*((max.j/step.j)+1) data = array(, dim = c(max.rows, 6)) current.row = 1 for (replicate in 1:total.replicates) { print(paste0("Replicate: ", replicate)) for (i in seq(0,max.i,step.i)) { # values of stress.attrition.sd for (j in seq(0,max.j,step.j)) { # values of r for (study in 1:no.studies) { # Set the variables for the current study n = s$N[study] FU = s$FU[study] FU.int = as.integer(FU) rem = FU - FU.int if (FU == FU.int) {max.years = FU} else {max.years = FU.int + 1} if (is.na(s$TLb[study] == TRUE)) {baselineTL.mean = bl.mean} else {baselineTL.mean = s$TLb[study] * 1000} if (is.na(s$TLb.sd[study] == TRUE)) {baselineTL.sd = bl.sd} else {baselineTL.sd = s$TLb.sd[study]} if (is.na(s$change[study] == TRUE)) {stress.attrition.mean = change} else {stress.attrition.mean = s$change[study]} CV = s$CV[study] if (s$rep[study]==2) {CV = CV/sqrt(2)} # reduce the CV of measurement error in studies with 2 replicates # run the simulation d = generate.TL.data(n = n, max.years = max.years, baselineTL.mean = baselineTL.mean, baselineTL.sd = baselineTL.sd, stress.attrition.mean = stress.attrition.mean, CV = CV, stress.attrition.sd = i, r = j) # calculate the true and observed TL gainers predicted d$obsGainers[d$obsPCloss<0] = 1 d$trueGainers[d$trueTLloss<0] =1 sum = ddply(d, .(year, stress.attrition.sd, r, CV), summarise, obsGainers=sum(obsGainers, na.rm=TRUE), trueGainers=sum(trueGainers, na.rm=TRUE)) sum$obsGainers[sum$year==0] = n/2 # this deals with the fact that if you measure people at baseline 50% will appear to have gained on average sum$propObsGainers = sum$obsGainers/n sum$propTrueGainers = sum$trueGainers/n # write the prediction into the output file if (FU.int == FU) {prediction = sum$propObsGainers[sum$year==max.years]} else {prediction = (((1-rem)*sum$propObsGainers[sum$year== (max.years-1)])+(rem*sum$propObsGainers[sum$year==max.years]))} if (FU.int == FU) {trueGainers = sum$propTrueGainers[sum$year==max.years]} else {trueGainers = (((1-rem)*sum$propTrueGainers[sum$year== (max.years-1)])+(rem*sum$propTrueGainers[sum$year==max.years]))} #dev = (s$ObsGainers[study]-(prediction*100))^2 # to assess goodness of fit to observed data case = c(replicate,study, i, j, trueGainers*100, prediction*100) data[current.row,] = case # accumulate summary array current.row = current.row + 1 }}} } # create the saved data frame dimnames(data) <- list(NULL, c("replicate","study","attrition.sd","r","trueGainers", "prediction")) data = as.data.frame(data, row.names = NULL) # save the data generated by the simulation write.csv(data, "Steenstrup_sim_data.csv") # Read in the saved data from the simulation (if required) data = read.csv("Steenstrup_sim_data3.csv", header = TRUE) # now calculate the r-squared values for each simulation summary = array(, dim = c(max.rows/no.studies, 4)) current.row = 1 for (replicate in 1:total.replicates) { for (i in seq(0,max.i,step.i)) { # values of stress.attrition.sd #print(paste0("Value of i: ", i)) for (j in seq(0,max.j,step.j)) { # values of r sub = data[data$replicate==replicate & data$attrition.sd==i & data$r==j,] m1 = lm(s$ObsGainers ~ 0 + sub$prediction) summary[current.row,] = c(replicate, i, j, summary(m1)$r.squared) current.row = current.row+1 }}} dimnames(summary) <- list(NULL, c("replicate","attrition.sd","r","r.squared")) summary = as.data.frame(summary, row.names = NULL) summary2 = ddply(summary, .(attrition.sd,r), summarise, mean.r.squared = mean(r.squared), sd.r.squared = sd(r.squared)) summary2$CI = qnorm(0.975)*summary2$sd.r.squared/sqrt(total.replicates) # plot the results fig4b = ggplot(summary2, aes(x = attrition.sd, y = mean.r.squared, colour = as.factor(r))) + geom_point(size = 2) + geom_errorbar(aes(ymin=mean.r.squared-CI, ymax=mean.r.squared+CI), width=.5) + geom_line(size = 1) + geom_abline(slope = 0, intercept = 0.862, linetype = "dotted") + # the Steenstrup fit annotate("segment", x = 95, xend = 95, y = (0.862-0.015), yend = (0.862+0.015), arrow=arrow(end = "both", length = unit(0.25, "cm"))) + annotate("text", x = 95, y = (0.862+.022), label = "Better fit") + annotate("text", x = 95, y = (0.862-.02), label = "Worse fit") + annotate("text", x = 50, y = 0.885, label = "*", size = 8) + # the best model annotate("segment", x=14.3, xend = 53.2, y=0.9, yend=0.9,arrow=arrow(ends="both",angle = 90, length=unit(.1,"cm")))+ # range of measures values of sd of annual attrition annotate("text", x = 33.5, y = 0.92, label = "Measured sd annual telomere attrition")+ theme_bw()+ theme(panel.grid.major = element_blank()) + theme(panel.grid.minor = element_blank()) + theme(legend.justification=c(0.95,0.98),legend.position=c(0.4,.3)) + theme(legend.background=element_rect(fill="white", colour = "black")) + xlab(bquote('Standard deviation of annual telomere attrition ('*sigma[a]*')'))+ ylab("R-squared")+ labs(colour = "Autocorrelation (r)") + scale_colour_manual(values = c("black","grey40","grey60"))+ annotate("text", x = 0, y = .94, label="B", size=6) fig4b # Tabulate when the computational model is better and worse than Steenstrup summary2$Steenstrup.r.squared = 0.8619852 summary2$better.diff = summary2$mean.r.squared-summary2$CI - summary2$Steenstrup.r.squared summary2$worse.diff = summary2$Steenstrup.r.squared-(summary2$mean.r.squared+summary2$CI) summary2$diff[summary2$better.diff>0] = "Better" summary2$diff[summary2$worse.diff>0] = "Worse" ###### Now plot the results for the "best" model (sd mean = 50, r = 0): Fig 4C ################ best = data[data$attrition.sd==50 & data$r==0,] best.sum = ddply(best, .(study), summarise, mean.true = mean(trueGainers), sd.true = sd(trueGainers),mean.prediction = mean(prediction), sd.prediction = sd(prediction)) s$best.model.mean = best.sum$mean.prediction m2 = lm(s$ObsGainers ~ 0 + s$best.model.mean) summary(m2) fig4c = ggplot(data = s, aes(y = ObsGainers, x = best.model.mean)) + geom_abline(intercept = 0, slope = 1, linetype = "dotted") + geom_point(size = 3) + geom_abline(slope = m2$coefficients, intercept = 0, size = 1) + theme_bw()+ theme(panel.grid.major = element_blank()) + # switch off major gridlines theme(panel.grid.minor = element_blank()) + ylab("Observed % TL gainers")+ xlab("Predicted % TL gainers") + annotate("text", x = 25, y = 50, parse=TRUE, label="Parameter~values:~sigma[a]==50~~~r==0", size=4) + annotate("text", x = 35, y = 10, size = 4, label=paste0("R-squared = ", round(summary(m2)$r.squared, digits = 2),", slope = ", round(m2$coefficients, digits=2))) + annotate("text", x = 0, y = 50, label="C", size=6) fig4c ###### Now calculate how many true gainers there are with the best model: Fig 4D ########### s$best.model.true.mean = best.sum$mean.true mean(s$best.model.mean) sd(s$best.model.mean) mean(s$best.model.true.mean) sd(s$best.model.true.mean) s2 = s %>% gather(TL,propGain,c(best.model.mean,best.model.true.mean)) s2$sds=c(best.sum$sd.prediction, best.sum$sd.true) s2$CI=qnorm(0.975)*s2$sds/sqrt(total.replicates) dodge = position_dodge(width = 0.8) fig4d = ggplot(s2, aes(x = study, y = propGain, fill = TL)) + geom_bar(stat = "identity", position = dodge, width = 0.8)+ geom_errorbar(aes(ymax = propGain+CI, ymin = propGain-CI), position = dodge,width = 0.2)+ theme_bw()+ theme(panel.grid.major = element_blank()) + theme(panel.grid.minor = element_blank()) + ylab("% TL gainers")+ theme(axis.text.x = element_text(angle = 45, hjust = 1))+ theme(legend.position=c(0.25,.75)) + theme(legend.background=element_rect(fill="white", colour = "black"))+ theme(axis.title.x=element_blank())+ scale_fill_manual(guide = guide_legend(title=NULL),labels=c("Measured % TL gainers","True % TL gainers"), values = c("grey80","grey50"))+ labs(fill = "Measure")+ annotate("text", x = 5, y = 50, parse=TRUE, label="Parameter~values:~sigma[a]==50~~~r==0", size=4) + annotate("text", x = 1, y = 50, label="D", size=6) fig4d ############### Generate the figures for the paper ############## # Figure 1ab pdf("Figure1ab.pdf",width=2800/300, height=1400/300) grid.arrange(fig1a, fig1b, ncol=2) dev.off() # Figure 2 pdf("Figure2.pdf",width=1800/300, height=1800/300) print(fig2) dev.off() # Figure S1ab pdf("FigureS1ab.pdf",width=2800/300, height=1400/300) grid.arrange(figS1a, figS1b, ncol=2) dev.off() # Figure 3 pdf("Figure3.pdf",width=2000/300, height=2000/300) print(fig3) dev.off() # Figure 4abcd pdf("Figure4abcd.pdf",width = 2800/300, height = 2800/300) grid.arrange(fig4a, fig4b, fig4c,fig4d, ncol=2) dev.off()