##################################################################################
# S3 Data.  R code to reproduce statistical analysis in: 
# Tanentzap et al. Microplastics and anthropogenic fibre concentrations in lakes reflect surrounding land use.  PLOS Biology. 
# No guarantees whatsoever are given with this code, but please cite if re-used.
# File prepared by Andrew Tanentzap (ajt65@cam.ac.uk), July 2021
# File contains the following sections:
# 1) Prepare workspace
# 2) Model associated with FIGURE 1
# 3) Model associated with FIGURE 2    
# 4) Simulation study associated with S1 FIGURE
# 5) Model associated with S2 FIGURE
######################################################################################
## 1) Prepare workspace
######################################################################################
library(rstan)
library(brms)  # ensure >= v.2.15.0
library(faux)

all_trawls <- read.table('TanentzapS1Data.txt',header=T,skip=10,sep = '\t',fill=T,quote="",as.is=T)
EUlakedata <- read.table('TanentzapS2Data.txt',header=T,skip=10,sep = '\t')
EUwastegdp <- read.table('TanentzapS4Data.txt',header=T,skip=10,sep = '\t')


######################################################################################
## 2) Model associated with FIGURE 1
######################################################################################
all_trawls[all_trawls$Habitat == 'MARINE','Habitat'] <- 'ESTUARINE/MARINE'                     

# because of all the zero's, fit a censored model in line with discussion @ https://discourse.datamethods.org/t/modelling-outcome-data-with-a-limit-of-detection-is-a-hurdle-model-appropriate/1427/8
# code modified from https://onlinelibrary.wiley.com/doi/full/10.1002/sim.5515    https://setac.onlinelibrary.wiley.com/doi/full/10.1002/etc.4046
cens_mod <- "
data {
  int<lower=0> N_obs;
  int<lower=0> N_cen; 
  int<lower=0> N_obs_us;
  int<lower=0> N_obs_db;
  int<lower=0> N_cens_us;
  int<lower=0> N_cens_db;
  int<lower=0> N_studies;
  real y_obs_us[N_obs_us];
  real y_obs_db[N_obs_db];
  real log_mesh_size[N_obs]; 
  real log_mesh_size_cen[N_cen];
  matrix[N_obs,3] habitat;
  matrix[N_cen,3] habitat_cen;  
  int habitat_id[N_obs];
  int habitat_id_cen[N_cen];
  int study_id[N_obs];
  int study_id_cen[N_cen];  
  vector[N_cens_us] L_us;
  vector[N_cens_db] L_db;
}
parameters {
  vector[1] beta;
  vector[3] alpha;
  vector<lower=0>[3] sigma;
  vector[N_studies] z;
  vector[3] z2;
}
transformed parameters{
  vector[N_obs_us] mu_us;
  vector[N_obs_db] mu_db;
  vector[N_cens_db] mu_cen_db;
  vector[N_cens_us] mu_cen_us;
  for (i in 1:N_obs_db)
   mu_db[i] = habitat[i]*alpha + (beta[1]+sigma[3]*z2[habitat_id[i]])*log_mesh_size[i] + sigma[1]*z[study_id[i]];
  for (i in (N_obs_db+1):N_obs)
   mu_us[(i-N_obs_db)] = alpha[1] + alpha[2] + (beta[1]+sigma[3]*z2[2])*log_mesh_size[i] + sigma[1]*z[study_id[i]];
  for (i in 1:N_cens_db)
   mu_cen_db[i] = habitat_cen[i]*alpha + (beta[1]+sigma[3]*z2[habitat_id_cen[i]])*log_mesh_size_cen[i] + sigma[1]*z[study_id_cen[i]];
  for (i in (N_cens_db+1):N_cen)
   mu_cen_us[(i-N_cens_db)] = alpha[1] + alpha[2] + (beta[1]+sigma[3]*z2[2])*log_mesh_size_cen[i] + sigma[1]*z[study_id_cen[i]]; 
}
model {
  z ~ std_normal();
  z2 ~ std_normal();
  alpha ~ normal(0,5);
  beta ~ std_normal();
  sigma ~ std_normal();
  y_obs_us ~ lognormal(mu_us, sigma[2]);
  y_obs_db ~ lognormal(mu_db, sigma[2]);
  target += normal_lcdf(L_us | mu_cen_us, sigma[2]) +  normal_lcdf(L_db | mu_cen_db, sigma[2]);  
}
generated quantities {
  vector[N_obs_us+N_obs_db+N_cens_db+N_cens_us] concat = append_row(append_row(append_row(mu_db,mu_us),mu_cen_db),mu_cen_us);
  real mean_concat = mean(concat);
  real r2_gel = variance(concat) / (variance(concat) + pow(sigma[2],2));  
}"

groups_cen <- as.factor(all_trawls$Habitat)
cens_mod_dat <- list(   N_obs = length(all_trawls$concentration_L[which(all_trawls$concentration_L!=0)]),
                        N_cen = length(all_trawls$concentration_L[which(all_trawls$concentration_L==0)]),
                        N_obs_us = nrow(all_trawls[which(all_trawls$concentration_L!=0 & all_trawls$Habitat == 'THIS_STUDY'),]),
                        N_obs_db = nrow(all_trawls[which(all_trawls$concentration_L!=0 & all_trawls$Habitat != 'THIS_STUDY'),]),
                        N_cens_us = nrow(all_trawls[which(all_trawls$concentration_L==0 & all_trawls$Habitat == 'THIS_STUDY'),]),
                        N_cens_db = nrow(all_trawls[which(all_trawls$concentration_L==0 & all_trawls$Habitat != 'THIS_STUDY'),]),
                        N_studies = length(unique(all_trawls$DOI)),
                        y_obs_db = all_trawls$concentration_L[which(all_trawls$concentration_L!=0 & all_trawls$Habitat != 'THIS_STUDY')],
                        y_obs_us = all_trawls$concentration_L[which(all_trawls$concentration_L!=0 & all_trawls$Habitat == 'THIS_STUDY')],
                        log_mesh_size = as.numeric(scale(log(all_trawls$Filter_microns)))[which(all_trawls$concentration_L!=0)],
                        log_mesh_size_cen = as.numeric(scale(log(all_trawls$Filter_microns)))[which(all_trawls$concentration_L==0)],
                        habitat = matrix(model.matrix(~groups_cen[which(all_trawls$concentration_L!=0)]),ncol=4)[,1:3],
                        habitat_cen = matrix(model.matrix(~groups_cen[which(all_trawls$concentration_L==0)]),ncol=4)[,1:3],
                        habitat_id = as.numeric(groups_cen[which(all_trawls$concentration_L!=0)]),
                        habitat_id_cen = as.numeric(groups_cen)[which(all_trawls$concentration_L==0)],
                        study_id = as.numeric(as.factor(all_trawls$DOI))[which(all_trawls$concentration_L!=0)],
                        study_id_cen = as.numeric(as.factor(all_trawls$DOI))[which(all_trawls$concentration_L==0)]
                    )
cens_mod_dat$L_us = rep(1/(pi*0.15*0.15*100*1000)*.5,cens_mod_dat$N_cens_us)
cens_mod_dat$L_db = rep(round(min(all_trawls$concentration_L[which(all_trawls$concentration_L!=0)]),7)*.5,cens_mod_dat$N_cens_db)
fit.stan <- stan(model_code = cens_mod, data = cens_mod_dat, iter = 1, chains = 1, verbose = F)

# sample model
cens_mod_m1 <- stan(fit=fit.stan, data=cens_mod_dat, iter=6750, warmup=3000, chains=4, refresh=100, thin=15, cores=4, open_progress=T)


######################################################################################
## 3) Model associated with FIGURE 2
######################################################################################
# fit linear model to count data accounting for overdispersion
m3 <- brm(bf(totalMP ~ scale(mwi^(1/3)) + scale(forest_cov) + scale(total_resp^(1/3)) + scale(human_cov) + scale(log(slope_short)) + scale(generated_load^(1/3)), hu ~ 1),
         #fit = m3,
         family = hurdle_poisson(),
         data = EUlakedata,
         chains = 4, cores = 4,
         iter = 5000, warmup = 2000, thin = 12,
         save_all_pars = T)     
         
         
######################################################################################
## 4) Simulation study associated with S1 FIGURE
######################################################################################     
######### simulation study testing for variation in sampling volumes                          
simresults <- lapply(seq(.2,.4,by=.2), function(z){  
                      lapply(seq(0,.8,by=.1), function(p){
                                    EUlakedata$samplevol <- rnorm_pre(EUlakedata$totalMP, mu = 7100, sd = 7100*z, r = p)                     
                                    m3_c <- brm(bf(totalMP ~ scale(mwi^(1/3)) + scale(forest_cov) + scale(total_resp^(1/3)) + scale(human_cov) + scale(log(slope_short)) + scale(generated_load^(1/3)) + scale(samplevol), hu ~ 1),
                                             family = hurdle_poisson(),
                                             data = alldata,
                                             chains = 4, iter = 2, warmup = 1, thin = 1)

                        vol_sims <- lapply(1:100, function(y){
                                    # add random error in proportion to concentrations but vary it to be up to 20% of the mean
                                    EUlakedata$samplevol <- rnorm_pre(EUlakedata$totalMP, mu = 7100, sd = 7100*z, r = p)                     
                                    m3_s1 <- update(m3_c,newdata = EUlakedata, iter = 5000, warmup = 2000, thin = 12, save_pars = save_pars(all=T), refresh=0)
                                    return(posterior_samples(m3_s1))
                              })
                              return(vol_sims)       
                        })
              }) 

#probability of finding a significant effect of sample vol and that effect of MPW is no longer significant
prop_pos_sig_sampvol <-   sapply(simresults,function(x){sapply(x,function(y){ sum(sapply(y,function(z){ (quantile(z$b_scalesamplevol,.025) >0 & quantile(z$b_scalesamplevol,.975) >0)}))  })})  /
                  sapply(simresults,function(x){sapply(x,function(y){ length(sapply(y,function(z){ (quantile(z$b_scalesamplevol,.025) >0 & quantile(z$b_scalesamplevol,.975) >0)}))  })})
prop_pos_sig_mwiout <-   sapply(simresults,function(x){sapply(x,function(y){ sum(sapply(y,function(z){ (quantile(z$b_scalemwi_outE1D3,.025) <0 & quantile(z$b_scalemwi_outE1D3,.975) >0)}))  })})  /
                  sapply(simresults,function(x){sapply(x,function(y){ length(sapply(y,function(z){ (quantile(z$b_scalemwi_outE1D3,.025) <0 & quantile(z$b_scalemwi_outE1D3,.975) >0)}))  })})
prop_pos_sig_for <-   sapply(simresults,function(x){sapply(x,function(y){ sum(sapply(y,function(z){ (quantile(z$b_scaleallforest_cov,.025) <0 & quantile(z$b_scaleallforest_cov,.975) >0)}))  })})  /
                  sapply(simresults,function(x){sapply(x,function(y){ length(sapply(y,function(z){ (quantile(z$b_scaleallforest_cov,.025) >0 & quantile(z$b_scaleallforest_cov,.975) >0)}))  })})
prop_pos_sig_WWtW <-   sapply(simresults,function(x){sapply(x,function(y){ sum(sapply(y,function(z){ (quantile(z$b_scalegenerated_loadE1D3,.025) <0 & quantile(z$b_scalegenerated_loadE1D3,.975) >0)}))  })})  /
                  sapply(simresults,function(x){sapply(x,function(y){ length(sapply(y,function(z){ (quantile(z$b_scalegenerated_loadE1D3,.025) <0 & quantile(z$b_scalegenerated_loadE1D3,.975) >0)}))  })})
prop_pos_sig_resp <-   sapply(simresults,function(x){sapply(x,function(y){ sum(sapply(y,function(z){ (quantile(z$b_scaletotal_respE1D3,.025) <0 & quantile(z$b_scaletotal_respE1D3,.975) >0)}))  })})  /
                  sapply(simresults,function(x){sapply(x,function(y){ length(sapply(y,function(z){ (quantile(z$b_scaletotal_respE1D3,.025) <0 & quantile(z$b_scaletotal_respE1D3,.975) >0)}))  })})


######################################################################################
## 5) Model associated with S2 FIGURE
######################################################################################       
lmm1 <-  brm(data = EUwastegdp, family = gaussian,
          log(percapita_waste)~log(regional_GDP_percapita)+(1|country),
          chains = 4, cores = 4,
          iter = 2000, warmup = 1000)      
    
          
#end of file