1 Introduction

This code accompanies the manuscript, Pham et al., “Variability in in vivo Toxicity Studies - Defining the upper limit of predictivity for models of systemic effect levels.” The corresponding author can be reached at paul-friedman.katie@epa.gov for any additional questions.

2 Background code

The code sections below provide the R setup, custom functions, database retrieved, and dataset compilation. The code is hidden by default but can be expanded for viewing globally by clicking “Show All Code” or by section by clicking the “code” boxes.

2.1 R Session information

See the R sessionInfo() used below.

## R version 3.6.1 (2019-07-05)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 16299)
## 
## Matrix products: default
## 
## locale:
## [1] LC_COLLATE=English_United States.1252 
## [2] LC_CTYPE=English_United States.1252   
## [3] LC_MONETARY=English_United States.1252
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.1252    
## 
## attached base packages:
## [1] grid      stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] moments_0.14      broom_0.5.2       purrr_0.3.2      
##  [4] kableExtra_1.1.0  gridExtra_2.3     ggpubr_0.2.1     
##  [7] cowplot_1.0.0     rmarkdown_1.15    openxlsx_4.1.0.1 
## [10] olsrr_0.5.2       MASS_7.3-51.4     ggalt_0.4.0      
## [13] ggthemes_4.2.0    DT_0.8            magrittr_1.5     
## [16] ggplot2_3.2.0     reshape2_1.4.3    dplyr_0.8.3      
## [19] data.table_1.12.6 RMySQL_0.10.17    DBI_1.0.0        
## 
## loaded via a namespace (and not attached):
##  [1] nlme_3.1-140       lubridate_1.7.4    webshot_0.5.1     
##  [4] ash_1.0-15         RColorBrewer_1.1-2 httr_1.4.0        
##  [7] gh_1.0.1           tools_3.6.1        backports_1.1.5   
## [10] R6_2.4.0           rpart_4.1-15       KernSmooth_2.23-15
## [13] nortest_1.0-4      lazyeval_0.2.2     colorspace_1.4-1  
## [16] nnet_7.3-12        withr_2.1.2        tidyselect_0.2.5  
## [19] curl_4.0           compiler_3.6.1     extrafontdb_1.0   
## [22] rvest_0.3.4        xml2_1.2.2         scales_1.0.0      
## [25] checkmate_1.9.4    proj4_1.0-8        readr_1.3.1       
## [28] goftest_1.1-1      stringr_1.4.0      digest_0.6.21     
## [31] foreign_0.8-71     rio_0.5.16         pkgconfig_2.0.3   
## [34] htmltools_0.3.6    extrafont_0.17     maps_3.3.0        
## [37] htmlwidgets_1.3    rlang_0.4.0        readxl_1.3.1      
## [40] rstudioapi_0.10    shiny_1.3.2        generics_0.0.2    
## [43] jsonlite_1.6       zip_2.0.3          car_3.0-3         
## [46] Matrix_1.2-17      Rcpp_1.0.2         munsell_0.5.0     
## [49] abind_1.4-5        stringi_1.4.3      yaml_2.2.0        
## [52] carData_3.0-2      plyr_1.8.4         recipes_0.1.6     
## [55] promises_1.0.1     forcats_0.4.0      crayon_1.3.4      
## [58] lattice_0.20-38    haven_2.1.1        splines_3.6.1     
## [61] hms_0.5.1          zeallot_0.1.0      knitr_1.24        
## [64] pillar_1.4.2       ggsignif_0.5.0     glue_1.3.1        
## [67] evaluate_0.14      vctrs_0.2.0        httpuv_1.5.1      
## [70] Rttf2pt1_1.3.7     cellranger_1.1.0   tidyr_0.8.3       
## [73] gtable_0.3.0       assertthat_0.2.1   xfun_0.9          
## [76] gower_0.2.1        mime_0.7           prodlim_2018.04.18
## [79] xtable_1.8-4       later_0.8.0        viridisLite_0.3.0 
## [82] class_7.3-15       survival_2.44-1.1  timeDate_3043.102 
## [85] tibble_2.1.3       lava_1.6.6         ipred_0.9-9

2.2 Custom functions for this work

# functions

# organize sex from studies

trsexConcat <- function(tr){
  dat_tr <- copy(tr)
  dat_sex <- unique(tr[ , list(chem, study_id, sex)])
  dat_sex[ , all_sex := .(concat = paste0(sex, collapse = ",")), by = study_id]
  
  dat_sex2 <- unique(dat_sex[ , list(chem, study_id, all_sex)])
  
  setkey(dat_sex2, "study_id")
  setkey(dat_tr, "study_id")
  dat_tr <- dat_tr[dat_sex2]
  
  ## all sex M, F, MF
  dat_tr[all_sex == "F,M", all_sex := "MF"]
  dat_tr[all_sex == "MF,F", all_sex := "MF"]
  dat_tr[all_sex == "MF,M", all_sex := "MF"]
  dat_tr[all_sex == "F,M,MF", all_sex := "MF"]
  dat_tr[all_sex == "MF,F,M", all_sex := "MF"]
  dat_tr[all_sex == "M,MF,F", all_sex := "MF"]
  dat_tr[all_sex == "F,MF,M", all_sex := "MF"]
  dat_tr[all_sex == "M,F", all_sex := "MF"]
  dat_tr[all_sex == "F,MF", all_sex := "MF"]
  dat_tr[all_sex == "M,F,MF", all_sex := "MF"]
  
  dat <- unique(dat_tr)
  return(dat)
  
}

# Leave-one-out function for the MLR

LinMeanSqCompar <- function(dat_lm){
  
  ta = dat_lm ## Whole Model
  
  Full_Model <- anova(ta) %>%
    .$`Mean Sq` %>%
    tail(dat_lm, n = 1) ## Extracting the Residual Sum Sq.
  
  tb = update(ta, ~ . - factor(chem)) ## Build model but without chem as a variable
  No_Chemical <- anova(tb) %>%
    .$`Mean Sq` %>%
    tail(dat_lm, n = 1)
  
  tc = update(ta, ~ . - factor(strain_group))
  No_Strain_group <- anova(tc) %>%
    .$`Mean Sq` %>%
    tail(dat_lm, n = 1)
  
  td = update(ta, ~ . - factor(study_type))
  No_Study_type <- anova(td) %>%
    .$`Mean Sq` %>%
    tail(dat_lm, n = 1)
  
  te = update(ta, ~ . - factor(admin_method))
  No_Admin_method <- anova(te) %>%
    .$`Mean Sq` %>%
    tail(dat_lm, n = 1)
  
  tf = update(ta, ~ . - dose_spacing_center)
  No_Dose_spacing <- anova(tf) %>%
    .$`Mean Sq` %>%
    tail(dat_lm, n = 1)
  
  tg = update(ta, ~ . - dose_no)
  No_Dose_no <- anova(tg) %>%
    .$`Mean Sq` %>%
    tail(dat_lm, n = 1)
  
  th = update(ta, ~ . - study_year_center)
  No_Study_year <- anova(th) %>%
    .$`Mean Sq` %>%
    tail(dat_lm, n = 1)
  
  ti = update(ta, ~ . - sub_purity_center)
  No_Substance_purity <- anova(ti) %>%
    .$`Mean Sq` %>%
    tail(dat_lm, n = 1)
  
  tj = update(ta, ~ . - factor(study_source))
  No_Study_source <- anova(tj) %>%
    .$`Mean Sq` %>%
    tail(dat_lm, n = 1)
  
  tk = update(ta, ~ . - factor(all_sex))
  No_sex <- anova(tk) %>%
    .$`Mean Sq` %>%
    tail(dat_lm, n = 1)
  
  tbta <- anova(tb, ta) %>% .$`Pr(>F)`
  tcta <- anova(tc, ta) %>% .$`Pr(>F)`
  tdta <- anova(td, ta) %>% .$`Pr(>F)`
  teta <- anova(te, ta) %>% .$`Pr(>F)`
  tfta <- anova(tf, ta) %>% .$`Pr(>F)`
  tgta <- anova(tg, ta) %>% .$`Pr(>F)`
  thta <- anova(th, ta) %>% .$`Pr(>F)`
  tita <- anova(ti, ta) %>% .$`Pr(>F)`
  tjta <- anova(tj, ta) %>% .$`Pr(>F)`
  tkta <- anova(tk, ta) %>% .$`Pr(>F)`
  
  lmst_MS <- rbind(Full_Model,
                   No_Chemical,
                   No_Strain_group,
                   No_Study_type,
                   No_Admin_method,
                   No_Dose_spacing,
                   No_Dose_no,
                   No_Study_year,
                   No_Substance_purity,
                   No_Study_source,
                   No_sex )
  
  lmst_P <- rbind(0, tbta, tcta, tdta, teta, tfta, tgta, thta, tita, tjta, tkta)
  
  lmst <- cbind(lmst_MS, lmst_P) %>% .[ , -2] %>% signif(3)
  
  colnames(lmst) <- c("MSE", "p-val")
  
  return(lmst)
  
}

2.3 Database used: ToxRefDB version 2.0

The DOI for this database is: https://doi.org/10.23645/epacomptox.6062545.v3 A reference publication describing the construction of ToxRefDB version 2.0 can be found here: https://doi.org/10.1016/j.reprotox.2019.07.012

2.3.1 Retrieving the data from ToxRefDB version 2.0

toxrefdb_2_0 <- dbGetQuery(con, "SELECT 
                           chemical.chemical_id,
                           chemical.dsstox_substance_id,
                           chemical.casrn,
                           chemical.preferred_name,
                           study.study_id,
                           study.chemical_id,
                           study.processed,
                           study.study_type,
                           study.study_year,
                           study.study_source,
                           study.species,
                           study.strain_group,
                           study.admin_route,
                           study.admin_method,
                           study.substance_purity,
                           study.dose_start,
                           study.dose_start_unit,
                           study.dose_end,
                           study.dose_end_unit,
                           endpoint.endpoint_category,
                           endpoint.endpoint_type,
                           endpoint.endpoint_target,
                           endpoint.endpoint_id,
                           tg_effect.life_stage,
                           tg_effect.tg_effect_id,
                           effect.effect_id,
                           effect.effect_desc,
                           tg.sex,
                           tg.generation,
                           dose.dose_level,
                           dtg.dose_adjusted,
                           dtg.dose_adjusted_unit,
                           dtg_effect.treatment_related,
                           dtg_effect.critical_effect,
                           tested_status,
                           reported_status
                           FROM 
                           (((((((((prod_toxrefdb_2_0.chemical INNER JOIN prod_toxrefdb_2_0.study ON chemical.chemical_id=study.chemical_id)
                           LEFT JOIN prod_toxrefdb_2_0.dose ON dose.study_id=study.study_id)
                           LEFT JOIN prod_toxrefdb_2_0.tg ON tg.study_id=study.study_id)
                           LEFT JOIN prod_toxrefdb_2_0.dtg ON tg.tg_id=dtg.tg_id AND dose.dose_id=dtg.dose_id)
                           LEFT JOIN prod_toxrefdb_2_0.tg_effect ON tg.tg_id=tg_effect.tg_id)
                           LEFT JOIN prod_toxrefdb_2_0.dtg_effect ON tg_effect.tg_effect_id=dtg_effect.tg_effect_id AND         dtg.dtg_id=dtg_effect.dtg_id)
                           LEFT JOIN prod_toxrefdb_2_0.effect ON effect.effect_id=tg_effect.effect_id)
                           LEFT JOIN prod_toxrefdb_2_0.endpoint ON endpoint.endpoint_id=effect.endpoint_id)
                           LEFT JOIN prod_toxrefdb_2_0.obs ON obs.study_id=study.study_id AND obs.endpoint_id=endpoint.endpoint_id)") %>% 
  data.table() 

#save(toxrefdb_2_0, file='RData/toxrefdb_2_0.RData')

2.3.2 Preparing the dataset for use

#length(unique(toxrefdb_2_0$dsstox_substance_id)) #1142
#length(unique(toxrefdb_2_0$study_id)) #5960

## Prepare the toxrefdb_2_0 dataset for modeling

toxrefdb_2_0[!is.na(dose_level), dose_no := max(dose_level, na.rm = TRUE) , by = study_id]
toxrefdb_2_0[dose_adjusted > 0, log_dose_adjusted := log10(dose_adjusted)]
toxrefdb_2_0[!is.na(log_dose_adjusted), ldt := min(log_dose_adjusted), by = study_id]
toxrefdb_2_0[!is.na(log_dose_adjusted), hdt := max(log_dose_adjusted), by = study_id]
toxrefdb_2_0[ , dose_spacing := (hdt-ldt)/(dose_no-1)]

ar <- c("Oral", "Oral/Gavage")
sp <- c("mouse", "rat", "dog", "rabbit", "Tif: RAI f (SPF)") 
st <- c("SUB","CHR", "DEV", "MGR", "SAC"  )
ec <- c('systemic')
ls <- c('adult')
gn <- c('F0')

tr2 <- data.table(toxrefdb_2_0) %>% .[processed==1 &
                                        dose_adjusted > 0 &
                                        dose_adjusted_unit == 'mg/kg/day' &
                                        dose_level > 0 &
                                        admin_route %in% ar &
                                        study_type %in% st &
                                        species %in% sp &
                                        endpoint_category %in% ec &
                                        life_stage %in% ls &
                                        generation %in% gn &
                                        treatment_related == 1 &
                                        dose_no > 1,] %>%
  .[ , chem := paste(casrn, preferred_name, sep = "||")] 


## Standarized data notations

tr2[ species=="Tif: RAI f (SPF)", species := "rat"]

tr2[ , strain_group := tolower(strain_group) ]
tr2[ strain_group %in% c("sprague dawley","sprague-dawley"), strain_group:="sprague_dawley" ]
tr2[ strain_group %in% c("other", NA), strain_group := species ]

tr2[ study_source %in% c("NTP Technical Report", "NTP Report"), study_source:="ntp"]

tr2[ , admin_method:=tolower(admin_method) ]
tr2[ admin_method =="[Not Specified]", admin_method := admin_route ] ## All admin_method[Not Specified] ==> default to admin method

tr2[ substance_purity==">95", substance_purity:="95" ]
tr2[ substance_purity==">99", substance_purity:="99" ]

tr2[ substance_purity=="99.75% gamma isomer of hexachlorocyclohexane", substance_purity:="99.75"]
tr2[ substance_purity=="72 +/- 3" , substance_purity:="72"]
tr2[ substance_purity=="96.8 percent" , substance_purity:="96.8"]

tr2[, substance_purity:=as.numeric(substance_purity)]
tr2[ substance_purity>100, substance_purity:=100]

## Calculating LEL/LOAEL

tr2[ , log_dose_adjusted := log10(dose_adjusted) ]
tr2[ , study_summary := max(treatment_related, na.rm = TRUE), by = study_id ]
tr2[study_summary == 1, lel := min(log_dose_adjusted[treatment_related == 1], na.rm = TRUE), by = list(study_id) ]

tr2[, ce := max(critical_effect, na.rm = TRUE), by = study_id ]
tr2[ce == 1, loael := min(log_dose_adjusted[critical_effect == 1], na.rm = TRUE), by = list(study_id) ]
tr2[is.na(loael), loael := lel]

## Obtain only the variables that are relevant for analysis

dat.tr <- unique(tr2[ , list(chem, 
                             chemical_id, 
                             study_id,
                             processed,
                             study_year,
                             study_source,
                             study_type, 
                             species, 
                             strain_group, 
                             admin_route,
                             admin_method, 
                             substance_purity, 
                             dose_no, 
                             dose_start,
                             dose_start_unit,
                             dose_end,
                             dose_end_unit,
                             sex, 
                             ldt, 
                             hdt, 
                             dose_spacing, 
                             lel,
                             loael)])

## Concatenate all of the sexes for each study 

dat.tr <- trsexConcat(dat.tr) %>%
  .[ , sex:=NULL] %>%
  unique()

## All "NA" values in study year and substance purity are given mean value

dat.tr[is.na(study_year), study_year := 1991]
dat.tr[is.na(substance_purity), substance_purity := 100 ]

dat.tr[ , sub_purity_center := substance_purity-mean(substance_purity)]
dat.tr[ , study_year_center := study_year-mean(study_year)]
dat.tr[ , dose_spacing_center := dose_spacing-mean(dose_spacing)]

#save(dat.tr, file="dat_toxref_2_0_4model.RData")

3 Preparation for multilinear regression (MLR) modeling

Herein the dataset is prepared for MLR.

3.1 Constructing the full MLR dataset

See code for constructing the full MLR dataset.

load('dat_toxref_2_0_4model.RData')

dat.mlr.full <- unique(dat.tr) %>%
  .[ , studies_count :=.N, by=chem] %>% 
  .[ studies_count > 1 ] %>% # need to ensure greater than one study per chem
  .[ , sub_purity_center := substance_purity - mean(substance_purity)] %>%
  .[ , study_year_center := study_year - mean(study_year)] %>% 
  .[ , dose_spacing_center := dose_spacing - mean(dose_spacing)] %>%
  .[ , lelvsloael := lel==loael]


#nrow(dat.mlr.full[lelvsloael=='TRUE']) #1830 out of 2724
#1830/2724 #67% of the time

3.2 Plot of proportion of study type by species in the full MLR dataset.

See a visual representation of the study type by species included in the full MLR dataset.

3.3 Calculation of the size and variance for the full MLR dataset.

See a table below of the size and variance for the full MLR dataset.

Full MLR dataset N
Total Number of Chemicals 563
Total Number of Studies 2724
Variance of the LEL 0.92
Variance of the LOAEL 0.79
length(unique(dat.mlr.full$chem)) #563 substances
length(unique(dat.mlr.full$study_id)) #2724 study records
round(var(dat.mlr.full$lel),2) #0.92 is the total variance for the LEL dataset
round(var(dat.mlr.full$loael),2) # 0.79 is the total variance for the LOAEL dataset

4 Full LEL dataset RLR and MLR

In the manuscript, please see Table 2: Variance estimation results for full datasets which describes the outcome of the regression modeling approaches for the full and full cell datasets.

4.1 RLR full LEL dataset

ANOVA for the model is shown.

rr.lel <- rlm(lel ~ 0 
                + factor(chem)  
                + factor(strain_group)
                + factor(study_type)
                + factor(admin_method)
                + dose_spacing_center
                + dose_no
                + study_year_center
                + sub_purity_center
                + factor(study_source)
                + factor(all_sex), data = dat.mlr.full, method="M")
## ANOVA for RLR full LEL dataset
anova(rr.lel) 

4.2 MLR full LEL dataset

ANOVA for the model is shown.

mlr.lel <- lm(lel ~ 0 
              + factor(chem)  
              + factor(strain_group) 
              + factor(study_type) 
              + factor(admin_method) 
              + dose_spacing_center  
              + dose_no
              + study_year_center 
              + sub_purity_center 
              + factor(study_source) 
              + factor(all_sex), data = dat.mlr.full) 
anova(mlr.lel) # ANOVA for MLR full LEL dataset

4.3 Identify potentially influential points

Identify influential data points to understand the stability of the MSE estimate on the MLR for the full LEL dataset using standard plots. This is included as Figure 3 in the manuscript.

# First we examine the standard regression performance plots

par(mfrow= c(3,2))
par(mfrow= c(3,2))
plot(mlr.lel,which=c(1,2,3,4,5), cex.lab=1.5, cex.axis=1.5, cex.main=1.5, cex.id = 1.5, caption = list("A: Residuals vs Fitted", 
                                                                                                       "B: Normal Q-Q",
                                                                                    "C: Scale-Location",
                                                                                    "D: Cook's distance",
                                                                                    "E:Residuals vs Leverage"))
## Warning: not plotting observations with leverage one:
##   184, 500, 720, 721, 736, 971, 1063, 1228, 1229, 1454, 2084, 2094

## Warning: not plotting observations with leverage one:
##   184, 500, 720, 721, 736, 971, 1063, 1228, 1229, 1454, 2084, 2094
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced

## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced

Second, we examine an alternate way of identifying high Cook’s distance points. Cook’s distance using a common threshold (Cook’s distance > 4/(n-P-1)), where P is the number of covariates in the model. This threshold-based method for identifying high Cook’s distance points suggests ~5% of dataset may be influential.

See below, the figure on the left shows the Cook’s distances for each residual ordered by size, with the dotted line denoting the threshold. The figure on the right shows the influence plot for the standardized residuals.

cooksd <- rev(sort(cooks.distance(mlr.lel)))
#plot(cooksd, pch='*', cex=2, main='Influential Obs by Cooks Distance')
#abline(h=(4/(2724-(length(mlr.lel$coefficients))-1)), col='red')

std_resid <- rstandard(mlr.lel)
cooks_D <- cooks.distance(mlr.lel)
hat_values <- hatvalues(mlr.lel)

# another way to visualize this is an influence plot
# plot diagnostic influence plots together

par(mfrow= c(1,2))
plot(cooksd, pch='*', cex=2, main="Cook's Distance Index",ylab="Cook's Distance")
abline(h=(4/(2724-(length(mlr.lel$coefficients))-1)), lty=2)
plot(hat_values, std_resid, cex=10*sqrt(cooks_D), main = "Influence Plot", xlab="Hat values", ylab="Standardized residuals")
abline(h=c(-2.5,2.5), lty=2)

4.4 MLR full LEL dataset high leverage points removed

ANOVA for the model is shown.

# now try recomputing model by removing high leverage points as identified via standard plots from Figure 3

#warning message indicates that: not plotting observations with leverage one:
#184, 500, 720, 721, 736, 971, 1063, 1228, 1229, 1454, 2084, 2094
#961, 1916, 1918 are also high leverage points on the plots

# try again removing high leverage and high residual points

dat.mlr.lel.leverage <- dat.mlr.full[ -c(184,500,720,721,736,961,971,1063,1228,1229,1454,1916, 1918,2084,2094), ]%>%
  .[ , st.count:=uniqueN(study_id), by=chem]%>% .[ st.count > 1 ]

mlr.lel.leverage <- lm(lel ~ 0 
                       + factor(chem)  
                       + factor(strain_group) 
                       + factor(study_type) 
                       + factor(admin_method) 
                       + dose_spacing_center  
                       + dose_no
                       + study_year_center 
                       + sub_purity_center 
                       + factor(study_source) 
                       + factor(all_sex), data = dat.mlr.lel.leverage) 

anova(mlr.lel.leverage) # ANOVA for MLR full LEL dataset with high leverage points removed

4.5 MLR full LEL dataset high Cook’s distance plot points removed

ANOVA for the model is shown.

#try removing points with high Cook's distance, simply based on the diagnostic plots and not the threshold

dat.mlr.lel.cooks.plot <- dat.mlr.full[ -c(961,1916,1918), ]%>%
  .[ , st.count:=uniqueN(study_id), by=chem]%>% .[ st.count > 1 ]

mlr.lel.cooks.plot <- lm(lel ~ 0 
                    + factor(chem)  
                    + factor(strain_group) 
                    + factor(study_type) 
                    + factor(admin_method) 
                    + dose_spacing_center  
                    + dose_no
                    + study_year_center 
                    + sub_purity_center 
                    + factor(study_source) 
                    + factor(all_sex), data = dat.mlr.lel.cooks.plot) 

anova(mlr.lel.cooks.plot) # ANOVA for MLR full LEL dataset with high Cook's distance plot points removed

4.6 MLR full LEL dataset high Cook’s distance points removed

ANOVA for the model is shown.

influential <- as.numeric(names(cooksd)[(cooksd > 4/(2724-(length(mlr.lel$coefficients)-1)))])

dat.mlr.lel.cooksd <- dat.mlr.full[-c(influential),]%>% .[ , st.count:=uniqueN(study_id), by=chem]%>% .[ st.count > 1 ]

mlr.lel.cooksd <- lm(lel ~ 0 
                       + factor(chem)  
                       + factor(strain_group) 
                       + factor(study_type) 
                       + factor(admin_method) 
                       + dose_spacing_center  
                       + dose_no
                       + study_year_center 
                       + sub_purity_center 
                       + factor(study_source) 
                       + factor(all_sex), data = dat.mlr.lel.cooksd) 

anova(mlr.lel.cooksd) # Summary for MLR full LEL dataset with high Cook's distance points removed

4.7 MLR full LEL dataset with all potential outliers removed

ANOVA for the model is shown.

dat.mlr.lel.both <- dat.mlr.full[ -c(184,500,720,721,736,961,971,1063,1228,1229,1454,1916, 1918,2084,2094,influential), ]%>%
  .[ , st.count:=uniqueN(study_id), by=chem]%>% .[ st.count > 1 ]

mlr.lel.both <- lm(lel ~ 0 
                   + factor(chem)  
                   + factor(strain_group) 
                   + factor(study_type) 
                   + factor(admin_method) 
                   + dose_spacing_center  
                   + dose_no
                   + study_year_center 
                   + sub_purity_center 
                   + factor(study_source) 
                   + factor(all_sex), data = dat.mlr.lel.both) 

anova(mlr.lel.both) # ANOVA for MLR full LEL dataset with all potential outliers removed

5 Full LOAEL dataset RLR and MLR

5.1 RLR full LOAEL dataset

ANOVA for the model is shown.

rr.loael <- rlm(loael ~ 0 
                + factor(chem)  
                + factor(strain_group)
                + factor(study_type)
                + factor(admin_method)
                + dose_spacing_center
                + dose_no
                + study_year_center
                + sub_purity_center
                + factor(study_source)
                + factor(all_sex), data = dat.mlr.full, method="M")

anova(rr.loael) # ANOVA for RLR full LOAEL dataset

5.2 MLR for full LOAEL dataset

ANOVA for the model is shown.

mlr.loael <- lm(loael ~ 0 
                + factor(chem)  
                + factor(strain_group) 
                + factor(study_type) 
                + factor(admin_method) 
                + dose_spacing_center  
                + dose_no
                + study_year_center 
                + sub_purity_center 
                + factor(study_source) 
                + factor(all_sex), data = dat.mlr.full) 

anova(mlr.loael) # ANOVA for MLR full LOAEL dataset

5.3 Identify potentially influential points

Identify influential data points to understand the stability of the MSE estimate on the MLR for the full LEL dataset using standard plots.

par(mfrow= c(3,2))
plot(mlr.loael)
## Warning: not plotting observations with leverage one:
##   184, 500, 720, 721, 736, 971, 1063, 1228, 1229, 1454, 2084, 2094

## Warning: not plotting observations with leverage one:
##   184, 500, 720, 721, 736, 971, 1063, 1228, 1229, 1454, 2084, 2094
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced

## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced
plot(mlr.loael, which=c(4))

5.4 MLR full LOAEL dataset high leverage points removed

ANOVA for the model is shown.

dat.mlr.loael.leverage <- dat.mlr.full[ -c(56, 60, 184, 500, 720, 721, 736, 971, 1063, 1228, 1229, 1454, 1510, 2084, 2094), ]%>%
  .[ , st.count:=uniqueN(study_id), by=chem] %>% .[ st.count > 1 ]

mlr.loael.leverage<- lm(loael ~ 0 
                        + factor(chem)  
                        + factor(strain_group) 
                        + factor(study_type) 
                        + factor(admin_method) 
                        + dose_spacing_center  
                        + dose_no
                        + study_year_center 
                        + sub_purity_center 
                        + factor(study_source) 
                        + factor(all_sex), data = dat.mlr.loael.leverage) 

anova(mlr.loael.leverage) # ANOVA for MLR full LOAEL dataset high leverage points removed

5.5 MLR full LOAEL dataset high Cook’s distance plot points removed

ANOVA for the model is shown.

dat.mlr.loael.cooks.plot <- dat.mlr.full[ -c(644,861,959), ]%>%
  .[ , st.count:=uniqueN(study_id), by=chem] %>% .[ st.count > 1 ]

mlr.loael.cooks.plot <- lm(loael ~ 0 
                        + factor(chem)  
                        + factor(strain_group) 
                        + factor(study_type) 
                        + factor(admin_method) 
                        + dose_spacing_center  
                        + dose_no
                        + study_year_center 
                        + sub_purity_center 
                        + factor(study_source) 
                        + factor(all_sex), data = dat.mlr.loael.cooks.plot) 

anova(mlr.loael.cooks.plot) # ANOVA for MLR full LOAEL dataset high Cook's distance plot points removed

We examined an alternate way of identifying high Cook’s distance points. Cook’s distance using a common threshold (Cook’s distance > 4/(n-P-1)), where P is the number of covariates in the model. This threshold-based method for identifying high Cook’s distance points suggests ~5% of dataset may be influential.

See below, the figure on the left shows the Cook’s distances for each residual ordered by size, with the dotted line denoting the threshold. The figure on the right shows the influence plot for the standardized residuals.

cooksd.loael <- rev(sort(cooks.distance(mlr.loael)))
#plot(cooksd.loael, pch='*', cex=2, main='Influential Obs by Cooks Distance')
#abline(h=(4/(2112-(length(mlr.loael$coefficients))-1)), col='red')
# another way to visualize this is an influence plot

std_resid_loael <- rstandard(mlr.loael)
cooks_D_loael <- cooks.distance(mlr.loael)
hat_values_loael <- hatvalues(mlr.loael)

# plot diagnostic influence plots together

par(mfrow= c(1,2))
plot(cooksd, pch='*', cex=2, main="Cook's Distance Index",ylab="Cook's Distance")
abline(h=(4/(2112-(length(mlr.loael$coefficients))-1)), lty=2)
plot(hat_values_loael, std_resid_loael, cex=10*sqrt(cooks_D_loael), main = "Influence Plot", xlab="Hat values", ylab="Standardized residuals")
abline(h=c(-2.5,2.5), lty=2)

5.6 MLR full LOAEL dataset high Cook’s distance points removed

ANOVA for the model is shown.

influential.loael <- as.numeric(names(cooksd.loael)[(cooksd.loael > 4/(2112-(length(mlr.loael$coefficients)-1)))])

dat.mlr.loael.cooksd <- dat.mlr.full[-c(influential.loael),]%>% .[ , st.count:=uniqueN(study_id), by=chem]%>% .[ st.count > 1 ]

mlr.loael.cooksd <- lm(loael ~ 0 
                     + factor(chem)  
                     + factor(strain_group) 
                     + factor(study_type) 
                     + factor(admin_method) 
                     + dose_spacing_center  
                     + dose_no
                     + study_year_center 
                     + sub_purity_center 
                     + factor(study_source) 
                     + factor(all_sex), data = dat.mlr.loael.cooksd) 

anova(mlr.loael.cooksd) # ANOVA for MLR full LOAEL dataset high Cook's distance points removed

5.7 MLR full LOAEL dataset with all potential outliers removed

ANOVA for the model is shown

dat.mlr.loael.both <- dat.mlr.full[ -c(56, 60, 184, 500, 720, 721, 736, 971, 1063, 1228, 1229, 1454, 1510, 2084, 2094, influential.loael), ]%>%
  .[ , st.count:=uniqueN(study_id), by=chem] %>% .[ st.count > 1 ]

mlr.loael.both <- lm(loael ~ 0 
                     + factor(chem)  
                     + factor(strain_group) 
                     + factor(study_type) 
                     + factor(admin_method) 
                     + dose_spacing_center  
                     + dose_no
                     + study_year_center 
                     + sub_purity_center 
                     + factor(study_source) 
                     + factor(all_sex), data = dat.mlr.loael.both) 

anova(mlr.loael.both) # ANOVA for MLR full LOAEL dataset high Cook's distance points removed

6 Leave One Out (LOO) analysis for full datasets

Following the development of the full model, nested models were developed in a leave-one-out (LOO) approach to evaluate the relative impact of each study parameter on the total variance. An analysis of variance (ANOVA) was used to compare between full and nested models to evaluate the significance of each study descriptor.

6.1 MLR LOO for the full LEL dataset

See the manuscript where this is included in Table 3. The prefix “No” indicates that the covariate was removed from the model. % exp is the % of the total variance explained.

mlr.lel.LOO <- LinMeanSqCompar(mlr.lel)
mlr.lel.LOO <- as.data.table(mlr.lel.LOO, keep.rownames = TRUE)
total.var <- var(dat.mlr.full$lel)
mlr.lel.LOO[,`%exp` := round((((total.var - MSE)/total.var)*100),1)]
setnames(mlr.lel.LOO, c('rn'), c('Model Parameter Removed'))
kable(mlr.lel.LOO) %>% kable_styling()
Model Parameter Removed MSE p-val %exp
Full_Model 0.347 0.00e+00 62.1
No_Chemical 0.766 0.00e+00 16.4
No_Strain_group 0.401 0.00e+00 56.2
No_Study_type 0.371 0.00e+00 59.5
No_Admin_method 0.350 1.25e-03 61.8
No_Dose_spacing 0.350 2.27e-05 61.8
No_Dose_no 0.351 1.50e-06 61.7
No_Study_year 0.347 7.69e-01 62.1
No_Substance_purity 0.347 1.98e-01 62.1
No_Study_source 0.348 3.62e-02 62.0
No_sex 0.349 5.71e-04 61.9

6.2 MLR LOO for the full LOAEL dataset

See the manuscript where this is included in Table 3. The prefix “No” indicates that the covariate was removed from the model. % exp is the % of the total variance explained.

mlr.loael.LOO <- LinMeanSqCompar(mlr.loael)
mlr.loael.LOO <- as.data.table(mlr.loael.LOO, keep.rownames = TRUE)
total.var.loael <- var(dat.mlr.full$loael)
mlr.loael.LOO[,`%exp` := round((((total.var.loael - MSE)/total.var.loael)*100),1)]
setnames(mlr.loael.LOO, c('rn'), c('Model Parameter Removed'))
kable(mlr.loael.LOO) %>% kable_styling()
Model Parameter Removed MSE p-val %exp
Full_Model 0.258 0.00000 67.4
No_Chemical 0.683 0.00000 13.8
No_Strain_group 0.301 0.00000 62.0
No_Study_type 0.272 0.00000 65.7
No_Admin_method 0.258 0.05900 67.4
No_Dose_spacing 0.259 0.00311 67.3
No_Dose_no 0.258 0.66500 67.4
No_Study_year 0.258 0.50800 67.4
No_Substance_purity 0.258 0.49400 67.4
No_Study_source 0.259 0.00806 67.3
No_sex 0.258 0.37900 67.4

7 Preparation for augmented cell means (ACM) modeling

Herein the dataset is prepared for ACM.

7.1 Constructing the ACM dataset

Semi-automated and manual review of study records included in the “cells” of the ACM resulted in what is referred to as the “full cell dataset.”

dat.acm.full <- data.table(dat.mlr.full)
dat.acm.full[ , chemfact := paste(chem,species,admin_method,study_type,all_sex, sep="||")]
dat.acm.full[ , chemfact_ct:=.N, by=chemfact]

dat.acm <- dat.acm.full[ chemfact_ct > 1] %>%
  .[ , st_ct:=.N , by=study_type] %>%
  .[ , lel_min:= min(lel), by=chemfact] %>%
  .[ , lel_max:= max(lel), by=chemfact] %>%
  .[ , lel_mean:=mean(lel), by=chemfact] %>%     
  .[ , lel_range:=abs(lel_max-lel_min)] %>%
  .[ , ds_low:= min(dose_spacing), by=c("chem", "study_type")] %>%
  .[ , ds_hight:= max(dose_spacing), by=c("chem", "study_type")] %>%
  .[ , ds_range:=abs(ds_low-ds_hight)] %>%
  .[ , test_range:= abs(hdt-ldt)]


dat.cell <- as.data.table(dat.acm)
dat.cell[,chemfact_cell := .N, by=chemfact]

# Based on semi-automated and a manual review of the cells, some failed and are removed for this analysis
drop_study_id <- c(5156,5163,5151,5161,3981,3986,526,528,1234,5091,5532,5090,5531,1397,4100,728,6094,6093,6536,2334,2336,338,5919,5783,5784,951,952,3777,
                  2136,1910,1917,1841,1844,3821,3822,3816,3817,2066,2067,2069,2070,3969,3970,463,464,1440,4032,4865,4892,2764,2766,4452,4453,4446,4461,
                  4364,4366,4024,4075,4079,4725,4728,2108,2120,2115,2117,2124,2126,1580,1582,321,588,325,326,1206,1208,1205,1207,6071,6412,3528,3530,5811,
                  5812,4015,4038,2295,4050,2987,2989,2663,2664,3440,3441,3276,3277,6100,6099,6689,6690,2144,2145,1040,2164,7463,5109,5724,5723,2421,2422,
                  2423,2425,1678,1679,5155,5157,668,4274,6490,6492,2036,7493,3036,4267,433,2817,2819,4474,4478,3531,4276,4278,3947,3951,5613,6740,5598,
                  6743,5599,6738,5597,6742,214,215,217,216,218,5722,5727,1392,7470,5765,6097,6176,6177,1362,1363,2575,2577,1641,1647,898,5374,1052,4146,
                  1043,4263,3579,5843,3903,6802,3902,6800,1954,5452, 3900, 6753)

# amend the drop_study_id following additional manual curation
cell.eval.done <- fread('acm_cell_evaluation_1april2019_reviewed.csv')

drop_study_id2 <- c(drop_study_id,
                   cell.eval.done[study_keep==0, study_id]) # drop_study_id = 177, drop_study_id2 = 213
                   
dat.cell.use <- dat.cell[!study_id %in% drop_study_id2]

7.2 Plot of proportion of study type by species in the full cell ACM dataset

See a visual representation of the study type by species included in the full cell dataset for ACM.

7.3 Calculation of the size and variance for the full cell dataset.

See a table below of the size and variance for the full cell dataset.

Full cell dataset N
Total Number of Chemicals 96
Total Number of Studies 278
Variance of the LEL 0.86
Variance of the LOAEL 0.75
length(unique(dat.cell.use$chem)) #96
length(unique(dat.cell.use$study_id)) #278
round(var( dat.cell.use$lel),2) #0.86
round(var( dat.cell.use$loael),2) #0.75

8 Modeling for full cell dataset

ACM and MLR modeling were performed on the full cell dataset for comparison.

8.1 ACM for full cell LEL dataset

ANOVA for the model is shown.

acm.lel <- lm(lel ~ 0 
              + factor(chemfact)  
              + dose_spacing_center  
              + dose_no
              + study_year_center 
              + sub_purity_center,
              data = dat.cell.use )

anova(acm.lel)  # ANOVA for ACM full cell LEL dataset

8.2 ACM for full cell LOAEL dataset

ANOVA for the model is shown.

acm.loael <- lm(loael ~ 0 
                + factor(chemfact)  
                + dose_spacing_center  
                + dose_no
                + study_year_center 
                + sub_purity_center,
                data = dat.cell.use) 

anova(acm.loael) # ANOVA for ACM full cell LOAEL dataset

8.3 ACM regression assumptions for the full cell LEL dataset

par(mfrow= c(2,2))
plot(acm.lel)
## Warning: not plotting observations with leverage one:
##   107, 201, 237

## Warning: not plotting observations with leverage one:
##   107, 201, 237

8.4 ACM regression assumptions for the full cell LOAEL dataset

par(mfrow= c(2,2))
plot(acm.loael)
## Warning: not plotting observations with leverage one:
##   107, 201, 237

## Warning: not plotting observations with leverage one:
##   107, 201, 237

8.5 MLR for the full cell LEL dataset

An MLR model was developed for the full cell dataset to compare to the ACM approach for the same dataset. ANOVA for the model is shown.

mlr.acm.lel <- lm(lel ~ 0 
              + factor(chem)  
              + factor(strain_group) 
              + factor(study_type) 
              + factor(admin_method) 
              + dose_spacing_center  
              + dose_no
              + study_year_center 
              + sub_purity_center 
              + factor(study_source) 
              + factor(all_sex), data = dat.cell.use) 

anova(mlr.acm.lel) # ANOVA for MLR full cell LEL dataset

8.6 MLR for the full cell LOAEL dataset

An MLR model was developed for the full cell dataset to compare to the ACM approach for the same dataset. ANOVA for the model is shown.

mlr.acm.loael <- lm(loael ~ 0 
                  + factor(chem)  
                  + factor(strain_group) 
                  + factor(study_type) 
                  + factor(admin_method) 
                  + dose_spacing_center  
                  + dose_no
                  + study_year_center 
                  + sub_purity_center 
                  + factor(study_source) 
                  + factor(all_sex), data = dat.cell.use) 

anova(mlr.acm.loael) # ANOVA for MLR full cell LOAEL dataset

9 Full and full cell dataset results with RLR, MLR, and ACM

See this summary as it is included in the manuscript in Table 2.

9.1 LEL Model Results

Multiple regression types (RLR = robust linear regression; MLR = multilinear regression; ACM = augmented cell means) with different full datasets were used to build models of the available data for variance estimation. Total variance and MSE are unitless, whereas RMSE is in log10(mg/kg/day) units just like the dataset. % exp = percent total variance explained. N = number of study records in the dataset.

# data.tables used for input datasets for RLR, MLR, and ACM of LEL values
data_list <- list(dat.mlr.full,
                  dat.mlr.full,
                  dat.mlr.lel.leverage,
                  dat.mlr.lel.cooks.plot,
                  dat.mlr.lel.cooksd,
                  dat.mlr.lel.both,
                  dat.cell.use,
                  dat.cell.use)

# columns used
cols_list <- quos(lel)

# functions applied
round_var <- function(x) round(var(x$lel),3) # this function calculates the variance in lel for each dataset
my_n <- function(dataset) unlist(count(dataset)) # this function calculates the n

dat.res <- data_list %>% {
  tibble(
    var = map_dbl(., round_var),
    n = map_dbl(., my_n)
  )
}

# MSE of LEL models 

model_list <- list(rr.lel, 
                   mlr.lel,
                   mlr.lel.leverage,
                   mlr.lel.cooks.plot,
                   mlr.lel.cooksd,
                   mlr.lel.both,
                   acm.lel,
                   mlr.acm.lel)

# function to calculate MSE (sum(residuals^2)/error degrees of freedom)
# robust regression (rr) residual calc defaults to mlr.lel model for df.residual because df.residual is NA for robust regression

mse_func <- function(model) {
  if(is.na(model$df.residual)){model$df.residual <- mlr.lel$df.residual} 
  round(glance(model)$deviance / model$df.residual, 3)
}

# apply mse and rmse functions to the model_list

mod.res <- model_list %>% {
  tibble(mse = map_dbl(., mse_func)) } %>%
  mutate(rmse = round(sqrt(mse), 3))

lel.model.results <- bind_cols(dat.res, mod.res)

# make a table of the values to summarize the models

data.lel <- c('full dataset',
              'full dataset',
              'high leverage points removed',
              'high Cooks distance plot points removed',
              'high Cooks distance points removed',
              'all potential outliers removed',
              'full cell dataset',
              'full cell dataset')

regression.type <-c('RR',
                    'MLR',
                    'MLR',
                    'MLR',
                    'MLR',
                    'MLR',
                    'ACM',
                    'MLR')

lel.table <- data.table(regression.type,
                        data.lel,
                        lel.model.results)
setnames(lel.table,
         c('regression.type','data.lel', 'var', 'mse', 'rmse', 'n'),
         c('Regression Type', 'Data', 'Total Variance', 'MSE','RMSE', 'N'))
setcolorder(lel.table, c('Regression Type', 'Data', 'Total Variance', 'MSE', 'RMSE', 'N'))
lel.table[,`% exp.` := round(((`Total Variance` - MSE)/`Total Variance`)*100,1)]

kable(lel.table) %>% kable_styling()
Regression Type Data Total Variance MSE RMSE N % exp.
RR full dataset 0.916 0.359 0.599 2724 60.8
MLR full dataset 0.916 0.347 0.589 2724 62.1
MLR high leverage points removed 0.908 0.339 0.582 2708 62.7
MLR high Cooks distance plot points removed 0.911 0.339 0.582 2720 62.8
MLR high Cooks distance points removed 0.842 0.261 0.511 2592 69.0
MLR all potential outliers removed 0.838 0.261 0.511 2580 68.9
ACM full cell dataset 0.858 0.320 0.566 278 62.7
MLR full cell dataset 0.858 0.387 0.622 278 54.9

9.2 LOAEL Model Results

Multiple regression types (RLR = robust linear regression; MLR = multilinear regression; ACM = augmented cell means) with different full datasets were used to build models of the available data for variance estimation. Total variance and MSE are unitless, whereas RMSE is in log10(mg/kg/day) units just like the dataset. % exp = percent total variance explained. N = number of study records in the dataset.

# data.tables used for input datasets for RLR, MLR, and ACM of LEL values
data_list <- list(dat.mlr.full,
                  dat.mlr.full,
                  dat.mlr.loael.leverage,
                  dat.mlr.loael.cooks.plot,
                  dat.mlr.loael.cooksd,
                  dat.mlr.loael.both,
                  dat.cell.use,
                  dat.cell.use)

# columns used
cols_list <- quos(loael)

# functions applied
round_var_loael <- function(x) round(var(x$loael),3) # this function calculates the variance in lel for each dataset
my_n_loael <- function(dataset) unlist(count(dataset)) # this function calculates the n

dat.res.loael <- data_list %>% {
  tibble(
    var = map_dbl(., round_var_loael),
    n = map_dbl(., my_n_loael)
  )
}

# MSE of LOAEL models 

model_list_loael <- list(rr.loael, 
                   mlr.loael,
                   mlr.loael.leverage,
                   mlr.loael.cooks.plot,
                   mlr.loael.cooksd,
                   mlr.loael.both,
                   acm.loael,
                   mlr.acm.loael)

# function to calculate MSE (sum(residuals^2)/error degrees of freedom)
# robust regression (rr) residual calc defaults to mlr.loael model for df.residual because df.residual is NA for robust regression

mse_func_loael <- function(model) {
  if(is.na(model$df.residual)){model$df.residual <- mlr.loael$df.residual} 
  round(glance(model)$deviance / model$df.residual, 3)
}

# apply mse and rmse functions to the model_list

mod.res.loael <- model_list_loael %>% {
  tibble(mse = map_dbl(., mse_func_loael)) } %>%
  mutate(rmse = round(sqrt(mse), 3))

loael.model.results <- bind_cols(dat.res.loael, mod.res.loael)

# make a table of the values to summarize the models

data.loael <- c('full dataset',
              'full dataset',
              'high leverage points removed',
              'high Cooks distance plot points removed',
              'high Cooks distance points removed',
              'all potential outliers removed',
              'full cell dataset',
              'full cell dataset')

regression.type <-c('RR',
                    'MLR',
                    'MLR',
                    'MLR',
                    'MLR',
                    'MLR',
                    'ACM',
                    'MLR')

loael.table <- data.table(regression.type,
                        data.loael,
                        loael.model.results)
setnames(loael.table,
         c('regression.type','data.loael', 'var', 'mse', 'rmse', 'n'),
         c('Regression Type', 'Data', 'Total Variance', 'MSE','RMSE', 'N'))
setcolorder(loael.table, c('Regression Type', 'Data', 'Total Variance', 'MSE', 'RMSE', 'N'))
loael.table[,`% exp.` := round(((`Total Variance` - MSE)/`Total Variance`)*100,1)]

kable(loael.table) %>% kable_styling()
Regression Type Data Total Variance MSE RMSE N % exp.
RR full dataset 0.792 0.266 0.516 2724 66.4
MLR full dataset 0.792 0.258 0.508 2724 67.4
MLR high leverage points removed 0.781 0.250 0.500 2709 68.0
MLR high Cooks distance plot points removed 0.788 0.254 0.504 2721 67.8
MLR high Cooks distance points removed 0.748 0.200 0.447 2614 73.3
MLR all potential outliers removed 0.744 0.200 0.447 2603 73.1
ACM full cell dataset 0.750 0.254 0.504 278 66.1
MLR full cell dataset 0.750 0.312 0.559 278 58.4

10 MLR models by study type

MLR was used to build models using data subset by the study type (SUB = subchronic; CHR = chronic; DEV = developmental) for variance estimation.

10.1 Construct the MLR data subsets by study type

The MLR models for subchronic (SUB), chronic (CHR), and developmental (DEV) studies require that the full dataset be partitioned by study type, as shown in the code here.

#load('dat_toxref_2_0_4model.RData')
load('toxrefdb_data_4models_acm_mlr.RData')

mlr.dev <- dat.mlr.full[ study_type=="DEV"] %>%
  .[ , st_ct:=.N, by=chemical_id] %>%
  .[ st_ct>1 ]

mlr.chr <- dat.mlr.full[ study_type=="CHR"] %>%
  .[ , st_ct:=.N, by=chemical_id] %>%
  .[ st_ct > 1 ]

mlr.sub <- dat.mlr.full[ study_type=="SUB"] %>%
  .[ , st_ct:=.N, by=chemical_id] %>%
  .[ st_ct > 1 ]

10.2 Dataset size for the MLR data subsets by study type

See table below of the size of the MLR data subsets.

MLR data subset Parameter N
SUB Total Number of Chemicals 281
SUB Total Number of Studies 705
CHR Total Number of Chemicals 429
CHR Total Number of Studies 1149
DEV Total Number of Chemicals 121
DEV Total Number of Studies 275
length(unique(mlr.sub$chem))
length(unique(mlr.sub$study_id))
length(unique(mlr.chr$chem))
length(unique(mlr.chr$study_id))
length(unique(mlr.dev$chem))
length(unique(mlr.dev$study_id))

10.3 MLR DEV LEL model

ANOVA for the model is shown.

# investigating the covariates available for the MLR DEV LEL model
#table(mlr.dev$study_source) # the study source is highly skewed (only 4/275 records come from second level)
#table(mlr.dev$admin_method) # admin method is fairly uninformative with the overwhelming majority from gavage
#table(mlr.dev$strain_group)
#table(mlr.dev$dose_spacing_center)
#table(mlr.dev$dose_no)
#table(mlr.dev$all_sex) # all female

# have to remove study type because there are no longer 2 levels
# have to remove sex because there are no longer 2 levels (all female adults)

mlr.dev.lel <- lm(lel ~ 0 
              + factor(chem)  
              + factor(strain_group) 
              + factor(admin_method) 
              + dose_spacing_center  
              + dose_no
              + study_year_center 
              + sub_purity_center, 
              #+ factor(study_source), #only 2, ntp (4 records) and opp_der (271 records)
              #+ factor(all_sex), 
              data = mlr.dev) 

anova(mlr.dev.lel)

10.4 MLR DEV LOAEL model

ANOVA for the model is shown.

mlr.dev.loael <- lm(loael ~ 0 
                 + factor(chem)  
                 + factor(strain_group) 
                 + factor(admin_method) 
                 + dose_spacing_center  
                 + dose_no
                 + study_year_center 
                 + sub_purity_center, 
                 #+ factor(study_source), #only 2, ntp and opp_der
                 #+ factor(all_sex), 
                 data = mlr.dev) 

anova(mlr.dev.loael)

10.5 MLR CHR LEL model

ANOVA for the model is shown.

#table(mlr.chr$study_source) # very few open_lit and then many opp_der and ntp
#table(mlr.chr$admin_method)
#table(mlr.chr$strain_group)
#table(mlr.chr$dose_spacing_center)
#table(mlr.chr$dose_no)
#table(mlr.chr$all_sex) # sex seems to be uninformative here as well

# have to remove study type because there are no longer 2 levels
# have to remove sex because there are no longer 2 levels (all female adults)

mlr.chr.lel <- lm(lel ~ 0 
                  + factor(chem)  
                  + factor(strain_group) 
                  + factor(admin_method) 
                  + dose_spacing_center  
                  + dose_no
                  + study_year_center 
                  + sub_purity_center 
                  + factor(study_source), #only 2, ntp and opp_der
                  #+ factor(all_sex), # sex is not really informative because so many are MF
                  data = mlr.chr) 

anova(mlr.chr.lel)

10.6 MLR CHR LOAEL model

ANOVA for the model is shown.

mlr.chr.loael <- lm(loael ~ 0 
                    + factor(chem)  
                    + factor(strain_group) 
                    + factor(admin_method) 
                    + dose_spacing_center  
                    + dose_no
                    + study_year_center 
                    + sub_purity_center 
                    + factor(study_source), #only 2, ntp and opp_der
                    #+ factor(all_sex), 
                    data = mlr.chr) 

anova(mlr.chr.loael)

10.7 MLR SUB LEL model

ANOVA for the model is shown.

#table(mlr.sub$study_source) # though there are three options, very few open_lit and then many opp_der and ntp
#table(mlr.sub$admin_method)
#table(mlr.sub$strain_group)
#table(mlr.sub$dose_spacing_center)
#table(mlr.sub$dose_no)
#table(mlr.sub$all_sex) # so many MF that this is not informative


mlr.sub.lel <- lm(lel ~ 0 
                  + factor(chem)  
                  + factor(strain_group) 
                  + factor(admin_method) 
                  + dose_spacing_center  
                  + dose_no
                  + study_year_center 
                  + sub_purity_center 
                  + factor(study_source),
                  #+ factor(all_sex), # sex is not really informative because so many are MF
                  data = mlr.sub) 


anova(mlr.sub.lel)

10.8 MLR SUB LOAEL model

ANOVA for the model is shown.

mlr.sub.loael <- lm(loael ~ 0 
                    + factor(chem)  
                    + factor(strain_group) 
                    + factor(admin_method) 
                    + dose_spacing_center  
                    + dose_no
                    + study_year_center 
                    + sub_purity_center 
                    + factor(study_source), #only 2, ntp and opp_der
                    #+ factor(all_sex), 
                    data = mlr.sub) 

anova(mlr.sub.loael)

11 ACM models by study type

ACM was used to build models using data subset by the study type (SUB = subchronic; CHR = chronic; DEV = developmental) for variance estimation.

11.1 Construct the MLR data subsets by study type

The ACM models for subchronic (SUB), chronic (CHR), and developmental (DEV) studies require that the full cell dataset be partitioned by study type, as shown in the code here. This limits the size of the datasets considerably for ACM modeling.

#length(unique(dat.cell.use[study_type=='DEV']$chemfact)) #28
#length(unique(dat.cell.use[study_type=='CHR']$chemfact)) #56
#length(unique(dat.cell.use[study_type=='SUB']$chemfact)) #43
#length(unique(dat.cell.use$chemfact)) #133

acm.dev <- dat.cell.use[ study_type=="DEV"] %>%
  .[ , st_ct:=.N, by=chemical_id] %>%
  .[ st_ct>1 ]

acm.chr <- dat.cell.use[ study_type=="CHR"] %>%
  .[ , st_ct:=.N, by=chemical_id] %>%
  .[ st_ct > 1 ]

acm.sub <- dat.cell.use[ study_type=="SUB"] %>%
  .[ , st_ct:=.N, by=chemical_id] %>%
  .[ st_ct > 1 ]

11.2 Dataset size for the ACM data subsets by study type

See table below of the size of the ACM data subsets.

ACM data subset Parameter N
SUB Number of cells 42
SUB Number of chemicals 40
SUB Number of study records 92
CHR Number of cells 56
CHR Number of chemicals 45
CHR Number of study records 117
DEV Number of cells 27
DEV Number of chemicals 24
DEV Number of study records 54
length(unique(acm.sub$chemfact))
length(unique(acm.sub$chemical_id))
length(unique(acm.sub$study_id))
length(unique(acm.chr$chemfact))
length(unique(acm.chr$chemical_id))
length(unique(acm.chr$study_id))
length(unique(acm.dev$chemfact))
length(unique(acm.dev$chemical_id))
length(unique(acm.dev$study_id))

11.3 ACM DEV LEL model

ANOVA for the model is shown.

acm.dev.lel <- lm(lel ~ 0 
              + factor(chemfact)  
              + dose_spacing_center  
              + dose_no
              + study_year_center 
              + sub_purity_center,
              data = acm.dev)

anova(acm.dev.lel)

11.4 ACM DEV LOAEL model

ANOVA for the model is shown.

acm.dev.loael <- lm(loael ~ 0 
                + factor(chemfact)  
                + dose_spacing_center  
                + dose_no
                + study_year_center 
                + sub_purity_center,
                data = acm.dev) 

anova(acm.dev.loael)

11.5 ACM CHR LEL model

ANOVA for the model is shown.

acm.chr.lel <- lm(lel ~ 0 
                  + factor(chemfact)  
                  + dose_spacing_center  
                  + dose_no
                  + study_year_center 
                  + sub_purity_center,
                  data = acm.chr)

anova(acm.chr.lel)

11.6 ACM CHR LOAEL model

ANOVA for the model is shown.

acm.chr.loael <- lm(loael ~ 0 
                  + factor(chemfact)  
                  + dose_spacing_center  
                  + dose_no
                  + study_year_center 
                  + sub_purity_center,
                  data = acm.chr)

anova(acm.chr.loael)

11.7 ACM SUB LEL model

ANOVA for the model is shown.

acm.sub.lel <- lm(lel ~ 0 
                  + factor(chemfact)  
                  + dose_spacing_center  
                  + dose_no
                  + study_year_center 
                  + sub_purity_center,
                  data = acm.sub)

anova(acm.sub.lel)

11.8 ACM SUB LOAEL model

ANOVA for the model is shown.

acm.sub.loael <- lm(loael ~ 0 
                    + factor(chemfact)  
                    + dose_spacing_center  
                    + dose_no
                    + study_year_center 
                    + sub_purity_center,
                    data = acm.sub)

anova(acm.sub.loael)

12 Study type subset model results

See this summary as it is included in the manuscript as Table 4.

12.1 LEL model results by study type and regression type

Two regression types (MLR = multilinear regression, ACM = augmented cell means) were used to build models using data subset by the study type (SUB = subchronic; CHR = chronic; DEV = developmental) for variance estimation. Total variance and MSE are unitless, whereas RMSE is in log10(mg/kg/day) units just like the dataset. % exp = percent total variance explained. N = number of study records in the dataset.

# data.tables used for input datasets for MLR and ACM study type models
sub_list <- list(mlr.sub,
                  acm.sub,
                  mlr.chr,
                  acm.chr,
                  mlr.dev,
                  acm.dev)

# columns used
cols_list <- quos(lel)

# functions applied
round_var_lel <- function(x) round(var(x$lel),3) # this function calculates the variance in loael for each dataset
my_n_lel <- function(dataset) unlist(count(dataset)) # this function calculates the n

sub.res.lel <- sub_list %>% {
  tibble(
    var = map_dbl(., round_var_lel),
    n = map_dbl(., my_n_lel)
  )
}

# MSE of LOAEL models 

sub_model_list_lel <- list(mlr.sub.lel,
                             acm.sub.lel,
                             mlr.chr.lel,
                             acm.chr.lel,
                             mlr.dev.lel,
                             acm.dev.lel)

# function to calculate MSE (sum(residuals^2)/error degrees of freedom)

sub_mse_func_lel <- function(model) {
  round(glance(model)$deviance / model$df.residual, 3)
}

# apply mse and rmse functions to the model_list

sub.mod.res.lel <- sub_model_list_lel %>% {
  tibble(mse = map_dbl(., sub_mse_func_lel)) } %>%
  mutate(rmse = round(sqrt(mse), 3))

sub.lel.model.results <- bind_cols(sub.res.lel, sub.mod.res.lel)

datasets <- c('SUB','SUB','CHR','CHR','DEV','DEV')
regression.type <-c('MLR','ACM','MLR','ACM','MLR','ACM')

sub.lel.table <- data.table(regression.type,
                              datasets,
                              sub.lel.model.results)

setnames(sub.lel.table,
         c('regression.type','datasets', 'var', 'mse', 'rmse', 'n'),
         c('Regression Type', 'Data', 'Total Variance', 'MSE','RMSE', 'N'))
setcolorder(sub.lel.table, c('Regression Type', 'Data', 'Total Variance', 'MSE', 'RMSE', 'N'))
sub.lel.table[,`% exp.` := round(((`Total Variance` - MSE)/`Total Variance`)*100,1)]

kable(sub.lel.table) %>% kable_styling()
Regression Type Data Total Variance MSE RMSE N % exp.
MLR SUB 0.879 0.350 0.592 705 60.2
ACM SUB 1.013 0.301 0.549 92 70.3
MLR CHR 0.952 0.352 0.593 1149 63.0
ACM CHR 0.887 0.395 0.628 117 55.5
MLR DEV 0.604 0.246 0.496 275 59.3
ACM DEV 0.410 0.328 0.573 54 20.0

12.2 LOAEL model results by study type and regression type

Two regression types (MLR = multilinear regression, ACM = augmented cell means) were used to build models using data subset by the study type (SUB = subchronic; CHR = chronic; DEV = developmental) for variance estimation. Total variance and MSE are unitless, whereas RMSE is in log10(mg/kg/day) units just like the dataset. % exp = percent total variance explained. N = number of study records in the dataset.

# data.tables used for input datasets for MLR and ACM study type models
sub_list <- list(mlr.sub,
                  acm.sub,
                  mlr.chr,
                  acm.chr,
                  mlr.dev,
                  acm.dev)

# columns used
cols_list <- quos(loael)

# functions applied
round_var_loael <- function(x) round(var(x$loael),3) # this function calculates the variance in loael for each dataset
my_n_loael <- function(dataset) unlist(count(dataset)) # this function calculates the n

sub.res.loael <- sub_list %>% {
  tibble(
    var = map_dbl(., round_var_loael),
    n = map_dbl(., my_n_loael)
  )
}

# MSE of LOAEL models 

sub_model_list_loael <- list(mlr.sub.loael,
                             acm.sub.loael,
                             mlr.chr.loael,
                             acm.chr.loael,
                             mlr.dev.loael,
                             acm.dev.loael)

# function to calculate MSE (sum(residuals^2)/error degrees of freedom)

sub_mse_func_loael <- function(model) {
  round(glance(model)$deviance / model$df.residual, 3)
}

# apply mse and rmse functions to the model_list

sub.mod.res.loael <- sub_model_list_loael %>% {
  tibble(mse = map_dbl(., sub_mse_func_loael)) } %>%
  mutate(rmse = round(sqrt(mse), 3))

sub.loael.model.results <- bind_cols(sub.res.loael, sub.mod.res.loael)

datasets <- c('SUB','SUB','CHR','CHR','DEV','DEV')
regression.type <-c('MLR','ACM','MLR','ACM','MLR','ACM')

sub.loael.table <- data.table(regression.type,
                              datasets,
                              sub.loael.model.results)

setnames(sub.loael.table,
         c('regression.type','datasets', 'var', 'mse', 'rmse', 'n'),
         c('Regression Type', 'Data', 'Total Variance', 'MSE','RMSE', 'N'))
setcolorder(sub.loael.table, c('Regression Type', 'Data', 'Total Variance', 'MSE', 'RMSE', 'N'))
sub.loael.table[,`% exp.` := round(((`Total Variance` - MSE)/`Total Variance`)*100,1)]

kable(sub.loael.table) %>% kable_styling()
Regression Type Data Total Variance MSE RMSE N % exp.
MLR SUB 0.782 0.277 0.526 705 64.6
ACM SUB 0.904 0.250 0.500 92 72.3
MLR CHR 0.795 0.252 0.502 1149 68.3
ACM CHR 0.825 0.265 0.515 117 67.9
MLR DEV 0.594 0.217 0.466 275 63.5
ACM DEV 0.398 0.316 0.562 54 20.6

13 Empirical cumulative distribution of residuals from the MLR model of the full dataset

See the manuscript where this is included as Figure 7. The minimum prediction interval for this dataset cannot be smaller than the 2.5 to 97.5 percentiles of the residuals on the model of these data.

The 2.5 and 97.5 percentiles are shown:

# get the cumulative distribution for the main model residuals

mlr.ecd <- ecdf(residuals(mlr.lel))
quantile(mlr.ecd, probs=c(0.025, 0.975))
##      2.5%     97.5% 
## -1.098148  1.002591
#range(residuals(mlr.lel)) #-2.462775  2.419666
#sd(residuals(mlr.lel)) #0.519
#mean(residuals(mlr.lel))
#2*(sd(residuals(mlr.lel))) #1.0384 #thus we find our residuals are slightly long-tailed

In this code section, the cumulative distribution of residuals for a normal data set is calculated.

# get the cumulative distribution for a normal data set
set.seed(10000)
normal.numbers = rnorm(2724, mean=0, sd=0.519)
#range(normal.numbers)
normal.ecdf = ecdf(normal.numbers)

Then, a plot is used to compare the ECDF of the MLR LEL model residuals and a normal data set. See Figure 6 in the manuscript.

model.residuals <- as.data.table(residuals(mlr.lel))
model.residuals <- model.residuals[order(V1)]

norm.residuals <- as.data.table(normal.numbers)
norm.residuals <- norm.residuals[order(normal.numbers)]

par(mfrow=c(1,2))
plot(mlr.ecd,
     main=NULL,
     xlab="Residual", ylab = "Cumulative Proportion", cex.axis=1.25, cex.lab=1.25, cex.main=2.0,
     xlim=c(-3,3), xaxs='i', 
     do.points=FALSE, 
     verticals=TRUE)
plot(normal.ecdf,
     do.points=FALSE, 
     add=TRUE, 
     col='blue', verticals=TRUE)
abline(h = 0.025, lty = 3)
abline(h = 0.975, lty = 3)
abline(v = -1.098148, lty =3 )
abline(v =  1.002591, lty=3 )
title("A", adj=0)

plot(norm.residuals$normal.numbers ~ model.residuals$V1,
     xlab='Ordered MLR LEL Residuals',
     ylab='Ordered Normal Distribution Sample',
     xlim=c(-3,3),
     cex.axis=1.25, cex.lab=1.25, cex.main=2.0 )
abline(a=0,b=1)
title("B", adj=0)

We also examined the results of a Kolmogorov-Smirnov Test, skewness, and kurtosis to understand how much the MLR LEL model residuals might deviate from a normal distribution.

ks.test(residuals(mlr.lel), normal.numbers) #kolmogorov smirnov test
## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  residuals(mlr.lel) and normal.numbers
## D = 0.037078, p-value = 0.04728
## alternative hypothesis: two-sided

Skewness and Kurtosis of the MLR LEL model residual distribution Given that the skewness is between -0.5 and 0.5, this suggests the model residual distribution is approximately symmetric. Given that the excess kurtosis is positive but less than 3, the MLR LEL model residual distribution appears leptokurtic (slightly heavy-tailed).

skew.mlr.lel.residuals <- skewness(model.residuals$V1) #-0.253 #If skewness is between -0.5 and 0.5, the distribution is approximately symmetric.
kurt.mlr.lel.residuals <- kurtosis(model.residuals$V1) -3 #1.31; leptokurtic (high peak, heavy tails)
Metric <- c('Skewness', 'Excess Kurtosis')
Value <- c(skew.mlr.lel.residuals, kurt.mlr.lel.residuals)
norm.testing <- as.data.table(cbind(Metric,Value))
kable(norm.testing) %>% kable_styling()
Metric Value
Skewness -0.253000573059661
Excess Kurtosis 1.30902771445109

14 Range of prediction intervals for LEL and LOAEL prediction

The table below is included in the manuscript as Table 5. Different estimates of prediction interval are calculated. The prediction interval width is the upper bound of the interval divided by the lower bound of the interval in fold-mg/kg/day units. For context, the lower and upper bounds of a prediction interval for 1, 10, and 100 mg/kg/day “true” POD values are shown.

UB= upper bound; LB = lower bound; mkd = mg/kg/day

# create table of values for prediction intervals

# first calculate based on ecdf of the MLR regression model residuals
ecdf.dt <- data.table(true = c(1,10,100), pred.lb = c(0,0,0), pred.ub = c(0,0,0))
ecdf.dt[,pred.lb := round(true*(10^(quantile(mlr.ecd,prob=0.025))),2)]
ecdf.dt[,pred.ub := round(true*(10^(quantile(mlr.ecd,prob=0.975))),2)]


ecdf.row <- data.table('ECDF of MLR model residuals', 'LEL', 'full dataset', 'NA', 
                       ecdf.dt[true==1,pred.lb], 
                       ecdf.dt[true==1,pred.ub],
                       ecdf.dt[true==10,pred.lb],
                       ecdf.dt[true==10,pred.ub],
                       ecdf.dt[true==100,pred.lb],
                       ecdf.dt[true==100,pred.ub])
setnames(ecdf.row, c('V1', 'V2', 'V3','V4','V5','V6','V7','V8','V9','V10'),
c('Regression Type','POD Type','Data','RMSE','1 mkd LB','1 mkd UB','10 mkd LB','10 mkd UB','100 mkd LB', '100 mkd UB'))

lel.table[,"POD Type":= 'LEL']
loael.table[,"POD Type":= 'LOAEL']
sub.lel.table[,"POD Type":= 'LEL']
sub.loael.table[,"POD Type":= 'LOAEL']
total <- rbind(lel.table, loael.table, sub.lel.table, sub.loael.table)
total <- total[order(-RMSE),c('Regression Type', 'POD Type', 'Data', 'RMSE')]
total[,"1 mkd LB" := round(10^(qnorm(0.025)*RMSE),2)]
total[,"1 mkd UB" := round(10^(qnorm(0.975)*RMSE),2)]
total[,"10 mkd LB" := round(10*(10^(qnorm(0.025)*RMSE)),2)]
total[,"10 mkd UB" := round(10*(10^(qnorm(0.975)*RMSE)),2)]
total[,"100 mkd LB" := round(100*(10^(qnorm(0.025)*RMSE)),2)]
total[,"100 mkd UB" := round(100*(10^(qnorm(0.975)*RMSE)),2)]
total2 <- rbind(ecdf.row, total)
total2[,'Minimum Prediction Interval Width' := round(`1 mkd UB`/`1 mkd LB`,0)]
setcolorder(total2, c(1:4,11,5:10))
kable(total2) %>% kable_styling()
Regression Type POD Type Data RMSE Minimum Prediction Interval Width 1 mkd LB 1 mkd UB 10 mkd LB 10 mkd UB 100 mkd LB 100 mkd UB
ECDF of MLR model residuals LEL full dataset NA 126 0.08 10.06 0.80 100.60 7.98 1005.98
ACM LEL CHR 0.628 284 0.06 17.02 0.59 170.16 5.88 1701.60
MLR LEL full cell dataset 0.622 276 0.06 16.56 0.60 165.61 6.04 1656.14
RR LEL full dataset 0.599 213 0.07 14.93 0.67 149.29 6.70 1492.86
MLR LEL CHR 0.593 208 0.07 14.53 0.69 145.30 6.88 1452.98
MLR LEL SUB 0.592 207 0.07 14.46 0.69 144.64 6.91 1446.43
MLR LEL full dataset 0.589 204 0.07 14.27 0.70 142.70 7.01 1426.98
MLR LEL high leverage points removed 0.582 198 0.07 13.83 0.72 138.26 7.23 1382.61
MLR LEL high Cooks distance plot points removed 0.582 198 0.07 13.83 0.72 138.26 7.23 1382.61
ACM LEL DEV 0.573 166 0.08 13.28 0.75 132.76 7.53 1327.58
ACM LEL full cell dataset 0.566 161 0.08 12.86 0.78 128.63 7.77 1286.29
ACM LOAEL DEV 0.562 158 0.08 12.63 0.79 126.33 7.92 1263.28
MLR LOAEL full cell dataset 0.559 156 0.08 12.46 0.80 124.63 8.02 1246.29
ACM LEL SUB 0.549 149 0.08 11.91 0.84 119.13 8.39 1191.30
MLR LOAEL SUB 0.526 119 0.09 10.74 0.93 107.38 9.31 1073.84
RR LOAEL full dataset 0.516 103 0.10 10.26 0.97 102.65 9.74 1026.46
ACM LOAEL CHR 0.515 102 0.10 10.22 0.98 102.18 9.79 1021.84
MLR LEL high Cooks distance points removed 0.511 100 0.10 10.04 1.00 100.36 9.96 1003.56
MLR LEL all potential outliers removed 0.511 100 0.10 10.04 1.00 100.36 9.96 1003.56
MLR LOAEL full dataset 0.508 99 0.10 9.90 1.01 99.01 10.10 990.06
MLR LOAEL high Cooks distance plot points removed 0.504 97 0.10 9.72 1.03 97.23 10.28 972.35
ACM LOAEL full cell dataset 0.504 97 0.10 9.72 1.03 97.23 10.28 972.35
MLR LOAEL CHR 0.502 96 0.10 9.64 1.04 96.36 10.38 963.61
MLR LOAEL high leverage points removed 0.5 96 0.10 9.55 1.05 95.50 10.47 954.95
ACM LOAEL SUB 0.5 96 0.10 9.55 1.05 95.50 10.47 954.95
MLR LEL DEV 0.496 85 0.11 9.38 1.07 93.79 10.66 937.87
MLR LOAEL DEV 0.466 68 0.12 8.19 1.22 81.91 12.21 819.11
MLR LOAEL high Cooks distance points removed 0.447 58 0.13 7.52 1.33 75.18 13.30 751.80
MLR LOAEL all potential outliers removed 0.447 58 0.13 7.52 1.33 75.18 13.30 751.80

14.1 Visualization of the minimum prediction intervals

total2.long <- melt.data.table(data=total2,
                               id.vars=c('Regression Type','POD Type','Data','RMSE','Minimum Prediction Interval Width'),
                               measure.vars = c('1 mkd LB', '1 mkd UB','10 mkd LB','10 mkd UB','100 mkd LB', '100 mkd UB'),
                               variable.name = 'Bounds',
                               value.name = 'Prediction')

total2.long[,model := paste0(`Regression Type`,'_',`POD Type`,'_',Data)]
total2.long[`Bounds` %in% c('1 mkd LB','1 mkd UB'), truth := 1]
total2.long[`Bounds` %in% c('10 mkd LB','10 mkd UB'), truth := 10]
total2.long[`Bounds` %in% c('100 mkd LB','100 mkd UB'), truth := 100]
total2.long[`Bounds` %in% c('1 mkd LB', '10 mkd LB', '100 mkd LB'), interval := 'lower']
total2.long[`Bounds` %in% c('1 mkd UB', '10 mkd UB', '100 mkd UB'), interval := 'upper']

total3 <- dcast.data.table(data=total2.long, `Regression Type`+ `POD Type`+ `Data` + RMSE + model + truth ~ interval,
                           value.var = 'Prediction')

total4 <- total3[order(upper)]

ggplot(total4[truth==10]) + 
  geom_errorbar(aes(x=reorder(model,upper),ymin=lower, ymax=upper, color=Data, lty = `POD Type`), width=0.3) +
  labs(y="Prediction bounds, mg/kg/day", 
       x="Model",
       title="Truth = 10 mg/kg/day"
       ) +
  #facet_wrap( ~ truth, scales='free_y', shrink=TRUE) +
  scale_x_discrete(label= function(x) strtrim(x,12))+
  #scale_y_continuous(breaks = seq(0, 20, 2.5)) +
  scale_y_log10()+
  annotation_logticks(side="l", base=10)+
  scale_color_manual(breaks=c("CHR","DEV","SUB","full cell dataset","full dataset","all potential outliers removed",
                              "high Cooks distance plot points removed","high Cooks distance points removed","high leverage points removed"), values=c("#999999", "#E69F00", "#56B4E9", "#009E73",
          "#F0E442", "#0072B2", "#D55E00", "#CC79A7", "#000000"), 
                     labels=c("CHR","DEV","SUB","full cell dataset","full dataset","all potential outliers removed",
                              "high Cooks distance plot points removed","high Cooks distance points removed","high leverage points removed"))+
  theme_classic() +
  theme(plot.title=element_text(hjust=0.5),
        panel.border=element_rect(colour="black",size=1, fill=NA),
        panel.grid.minor.y = element_line(size = (0.2), colour="grey"),
        panel.grid.major.y = element_line(size = (0.2), colour="grey"),
        axis.ticks.x=element_blank(),
        legend.position="left",
        legend.title = element_blank(),
        #legend.key.size =  unit(0.5, "in"),
        legend.text = element_text(size=7),
        #panel.border=element_blank(),
        axis.text.x=element_text(angle = 90, size=9, vjust =1, hjust=1),
        axis.text.y=element_text(size=12),
        axis.title.y=element_text(size=13))

15 Range of LEL values per cell in the ACM models, by study type

The LEL values are presented by study type in the ACM subsets (SUB, CHR, and DEV) are presented. The dot represents the LEL mean and the line shows the range from minimum to maximum for each cell of the respective ACM models. Species is denoted by color, with gray = dog, green = mouse, black = rabbit, and blue = rat. One line represents one cell, but there may be more than one cell for each chemical. These are ordered by the mean LEL and the order of cells is not the same across the SUB, CHR, and DEV sections.

dat.acm.comparison <- dat.cell.use[ study_type %in% c("CHR", "DEV", "SUB")] %>%
  .[ , list(chem, chemfact, study_id, study_type, species, lel_min, lel_max, lel_mean)] %>%
  unique() %>%
  .[ , chem_ct:=.N, by=chem] %>%
  .[ , mean_studytype:=mean(lel_mean), by=study_type]

dat.acm.comparison <- unique(dat.acm.comparison[,c('chemfact', 'study_type', 'species', 'lel_min', 
                                                   'lel_max', 'lel_mean')])

dat.acm.comparison <- dat.acm.comparison[order(study_type, lel_mean), ] #sort
dat.acm.comparison$chemfact <- factor(dat.acm.comparison$chemfact, 
                                      levels = dat.acm.comparison$chemfact[order(dat.acm.comparison$lel_mean)])

dat.acm.comparison$study_type <- factor(dat.acm.comparison$study_type,
                                        levels=c('SUB','CHR','DEV'))

ggplot(dat.acm.comparison) + 
  geom_point(aes(y=lel_mean, x=chemfact, color = species)) +
  geom_errorbar(data=dat.acm.comparison , aes(x=chemfact,ymin=lel_min, ymax=lel_max, color=species), width=0.3) +
  labs(y="LEL log10(mg/kg/day)", 
       x="Cell"
       #title="HED LEL Range by Study Type and Cell"
       ) +
  facet_grid( ~ study_type, scales='free_x') +
  scale_y_continuous(breaks = seq(-4, 4, .5)) +
  scale_color_manual(breaks=c('dog', 'mouse', 'rabbit', 'rat'), values=c("#999999",
                                                                         "#009E73",
                                                                         "#000000",
                                                                         "#56B4E9"), 
                     labels=c('dog', 'mouse', 'rabbit', 'rat'))+
  theme_classic() +
  theme(
    #plot.title = element_text(hjust=0.5, face="bold", size = 17),
        panel.border=element_rect(colour="black",size=1, fill=NA),
        panel.grid.minor.y = element_line(size = (0.2), colour="grey"),
        panel.grid.major.y = element_line(size = (0.2), colour="grey"),
        axis.ticks.x=element_blank(),
        legend.position="bottom",
        legend.title = element_text(size=15, face="bold"),
        legend.key.size =  unit(0.5, "in"),
        legend.text = element_text(size=15),
        #panel.border=element_blank(),
        axis.text.x=element_blank(),
        axis.text.y=element_text(size=12),
        axis.title.y=element_text(size=13))

16 Comparison of subchronic and chronic values

To evaluate how different subchronic and chronic values might be from each other.

16.1 Construct the dataset and plot the relationships

Using the full MLR LEL dataset, plot median SUB LOAEL values vs. median CHR LOAEL values by chemical for chemicals with more than one study each of CHR and SUB.

st <- c("CHR", "SUB")

dat.comp2 <- dat.mlr.full[,list(chem,chemical_id,study_type,species,lel,loael)] %>%
  .[study_type %in% st] %>%
  .[, chnm := paste(unlist(strsplit(chem, split='||',fixed=TRUE))[[2]]), by=chemical_id] %>%  
  .[, chem_st_ct:=.N, by=list(chemical_id, study_type)] %>% .[ chem_st_ct > 1] %>% 
  .[, med.loael:=median(loael), by=list(chemical_id, study_type, species)]

s.c.ratio <- dcast.data.table(dat.comp2,
                              chnm + chem + chemical_id ~ study_type + species,
                              value.var = 'med.loael',
                              fun.aggregate = median)

s.c.ratio[,dog.s.c := ifelse(!is.na(SUB_dog|!is.na(CHR_dog)), SUB_dog - CHR_dog, NA)]
s.c.ratio[,mouse.s.c := ifelse(!is.na(SUB_mouse|!is.na(CHR_mouse)), SUB_mouse - CHR_mouse, NA)]
s.c.ratio[,rat.s.c := ifelse(!is.na(SUB_rat|!is.na(CHR_rat)), SUB_rat - CHR_rat, NA)]

#nrow(s.c.ratio[!is.na(mouse.s.c)]) #141
#nrow(s.c.ratio[!is.na(rat.s.c)]) #204
#nrow(s.c.ratio[!is.na(dog.s.c)]) #119


mouse <- ggplot(data=s.c.ratio[!is.na(mouse.s.c)], aes(y=CHR_mouse, x=SUB_mouse)) +
  geom_point()+
  stat_smooth(method='lm', col='red')+
  labs(y="Median CHR LOAEL log10(mg/kg/day)",
       x="Median SUB LOAEL log10(mg/kg/day)",
       title = "mouse"
       ) +
  theme_bw()+
  theme(
        axis.text.x=element_text(size=12),
        axis.text.y=element_text(size=12),
        axis.title.x=element_text(size=12),
        axis.title.y=element_text(size=12))+
  coord_cartesian(xlim=c(-1,4), ylim=c(-1.5,3.5))+
  annotate("text", x=3, y = -1, label="N=141", size=6)+
  annotate("text", x=-0.75, y=3.35, label = "B",size=7)

rat <- ggplot(data=s.c.ratio[!is.na(rat.s.c)], aes(y=CHR_rat, x=SUB_rat)) +
  geom_point()+
  stat_smooth(method='lm', col='red')+
  labs(y="Median CHR LOAEL log10(mg/kg/day)",
       x="Median SUB LOAEL log10(mg/kg/day)",
       title = "rat"
       ) +
  theme_bw()+
  theme(
        axis.text.x=element_text(size=12),
        axis.text.y=element_text(size=12),
        axis.title.x=element_text(size=12),
        axis.title.y=element_text(size=12))+
coord_cartesian(xlim=c(-1,4), ylim=c(-1.5,3.5))+
  annotate("text", x=3, y = -1, label="N=204", size=6)+
  annotate("text", x=-0.75, y=3.35, label = "C",size=7)

dog <- ggplot(data=s.c.ratio[!is.na(dog.s.c)], aes(y=CHR_dog, x=SUB_dog)) +
  geom_point()+
  stat_smooth(method='lm', col='red')+
  labs(y="Median CHR LOAEL log10(mg/kg/day)",
       x="Median SUB LOAEL log10(mg/kg/day)",
       title = "dog"
       ) +
  theme_bw()+
  theme(
        axis.text.x=element_text(size=12),
        axis.text.y=element_text(size=12),
        axis.title.x=element_text(size=12),
        axis.title.y=element_text(size=12))+
  coord_cartesian(xlim=c(-1,4), ylim=c(-1.5,3.5))+
  annotate("text", x=3, y = -1, label="N=119", size=6)+
  annotate("text", x=-0.75, y=3.35, label = "A",size=7)

grid.arrange(arrangeGrob(dog, mouse, rat, ncol=3,nrow=1))

The linear relationships between subchronic and chronic median LOAEL values by species vary slightly.

16.2 dog SUB to CHR linear model

fit.dog <- lm(CHR_dog ~ SUB_dog, data=s.c.ratio[!is.na(dog.s.c)])
summary(fit.dog)
## 
## Call:
## lm(formula = CHR_dog ~ SUB_dog, data = s.c.ratio[!is.na(dog.s.c)])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.2767 -0.3153  0.0521  0.2726  2.8680 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.34151    0.10511   3.249  0.00151 ** 
## SUB_dog      0.66678    0.06188  10.776  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.541 on 117 degrees of freedom
## Multiple R-squared:  0.4981, Adjusted R-squared:  0.4938 
## F-statistic: 116.1 on 1 and 117 DF,  p-value: < 2.2e-16

16.3 mouse SUB to CHR linear model

fit.mouse <- lm(CHR_mouse ~ SUB_mouse, data=s.c.ratio[!is.na(mouse.s.c)])
summary(fit.mouse)
## 
## Call:
## lm(formula = CHR_mouse ~ SUB_mouse, data = s.c.ratio[!is.na(mouse.s.c)])
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.49990 -0.31189  0.06877  0.36692  1.13837 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.42903    0.14414   2.977  0.00344 ** 
## SUB_mouse    0.63277    0.05947  10.639  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5954 on 139 degrees of freedom
## Multiple R-squared:  0.4488, Adjusted R-squared:  0.4449 
## F-statistic: 113.2 on 1 and 139 DF,  p-value: < 2.2e-16

16.4 rat SUB to CHR linear model

fit.rat <- lm(CHR_rat ~ SUB_rat, data=s.c.ratio[!is.na(rat.s.c)])
summary(fit.rat)
## 
## Call:
## lm(formula = CHR_rat ~ SUB_rat, data = s.c.ratio[!is.na(rat.s.c)])
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.58175 -0.28735  0.01142  0.30918  1.55837 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.14060    0.09003   1.562     0.12    
## SUB_rat      0.74240    0.04468  16.614   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5204 on 202 degrees of freedom
## Multiple R-squared:  0.5774, Adjusted R-squared:  0.5753 
## F-statistic:   276 on 1 and 202 DF,  p-value: < 2.2e-16