This code accompanies the manuscript, Pham et al., “Variability in in vivo Toxicity Studies - Defining the upper limit of predictivity for models of systemic effect levels.” The corresponding author can be reached at paul-friedman.katie@epa.gov
for any additional questions.
The code sections below provide the R setup, custom functions, database retrieved, and dataset compilation. The code is hidden by default but can be expanded for viewing globally by clicking “Show All Code” or by section by clicking the “code” boxes.
See the R sessionInfo() used below.
## R version 3.6.1 (2019-07-05)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 16299)
##
## Matrix products: default
##
## locale:
## [1] LC_COLLATE=English_United States.1252
## [2] LC_CTYPE=English_United States.1252
## [3] LC_MONETARY=English_United States.1252
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United States.1252
##
## attached base packages:
## [1] grid stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] moments_0.14 broom_0.5.2 purrr_0.3.2
## [4] kableExtra_1.1.0 gridExtra_2.3 ggpubr_0.2.1
## [7] cowplot_1.0.0 rmarkdown_1.15 openxlsx_4.1.0.1
## [10] olsrr_0.5.2 MASS_7.3-51.4 ggalt_0.4.0
## [13] ggthemes_4.2.0 DT_0.8 magrittr_1.5
## [16] ggplot2_3.2.0 reshape2_1.4.3 dplyr_0.8.3
## [19] data.table_1.12.6 RMySQL_0.10.17 DBI_1.0.0
##
## loaded via a namespace (and not attached):
## [1] nlme_3.1-140 lubridate_1.7.4 webshot_0.5.1
## [4] ash_1.0-15 RColorBrewer_1.1-2 httr_1.4.0
## [7] gh_1.0.1 tools_3.6.1 backports_1.1.5
## [10] R6_2.4.0 rpart_4.1-15 KernSmooth_2.23-15
## [13] nortest_1.0-4 lazyeval_0.2.2 colorspace_1.4-1
## [16] nnet_7.3-12 withr_2.1.2 tidyselect_0.2.5
## [19] curl_4.0 compiler_3.6.1 extrafontdb_1.0
## [22] rvest_0.3.4 xml2_1.2.2 scales_1.0.0
## [25] checkmate_1.9.4 proj4_1.0-8 readr_1.3.1
## [28] goftest_1.1-1 stringr_1.4.0 digest_0.6.21
## [31] foreign_0.8-71 rio_0.5.16 pkgconfig_2.0.3
## [34] htmltools_0.3.6 extrafont_0.17 maps_3.3.0
## [37] htmlwidgets_1.3 rlang_0.4.0 readxl_1.3.1
## [40] rstudioapi_0.10 shiny_1.3.2 generics_0.0.2
## [43] jsonlite_1.6 zip_2.0.3 car_3.0-3
## [46] Matrix_1.2-17 Rcpp_1.0.2 munsell_0.5.0
## [49] abind_1.4-5 stringi_1.4.3 yaml_2.2.0
## [52] carData_3.0-2 plyr_1.8.4 recipes_0.1.6
## [55] promises_1.0.1 forcats_0.4.0 crayon_1.3.4
## [58] lattice_0.20-38 haven_2.1.1 splines_3.6.1
## [61] hms_0.5.1 zeallot_0.1.0 knitr_1.24
## [64] pillar_1.4.2 ggsignif_0.5.0 glue_1.3.1
## [67] evaluate_0.14 vctrs_0.2.0 httpuv_1.5.1
## [70] Rttf2pt1_1.3.7 cellranger_1.1.0 tidyr_0.8.3
## [73] gtable_0.3.0 assertthat_0.2.1 xfun_0.9
## [76] gower_0.2.1 mime_0.7 prodlim_2018.04.18
## [79] xtable_1.8-4 later_0.8.0 viridisLite_0.3.0
## [82] class_7.3-15 survival_2.44-1.1 timeDate_3043.102
## [85] tibble_2.1.3 lava_1.6.6 ipred_0.9-9
# functions
# organize sex from studies
trsexConcat <- function(tr){
dat_tr <- copy(tr)
dat_sex <- unique(tr[ , list(chem, study_id, sex)])
dat_sex[ , all_sex := .(concat = paste0(sex, collapse = ",")), by = study_id]
dat_sex2 <- unique(dat_sex[ , list(chem, study_id, all_sex)])
setkey(dat_sex2, "study_id")
setkey(dat_tr, "study_id")
dat_tr <- dat_tr[dat_sex2]
## all sex M, F, MF
dat_tr[all_sex == "F,M", all_sex := "MF"]
dat_tr[all_sex == "MF,F", all_sex := "MF"]
dat_tr[all_sex == "MF,M", all_sex := "MF"]
dat_tr[all_sex == "F,M,MF", all_sex := "MF"]
dat_tr[all_sex == "MF,F,M", all_sex := "MF"]
dat_tr[all_sex == "M,MF,F", all_sex := "MF"]
dat_tr[all_sex == "F,MF,M", all_sex := "MF"]
dat_tr[all_sex == "M,F", all_sex := "MF"]
dat_tr[all_sex == "F,MF", all_sex := "MF"]
dat_tr[all_sex == "M,F,MF", all_sex := "MF"]
dat <- unique(dat_tr)
return(dat)
}
# Leave-one-out function for the MLR
LinMeanSqCompar <- function(dat_lm){
ta = dat_lm ## Whole Model
Full_Model <- anova(ta) %>%
.$`Mean Sq` %>%
tail(dat_lm, n = 1) ## Extracting the Residual Sum Sq.
tb = update(ta, ~ . - factor(chem)) ## Build model but without chem as a variable
No_Chemical <- anova(tb) %>%
.$`Mean Sq` %>%
tail(dat_lm, n = 1)
tc = update(ta, ~ . - factor(strain_group))
No_Strain_group <- anova(tc) %>%
.$`Mean Sq` %>%
tail(dat_lm, n = 1)
td = update(ta, ~ . - factor(study_type))
No_Study_type <- anova(td) %>%
.$`Mean Sq` %>%
tail(dat_lm, n = 1)
te = update(ta, ~ . - factor(admin_method))
No_Admin_method <- anova(te) %>%
.$`Mean Sq` %>%
tail(dat_lm, n = 1)
tf = update(ta, ~ . - dose_spacing_center)
No_Dose_spacing <- anova(tf) %>%
.$`Mean Sq` %>%
tail(dat_lm, n = 1)
tg = update(ta, ~ . - dose_no)
No_Dose_no <- anova(tg) %>%
.$`Mean Sq` %>%
tail(dat_lm, n = 1)
th = update(ta, ~ . - study_year_center)
No_Study_year <- anova(th) %>%
.$`Mean Sq` %>%
tail(dat_lm, n = 1)
ti = update(ta, ~ . - sub_purity_center)
No_Substance_purity <- anova(ti) %>%
.$`Mean Sq` %>%
tail(dat_lm, n = 1)
tj = update(ta, ~ . - factor(study_source))
No_Study_source <- anova(tj) %>%
.$`Mean Sq` %>%
tail(dat_lm, n = 1)
tk = update(ta, ~ . - factor(all_sex))
No_sex <- anova(tk) %>%
.$`Mean Sq` %>%
tail(dat_lm, n = 1)
tbta <- anova(tb, ta) %>% .$`Pr(>F)`
tcta <- anova(tc, ta) %>% .$`Pr(>F)`
tdta <- anova(td, ta) %>% .$`Pr(>F)`
teta <- anova(te, ta) %>% .$`Pr(>F)`
tfta <- anova(tf, ta) %>% .$`Pr(>F)`
tgta <- anova(tg, ta) %>% .$`Pr(>F)`
thta <- anova(th, ta) %>% .$`Pr(>F)`
tita <- anova(ti, ta) %>% .$`Pr(>F)`
tjta <- anova(tj, ta) %>% .$`Pr(>F)`
tkta <- anova(tk, ta) %>% .$`Pr(>F)`
lmst_MS <- rbind(Full_Model,
No_Chemical,
No_Strain_group,
No_Study_type,
No_Admin_method,
No_Dose_spacing,
No_Dose_no,
No_Study_year,
No_Substance_purity,
No_Study_source,
No_sex )
lmst_P <- rbind(0, tbta, tcta, tdta, teta, tfta, tgta, thta, tita, tjta, tkta)
lmst <- cbind(lmst_MS, lmst_P) %>% .[ , -2] %>% signif(3)
colnames(lmst) <- c("MSE", "p-val")
return(lmst)
}
The DOI for this database is: https://doi.org/10.23645/epacomptox.6062545.v3 A reference publication describing the construction of ToxRefDB version 2.0 can be found here: https://doi.org/10.1016/j.reprotox.2019.07.012
toxrefdb_2_0 <- dbGetQuery(con, "SELECT
chemical.chemical_id,
chemical.dsstox_substance_id,
chemical.casrn,
chemical.preferred_name,
study.study_id,
study.chemical_id,
study.processed,
study.study_type,
study.study_year,
study.study_source,
study.species,
study.strain_group,
study.admin_route,
study.admin_method,
study.substance_purity,
study.dose_start,
study.dose_start_unit,
study.dose_end,
study.dose_end_unit,
endpoint.endpoint_category,
endpoint.endpoint_type,
endpoint.endpoint_target,
endpoint.endpoint_id,
tg_effect.life_stage,
tg_effect.tg_effect_id,
effect.effect_id,
effect.effect_desc,
tg.sex,
tg.generation,
dose.dose_level,
dtg.dose_adjusted,
dtg.dose_adjusted_unit,
dtg_effect.treatment_related,
dtg_effect.critical_effect,
tested_status,
reported_status
FROM
(((((((((prod_toxrefdb_2_0.chemical INNER JOIN prod_toxrefdb_2_0.study ON chemical.chemical_id=study.chemical_id)
LEFT JOIN prod_toxrefdb_2_0.dose ON dose.study_id=study.study_id)
LEFT JOIN prod_toxrefdb_2_0.tg ON tg.study_id=study.study_id)
LEFT JOIN prod_toxrefdb_2_0.dtg ON tg.tg_id=dtg.tg_id AND dose.dose_id=dtg.dose_id)
LEFT JOIN prod_toxrefdb_2_0.tg_effect ON tg.tg_id=tg_effect.tg_id)
LEFT JOIN prod_toxrefdb_2_0.dtg_effect ON tg_effect.tg_effect_id=dtg_effect.tg_effect_id AND dtg.dtg_id=dtg_effect.dtg_id)
LEFT JOIN prod_toxrefdb_2_0.effect ON effect.effect_id=tg_effect.effect_id)
LEFT JOIN prod_toxrefdb_2_0.endpoint ON endpoint.endpoint_id=effect.endpoint_id)
LEFT JOIN prod_toxrefdb_2_0.obs ON obs.study_id=study.study_id AND obs.endpoint_id=endpoint.endpoint_id)") %>%
data.table()
#save(toxrefdb_2_0, file='RData/toxrefdb_2_0.RData')
#length(unique(toxrefdb_2_0$dsstox_substance_id)) #1142
#length(unique(toxrefdb_2_0$study_id)) #5960
## Prepare the toxrefdb_2_0 dataset for modeling
toxrefdb_2_0[!is.na(dose_level), dose_no := max(dose_level, na.rm = TRUE) , by = study_id]
toxrefdb_2_0[dose_adjusted > 0, log_dose_adjusted := log10(dose_adjusted)]
toxrefdb_2_0[!is.na(log_dose_adjusted), ldt := min(log_dose_adjusted), by = study_id]
toxrefdb_2_0[!is.na(log_dose_adjusted), hdt := max(log_dose_adjusted), by = study_id]
toxrefdb_2_0[ , dose_spacing := (hdt-ldt)/(dose_no-1)]
ar <- c("Oral", "Oral/Gavage")
sp <- c("mouse", "rat", "dog", "rabbit", "Tif: RAI f (SPF)")
st <- c("SUB","CHR", "DEV", "MGR", "SAC" )
ec <- c('systemic')
ls <- c('adult')
gn <- c('F0')
tr2 <- data.table(toxrefdb_2_0) %>% .[processed==1 &
dose_adjusted > 0 &
dose_adjusted_unit == 'mg/kg/day' &
dose_level > 0 &
admin_route %in% ar &
study_type %in% st &
species %in% sp &
endpoint_category %in% ec &
life_stage %in% ls &
generation %in% gn &
treatment_related == 1 &
dose_no > 1,] %>%
.[ , chem := paste(casrn, preferred_name, sep = "||")]
## Standarized data notations
tr2[ species=="Tif: RAI f (SPF)", species := "rat"]
tr2[ , strain_group := tolower(strain_group) ]
tr2[ strain_group %in% c("sprague dawley","sprague-dawley"), strain_group:="sprague_dawley" ]
tr2[ strain_group %in% c("other", NA), strain_group := species ]
tr2[ study_source %in% c("NTP Technical Report", "NTP Report"), study_source:="ntp"]
tr2[ , admin_method:=tolower(admin_method) ]
tr2[ admin_method =="[Not Specified]", admin_method := admin_route ] ## All admin_method[Not Specified] ==> default to admin method
tr2[ substance_purity==">95", substance_purity:="95" ]
tr2[ substance_purity==">99", substance_purity:="99" ]
tr2[ substance_purity=="99.75% gamma isomer of hexachlorocyclohexane", substance_purity:="99.75"]
tr2[ substance_purity=="72 +/- 3" , substance_purity:="72"]
tr2[ substance_purity=="96.8 percent" , substance_purity:="96.8"]
tr2[, substance_purity:=as.numeric(substance_purity)]
tr2[ substance_purity>100, substance_purity:=100]
## Calculating LEL/LOAEL
tr2[ , log_dose_adjusted := log10(dose_adjusted) ]
tr2[ , study_summary := max(treatment_related, na.rm = TRUE), by = study_id ]
tr2[study_summary == 1, lel := min(log_dose_adjusted[treatment_related == 1], na.rm = TRUE), by = list(study_id) ]
tr2[, ce := max(critical_effect, na.rm = TRUE), by = study_id ]
tr2[ce == 1, loael := min(log_dose_adjusted[critical_effect == 1], na.rm = TRUE), by = list(study_id) ]
tr2[is.na(loael), loael := lel]
## Obtain only the variables that are relevant for analysis
dat.tr <- unique(tr2[ , list(chem,
chemical_id,
study_id,
processed,
study_year,
study_source,
study_type,
species,
strain_group,
admin_route,
admin_method,
substance_purity,
dose_no,
dose_start,
dose_start_unit,
dose_end,
dose_end_unit,
sex,
ldt,
hdt,
dose_spacing,
lel,
loael)])
## Concatenate all of the sexes for each study
dat.tr <- trsexConcat(dat.tr) %>%
.[ , sex:=NULL] %>%
unique()
## All "NA" values in study year and substance purity are given mean value
dat.tr[is.na(study_year), study_year := 1991]
dat.tr[is.na(substance_purity), substance_purity := 100 ]
dat.tr[ , sub_purity_center := substance_purity-mean(substance_purity)]
dat.tr[ , study_year_center := study_year-mean(study_year)]
dat.tr[ , dose_spacing_center := dose_spacing-mean(dose_spacing)]
#save(dat.tr, file="dat_toxref_2_0_4model.RData")
Herein the dataset is prepared for MLR.
See code for constructing the full MLR dataset.
load('dat_toxref_2_0_4model.RData')
dat.mlr.full <- unique(dat.tr) %>%
.[ , studies_count :=.N, by=chem] %>%
.[ studies_count > 1 ] %>% # need to ensure greater than one study per chem
.[ , sub_purity_center := substance_purity - mean(substance_purity)] %>%
.[ , study_year_center := study_year - mean(study_year)] %>%
.[ , dose_spacing_center := dose_spacing - mean(dose_spacing)] %>%
.[ , lelvsloael := lel==loael]
#nrow(dat.mlr.full[lelvsloael=='TRUE']) #1830 out of 2724
#1830/2724 #67% of the time
See a visual representation of the study type by species included in the full MLR dataset.
See a table below of the size and variance for the full MLR dataset.
Full MLR dataset | N |
---|---|
Total Number of Chemicals | 563 |
Total Number of Studies | 2724 |
Variance of the LEL | 0.92 |
Variance of the LOAEL | 0.79 |
length(unique(dat.mlr.full$chem)) #563 substances
length(unique(dat.mlr.full$study_id)) #2724 study records
round(var(dat.mlr.full$lel),2) #0.92 is the total variance for the LEL dataset
round(var(dat.mlr.full$loael),2) # 0.79 is the total variance for the LOAEL dataset
In the manuscript, please see Table 2: Variance estimation results for full datasets which describes the outcome of the regression modeling approaches for the full and full cell datasets.
ANOVA for the model is shown.
rr.lel <- rlm(lel ~ 0
+ factor(chem)
+ factor(strain_group)
+ factor(study_type)
+ factor(admin_method)
+ dose_spacing_center
+ dose_no
+ study_year_center
+ sub_purity_center
+ factor(study_source)
+ factor(all_sex), data = dat.mlr.full, method="M")
## ANOVA for RLR full LEL dataset
anova(rr.lel)
ANOVA for the model is shown.
mlr.lel <- lm(lel ~ 0
+ factor(chem)
+ factor(strain_group)
+ factor(study_type)
+ factor(admin_method)
+ dose_spacing_center
+ dose_no
+ study_year_center
+ sub_purity_center
+ factor(study_source)
+ factor(all_sex), data = dat.mlr.full)
anova(mlr.lel) # ANOVA for MLR full LEL dataset
Identify influential data points to understand the stability of the MSE estimate on the MLR for the full LEL dataset using standard plots. This is included as Figure 3 in the manuscript.
# First we examine the standard regression performance plots
par(mfrow= c(3,2))
par(mfrow= c(3,2))
plot(mlr.lel,which=c(1,2,3,4,5), cex.lab=1.5, cex.axis=1.5, cex.main=1.5, cex.id = 1.5, caption = list("A: Residuals vs Fitted",
"B: Normal Q-Q",
"C: Scale-Location",
"D: Cook's distance",
"E:Residuals vs Leverage"))
## Warning: not plotting observations with leverage one:
## 184, 500, 720, 721, 736, 971, 1063, 1228, 1229, 1454, 2084, 2094
## Warning: not plotting observations with leverage one:
## 184, 500, 720, 721, 736, 971, 1063, 1228, 1229, 1454, 2084, 2094
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced
Second, we examine an alternate way of identifying high Cook’s distance points. Cook’s distance using a common threshold (Cook’s distance > 4/(n-P-1)), where P is the number of covariates in the model. This threshold-based method for identifying high Cook’s distance points suggests ~5% of dataset may be influential.
See below, the figure on the left shows the Cook’s distances for each residual ordered by size, with the dotted line denoting the threshold. The figure on the right shows the influence plot for the standardized residuals.
cooksd <- rev(sort(cooks.distance(mlr.lel)))
#plot(cooksd, pch='*', cex=2, main='Influential Obs by Cooks Distance')
#abline(h=(4/(2724-(length(mlr.lel$coefficients))-1)), col='red')
std_resid <- rstandard(mlr.lel)
cooks_D <- cooks.distance(mlr.lel)
hat_values <- hatvalues(mlr.lel)
# another way to visualize this is an influence plot
# plot diagnostic influence plots together
par(mfrow= c(1,2))
plot(cooksd, pch='*', cex=2, main="Cook's Distance Index",ylab="Cook's Distance")
abline(h=(4/(2724-(length(mlr.lel$coefficients))-1)), lty=2)
plot(hat_values, std_resid, cex=10*sqrt(cooks_D), main = "Influence Plot", xlab="Hat values", ylab="Standardized residuals")
abline(h=c(-2.5,2.5), lty=2)
ANOVA for the model is shown.
# now try recomputing model by removing high leverage points as identified via standard plots from Figure 3
#warning message indicates that: not plotting observations with leverage one:
#184, 500, 720, 721, 736, 971, 1063, 1228, 1229, 1454, 2084, 2094
#961, 1916, 1918 are also high leverage points on the plots
# try again removing high leverage and high residual points
dat.mlr.lel.leverage <- dat.mlr.full[ -c(184,500,720,721,736,961,971,1063,1228,1229,1454,1916, 1918,2084,2094), ]%>%
.[ , st.count:=uniqueN(study_id), by=chem]%>% .[ st.count > 1 ]
mlr.lel.leverage <- lm(lel ~ 0
+ factor(chem)
+ factor(strain_group)
+ factor(study_type)
+ factor(admin_method)
+ dose_spacing_center
+ dose_no
+ study_year_center
+ sub_purity_center
+ factor(study_source)
+ factor(all_sex), data = dat.mlr.lel.leverage)
anova(mlr.lel.leverage) # ANOVA for MLR full LEL dataset with high leverage points removed
ANOVA for the model is shown.
#try removing points with high Cook's distance, simply based on the diagnostic plots and not the threshold
dat.mlr.lel.cooks.plot <- dat.mlr.full[ -c(961,1916,1918), ]%>%
.[ , st.count:=uniqueN(study_id), by=chem]%>% .[ st.count > 1 ]
mlr.lel.cooks.plot <- lm(lel ~ 0
+ factor(chem)
+ factor(strain_group)
+ factor(study_type)
+ factor(admin_method)
+ dose_spacing_center
+ dose_no
+ study_year_center
+ sub_purity_center
+ factor(study_source)
+ factor(all_sex), data = dat.mlr.lel.cooks.plot)
anova(mlr.lel.cooks.plot) # ANOVA for MLR full LEL dataset with high Cook's distance plot points removed
ANOVA for the model is shown.
influential <- as.numeric(names(cooksd)[(cooksd > 4/(2724-(length(mlr.lel$coefficients)-1)))])
dat.mlr.lel.cooksd <- dat.mlr.full[-c(influential),]%>% .[ , st.count:=uniqueN(study_id), by=chem]%>% .[ st.count > 1 ]
mlr.lel.cooksd <- lm(lel ~ 0
+ factor(chem)
+ factor(strain_group)
+ factor(study_type)
+ factor(admin_method)
+ dose_spacing_center
+ dose_no
+ study_year_center
+ sub_purity_center
+ factor(study_source)
+ factor(all_sex), data = dat.mlr.lel.cooksd)
anova(mlr.lel.cooksd) # Summary for MLR full LEL dataset with high Cook's distance points removed
ANOVA for the model is shown.
dat.mlr.lel.both <- dat.mlr.full[ -c(184,500,720,721,736,961,971,1063,1228,1229,1454,1916, 1918,2084,2094,influential), ]%>%
.[ , st.count:=uniqueN(study_id), by=chem]%>% .[ st.count > 1 ]
mlr.lel.both <- lm(lel ~ 0
+ factor(chem)
+ factor(strain_group)
+ factor(study_type)
+ factor(admin_method)
+ dose_spacing_center
+ dose_no
+ study_year_center
+ sub_purity_center
+ factor(study_source)
+ factor(all_sex), data = dat.mlr.lel.both)
anova(mlr.lel.both) # ANOVA for MLR full LEL dataset with all potential outliers removed
ANOVA for the model is shown.
rr.loael <- rlm(loael ~ 0
+ factor(chem)
+ factor(strain_group)
+ factor(study_type)
+ factor(admin_method)
+ dose_spacing_center
+ dose_no
+ study_year_center
+ sub_purity_center
+ factor(study_source)
+ factor(all_sex), data = dat.mlr.full, method="M")
anova(rr.loael) # ANOVA for RLR full LOAEL dataset
ANOVA for the model is shown.
mlr.loael <- lm(loael ~ 0
+ factor(chem)
+ factor(strain_group)
+ factor(study_type)
+ factor(admin_method)
+ dose_spacing_center
+ dose_no
+ study_year_center
+ sub_purity_center
+ factor(study_source)
+ factor(all_sex), data = dat.mlr.full)
anova(mlr.loael) # ANOVA for MLR full LOAEL dataset
Identify influential data points to understand the stability of the MSE estimate on the MLR for the full LEL dataset using standard plots.
par(mfrow= c(3,2))
plot(mlr.loael)
## Warning: not plotting observations with leverage one:
## 184, 500, 720, 721, 736, 971, 1063, 1228, 1229, 1454, 2084, 2094
## Warning: not plotting observations with leverage one:
## 184, 500, 720, 721, 736, 971, 1063, 1228, 1229, 1454, 2084, 2094
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced
plot(mlr.loael, which=c(4))
ANOVA for the model is shown.
dat.mlr.loael.leverage <- dat.mlr.full[ -c(56, 60, 184, 500, 720, 721, 736, 971, 1063, 1228, 1229, 1454, 1510, 2084, 2094), ]%>%
.[ , st.count:=uniqueN(study_id), by=chem] %>% .[ st.count > 1 ]
mlr.loael.leverage<- lm(loael ~ 0
+ factor(chem)
+ factor(strain_group)
+ factor(study_type)
+ factor(admin_method)
+ dose_spacing_center
+ dose_no
+ study_year_center
+ sub_purity_center
+ factor(study_source)
+ factor(all_sex), data = dat.mlr.loael.leverage)
anova(mlr.loael.leverage) # ANOVA for MLR full LOAEL dataset high leverage points removed
ANOVA for the model is shown.
dat.mlr.loael.cooks.plot <- dat.mlr.full[ -c(644,861,959), ]%>%
.[ , st.count:=uniqueN(study_id), by=chem] %>% .[ st.count > 1 ]
mlr.loael.cooks.plot <- lm(loael ~ 0
+ factor(chem)
+ factor(strain_group)
+ factor(study_type)
+ factor(admin_method)
+ dose_spacing_center
+ dose_no
+ study_year_center
+ sub_purity_center
+ factor(study_source)
+ factor(all_sex), data = dat.mlr.loael.cooks.plot)
anova(mlr.loael.cooks.plot) # ANOVA for MLR full LOAEL dataset high Cook's distance plot points removed
We examined an alternate way of identifying high Cook’s distance points. Cook’s distance using a common threshold (Cook’s distance > 4/(n-P-1)), where P is the number of covariates in the model. This threshold-based method for identifying high Cook’s distance points suggests ~5% of dataset may be influential.
See below, the figure on the left shows the Cook’s distances for each residual ordered by size, with the dotted line denoting the threshold. The figure on the right shows the influence plot for the standardized residuals.
cooksd.loael <- rev(sort(cooks.distance(mlr.loael)))
#plot(cooksd.loael, pch='*', cex=2, main='Influential Obs by Cooks Distance')
#abline(h=(4/(2112-(length(mlr.loael$coefficients))-1)), col='red')
# another way to visualize this is an influence plot
std_resid_loael <- rstandard(mlr.loael)
cooks_D_loael <- cooks.distance(mlr.loael)
hat_values_loael <- hatvalues(mlr.loael)
# plot diagnostic influence plots together
par(mfrow= c(1,2))
plot(cooksd, pch='*', cex=2, main="Cook's Distance Index",ylab="Cook's Distance")
abline(h=(4/(2112-(length(mlr.loael$coefficients))-1)), lty=2)
plot(hat_values_loael, std_resid_loael, cex=10*sqrt(cooks_D_loael), main = "Influence Plot", xlab="Hat values", ylab="Standardized residuals")
abline(h=c(-2.5,2.5), lty=2)
ANOVA for the model is shown.
influential.loael <- as.numeric(names(cooksd.loael)[(cooksd.loael > 4/(2112-(length(mlr.loael$coefficients)-1)))])
dat.mlr.loael.cooksd <- dat.mlr.full[-c(influential.loael),]%>% .[ , st.count:=uniqueN(study_id), by=chem]%>% .[ st.count > 1 ]
mlr.loael.cooksd <- lm(loael ~ 0
+ factor(chem)
+ factor(strain_group)
+ factor(study_type)
+ factor(admin_method)
+ dose_spacing_center
+ dose_no
+ study_year_center
+ sub_purity_center
+ factor(study_source)
+ factor(all_sex), data = dat.mlr.loael.cooksd)
anova(mlr.loael.cooksd) # ANOVA for MLR full LOAEL dataset high Cook's distance points removed
ANOVA for the model is shown
dat.mlr.loael.both <- dat.mlr.full[ -c(56, 60, 184, 500, 720, 721, 736, 971, 1063, 1228, 1229, 1454, 1510, 2084, 2094, influential.loael), ]%>%
.[ , st.count:=uniqueN(study_id), by=chem] %>% .[ st.count > 1 ]
mlr.loael.both <- lm(loael ~ 0
+ factor(chem)
+ factor(strain_group)
+ factor(study_type)
+ factor(admin_method)
+ dose_spacing_center
+ dose_no
+ study_year_center
+ sub_purity_center
+ factor(study_source)
+ factor(all_sex), data = dat.mlr.loael.both)
anova(mlr.loael.both) # ANOVA for MLR full LOAEL dataset high Cook's distance points removed
Following the development of the full model, nested models were developed in a leave-one-out (LOO) approach to evaluate the relative impact of each study parameter on the total variance. An analysis of variance (ANOVA) was used to compare between full and nested models to evaluate the significance of each study descriptor.
See the manuscript where this is included in Table 3. The prefix “No” indicates that the covariate was removed from the model. % exp is the % of the total variance explained.
mlr.lel.LOO <- LinMeanSqCompar(mlr.lel)
mlr.lel.LOO <- as.data.table(mlr.lel.LOO, keep.rownames = TRUE)
total.var <- var(dat.mlr.full$lel)
mlr.lel.LOO[,`%exp` := round((((total.var - MSE)/total.var)*100),1)]
setnames(mlr.lel.LOO, c('rn'), c('Model Parameter Removed'))
kable(mlr.lel.LOO) %>% kable_styling()
Model Parameter Removed | MSE | p-val | %exp |
---|---|---|---|
Full_Model | 0.347 | 0.00e+00 | 62.1 |
No_Chemical | 0.766 | 0.00e+00 | 16.4 |
No_Strain_group | 0.401 | 0.00e+00 | 56.2 |
No_Study_type | 0.371 | 0.00e+00 | 59.5 |
No_Admin_method | 0.350 | 1.25e-03 | 61.8 |
No_Dose_spacing | 0.350 | 2.27e-05 | 61.8 |
No_Dose_no | 0.351 | 1.50e-06 | 61.7 |
No_Study_year | 0.347 | 7.69e-01 | 62.1 |
No_Substance_purity | 0.347 | 1.98e-01 | 62.1 |
No_Study_source | 0.348 | 3.62e-02 | 62.0 |
No_sex | 0.349 | 5.71e-04 | 61.9 |
See the manuscript where this is included in Table 3. The prefix “No” indicates that the covariate was removed from the model. % exp is the % of the total variance explained.
mlr.loael.LOO <- LinMeanSqCompar(mlr.loael)
mlr.loael.LOO <- as.data.table(mlr.loael.LOO, keep.rownames = TRUE)
total.var.loael <- var(dat.mlr.full$loael)
mlr.loael.LOO[,`%exp` := round((((total.var.loael - MSE)/total.var.loael)*100),1)]
setnames(mlr.loael.LOO, c('rn'), c('Model Parameter Removed'))
kable(mlr.loael.LOO) %>% kable_styling()
Model Parameter Removed | MSE | p-val | %exp |
---|---|---|---|
Full_Model | 0.258 | 0.00000 | 67.4 |
No_Chemical | 0.683 | 0.00000 | 13.8 |
No_Strain_group | 0.301 | 0.00000 | 62.0 |
No_Study_type | 0.272 | 0.00000 | 65.7 |
No_Admin_method | 0.258 | 0.05900 | 67.4 |
No_Dose_spacing | 0.259 | 0.00311 | 67.3 |
No_Dose_no | 0.258 | 0.66500 | 67.4 |
No_Study_year | 0.258 | 0.50800 | 67.4 |
No_Substance_purity | 0.258 | 0.49400 | 67.4 |
No_Study_source | 0.259 | 0.00806 | 67.3 |
No_sex | 0.258 | 0.37900 | 67.4 |
Herein the dataset is prepared for ACM.
Semi-automated and manual review of study records included in the “cells” of the ACM resulted in what is referred to as the “full cell dataset.”
dat.acm.full <- data.table(dat.mlr.full)
dat.acm.full[ , chemfact := paste(chem,species,admin_method,study_type,all_sex, sep="||")]
dat.acm.full[ , chemfact_ct:=.N, by=chemfact]
dat.acm <- dat.acm.full[ chemfact_ct > 1] %>%
.[ , st_ct:=.N , by=study_type] %>%
.[ , lel_min:= min(lel), by=chemfact] %>%
.[ , lel_max:= max(lel), by=chemfact] %>%
.[ , lel_mean:=mean(lel), by=chemfact] %>%
.[ , lel_range:=abs(lel_max-lel_min)] %>%
.[ , ds_low:= min(dose_spacing), by=c("chem", "study_type")] %>%
.[ , ds_hight:= max(dose_spacing), by=c("chem", "study_type")] %>%
.[ , ds_range:=abs(ds_low-ds_hight)] %>%
.[ , test_range:= abs(hdt-ldt)]
dat.cell <- as.data.table(dat.acm)
dat.cell[,chemfact_cell := .N, by=chemfact]
# Based on semi-automated and a manual review of the cells, some failed and are removed for this analysis
drop_study_id <- c(5156,5163,5151,5161,3981,3986,526,528,1234,5091,5532,5090,5531,1397,4100,728,6094,6093,6536,2334,2336,338,5919,5783,5784,951,952,3777,
2136,1910,1917,1841,1844,3821,3822,3816,3817,2066,2067,2069,2070,3969,3970,463,464,1440,4032,4865,4892,2764,2766,4452,4453,4446,4461,
4364,4366,4024,4075,4079,4725,4728,2108,2120,2115,2117,2124,2126,1580,1582,321,588,325,326,1206,1208,1205,1207,6071,6412,3528,3530,5811,
5812,4015,4038,2295,4050,2987,2989,2663,2664,3440,3441,3276,3277,6100,6099,6689,6690,2144,2145,1040,2164,7463,5109,5724,5723,2421,2422,
2423,2425,1678,1679,5155,5157,668,4274,6490,6492,2036,7493,3036,4267,433,2817,2819,4474,4478,3531,4276,4278,3947,3951,5613,6740,5598,
6743,5599,6738,5597,6742,214,215,217,216,218,5722,5727,1392,7470,5765,6097,6176,6177,1362,1363,2575,2577,1641,1647,898,5374,1052,4146,
1043,4263,3579,5843,3903,6802,3902,6800,1954,5452, 3900, 6753)
# amend the drop_study_id following additional manual curation
cell.eval.done <- fread('acm_cell_evaluation_1april2019_reviewed.csv')
drop_study_id2 <- c(drop_study_id,
cell.eval.done[study_keep==0, study_id]) # drop_study_id = 177, drop_study_id2 = 213
dat.cell.use <- dat.cell[!study_id %in% drop_study_id2]
See a visual representation of the study type by species included in the full cell dataset for ACM.
See a table below of the size and variance for the full cell dataset.
Full cell dataset | N |
---|---|
Total Number of Chemicals | 96 |
Total Number of Studies | 278 |
Variance of the LEL | 0.86 |
Variance of the LOAEL | 0.75 |
length(unique(dat.cell.use$chem)) #96
length(unique(dat.cell.use$study_id)) #278
round(var( dat.cell.use$lel),2) #0.86
round(var( dat.cell.use$loael),2) #0.75
ACM and MLR modeling were performed on the full cell dataset for comparison.
ANOVA for the model is shown.
acm.lel <- lm(lel ~ 0
+ factor(chemfact)
+ dose_spacing_center
+ dose_no
+ study_year_center
+ sub_purity_center,
data = dat.cell.use )
anova(acm.lel) # ANOVA for ACM full cell LEL dataset
ANOVA for the model is shown.
acm.loael <- lm(loael ~ 0
+ factor(chemfact)
+ dose_spacing_center
+ dose_no
+ study_year_center
+ sub_purity_center,
data = dat.cell.use)
anova(acm.loael) # ANOVA for ACM full cell LOAEL dataset
par(mfrow= c(2,2))
plot(acm.lel)
## Warning: not plotting observations with leverage one:
## 107, 201, 237
## Warning: not plotting observations with leverage one:
## 107, 201, 237
par(mfrow= c(2,2))
plot(acm.loael)
## Warning: not plotting observations with leverage one:
## 107, 201, 237
## Warning: not plotting observations with leverage one:
## 107, 201, 237
An MLR model was developed for the full cell dataset to compare to the ACM approach for the same dataset. ANOVA for the model is shown.
mlr.acm.lel <- lm(lel ~ 0
+ factor(chem)
+ factor(strain_group)
+ factor(study_type)
+ factor(admin_method)
+ dose_spacing_center
+ dose_no
+ study_year_center
+ sub_purity_center
+ factor(study_source)
+ factor(all_sex), data = dat.cell.use)
anova(mlr.acm.lel) # ANOVA for MLR full cell LEL dataset
An MLR model was developed for the full cell dataset to compare to the ACM approach for the same dataset. ANOVA for the model is shown.
mlr.acm.loael <- lm(loael ~ 0
+ factor(chem)
+ factor(strain_group)
+ factor(study_type)
+ factor(admin_method)
+ dose_spacing_center
+ dose_no
+ study_year_center
+ sub_purity_center
+ factor(study_source)
+ factor(all_sex), data = dat.cell.use)
anova(mlr.acm.loael) # ANOVA for MLR full cell LOAEL dataset
See this summary as it is included in the manuscript in Table 2.
Multiple regression types (RLR = robust linear regression; MLR = multilinear regression; ACM = augmented cell means) with different full datasets were used to build models of the available data for variance estimation. Total variance and MSE are unitless, whereas RMSE is in log10(mg/kg/day) units just like the dataset. % exp = percent total variance explained. N = number of study records in the dataset.
# data.tables used for input datasets for RLR, MLR, and ACM of LEL values
data_list <- list(dat.mlr.full,
dat.mlr.full,
dat.mlr.lel.leverage,
dat.mlr.lel.cooks.plot,
dat.mlr.lel.cooksd,
dat.mlr.lel.both,
dat.cell.use,
dat.cell.use)
# columns used
cols_list <- quos(lel)
# functions applied
round_var <- function(x) round(var(x$lel),3) # this function calculates the variance in lel for each dataset
my_n <- function(dataset) unlist(count(dataset)) # this function calculates the n
dat.res <- data_list %>% {
tibble(
var = map_dbl(., round_var),
n = map_dbl(., my_n)
)
}
# MSE of LEL models
model_list <- list(rr.lel,
mlr.lel,
mlr.lel.leverage,
mlr.lel.cooks.plot,
mlr.lel.cooksd,
mlr.lel.both,
acm.lel,
mlr.acm.lel)
# function to calculate MSE (sum(residuals^2)/error degrees of freedom)
# robust regression (rr) residual calc defaults to mlr.lel model for df.residual because df.residual is NA for robust regression
mse_func <- function(model) {
if(is.na(model$df.residual)){model$df.residual <- mlr.lel$df.residual}
round(glance(model)$deviance / model$df.residual, 3)
}
# apply mse and rmse functions to the model_list
mod.res <- model_list %>% {
tibble(mse = map_dbl(., mse_func)) } %>%
mutate(rmse = round(sqrt(mse), 3))
lel.model.results <- bind_cols(dat.res, mod.res)
# make a table of the values to summarize the models
data.lel <- c('full dataset',
'full dataset',
'high leverage points removed',
'high Cooks distance plot points removed',
'high Cooks distance points removed',
'all potential outliers removed',
'full cell dataset',
'full cell dataset')
regression.type <-c('RR',
'MLR',
'MLR',
'MLR',
'MLR',
'MLR',
'ACM',
'MLR')
lel.table <- data.table(regression.type,
data.lel,
lel.model.results)
setnames(lel.table,
c('regression.type','data.lel', 'var', 'mse', 'rmse', 'n'),
c('Regression Type', 'Data', 'Total Variance', 'MSE','RMSE', 'N'))
setcolorder(lel.table, c('Regression Type', 'Data', 'Total Variance', 'MSE', 'RMSE', 'N'))
lel.table[,`% exp.` := round(((`Total Variance` - MSE)/`Total Variance`)*100,1)]
kable(lel.table) %>% kable_styling()
Regression Type | Data | Total Variance | MSE | RMSE | N | % exp. |
---|---|---|---|---|---|---|
RR | full dataset | 0.916 | 0.359 | 0.599 | 2724 | 60.8 |
MLR | full dataset | 0.916 | 0.347 | 0.589 | 2724 | 62.1 |
MLR | high leverage points removed | 0.908 | 0.339 | 0.582 | 2708 | 62.7 |
MLR | high Cooks distance plot points removed | 0.911 | 0.339 | 0.582 | 2720 | 62.8 |
MLR | high Cooks distance points removed | 0.842 | 0.261 | 0.511 | 2592 | 69.0 |
MLR | all potential outliers removed | 0.838 | 0.261 | 0.511 | 2580 | 68.9 |
ACM | full cell dataset | 0.858 | 0.320 | 0.566 | 278 | 62.7 |
MLR | full cell dataset | 0.858 | 0.387 | 0.622 | 278 | 54.9 |
Multiple regression types (RLR = robust linear regression; MLR = multilinear regression; ACM = augmented cell means) with different full datasets were used to build models of the available data for variance estimation. Total variance and MSE are unitless, whereas RMSE is in log10(mg/kg/day) units just like the dataset. % exp = percent total variance explained. N = number of study records in the dataset.
# data.tables used for input datasets for RLR, MLR, and ACM of LEL values
data_list <- list(dat.mlr.full,
dat.mlr.full,
dat.mlr.loael.leverage,
dat.mlr.loael.cooks.plot,
dat.mlr.loael.cooksd,
dat.mlr.loael.both,
dat.cell.use,
dat.cell.use)
# columns used
cols_list <- quos(loael)
# functions applied
round_var_loael <- function(x) round(var(x$loael),3) # this function calculates the variance in lel for each dataset
my_n_loael <- function(dataset) unlist(count(dataset)) # this function calculates the n
dat.res.loael <- data_list %>% {
tibble(
var = map_dbl(., round_var_loael),
n = map_dbl(., my_n_loael)
)
}
# MSE of LOAEL models
model_list_loael <- list(rr.loael,
mlr.loael,
mlr.loael.leverage,
mlr.loael.cooks.plot,
mlr.loael.cooksd,
mlr.loael.both,
acm.loael,
mlr.acm.loael)
# function to calculate MSE (sum(residuals^2)/error degrees of freedom)
# robust regression (rr) residual calc defaults to mlr.loael model for df.residual because df.residual is NA for robust regression
mse_func_loael <- function(model) {
if(is.na(model$df.residual)){model$df.residual <- mlr.loael$df.residual}
round(glance(model)$deviance / model$df.residual, 3)
}
# apply mse and rmse functions to the model_list
mod.res.loael <- model_list_loael %>% {
tibble(mse = map_dbl(., mse_func_loael)) } %>%
mutate(rmse = round(sqrt(mse), 3))
loael.model.results <- bind_cols(dat.res.loael, mod.res.loael)
# make a table of the values to summarize the models
data.loael <- c('full dataset',
'full dataset',
'high leverage points removed',
'high Cooks distance plot points removed',
'high Cooks distance points removed',
'all potential outliers removed',
'full cell dataset',
'full cell dataset')
regression.type <-c('RR',
'MLR',
'MLR',
'MLR',
'MLR',
'MLR',
'ACM',
'MLR')
loael.table <- data.table(regression.type,
data.loael,
loael.model.results)
setnames(loael.table,
c('regression.type','data.loael', 'var', 'mse', 'rmse', 'n'),
c('Regression Type', 'Data', 'Total Variance', 'MSE','RMSE', 'N'))
setcolorder(loael.table, c('Regression Type', 'Data', 'Total Variance', 'MSE', 'RMSE', 'N'))
loael.table[,`% exp.` := round(((`Total Variance` - MSE)/`Total Variance`)*100,1)]
kable(loael.table) %>% kable_styling()
Regression Type | Data | Total Variance | MSE | RMSE | N | % exp. |
---|---|---|---|---|---|---|
RR | full dataset | 0.792 | 0.266 | 0.516 | 2724 | 66.4 |
MLR | full dataset | 0.792 | 0.258 | 0.508 | 2724 | 67.4 |
MLR | high leverage points removed | 0.781 | 0.250 | 0.500 | 2709 | 68.0 |
MLR | high Cooks distance plot points removed | 0.788 | 0.254 | 0.504 | 2721 | 67.8 |
MLR | high Cooks distance points removed | 0.748 | 0.200 | 0.447 | 2614 | 73.3 |
MLR | all potential outliers removed | 0.744 | 0.200 | 0.447 | 2603 | 73.1 |
ACM | full cell dataset | 0.750 | 0.254 | 0.504 | 278 | 66.1 |
MLR | full cell dataset | 0.750 | 0.312 | 0.559 | 278 | 58.4 |
MLR was used to build models using data subset by the study type (SUB = subchronic; CHR = chronic; DEV = developmental) for variance estimation.
The MLR models for subchronic (SUB), chronic (CHR), and developmental (DEV) studies require that the full dataset be partitioned by study type, as shown in the code here.
#load('dat_toxref_2_0_4model.RData')
load('toxrefdb_data_4models_acm_mlr.RData')
mlr.dev <- dat.mlr.full[ study_type=="DEV"] %>%
.[ , st_ct:=.N, by=chemical_id] %>%
.[ st_ct>1 ]
mlr.chr <- dat.mlr.full[ study_type=="CHR"] %>%
.[ , st_ct:=.N, by=chemical_id] %>%
.[ st_ct > 1 ]
mlr.sub <- dat.mlr.full[ study_type=="SUB"] %>%
.[ , st_ct:=.N, by=chemical_id] %>%
.[ st_ct > 1 ]
See table below of the size of the MLR data subsets.
MLR data subset | Parameter | N |
---|---|---|
SUB | Total Number of Chemicals | 281 |
SUB | Total Number of Studies | 705 |
CHR | Total Number of Chemicals | 429 |
CHR | Total Number of Studies | 1149 |
DEV | Total Number of Chemicals | 121 |
DEV | Total Number of Studies | 275 |
length(unique(mlr.sub$chem))
length(unique(mlr.sub$study_id))
length(unique(mlr.chr$chem))
length(unique(mlr.chr$study_id))
length(unique(mlr.dev$chem))
length(unique(mlr.dev$study_id))
ANOVA for the model is shown.
# investigating the covariates available for the MLR DEV LEL model
#table(mlr.dev$study_source) # the study source is highly skewed (only 4/275 records come from second level)
#table(mlr.dev$admin_method) # admin method is fairly uninformative with the overwhelming majority from gavage
#table(mlr.dev$strain_group)
#table(mlr.dev$dose_spacing_center)
#table(mlr.dev$dose_no)
#table(mlr.dev$all_sex) # all female
# have to remove study type because there are no longer 2 levels
# have to remove sex because there are no longer 2 levels (all female adults)
mlr.dev.lel <- lm(lel ~ 0
+ factor(chem)
+ factor(strain_group)
+ factor(admin_method)
+ dose_spacing_center
+ dose_no
+ study_year_center
+ sub_purity_center,
#+ factor(study_source), #only 2, ntp (4 records) and opp_der (271 records)
#+ factor(all_sex),
data = mlr.dev)
anova(mlr.dev.lel)
ANOVA for the model is shown.
mlr.dev.loael <- lm(loael ~ 0
+ factor(chem)
+ factor(strain_group)
+ factor(admin_method)
+ dose_spacing_center
+ dose_no
+ study_year_center
+ sub_purity_center,
#+ factor(study_source), #only 2, ntp and opp_der
#+ factor(all_sex),
data = mlr.dev)
anova(mlr.dev.loael)
ANOVA for the model is shown.
#table(mlr.chr$study_source) # very few open_lit and then many opp_der and ntp
#table(mlr.chr$admin_method)
#table(mlr.chr$strain_group)
#table(mlr.chr$dose_spacing_center)
#table(mlr.chr$dose_no)
#table(mlr.chr$all_sex) # sex seems to be uninformative here as well
# have to remove study type because there are no longer 2 levels
# have to remove sex because there are no longer 2 levels (all female adults)
mlr.chr.lel <- lm(lel ~ 0
+ factor(chem)
+ factor(strain_group)
+ factor(admin_method)
+ dose_spacing_center
+ dose_no
+ study_year_center
+ sub_purity_center
+ factor(study_source), #only 2, ntp and opp_der
#+ factor(all_sex), # sex is not really informative because so many are MF
data = mlr.chr)
anova(mlr.chr.lel)
ANOVA for the model is shown.
mlr.chr.loael <- lm(loael ~ 0
+ factor(chem)
+ factor(strain_group)
+ factor(admin_method)
+ dose_spacing_center
+ dose_no
+ study_year_center
+ sub_purity_center
+ factor(study_source), #only 2, ntp and opp_der
#+ factor(all_sex),
data = mlr.chr)
anova(mlr.chr.loael)
ANOVA for the model is shown.
#table(mlr.sub$study_source) # though there are three options, very few open_lit and then many opp_der and ntp
#table(mlr.sub$admin_method)
#table(mlr.sub$strain_group)
#table(mlr.sub$dose_spacing_center)
#table(mlr.sub$dose_no)
#table(mlr.sub$all_sex) # so many MF that this is not informative
mlr.sub.lel <- lm(lel ~ 0
+ factor(chem)
+ factor(strain_group)
+ factor(admin_method)
+ dose_spacing_center
+ dose_no
+ study_year_center
+ sub_purity_center
+ factor(study_source),
#+ factor(all_sex), # sex is not really informative because so many are MF
data = mlr.sub)
anova(mlr.sub.lel)
ANOVA for the model is shown.
mlr.sub.loael <- lm(loael ~ 0
+ factor(chem)
+ factor(strain_group)
+ factor(admin_method)
+ dose_spacing_center
+ dose_no
+ study_year_center
+ sub_purity_center
+ factor(study_source), #only 2, ntp and opp_der
#+ factor(all_sex),
data = mlr.sub)
anova(mlr.sub.loael)
ACM was used to build models using data subset by the study type (SUB = subchronic; CHR = chronic; DEV = developmental) for variance estimation.
The ACM models for subchronic (SUB), chronic (CHR), and developmental (DEV) studies require that the full cell dataset be partitioned by study type, as shown in the code here. This limits the size of the datasets considerably for ACM modeling.
#length(unique(dat.cell.use[study_type=='DEV']$chemfact)) #28
#length(unique(dat.cell.use[study_type=='CHR']$chemfact)) #56
#length(unique(dat.cell.use[study_type=='SUB']$chemfact)) #43
#length(unique(dat.cell.use$chemfact)) #133
acm.dev <- dat.cell.use[ study_type=="DEV"] %>%
.[ , st_ct:=.N, by=chemical_id] %>%
.[ st_ct>1 ]
acm.chr <- dat.cell.use[ study_type=="CHR"] %>%
.[ , st_ct:=.N, by=chemical_id] %>%
.[ st_ct > 1 ]
acm.sub <- dat.cell.use[ study_type=="SUB"] %>%
.[ , st_ct:=.N, by=chemical_id] %>%
.[ st_ct > 1 ]
See table below of the size of the ACM data subsets.
ACM data subset | Parameter | N |
---|---|---|
SUB | Number of cells | 42 |
SUB | Number of chemicals | 40 |
SUB | Number of study records | 92 |
CHR | Number of cells | 56 |
CHR | Number of chemicals | 45 |
CHR | Number of study records | 117 |
DEV | Number of cells | 27 |
DEV | Number of chemicals | 24 |
DEV | Number of study records | 54 |
length(unique(acm.sub$chemfact))
length(unique(acm.sub$chemical_id))
length(unique(acm.sub$study_id))
length(unique(acm.chr$chemfact))
length(unique(acm.chr$chemical_id))
length(unique(acm.chr$study_id))
length(unique(acm.dev$chemfact))
length(unique(acm.dev$chemical_id))
length(unique(acm.dev$study_id))
ANOVA for the model is shown.
acm.dev.lel <- lm(lel ~ 0
+ factor(chemfact)
+ dose_spacing_center
+ dose_no
+ study_year_center
+ sub_purity_center,
data = acm.dev)
anova(acm.dev.lel)
ANOVA for the model is shown.
acm.dev.loael <- lm(loael ~ 0
+ factor(chemfact)
+ dose_spacing_center
+ dose_no
+ study_year_center
+ sub_purity_center,
data = acm.dev)
anova(acm.dev.loael)
ANOVA for the model is shown.
acm.chr.lel <- lm(lel ~ 0
+ factor(chemfact)
+ dose_spacing_center
+ dose_no
+ study_year_center
+ sub_purity_center,
data = acm.chr)
anova(acm.chr.lel)
ANOVA for the model is shown.
acm.chr.loael <- lm(loael ~ 0
+ factor(chemfact)
+ dose_spacing_center
+ dose_no
+ study_year_center
+ sub_purity_center,
data = acm.chr)
anova(acm.chr.loael)
ANOVA for the model is shown.
acm.sub.lel <- lm(lel ~ 0
+ factor(chemfact)
+ dose_spacing_center
+ dose_no
+ study_year_center
+ sub_purity_center,
data = acm.sub)
anova(acm.sub.lel)
ANOVA for the model is shown.
acm.sub.loael <- lm(loael ~ 0
+ factor(chemfact)
+ dose_spacing_center
+ dose_no
+ study_year_center
+ sub_purity_center,
data = acm.sub)
anova(acm.sub.loael)
See this summary as it is included in the manuscript as Table 4.
Two regression types (MLR = multilinear regression, ACM = augmented cell means) were used to build models using data subset by the study type (SUB = subchronic; CHR = chronic; DEV = developmental) for variance estimation. Total variance and MSE are unitless, whereas RMSE is in log10(mg/kg/day) units just like the dataset. % exp = percent total variance explained. N = number of study records in the dataset.
# data.tables used for input datasets for MLR and ACM study type models
sub_list <- list(mlr.sub,
acm.sub,
mlr.chr,
acm.chr,
mlr.dev,
acm.dev)
# columns used
cols_list <- quos(lel)
# functions applied
round_var_lel <- function(x) round(var(x$lel),3) # this function calculates the variance in loael for each dataset
my_n_lel <- function(dataset) unlist(count(dataset)) # this function calculates the n
sub.res.lel <- sub_list %>% {
tibble(
var = map_dbl(., round_var_lel),
n = map_dbl(., my_n_lel)
)
}
# MSE of LOAEL models
sub_model_list_lel <- list(mlr.sub.lel,
acm.sub.lel,
mlr.chr.lel,
acm.chr.lel,
mlr.dev.lel,
acm.dev.lel)
# function to calculate MSE (sum(residuals^2)/error degrees of freedom)
sub_mse_func_lel <- function(model) {
round(glance(model)$deviance / model$df.residual, 3)
}
# apply mse and rmse functions to the model_list
sub.mod.res.lel <- sub_model_list_lel %>% {
tibble(mse = map_dbl(., sub_mse_func_lel)) } %>%
mutate(rmse = round(sqrt(mse), 3))
sub.lel.model.results <- bind_cols(sub.res.lel, sub.mod.res.lel)
datasets <- c('SUB','SUB','CHR','CHR','DEV','DEV')
regression.type <-c('MLR','ACM','MLR','ACM','MLR','ACM')
sub.lel.table <- data.table(regression.type,
datasets,
sub.lel.model.results)
setnames(sub.lel.table,
c('regression.type','datasets', 'var', 'mse', 'rmse', 'n'),
c('Regression Type', 'Data', 'Total Variance', 'MSE','RMSE', 'N'))
setcolorder(sub.lel.table, c('Regression Type', 'Data', 'Total Variance', 'MSE', 'RMSE', 'N'))
sub.lel.table[,`% exp.` := round(((`Total Variance` - MSE)/`Total Variance`)*100,1)]
kable(sub.lel.table) %>% kable_styling()
Regression Type | Data | Total Variance | MSE | RMSE | N | % exp. |
---|---|---|---|---|---|---|
MLR | SUB | 0.879 | 0.350 | 0.592 | 705 | 60.2 |
ACM | SUB | 1.013 | 0.301 | 0.549 | 92 | 70.3 |
MLR | CHR | 0.952 | 0.352 | 0.593 | 1149 | 63.0 |
ACM | CHR | 0.887 | 0.395 | 0.628 | 117 | 55.5 |
MLR | DEV | 0.604 | 0.246 | 0.496 | 275 | 59.3 |
ACM | DEV | 0.410 | 0.328 | 0.573 | 54 | 20.0 |
Two regression types (MLR = multilinear regression, ACM = augmented cell means) were used to build models using data subset by the study type (SUB = subchronic; CHR = chronic; DEV = developmental) for variance estimation. Total variance and MSE are unitless, whereas RMSE is in log10(mg/kg/day) units just like the dataset. % exp = percent total variance explained. N = number of study records in the dataset.
# data.tables used for input datasets for MLR and ACM study type models
sub_list <- list(mlr.sub,
acm.sub,
mlr.chr,
acm.chr,
mlr.dev,
acm.dev)
# columns used
cols_list <- quos(loael)
# functions applied
round_var_loael <- function(x) round(var(x$loael),3) # this function calculates the variance in loael for each dataset
my_n_loael <- function(dataset) unlist(count(dataset)) # this function calculates the n
sub.res.loael <- sub_list %>% {
tibble(
var = map_dbl(., round_var_loael),
n = map_dbl(., my_n_loael)
)
}
# MSE of LOAEL models
sub_model_list_loael <- list(mlr.sub.loael,
acm.sub.loael,
mlr.chr.loael,
acm.chr.loael,
mlr.dev.loael,
acm.dev.loael)
# function to calculate MSE (sum(residuals^2)/error degrees of freedom)
sub_mse_func_loael <- function(model) {
round(glance(model)$deviance / model$df.residual, 3)
}
# apply mse and rmse functions to the model_list
sub.mod.res.loael <- sub_model_list_loael %>% {
tibble(mse = map_dbl(., sub_mse_func_loael)) } %>%
mutate(rmse = round(sqrt(mse), 3))
sub.loael.model.results <- bind_cols(sub.res.loael, sub.mod.res.loael)
datasets <- c('SUB','SUB','CHR','CHR','DEV','DEV')
regression.type <-c('MLR','ACM','MLR','ACM','MLR','ACM')
sub.loael.table <- data.table(regression.type,
datasets,
sub.loael.model.results)
setnames(sub.loael.table,
c('regression.type','datasets', 'var', 'mse', 'rmse', 'n'),
c('Regression Type', 'Data', 'Total Variance', 'MSE','RMSE', 'N'))
setcolorder(sub.loael.table, c('Regression Type', 'Data', 'Total Variance', 'MSE', 'RMSE', 'N'))
sub.loael.table[,`% exp.` := round(((`Total Variance` - MSE)/`Total Variance`)*100,1)]
kable(sub.loael.table) %>% kable_styling()
Regression Type | Data | Total Variance | MSE | RMSE | N | % exp. |
---|---|---|---|---|---|---|
MLR | SUB | 0.782 | 0.277 | 0.526 | 705 | 64.6 |
ACM | SUB | 0.904 | 0.250 | 0.500 | 92 | 72.3 |
MLR | CHR | 0.795 | 0.252 | 0.502 | 1149 | 68.3 |
ACM | CHR | 0.825 | 0.265 | 0.515 | 117 | 67.9 |
MLR | DEV | 0.594 | 0.217 | 0.466 | 275 | 63.5 |
ACM | DEV | 0.398 | 0.316 | 0.562 | 54 | 20.6 |
See the manuscript where this is included as Figure 7. The minimum prediction interval for this dataset cannot be smaller than the 2.5 to 97.5 percentiles of the residuals on the model of these data.
The 2.5 and 97.5 percentiles are shown:
# get the cumulative distribution for the main model residuals
mlr.ecd <- ecdf(residuals(mlr.lel))
quantile(mlr.ecd, probs=c(0.025, 0.975))
## 2.5% 97.5%
## -1.098148 1.002591
#range(residuals(mlr.lel)) #-2.462775 2.419666
#sd(residuals(mlr.lel)) #0.519
#mean(residuals(mlr.lel))
#2*(sd(residuals(mlr.lel))) #1.0384 #thus we find our residuals are slightly long-tailed
In this code section, the cumulative distribution of residuals for a normal data set is calculated.
# get the cumulative distribution for a normal data set
set.seed(10000)
normal.numbers = rnorm(2724, mean=0, sd=0.519)
#range(normal.numbers)
normal.ecdf = ecdf(normal.numbers)
Then, a plot is used to compare the ECDF of the MLR LEL model residuals and a normal data set. See Figure 6 in the manuscript.
model.residuals <- as.data.table(residuals(mlr.lel))
model.residuals <- model.residuals[order(V1)]
norm.residuals <- as.data.table(normal.numbers)
norm.residuals <- norm.residuals[order(normal.numbers)]
par(mfrow=c(1,2))
plot(mlr.ecd,
main=NULL,
xlab="Residual", ylab = "Cumulative Proportion", cex.axis=1.25, cex.lab=1.25, cex.main=2.0,
xlim=c(-3,3), xaxs='i',
do.points=FALSE,
verticals=TRUE)
plot(normal.ecdf,
do.points=FALSE,
add=TRUE,
col='blue', verticals=TRUE)
abline(h = 0.025, lty = 3)
abline(h = 0.975, lty = 3)
abline(v = -1.098148, lty =3 )
abline(v = 1.002591, lty=3 )
title("A", adj=0)
plot(norm.residuals$normal.numbers ~ model.residuals$V1,
xlab='Ordered MLR LEL Residuals',
ylab='Ordered Normal Distribution Sample',
xlim=c(-3,3),
cex.axis=1.25, cex.lab=1.25, cex.main=2.0 )
abline(a=0,b=1)
title("B", adj=0)
We also examined the results of a Kolmogorov-Smirnov Test, skewness, and kurtosis to understand how much the MLR LEL model residuals might deviate from a normal distribution.
ks.test(residuals(mlr.lel), normal.numbers) #kolmogorov smirnov test
##
## Two-sample Kolmogorov-Smirnov test
##
## data: residuals(mlr.lel) and normal.numbers
## D = 0.037078, p-value = 0.04728
## alternative hypothesis: two-sided
Skewness and Kurtosis of the MLR LEL model residual distribution Given that the skewness is between -0.5 and 0.5, this suggests the model residual distribution is approximately symmetric. Given that the excess kurtosis is positive but less than 3, the MLR LEL model residual distribution appears leptokurtic (slightly heavy-tailed).
skew.mlr.lel.residuals <- skewness(model.residuals$V1) #-0.253 #If skewness is between -0.5 and 0.5, the distribution is approximately symmetric.
kurt.mlr.lel.residuals <- kurtosis(model.residuals$V1) -3 #1.31; leptokurtic (high peak, heavy tails)
Metric <- c('Skewness', 'Excess Kurtosis')
Value <- c(skew.mlr.lel.residuals, kurt.mlr.lel.residuals)
norm.testing <- as.data.table(cbind(Metric,Value))
kable(norm.testing) %>% kable_styling()
Metric | Value |
---|---|
Skewness | -0.253000573059661 |
Excess Kurtosis | 1.30902771445109 |
The table below is included in the manuscript as Table 5. Different estimates of prediction interval are calculated. The prediction interval width is the upper bound of the interval divided by the lower bound of the interval in fold-mg/kg/day units. For context, the lower and upper bounds of a prediction interval for 1, 10, and 100 mg/kg/day “true” POD values are shown.
UB= upper bound; LB = lower bound; mkd = mg/kg/day
# create table of values for prediction intervals
# first calculate based on ecdf of the MLR regression model residuals
ecdf.dt <- data.table(true = c(1,10,100), pred.lb = c(0,0,0), pred.ub = c(0,0,0))
ecdf.dt[,pred.lb := round(true*(10^(quantile(mlr.ecd,prob=0.025))),2)]
ecdf.dt[,pred.ub := round(true*(10^(quantile(mlr.ecd,prob=0.975))),2)]
ecdf.row <- data.table('ECDF of MLR model residuals', 'LEL', 'full dataset', 'NA',
ecdf.dt[true==1,pred.lb],
ecdf.dt[true==1,pred.ub],
ecdf.dt[true==10,pred.lb],
ecdf.dt[true==10,pred.ub],
ecdf.dt[true==100,pred.lb],
ecdf.dt[true==100,pred.ub])
setnames(ecdf.row, c('V1', 'V2', 'V3','V4','V5','V6','V7','V8','V9','V10'),
c('Regression Type','POD Type','Data','RMSE','1 mkd LB','1 mkd UB','10 mkd LB','10 mkd UB','100 mkd LB', '100 mkd UB'))
lel.table[,"POD Type":= 'LEL']
loael.table[,"POD Type":= 'LOAEL']
sub.lel.table[,"POD Type":= 'LEL']
sub.loael.table[,"POD Type":= 'LOAEL']
total <- rbind(lel.table, loael.table, sub.lel.table, sub.loael.table)
total <- total[order(-RMSE),c('Regression Type', 'POD Type', 'Data', 'RMSE')]
total[,"1 mkd LB" := round(10^(qnorm(0.025)*RMSE),2)]
total[,"1 mkd UB" := round(10^(qnorm(0.975)*RMSE),2)]
total[,"10 mkd LB" := round(10*(10^(qnorm(0.025)*RMSE)),2)]
total[,"10 mkd UB" := round(10*(10^(qnorm(0.975)*RMSE)),2)]
total[,"100 mkd LB" := round(100*(10^(qnorm(0.025)*RMSE)),2)]
total[,"100 mkd UB" := round(100*(10^(qnorm(0.975)*RMSE)),2)]
total2 <- rbind(ecdf.row, total)
total2[,'Minimum Prediction Interval Width' := round(`1 mkd UB`/`1 mkd LB`,0)]
setcolorder(total2, c(1:4,11,5:10))
kable(total2) %>% kable_styling()
Regression Type | POD Type | Data | RMSE | Minimum Prediction Interval Width | 1 mkd LB | 1 mkd UB | 10 mkd LB | 10 mkd UB | 100 mkd LB | 100 mkd UB |
---|---|---|---|---|---|---|---|---|---|---|
ECDF of MLR model residuals | LEL | full dataset | NA | 126 | 0.08 | 10.06 | 0.80 | 100.60 | 7.98 | 1005.98 |
ACM | LEL | CHR | 0.628 | 284 | 0.06 | 17.02 | 0.59 | 170.16 | 5.88 | 1701.60 |
MLR | LEL | full cell dataset | 0.622 | 276 | 0.06 | 16.56 | 0.60 | 165.61 | 6.04 | 1656.14 |
RR | LEL | full dataset | 0.599 | 213 | 0.07 | 14.93 | 0.67 | 149.29 | 6.70 | 1492.86 |
MLR | LEL | CHR | 0.593 | 208 | 0.07 | 14.53 | 0.69 | 145.30 | 6.88 | 1452.98 |
MLR | LEL | SUB | 0.592 | 207 | 0.07 | 14.46 | 0.69 | 144.64 | 6.91 | 1446.43 |
MLR | LEL | full dataset | 0.589 | 204 | 0.07 | 14.27 | 0.70 | 142.70 | 7.01 | 1426.98 |
MLR | LEL | high leverage points removed | 0.582 | 198 | 0.07 | 13.83 | 0.72 | 138.26 | 7.23 | 1382.61 |
MLR | LEL | high Cooks distance plot points removed | 0.582 | 198 | 0.07 | 13.83 | 0.72 | 138.26 | 7.23 | 1382.61 |
ACM | LEL | DEV | 0.573 | 166 | 0.08 | 13.28 | 0.75 | 132.76 | 7.53 | 1327.58 |
ACM | LEL | full cell dataset | 0.566 | 161 | 0.08 | 12.86 | 0.78 | 128.63 | 7.77 | 1286.29 |
ACM | LOAEL | DEV | 0.562 | 158 | 0.08 | 12.63 | 0.79 | 126.33 | 7.92 | 1263.28 |
MLR | LOAEL | full cell dataset | 0.559 | 156 | 0.08 | 12.46 | 0.80 | 124.63 | 8.02 | 1246.29 |
ACM | LEL | SUB | 0.549 | 149 | 0.08 | 11.91 | 0.84 | 119.13 | 8.39 | 1191.30 |
MLR | LOAEL | SUB | 0.526 | 119 | 0.09 | 10.74 | 0.93 | 107.38 | 9.31 | 1073.84 |
RR | LOAEL | full dataset | 0.516 | 103 | 0.10 | 10.26 | 0.97 | 102.65 | 9.74 | 1026.46 |
ACM | LOAEL | CHR | 0.515 | 102 | 0.10 | 10.22 | 0.98 | 102.18 | 9.79 | 1021.84 |
MLR | LEL | high Cooks distance points removed | 0.511 | 100 | 0.10 | 10.04 | 1.00 | 100.36 | 9.96 | 1003.56 |
MLR | LEL | all potential outliers removed | 0.511 | 100 | 0.10 | 10.04 | 1.00 | 100.36 | 9.96 | 1003.56 |
MLR | LOAEL | full dataset | 0.508 | 99 | 0.10 | 9.90 | 1.01 | 99.01 | 10.10 | 990.06 |
MLR | LOAEL | high Cooks distance plot points removed | 0.504 | 97 | 0.10 | 9.72 | 1.03 | 97.23 | 10.28 | 972.35 |
ACM | LOAEL | full cell dataset | 0.504 | 97 | 0.10 | 9.72 | 1.03 | 97.23 | 10.28 | 972.35 |
MLR | LOAEL | CHR | 0.502 | 96 | 0.10 | 9.64 | 1.04 | 96.36 | 10.38 | 963.61 |
MLR | LOAEL | high leverage points removed | 0.5 | 96 | 0.10 | 9.55 | 1.05 | 95.50 | 10.47 | 954.95 |
ACM | LOAEL | SUB | 0.5 | 96 | 0.10 | 9.55 | 1.05 | 95.50 | 10.47 | 954.95 |
MLR | LEL | DEV | 0.496 | 85 | 0.11 | 9.38 | 1.07 | 93.79 | 10.66 | 937.87 |
MLR | LOAEL | DEV | 0.466 | 68 | 0.12 | 8.19 | 1.22 | 81.91 | 12.21 | 819.11 |
MLR | LOAEL | high Cooks distance points removed | 0.447 | 58 | 0.13 | 7.52 | 1.33 | 75.18 | 13.30 | 751.80 |
MLR | LOAEL | all potential outliers removed | 0.447 | 58 | 0.13 | 7.52 | 1.33 | 75.18 | 13.30 | 751.80 |
total2.long <- melt.data.table(data=total2,
id.vars=c('Regression Type','POD Type','Data','RMSE','Minimum Prediction Interval Width'),
measure.vars = c('1 mkd LB', '1 mkd UB','10 mkd LB','10 mkd UB','100 mkd LB', '100 mkd UB'),
variable.name = 'Bounds',
value.name = 'Prediction')
total2.long[,model := paste0(`Regression Type`,'_',`POD Type`,'_',Data)]
total2.long[`Bounds` %in% c('1 mkd LB','1 mkd UB'), truth := 1]
total2.long[`Bounds` %in% c('10 mkd LB','10 mkd UB'), truth := 10]
total2.long[`Bounds` %in% c('100 mkd LB','100 mkd UB'), truth := 100]
total2.long[`Bounds` %in% c('1 mkd LB', '10 mkd LB', '100 mkd LB'), interval := 'lower']
total2.long[`Bounds` %in% c('1 mkd UB', '10 mkd UB', '100 mkd UB'), interval := 'upper']
total3 <- dcast.data.table(data=total2.long, `Regression Type`+ `POD Type`+ `Data` + RMSE + model + truth ~ interval,
value.var = 'Prediction')
total4 <- total3[order(upper)]
ggplot(total4[truth==10]) +
geom_errorbar(aes(x=reorder(model,upper),ymin=lower, ymax=upper, color=Data, lty = `POD Type`), width=0.3) +
labs(y="Prediction bounds, mg/kg/day",
x="Model",
title="Truth = 10 mg/kg/day"
) +
#facet_wrap( ~ truth, scales='free_y', shrink=TRUE) +
scale_x_discrete(label= function(x) strtrim(x,12))+
#scale_y_continuous(breaks = seq(0, 20, 2.5)) +
scale_y_log10()+
annotation_logticks(side="l", base=10)+
scale_color_manual(breaks=c("CHR","DEV","SUB","full cell dataset","full dataset","all potential outliers removed",
"high Cooks distance plot points removed","high Cooks distance points removed","high leverage points removed"), values=c("#999999", "#E69F00", "#56B4E9", "#009E73",
"#F0E442", "#0072B2", "#D55E00", "#CC79A7", "#000000"),
labels=c("CHR","DEV","SUB","full cell dataset","full dataset","all potential outliers removed",
"high Cooks distance plot points removed","high Cooks distance points removed","high leverage points removed"))+
theme_classic() +
theme(plot.title=element_text(hjust=0.5),
panel.border=element_rect(colour="black",size=1, fill=NA),
panel.grid.minor.y = element_line(size = (0.2), colour="grey"),
panel.grid.major.y = element_line(size = (0.2), colour="grey"),
axis.ticks.x=element_blank(),
legend.position="left",
legend.title = element_blank(),
#legend.key.size = unit(0.5, "in"),
legend.text = element_text(size=7),
#panel.border=element_blank(),
axis.text.x=element_text(angle = 90, size=9, vjust =1, hjust=1),
axis.text.y=element_text(size=12),
axis.title.y=element_text(size=13))
The LEL values are presented by study type in the ACM subsets (SUB, CHR, and DEV) are presented. The dot represents the LEL mean and the line shows the range from minimum to maximum for each cell of the respective ACM models. Species is denoted by color, with gray = dog, green = mouse, black = rabbit, and blue = rat. One line represents one cell, but there may be more than one cell for each chemical. These are ordered by the mean LEL and the order of cells is not the same across the SUB, CHR, and DEV sections.
dat.acm.comparison <- dat.cell.use[ study_type %in% c("CHR", "DEV", "SUB")] %>%
.[ , list(chem, chemfact, study_id, study_type, species, lel_min, lel_max, lel_mean)] %>%
unique() %>%
.[ , chem_ct:=.N, by=chem] %>%
.[ , mean_studytype:=mean(lel_mean), by=study_type]
dat.acm.comparison <- unique(dat.acm.comparison[,c('chemfact', 'study_type', 'species', 'lel_min',
'lel_max', 'lel_mean')])
dat.acm.comparison <- dat.acm.comparison[order(study_type, lel_mean), ] #sort
dat.acm.comparison$chemfact <- factor(dat.acm.comparison$chemfact,
levels = dat.acm.comparison$chemfact[order(dat.acm.comparison$lel_mean)])
dat.acm.comparison$study_type <- factor(dat.acm.comparison$study_type,
levels=c('SUB','CHR','DEV'))
ggplot(dat.acm.comparison) +
geom_point(aes(y=lel_mean, x=chemfact, color = species)) +
geom_errorbar(data=dat.acm.comparison , aes(x=chemfact,ymin=lel_min, ymax=lel_max, color=species), width=0.3) +
labs(y="LEL log10(mg/kg/day)",
x="Cell"
#title="HED LEL Range by Study Type and Cell"
) +
facet_grid( ~ study_type, scales='free_x') +
scale_y_continuous(breaks = seq(-4, 4, .5)) +
scale_color_manual(breaks=c('dog', 'mouse', 'rabbit', 'rat'), values=c("#999999",
"#009E73",
"#000000",
"#56B4E9"),
labels=c('dog', 'mouse', 'rabbit', 'rat'))+
theme_classic() +
theme(
#plot.title = element_text(hjust=0.5, face="bold", size = 17),
panel.border=element_rect(colour="black",size=1, fill=NA),
panel.grid.minor.y = element_line(size = (0.2), colour="grey"),
panel.grid.major.y = element_line(size = (0.2), colour="grey"),
axis.ticks.x=element_blank(),
legend.position="bottom",
legend.title = element_text(size=15, face="bold"),
legend.key.size = unit(0.5, "in"),
legend.text = element_text(size=15),
#panel.border=element_blank(),
axis.text.x=element_blank(),
axis.text.y=element_text(size=12),
axis.title.y=element_text(size=13))
To evaluate how different subchronic and chronic values might be from each other.
Using the full MLR LEL dataset, plot median SUB LOAEL values vs. median CHR LOAEL values by chemical for chemicals with more than one study each of CHR and SUB.
st <- c("CHR", "SUB")
dat.comp2 <- dat.mlr.full[,list(chem,chemical_id,study_type,species,lel,loael)] %>%
.[study_type %in% st] %>%
.[, chnm := paste(unlist(strsplit(chem, split='||',fixed=TRUE))[[2]]), by=chemical_id] %>%
.[, chem_st_ct:=.N, by=list(chemical_id, study_type)] %>% .[ chem_st_ct > 1] %>%
.[, med.loael:=median(loael), by=list(chemical_id, study_type, species)]
s.c.ratio <- dcast.data.table(dat.comp2,
chnm + chem + chemical_id ~ study_type + species,
value.var = 'med.loael',
fun.aggregate = median)
s.c.ratio[,dog.s.c := ifelse(!is.na(SUB_dog|!is.na(CHR_dog)), SUB_dog - CHR_dog, NA)]
s.c.ratio[,mouse.s.c := ifelse(!is.na(SUB_mouse|!is.na(CHR_mouse)), SUB_mouse - CHR_mouse, NA)]
s.c.ratio[,rat.s.c := ifelse(!is.na(SUB_rat|!is.na(CHR_rat)), SUB_rat - CHR_rat, NA)]
#nrow(s.c.ratio[!is.na(mouse.s.c)]) #141
#nrow(s.c.ratio[!is.na(rat.s.c)]) #204
#nrow(s.c.ratio[!is.na(dog.s.c)]) #119
mouse <- ggplot(data=s.c.ratio[!is.na(mouse.s.c)], aes(y=CHR_mouse, x=SUB_mouse)) +
geom_point()+
stat_smooth(method='lm', col='red')+
labs(y="Median CHR LOAEL log10(mg/kg/day)",
x="Median SUB LOAEL log10(mg/kg/day)",
title = "mouse"
) +
theme_bw()+
theme(
axis.text.x=element_text(size=12),
axis.text.y=element_text(size=12),
axis.title.x=element_text(size=12),
axis.title.y=element_text(size=12))+
coord_cartesian(xlim=c(-1,4), ylim=c(-1.5,3.5))+
annotate("text", x=3, y = -1, label="N=141", size=6)+
annotate("text", x=-0.75, y=3.35, label = "B",size=7)
rat <- ggplot(data=s.c.ratio[!is.na(rat.s.c)], aes(y=CHR_rat, x=SUB_rat)) +
geom_point()+
stat_smooth(method='lm', col='red')+
labs(y="Median CHR LOAEL log10(mg/kg/day)",
x="Median SUB LOAEL log10(mg/kg/day)",
title = "rat"
) +
theme_bw()+
theme(
axis.text.x=element_text(size=12),
axis.text.y=element_text(size=12),
axis.title.x=element_text(size=12),
axis.title.y=element_text(size=12))+
coord_cartesian(xlim=c(-1,4), ylim=c(-1.5,3.5))+
annotate("text", x=3, y = -1, label="N=204", size=6)+
annotate("text", x=-0.75, y=3.35, label = "C",size=7)
dog <- ggplot(data=s.c.ratio[!is.na(dog.s.c)], aes(y=CHR_dog, x=SUB_dog)) +
geom_point()+
stat_smooth(method='lm', col='red')+
labs(y="Median CHR LOAEL log10(mg/kg/day)",
x="Median SUB LOAEL log10(mg/kg/day)",
title = "dog"
) +
theme_bw()+
theme(
axis.text.x=element_text(size=12),
axis.text.y=element_text(size=12),
axis.title.x=element_text(size=12),
axis.title.y=element_text(size=12))+
coord_cartesian(xlim=c(-1,4), ylim=c(-1.5,3.5))+
annotate("text", x=3, y = -1, label="N=119", size=6)+
annotate("text", x=-0.75, y=3.35, label = "A",size=7)
grid.arrange(arrangeGrob(dog, mouse, rat, ncol=3,nrow=1))
The linear relationships between subchronic and chronic median LOAEL values by species vary slightly.
fit.dog <- lm(CHR_dog ~ SUB_dog, data=s.c.ratio[!is.na(dog.s.c)])
summary(fit.dog)
##
## Call:
## lm(formula = CHR_dog ~ SUB_dog, data = s.c.ratio[!is.na(dog.s.c)])
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.2767 -0.3153 0.0521 0.2726 2.8680
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.34151 0.10511 3.249 0.00151 **
## SUB_dog 0.66678 0.06188 10.776 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.541 on 117 degrees of freedom
## Multiple R-squared: 0.4981, Adjusted R-squared: 0.4938
## F-statistic: 116.1 on 1 and 117 DF, p-value: < 2.2e-16
fit.mouse <- lm(CHR_mouse ~ SUB_mouse, data=s.c.ratio[!is.na(mouse.s.c)])
summary(fit.mouse)
##
## Call:
## lm(formula = CHR_mouse ~ SUB_mouse, data = s.c.ratio[!is.na(mouse.s.c)])
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.49990 -0.31189 0.06877 0.36692 1.13837
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.42903 0.14414 2.977 0.00344 **
## SUB_mouse 0.63277 0.05947 10.639 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5954 on 139 degrees of freedom
## Multiple R-squared: 0.4488, Adjusted R-squared: 0.4449
## F-statistic: 113.2 on 1 and 139 DF, p-value: < 2.2e-16
fit.rat <- lm(CHR_rat ~ SUB_rat, data=s.c.ratio[!is.na(rat.s.c)])
summary(fit.rat)
##
## Call:
## lm(formula = CHR_rat ~ SUB_rat, data = s.c.ratio[!is.na(rat.s.c)])
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.58175 -0.28735 0.01142 0.30918 1.55837
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.14060 0.09003 1.562 0.12
## SUB_rat 0.74240 0.04468 16.614 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5204 on 202 degrees of freedom
## Multiple R-squared: 0.5774, Adjusted R-squared: 0.5753
## F-statistic: 276 on 1 and 202 DF, p-value: < 2.2e-16