library(dplyr) library(mice) library(misl) # Note, the code for misl is available on Github: https://github.com/carpenitoThomas/misl library(sl3) library(abind) library(tidyr) library(dplyr) # The following code only serves as example code for the second simulation (for 10% MAR covariate data) however can be adapted easily for remaining simulations presented. # 1. Load the dataset and ensure the predicted model fits the data. This data came from https://github.com/PacktPublishing/Machine-Learning-with-R-Second-Edition/tree/master/Chapter%2006 insurance <- read.csv("insurance.csv", header=TRUE) insurance <- insurance %>% mutate( age = as.numeric(age), children = as.numeric(children), male = as.integer(ifelse(sex == "male", 1, 0)), smoker = as.integer(ifelse(smoker == "yes", 1, 0)), region = as.factor(region), expenses = expenses / 1000 ) %>% select(-c(sex)) insurance <- dplyr::tibble(insurance) ## 2. Ensure the model fits the data original_model <- lm(expenses ~ age + bmi + children + smoker + region + male, data = insurance) predictions <- predict(original_model) true <- original_model$coefficients ## 3. Replace the outcome with the predicted values from the model insurance_model_predictions <- insurance %>% select(-c(expenses)) insurance_model_predictions$expenses <- unname(predictions) insurance_truth_model <- lm(expenses ~ age + bmi + children + smoker + region + male, data = insurance_model_predictions) # 4. Begin a loop 1000 times that will ampute the data and generate amputed and imputed data. simulation <- function(runs = 2){ results <- array(NA, dim = c(4, runs, 9, 3 )) dimnames(results) <- list(c("misl", "mice_default", "mice_CART", "mice_mixed"), as.character(1:runs), c("Intercept", "age", "bmi", "children", "smoker", "regionnorthwest", "regionsoutheast", "regionsouthwest", "male"), c("Estimate", "2.5 %", "97.5 %") ) for(run in 1:runs){ # This bit of code ensures for each simulation there is a random selection of which variables are missing. mypatterns <- expand.grid(age = 0:1, bmi = 0:1, children = 0:1, smoker = 0:1, region = 0:1,male = 0:1, expenses = 1) mypatterns <- mypatterns[sample(1:nrow(mypatterns), replace = FALSE, 8),] mypatterns <- mypatterns[rowSums(mypatterns) != 0,] type_sample <- sample(c("LEFT", "RIGHT", "MID", "TAIL"), 1) # Begin by amputing the original data amputed_mads <- ampute(insurance_model_predictions, prop = .10, patterns = mypatterns, mech = "MAR", type = type_sample) amputed_data <- amputed_mads$amp # The amputed data needs to be "fixed" becuase of weights calculation (where everything is converted to integers/numerics) amputed_data <- amputed_data %>% mutate( smoker = as.integer(smoker), region = factor(region, labels = levels(insurance_model_predictions$region)), male = as.integer(male) ) # We can then begin the fitting process misl_imputations <- misl(amputed_data, con_method = c("Lrnr_glm_fast", "Lrnr_earth", "Lrnr_ranger", "Lrnr_svm"), bin_method = c("Lrnr_glm_fast", "Lrnr_earth", "Lrnr_ranger"), cat_method = c("Lrnr_independent_binomial", "Lrnr_ranger", "Lrnr_svm")) mice_amputed <- amputed_data %>% mutate(smoker = as.factor(smoker), male = as.factor(male)) # Here we explicitly define what our imputation methods are mice_default_imputations <- mice(mice_amputed, print = FALSE) mice_CART_imputations <- mice(mice_amputed, defaultMethod = c("cart", "cart", "cart", "cart"), print = FALSE) mice_mixed_imputations <- mice(mice_amputed, defaultMethod = c("pmm", "logreg", "cart", "cart"), print = FALSE) # After imputation we can begin with model fitting misl_fit <- lapply(misl_imputations, function(y){ stats::lm(expenses ~ age + bmi + children + smoker + region + male, data = y$datasets) }) mice_default_fit <- with(mice_default_imputations, exp = lm(expenses ~ age + bmi + children + smoker + region + male)) mice_CART_fit <- with(mice_CART_imputations, exp = lm(expenses ~ age + bmi + children + smoker + region + male)) mice_mixed_fit <- with(mice_mixed_imputations, exp = lm(expenses ~ age + bmi + children + smoker + region + male)) # The results can then be pooled misl_pool <- summary(mice::pool(misl_fit), conf.int = TRUE) mice_default_pool <- summary(mice::pool(mice_default_fit), conf.int = TRUE) mice_CART_pool <- summary(mice::pool(mice_CART_fit), conf.int = TRUE) mice_mixed_pool <- summary(mice::pool(mice_mixed_fit), conf.int = TRUE) # This code aggregates the results to a dataframe results["misl", run,"Intercept", ] <- as.numeric(misl_pool[1, c("estimate", "2.5 %", "97.5 %")]) results["misl", run, "age",] <- as.numeric(misl_pool[2, c("estimate", "2.5 %", "97.5 %")]) results["misl", run, "bmi",] <- as.numeric(misl_pool[3, c("estimate", "2.5 %", "97.5 %")]) results["misl", run, "children",] <- as.numeric(misl_pool[4, c("estimate", "2.5 %", "97.5 %")]) results["misl", run, "smoker",] <- as.numeric(misl_pool[5, c("estimate", "2.5 %", "97.5 %")]) results["misl", run, "regionnorthwest",] <- as.numeric(misl_pool[6, c("estimate", "2.5 %", "97.5 %")]) results["misl", run, "regionsoutheast",] <- as.numeric(misl_pool[7, c("estimate", "2.5 %", "97.5 %")]) results["misl", run, "regionsouthwest",] <- as.numeric(misl_pool[8, c("estimate", "2.5 %", "97.5 %")]) results["misl", run, "male",] <- as.numeric(misl_pool[9, c("estimate", "2.5 %", "97.5 %")]) results["mice_default", run,"Intercept", ] <- as.numeric(mice_default_pool[1, c("estimate", "2.5 %", "97.5 %")]) results["mice_default", run, "age",] <- as.numeric(mice_default_pool[2, c("estimate", "2.5 %", "97.5 %")]) results["mice_default", run, "bmi",] <- as.numeric(mice_default_pool[3, c("estimate", "2.5 %", "97.5 %")]) results["mice_default", run, "children",] <- as.numeric(mice_default_pool[4, c("estimate", "2.5 %", "97.5 %")]) results["mice_default", run, "smoker",] <- as.numeric(mice_default_pool[5, c("estimate", "2.5 %", "97.5 %")]) results["mice_default", run, "regionnorthwest",] <- as.numeric(mice_default_pool[6, c("estimate", "2.5 %", "97.5 %")]) results["mice_default", run, "regionsoutheast",] <- as.numeric(mice_default_pool[7, c("estimate", "2.5 %", "97.5 %")]) results["mice_default", run, "regionsouthwest",] <- as.numeric(mice_default_pool[8, c("estimate", "2.5 %", "97.5 %")]) results["mice_default", run, "male",] <- as.numeric(mice_default_pool[9, c("estimate", "2.5 %", "97.5 %")]) results["mice_CART", run,"Intercept", ] <- as.numeric(mice_CART_pool[1, c("estimate", "2.5 %", "97.5 %")]) results["mice_CART", run, "age",] <- as.numeric(mice_CART_pool[2, c("estimate", "2.5 %", "97.5 %")]) results["mice_CART", run, "bmi",] <- as.numeric(mice_CART_pool[3, c("estimate", "2.5 %", "97.5 %")]) results["mice_CART", run, "children",] <- as.numeric(mice_CART_pool[4, c("estimate", "2.5 %", "97.5 %")]) results["mice_CART", run, "smoker",] <- as.numeric(mice_CART_pool[5, c("estimate", "2.5 %", "97.5 %")]) results["mice_CART", run, "regionnorthwest",] <- as.numeric(mice_CART_pool[6, c("estimate", "2.5 %", "97.5 %")]) results["mice_CART", run, "regionsoutheast",] <- as.numeric(mice_CART_pool[7, c("estimate", "2.5 %", "97.5 %")]) results["mice_CART", run, "regionsouthwest",] <- as.numeric(mice_CART_pool[8, c("estimate", "2.5 %", "97.5 %")]) results["mice_CART", run, "male",] <- as.numeric(mice_CART_pool[9, c("estimate", "2.5 %", "97.5 %")]) results["mice_mixed", run,"Intercept", ] <- as.numeric(mice_mixed_pool[1, c("estimate", "2.5 %", "97.5 %")]) results["mice_mixed", run, "age",] <- as.numeric(mice_mixed_pool[2, c("estimate", "2.5 %", "97.5 %")]) results["mice_mixed", run, "bmi",] <- as.numeric(mice_mixed_pool[3, c("estimate", "2.5 %", "97.5 %")]) results["mice_mixed", run, "children",] <- as.numeric(mice_mixed_pool[4, c("estimate", "2.5 %", "97.5 %")]) results["mice_mixed", run, "smoker",] <- as.numeric(mice_mixed_pool[5, c("estimate", "2.5 %", "97.5 %")]) results["mice_mixed", run, "regionnorthwest",] <- as.numeric(mice_mixed_pool[6, c("estimate", "2.5 %", "97.5 %")]) results["mice_mixed", run, "regionsoutheast",] <- as.numeric(mice_mixed_pool[7, c("estimate", "2.5 %", "97.5 %")]) results["mice_mixed", run, "regionsouthwest",] <- as.numeric(mice_mixed_pool[8, c("estimate", "2.5 %", "97.5 %")]) results["mice_mixed", run, "male",] <- as.numeric(mice_mixed_pool[9, c("estimate", "2.5 %", "97.5 %")]) } results } simulate_1000_times <- simulation(1000) ## 5. Take the simulations to generate raw bias (RB), percent bias (PB), coverate rate (CR), and average confidence interval width (AW) organize_results <- function(combined){ RB <- apply(apply(combined[,,, "Estimate"], c(1,3), mean), 1, `-`, true ) PB <- abs(apply(apply(apply(combined[,,, "Estimate"], c(1,3), mean), 1, `-`, true ),2, `/`, true)) CR <- apply(apply(combined[,,, "2.5 %"], c(1,2), `<`, true) & apply(combined[,,, "97.5 %"], c(1,2), `>`, true), c(1,2), mean) AW <- apply(combined[,,, "97.5 %"] - combined[,,, "2.5 %"], c(1,3) , mean) RB <- data.frame(RB) RB$variable <- row.names(RB) PB <- data.frame(PB) PB$variable <- row.names(PB) CR <- data.frame(CR) CR$variable <- row.names(CR) AW <- data.frame(t(AW)) AW$variable <- row.names(AW) RB_df <- gather(RB, imputation, statistic, misl:mice_mixed, factor_key=TRUE) RB_df$measurement <- "RB" PB_df <- gather(PB, imputation, statistic, misl:mice_mixed, factor_key=TRUE) PB_df$measurement <- "PB" CR_df <- gather(CR, imputation, statistic, misl:mice_mixed, factor_key=TRUE) CR_df$measurement <- "CR" AW_df <- gather(AW, imputation, statistic, misl:mice_mixed, factor_key=TRUE) AW_df$measurement <- "AW" results <- rbind(RB_df, PB_df,CR_df, AW_df) return(results) } final_results <- organize_results(simulate_1000_times)