library(dplyr)
library(mice)
library(misl) # Note, the code for misl is available on Github: https://github.com/carpenitoThomas/misl
library(sl3)
library(abind)
library(tidyr)
library(dplyr)

# The following code only serves as example code for the second simulation (for 10% MAR covariate data) however can be adapted easily for remaining simulations presented.

# 1. Load the dataset and ensure the predicted model fits the data. This data came from https://github.com/PacktPublishing/Machine-Learning-with-R-Second-Edition/tree/master/Chapter%2006
insurance <- read.csv("insurance.csv", header=TRUE)

insurance <- insurance %>%
  mutate(
    age = as.numeric(age),
    children = as.numeric(children),
    male = as.integer(ifelse(sex == "male", 1, 0)),
    smoker = as.integer(ifelse(smoker == "yes", 1, 0)),
    region = as.factor(region),
    expenses = expenses / 1000
  ) %>%
  select(-c(sex))

insurance <- dplyr::tibble(insurance)

## 2. Ensure the model fits the data
original_model <- lm(expenses ~ age + bmi + children + smoker + region + male, data = insurance)
predictions <- predict(original_model)

true <- original_model$coefficients


## 3. Replace the outcome with the predicted values from the model
insurance_model_predictions <- insurance %>%
  select(-c(expenses))

insurance_model_predictions$expenses <- unname(predictions)
insurance_truth_model <- lm(expenses ~ age + bmi + children + smoker + region + male, data = insurance_model_predictions)

# 4. Begin a loop 1000 times that will ampute the data and generate amputed and imputed data.
simulation <- function(runs = 2){
  results <- array(NA, dim = c(4, runs, 9, 3 ))
  dimnames(results) <- list(c("misl", "mice_default", "mice_CART", "mice_mixed"),
                            as.character(1:runs),
                            c("Intercept", "age", "bmi", "children", "smoker", "regionnorthwest", "regionsoutheast", "regionsouthwest",  "male"),
                            c("Estimate", "2.5 %", "97.5 %")
  )
  for(run in 1:runs){
    
    # This bit of code ensures for each simulation there is a random selection of which variables are missing.
    mypatterns <- expand.grid(age = 0:1, bmi = 0:1, children = 0:1, smoker = 0:1, region = 0:1,male = 0:1, expenses = 1)
    mypatterns <- mypatterns[sample(1:nrow(mypatterns), replace = FALSE, 8),]
    mypatterns <- mypatterns[rowSums(mypatterns) != 0,]
    
    type_sample <- sample(c("LEFT", "RIGHT", "MID", "TAIL"), 1)
    
    # Begin by amputing the original data
    amputed_mads <- ampute(insurance_model_predictions,
                           prop = .10,
                           patterns = mypatterns,
                           mech = "MAR",
                           type = type_sample)

    amputed_data <- amputed_mads$amp
    
    # The amputed data needs to be "fixed" becuase of weights calculation (where everything is converted to integers/numerics)
    amputed_data <- amputed_data %>%
      mutate(
        smoker = as.integer(smoker),
        region = factor(region, labels = levels(insurance_model_predictions$region)),
        male = as.integer(male)
      )

    # We can then begin the fitting process
    misl_imputations <- misl(amputed_data, 
                             con_method = c("Lrnr_glm_fast", "Lrnr_earth", "Lrnr_ranger", "Lrnr_svm"),
                             bin_method = c("Lrnr_glm_fast", "Lrnr_earth", "Lrnr_ranger"),
                             cat_method = c("Lrnr_independent_binomial", "Lrnr_ranger", "Lrnr_svm"))
    
    mice_amputed <- amputed_data %>%
      mutate(smoker = as.factor(smoker),
             male = as.factor(male))
    
    # Here we explicitly define what our imputation methods are
    mice_default_imputations <- mice(mice_amputed, print = FALSE)
    mice_CART_imputations <- mice(mice_amputed, defaultMethod = c("cart", "cart", "cart", "cart"), print = FALSE)
    mice_mixed_imputations <- mice(mice_amputed, defaultMethod = c("pmm", "logreg", "cart", "cart"), print = FALSE)
    
    # After imputation we can begin with model fitting
    misl_fit <- lapply(misl_imputations, function(y){
      stats::lm(expenses ~ age + bmi + children + smoker + region + male, data = y$datasets)
    })    
    mice_default_fit <- with(mice_default_imputations, exp = lm(expenses ~ age + bmi + children + smoker + region + male))
    mice_CART_fit <- with(mice_CART_imputations, exp = lm(expenses ~ age + bmi + children + smoker + region + male))
    mice_mixed_fit <- with(mice_mixed_imputations, exp = lm(expenses ~ age + bmi + children + smoker + region + male))

    # The results can then be pooled
    misl_pool <- summary(mice::pool(misl_fit), conf.int = TRUE)    
    mice_default_pool <- summary(mice::pool(mice_default_fit), conf.int = TRUE)
    mice_CART_pool <- summary(mice::pool(mice_CART_fit), conf.int = TRUE)
    mice_mixed_pool <- summary(mice::pool(mice_mixed_fit), conf.int = TRUE)
  
    # This code aggregates the results to a dataframe
    results["misl", run,"Intercept", ] <- as.numeric(misl_pool[1, c("estimate", "2.5 %", "97.5 %")])
    results["misl", run, "age",] <- as.numeric(misl_pool[2, c("estimate", "2.5 %", "97.5 %")])
    results["misl", run, "bmi",] <- as.numeric(misl_pool[3, c("estimate", "2.5 %", "97.5 %")])
    results["misl", run, "children",] <- as.numeric(misl_pool[4, c("estimate", "2.5 %", "97.5 %")])
    results["misl", run, "smoker",] <- as.numeric(misl_pool[5, c("estimate", "2.5 %", "97.5 %")])
    results["misl", run, "regionnorthwest",] <- as.numeric(misl_pool[6, c("estimate", "2.5 %", "97.5 %")])
    results["misl", run, "regionsoutheast",] <- as.numeric(misl_pool[7, c("estimate", "2.5 %", "97.5 %")])
    results["misl", run, "regionsouthwest",] <- as.numeric(misl_pool[8, c("estimate", "2.5 %", "97.5 %")])
    results["misl", run, "male",] <- as.numeric(misl_pool[9, c("estimate", "2.5 %", "97.5 %")])    
    
    results["mice_default", run,"Intercept", ] <- as.numeric(mice_default_pool[1, c("estimate", "2.5 %", "97.5 %")])
    results["mice_default", run, "age",] <- as.numeric(mice_default_pool[2, c("estimate", "2.5 %", "97.5 %")])
    results["mice_default", run, "bmi",] <- as.numeric(mice_default_pool[3, c("estimate", "2.5 %", "97.5 %")])
    results["mice_default", run, "children",] <- as.numeric(mice_default_pool[4, c("estimate", "2.5 %", "97.5 %")])
    results["mice_default", run, "smoker",] <- as.numeric(mice_default_pool[5, c("estimate", "2.5 %", "97.5 %")])
    results["mice_default", run, "regionnorthwest",] <- as.numeric(mice_default_pool[6, c("estimate", "2.5 %", "97.5 %")])
    results["mice_default", run, "regionsoutheast",] <- as.numeric(mice_default_pool[7, c("estimate", "2.5 %", "97.5 %")])
    results["mice_default", run, "regionsouthwest",] <- as.numeric(mice_default_pool[8, c("estimate", "2.5 %", "97.5 %")])
    results["mice_default", run, "male",] <- as.numeric(mice_default_pool[9, c("estimate", "2.5 %", "97.5 %")])
    
    results["mice_CART", run,"Intercept", ] <- as.numeric(mice_CART_pool[1, c("estimate", "2.5 %", "97.5 %")])
    results["mice_CART", run, "age",] <- as.numeric(mice_CART_pool[2, c("estimate", "2.5 %", "97.5 %")])
    results["mice_CART", run, "bmi",] <- as.numeric(mice_CART_pool[3, c("estimate", "2.5 %", "97.5 %")])
    results["mice_CART", run, "children",] <- as.numeric(mice_CART_pool[4, c("estimate", "2.5 %", "97.5 %")])
    results["mice_CART", run, "smoker",] <- as.numeric(mice_CART_pool[5, c("estimate", "2.5 %", "97.5 %")])
    results["mice_CART", run, "regionnorthwest",] <- as.numeric(mice_CART_pool[6, c("estimate", "2.5 %", "97.5 %")])
    results["mice_CART", run, "regionsoutheast",] <- as.numeric(mice_CART_pool[7, c("estimate", "2.5 %", "97.5 %")])
    results["mice_CART", run, "regionsouthwest",] <- as.numeric(mice_CART_pool[8, c("estimate", "2.5 %", "97.5 %")])
    results["mice_CART", run, "male",] <- as.numeric(mice_CART_pool[9, c("estimate", "2.5 %", "97.5 %")])    
    
    results["mice_mixed", run,"Intercept", ] <- as.numeric(mice_mixed_pool[1, c("estimate", "2.5 %", "97.5 %")])
    results["mice_mixed", run, "age",] <- as.numeric(mice_mixed_pool[2, c("estimate", "2.5 %", "97.5 %")])
    results["mice_mixed", run, "bmi",] <- as.numeric(mice_mixed_pool[3, c("estimate", "2.5 %", "97.5 %")])
    results["mice_mixed", run, "children",] <- as.numeric(mice_mixed_pool[4, c("estimate", "2.5 %", "97.5 %")])
    results["mice_mixed", run, "smoker",] <- as.numeric(mice_mixed_pool[5, c("estimate", "2.5 %", "97.5 %")])
    results["mice_mixed", run, "regionnorthwest",] <- as.numeric(mice_mixed_pool[6, c("estimate", "2.5 %", "97.5 %")])
    results["mice_mixed", run, "regionsoutheast",] <- as.numeric(mice_mixed_pool[7, c("estimate", "2.5 %", "97.5 %")])
    results["mice_mixed", run, "regionsouthwest",] <- as.numeric(mice_mixed_pool[8, c("estimate", "2.5 %", "97.5 %")])
    results["mice_mixed", run, "male",] <- as.numeric(mice_mixed_pool[9, c("estimate", "2.5 %", "97.5 %")])    
    
  }
  results
}

simulate_1000_times <- simulation(1000)

## 5. Take the simulations to generate raw bias (RB), percent bias (PB), coverate rate (CR), and average confidence interval width (AW)
organize_results <- function(combined){
  RB <- apply(apply(combined[,,, "Estimate"], c(1,3), mean), 1, `-`, true )
  PB <- abs(apply(apply(apply(combined[,,, "Estimate"], c(1,3), mean), 1, `-`, true ),2, `/`, true))
  CR <- apply(apply(combined[,,, "2.5 %"], c(1,2), `<`, true) & apply(combined[,,, "97.5 %"], c(1,2), `>`, true), c(1,2), mean)
  AW <- apply(combined[,,, "97.5 %"] - combined[,,, "2.5 %"], c(1,3) , mean)
  
  RB <- data.frame(RB)
  RB$variable <- row.names(RB)
  
  PB <- data.frame(PB)
  PB$variable <- row.names(PB)
  
  CR <- data.frame(CR)
  CR$variable <- row.names(CR)
  
  AW <- data.frame(t(AW))
  AW$variable <- row.names(AW)
  
  RB_df <- gather(RB, imputation, statistic, misl:mice_mixed, factor_key=TRUE)
  RB_df$measurement <- "RB"
  
  PB_df <- gather(PB, imputation, statistic, misl:mice_mixed, factor_key=TRUE)
  PB_df$measurement <- "PB"
  
  CR_df <- gather(CR, imputation, statistic, misl:mice_mixed, factor_key=TRUE)
  CR_df$measurement <- "CR"  
  
  AW_df <- gather(AW, imputation, statistic, misl:mice_mixed, factor_key=TRUE)
  AW_df$measurement <- "AW"   
  
  results <- rbind(RB_df, PB_df,CR_df, AW_df)
  return(results)
}

final_results <- organize_results(simulate_1000_times)