#0 Preliminaries -----------------------------------------------------------

# _0.1 Packages ------------------------------------------------------------

library(tidyverse)
library(tidylog)
library(psych)
library(mice)
library(micemd)


# _0.2 Load data files ----------------------------------------------------------

load(file = "02760-0001-Data.rda")
wave1 <- da02760.0001
load(file = "04652-0001-Data.rda")
wave2 <- da04652.0001
load(file = "36346-0001-Data.rda")
wave3 <- da36346.0001

wave1 <- as_tibble(wave1)
wave2 <- as_tibble(wave2)
wave3 <- as_tibble(wave3)



# 1 Wrangling -------------------------------------------------------------


# _1.1 Selecting variables -------------------------------------------------

wave1 <- dplyr::select(wave1, c(
  M2ID, M2FAMNUM, SAMPLMAJ, A1STATUS, A1PAGE_M2, A1PRAGE_2019,
  A1PBYEAR_2019, A1PRSEX, TOT_SIBS, ZYGCAT,
  A1SS7R, A1SHHTOT, #race, income*, 
  A1SA18, A1SA19, #exercise
  A1SA20, A1SA21, 
  A1PDEPRE, A1PDEPDX, # depression
  A1SA12G, # sleep problems
  A1PA40, A1PA43, #ever and current cigarette smoking
   A1SL1, A1SL2 # social contact
))

# Select exercise variables for wave2
wave2 <- dplyr::select(wave2, c(
  M2ID, B1STINC1,  #income, 
  B1SA30A, B1SA30B, B1SA30C, B1SA30D, B1SA30E, B1SA30F, # exercise
  B1SA31A, B1SA31B, B1SA31C, B1SA31D, B1SA31E, B1SA31F,
  B1PDEPRE, B1PDEPDX, # depression
  B1SA10G, # sleep problems
  B1SA61A, B1SA61B, B1SA61C, B1SA61D, #Sleep complaints
  B1PA38A, B1PA39, #ever and current cigarette smoking
  B1SI1, B1SI2 # social contact
)) 

# Select exercise variables for wave3
wave3 <- dplyr::select(wave3, c(
  M2ID, C1STINC, #income, 
  C1SA26A, C1SA26B, C1SA26C, C1SA26D, C1SA26E, C1SA26F, # exercise
  C1SA27A, C1SA27B, C1SA27C, C1SA27D, C1SA27E, C1SA27F,
  C1PDEPRE, C1PDEPDX, # depression
  C1SA10G,# sleep problems
  C1SA57A, C1SA57B, C1SA57C, C1SA57D, #sleep complaints
  C1PA38A, C1PA39, # ever and current cigarette smoking
  C1SI1, C1SI2 # social contact
)) 

# Joining datasets into one
midus <- left_join(wave1, wave2, by = "M2ID")

midus <- left_join(midus, wave3, by = "M2ID")

midus <- as_tibble(midus)

# rename variables
midus <- rename(
  midus,
  age = A1PRAGE_2019, race = A1SS7R, income_w1 = A1SHHTOT, income_w2 = B1STINC1, income_w3 = C1STINC,
  s_vig_w1 = A1SA18, w_vig_w1 = A1SA19, # Wave1 ex
  s_mod_w1 = A1SA20, w_mod_w1 = A1SA21,
  s_vig_job_w2 = B1SA30A, w_vig_job_w2 = B1SA30B, # Wave2 ex
  s_vig_chor_w2 = B1SA30C, w_vig_chor_w2 = B1SA30D,
  s_vig_leis_w2 = B1SA30E, w_vig_leis_w2 = B1SA30F,
  s_mod_job_w2 = B1SA31A, w_mod_job_w2 = B1SA31B,
  s_mod_chor_w2 = B1SA31C, w_mod_chor_w2 = B1SA31D,
  s_mod_leis_w2 = B1SA31E, w_mod_leis_w2 = B1SA31F,
  s_vig_job_w3 = C1SA26A, w_vig_job_w3 = C1SA26B, # Wave3 ex
  s_vig_chor_w3 = C1SA26C, w_vig_chor_w3 = C1SA26D,
  s_vig_leis_w3 = C1SA26E, w_vig_leis_w3 = C1SA26F,
  s_mod_job_w3 = C1SA27A, w_mod_job_w3 = C1SA27B,
  s_mod_chor_w3 = C1SA27C, w_mod_chor_w3 = C1SA27D,
  s_mod_leis_w3 = C1SA27E, w_mod_leis_w3 = C1SA27F,
  depr_sev_w1 = A1PDEPRE, depr_bin_w1 = A1PDEPDX, # wave 1 others
  sleep_prob_w1 = A1SA12G, eversmoke_w1 =A1PA40, smoking_w1 = A1PA43,
  neighbcont_w1 = A1SL1, neighbconv_w1 = A1SL2, 
  depr_sev_w2 = B1PDEPRE, depr_bin_w2 = B1PDEPDX, # wave 2 others
  sleep_prob_w2 = B1SA10G, eversmoke_w2 = B1PA38A, smoking_w2 = B1PA39,
  neighbcont_w2 = B1SI1,
  neighbconv_w2 = B1SI2, 
  sleepc1_w2 = B1SA61A, sleepc2_w2 = B1SA61B, 
  sleepc3_w2 = B1SA61C, sleepc4_w2 = B1SA61D, 
  depr_sev_w3 = C1PDEPRE, depr_bin_w3 = C1PDEPDX, # wave 3 others
  sleep_prob_w3 = C1SA10G, eversmoke_w3 = C1PA38A, smoking_w3 = C1PA39,
  neighbcont_w3 = C1SI1,
  neighbconv_w3 = C1SI2, 
  sleepc1_w3 = C1SA57A, sleepc2_w3 = C1SA57B, 
  sleepc3_w3 = C1SA57C, sleepc4_w3 = C1SA57D
)


# _1.2 Recode variables ----------------------------------------------------
# 
midus <- midus %>% 
mutate(A1PRSEX = recode(A1PRSEX, "(1) Male" = '0', "(2) Female" = '1'), 
       A1PRSEX = as.double(A1PRSEX) - 1,
       race = ifelse(is.na(race), NA, 
                     ifelse(race == "(1) White", 0, 1))) %>% 
  rename(sex=A1PRSEX)


# Exercise
# Wave 1
midus <- midus %>%
  mutate_at(
    vars(c(s_vig_w1, w_vig_w1,
           s_mod_w1, w_mod_w1)),
    ~as.numeric(recode(.,
      "(1) Several times a week or more" = 5,
      "(2) About once a week" = 4, "(3) Several times a month" = 3,
      "(4) About once a month" = 2, "(5) Less than once a month" = 1,
      "(6) Never" = 0
    ))
  )

# Wave 2 and 3
midus <- midus %>%
  mutate_at(
    vars(c(
      s_vig_job_w2,  w_vig_job_w2, # Wave2 ex
      s_vig_chor_w2, w_vig_chor_w2,
      s_vig_leis_w2, w_vig_leis_w2,
      s_mod_job_w2,  w_mod_job_w2,
      s_mod_chor_w2, w_mod_chor_w2,
      s_mod_leis_w2, w_mod_leis_w2,
      s_vig_job_w3,  w_vig_job_w3, # Wave3 ex
      s_vig_chor_w3, w_vig_chor_w3,
      s_vig_leis_w3, w_vig_leis_w3,
      s_mod_job_w3,  w_mod_job_w3,
      s_mod_chor_w3, w_mod_chor_w3,
      s_mod_leis_w3, w_mod_leis_w3
    )),
    ~as.numeric(recode(.,
      "(1) SEVERAL TIMES A WEEK" = 5,
      "(2) ONCE A WEEK" = 4, "(3) SEVERAL TIMES A MONTH" = 3,
      "(4) ONCE A MONTH" = 2, "(5) LESS THAN ONCE A MONTH" = 1,
      "(6) NEVER" = 0
    ))
  )

# depression w1 & 2
midus <- midus %>% 
  mutate_at(
    vars(c(depr_bin_w1, depr_bin_w2)),
    ~as.numeric(recode(.,
      "(0) Negative" = 0,
      "(1) Positive" = 1
    ))
  )

# depression w3
midus <- midus %>% 
  mutate_at(
    vars(c(depr_bin_w3)),
    ~as.numeric(recode(.,
      "(0) NEGATIVE" = 0,
      "(1) POSITIVE" = 1
    ))
  )

midus <- midus %>% # sleep problems ordinal w1
  mutate_at(
    vars(c(sleep_prob_w1)),
    ~as.numeric(recode(.,
      "(6) Not at all" = 0,
      "(5) Once a month" = 1,
      "(4) Several times a month" = 2,
      "(3) Once a week" = 3,
      "(2) Several times a week" = 4,
      "(1) Almost every day" = 5
    ))
  )

midus <- midus %>% # sleep problems ordinal w2-3
  mutate_at(
    vars(c(sleep_prob_w2, sleep_prob_w3)),
    ~as.numeric(recode(.,
                       "(6) NOT AT ALL" = 0,
                       "(5) ONCE A MONTH" = 1,
                       "(4) SEVERAL TIMES A MONTH" = 2,
                       "(3) ONCE A WEEK" = 3,
                       "(2) SEVERAL TIMES A WEEK" = 4,
                       "(1) ALMOST EVERY DAY" = 5
    ))
  )

midus <- midus %>% #extra sleep items for w2-w3
  mutate_at(vars(sleepc1_w2, sleepc2_w2, sleepc3_w2, sleepc4_w2,
                 sleepc1_w3, sleepc2_w3, sleepc3_w3, sleepc4_w3), 
            ~as.numeric(recode(., '(1) NEVER' = 0, '(2) RARELY' = 1,
                         '(3) SOMETIMES' = 2, '(4) OFTEN' = 3,
                         '(5) ALMOST ALWAYS' = 5)))

midus <- midus %>% # smoking w1
  mutate_at(
    vars(c(eversmoke_w1, smoking_w1)),
    ~as.numeric(recode(.,
      "(1) Yes" = 1,
      "(2) No" = 0
    )))

midus <- midus %>% #Never smokers were coded as NA's
  mutate(smoking_w1 = replace_na(smoking_w1, 0))


midus <- midus %>% # smoking w2 and w3
  mutate_at(
    vars(c(eversmoke_w2, smoking_w2,
           eversmoke_w3, smoking_w3)),
    funs(recode(.,
      "(1) YES" = 1,
      "(2) NO" = 0
    ))
  )



# Identify participants with inconsistencies in reporting cigarette usage (i.e. they
# report never having smoked in wave 2 or 3, when they reported smoking on a previous wave)
midus <- midus %>% 
  mutate(errorsmoke_w2 = ifelse(eversmoke_w2 == 0 & (smoking_w1 == 1 | eversmoke_w1 == 1), "error", "ok"),
         errorsmoke_w3 = ifelse(eversmoke_w3 == 0 & (smoking_w1 == 1 | smoking_w2 == 1 | 
                                                      eversmoke_w1 == 1 | eversmoke_w2 == 1), "error", "ok"))

# Filter out those participants
midus <- midus %>% 
  filter(is.na(errorsmoke_w2) | errorsmoke_w2 != "error",     #keep those who were NA on errorsmoke 
         is.na(errorsmoke_w3) | errorsmoke_w3 != "error") %>% #or who did not have a discrepancy
  dplyr::select(-c(errorsmoke_w2, errorsmoke_w3))

# The smoking_w2 and smoking_w3 variables are "NA's" rather than "0" if the person reports never having smoked a cigarette. 
# I need to distinguish actual missing NA's from "not currently smoking" values (0's). 
# If the smoking variable is NA and the depression var (which didn't have the same 0/NA conflation)
# is also NA, then score the code the smoking variable as NA (a true NA)
# If the smoking variable is NA and the depression var is present, then 0 (Non-smoker)
# Otherwise, the smoker variable is either a 0 or 1 as given

midus <- midus %>%   
  mutate(smoking_w2 = ifelse(is.na(smoking_w2) & is.na(depr_sev_w2), NA,
                             ifelse(is.na(smoking_w2) & !is.na(depr_sev_w2), 0, smoking_w2)),
         smoking_w3 = ifelse(is.na(smoking_w3) & is.na(depr_sev_w3), NA,
                            ifelse(is.na(smoking_w3) & !is.na(depr_sev_w3), 0, smoking_w3)))


#Neighbor variables
midus <- midus %>% # neighb contact and conversation recoded w1
  mutate_at(
    vars(c(neighbcont_w1, neighbconv_w1)),
    ~as.numeric(recode(.,
      "(6) Never or hardly ever" = 0,
      "(5) Less than once a month" = 1,
      "(4) 1-3 times a month" = 2,
      "(3) About once a week" = 3,
      "(2) Several times a week" = 4,
      "(1) Almost every day" = 5
    ))
  )

midus <- midus %>% # neighb contact and conversation recoded w2, w3
  mutate_at(
    vars(c(
      neighbcont_w2, neighbcont_w3,
      neighbconv_w2, neighbconv_w3
    )),
    ~as.numeric(recode(.,
      "(6) NEVER OR HARDLY EVER" = 0,
      "(5) LESS THAN ONCE A MONTH" = 1,
      "(4) 1-3 TIMES A MONTH" = 2,
      "(3) ABOUT ONCE A WEEK" = 3,
      "(2) SEVERAL TIMES A WEEK" = 4,
      "(1) ALMOST EVERY DAY" = 5
    ))
  )


# Create 4 composite variables for w2 and w3 exercise each
# (vig_win, vig_sum, mod_win, and mod_sum); the same already exist for w1
midus <- midus %>%
  mutate(
    w_vig_w2 = (w_vig_job_w2 + w_vig_chor_w2 + w_vig_leis_w2) ,
    s_vig_w2 = (s_vig_job_w2 + s_vig_chor_w2 + s_vig_leis_w2) ,
    w_mod_w2 = (w_mod_job_w2 + w_mod_chor_w2 + w_mod_leis_w2) ,
    s_mod_w2 = (s_mod_job_w2 + s_mod_chor_w2 + s_mod_leis_w2) ,
    w_vig_w3 = (w_vig_job_w3 + w_vig_chor_w3 + w_vig_leis_w3) ,
    s_vig_w3 = (s_vig_job_w3 + s_vig_chor_w3 + s_vig_leis_w3) ,
    w_mod_w3 = (w_mod_job_w3 + w_mod_chor_w3 + w_mod_leis_w3) ,
    s_mod_w3 = (s_mod_job_w3 + s_mod_chor_w3 + s_mod_leis_w3) 
  )

# Create exercise averages for each wave
midus <- midus %>%
  mutate(
    exercise_w1 = (((w_vig_w1 + s_vig_w1) * 7) / 2 + ((w_mod_w1 + s_mod_w1) * 5) / 2),# multiply by met unit; divide by 2 to get average
    exercise_w2 = (((w_vig_w2 + s_vig_w2) * 7) / 2 + ((w_mod_w2 + s_mod_w2) * 5) / 2),# 
    exercise_w3 = (((w_vig_w3 + s_vig_w3) * 7) / 2 + ((w_mod_w3 + s_mod_w3) * 5) / 2))# 
    

# Neighbor contact composite variables
midus <- midus %>%
  mutate(
    neighbs_w1 = (neighbcont_w1 + neighbconv_w1) / 2,
    neighbs_w2 = (neighbcont_w2 + neighbconv_w2) / 2,
    neighbs_w3 = (neighbcont_w3 + neighbconv_w3) / 2
  )


# sleep composites for w2 and w3
midus <- midus %>% 
  mutate(sleepc_w2 = sleepc1_w2 + sleepc2_w2 + sleepc3_w2 + sleepc4_w2,
         sleepc_w3 = sleepc1_w3 + sleepc2_w3 + sleepc3_w3 + sleepc4_w3)


# Correlation among single-item sleep problem variable (collected at all three waves) and
# 4-item sleep problem scales (collected at wave 2 and wave 3)
cor(midus$sleepc_w3, midus$sleep_prob_w3, use = "complete.obs")
cor(midus$sleepc_w2, midus$sleep_prob_w2, use = "complete.obs")
cor(midus$sleepc_w2, midus$sleep_prob_w1, use = "complete.obs") #W2 full scale with W1: r = .44
cor(midus$sleep_prob_w2, midus$sleep_prob_w1, use = "complete.obs") #W2 single item with W1 single item: r = .44


write.csv(midus, "midus.csv")
write_rds(midus, "midus.rds")


# _1.3 Wide to long ------------------------------------------------------------

midus <- read_rds("midus.rds")

#Rescaling w1 exercise to be M = 0, SD = 1 because they involved a different number of items
print(describe(midus[,80:82]), digits = 5)
midus <- midus %>%
  mutate(exercise_raw_w1 = exercise_w1,
         exercise_raw_w2 = exercise_w2,
         exercise_raw_w3 = exercise_w3,
         exercise_w1 = (exercise_w1-43.0304)/15.56109,
         exercise_w2 = (exercise_w2-74.0327)/47.00911,
         exercise_w3 = (exercise_w3-77.3660)/48.05796)

#Creating long data
midus_long <- midus %>% 
  pivot_longer(
    -c(M2ID:race), #variables you don't want to stack (i.e. between person)
    names_to = c('.value', "wave"), #".value" specifies that the "value" being measured comes from the column name, "wave" is the new column with the wave values
    names_sep = "_w", #pulling the wave number from all the variables
    values_drop_na = FALSE) 

midus_long <- midus_long %>% #Removing an extra row per person that got added from the previous code
  filter(wave != 'NA') %>% 
  mutate(wave = as.numeric(wave))


# _1.4 Centering variables for completer dataset -------------------------------------------------

#scale raw variables
midus_long <- midus_long %>%
  mutate(sleep_prob_z = as.vector(scale(sleep_prob)),
         neighbs_z = as.vector(scale(neighbs)),
         exercise_z = as.vector(scale(exercise)),
         income_z = as.vector(scale(income)),
         age_z = as.vector(scale(age)))


#Person means
midus_long <- midus_long %>%
  group_by(M2ID) %>%
  mutate(sleep_mean = mean(sleep_prob_z, na.rm = TRUE),
         neigh_mean = mean(neighbs_z, na.rm = TRUE),
         smoke_mean = mean(smoking, na.rm = TRUE),
         exercise_mean = mean(exercise_z, na.rm = TRUE),
         dep_bin_mean = mean(depr_bin, na.rm = TRUE),
         income_mean = mean(income_z, na.rm = TRUE)) %>%
  ungroup()

#Person-centered
midus_long <- midus_long %>%
  mutate(sleep_c = sleep_prob_z - sleep_mean,
         neigh_c = neighbs_z - neigh_mean,
         smoke_c = smoking - smoke_mean,
         exercise_c = exercise_z - exercise_mean,
         dep_bin_c = depr_bin - dep_bin_mean,
         income_c = income_z - income_mean)

write.csv(midus_long, 'midus_long.csv')
write_rds(midus_long, 'midus_long.rds')


# _1.5 Create completer dataset ---------------------------------------------------------

completer <- read_rds("midus_long.rds") #read in dataset
completer  <- completer %>% 
  mutate(wave_num = (as.double(wave)-1)/2,) %>% #For scaling, change wave to 0, .5, 1
  dplyr::select(M2ID, dep_bin_c, exercise_c, neigh_c, #Select relevant variables for analyses
                smoke_c, sleep_c, wave_num, income_c, 
                dep_bin_mean, exercise_mean, 
                neigh_mean, smoke_mean, sleep_mean, income_mean, 
                sex, age_z,race)%>%
  mutate(na_sum = rowSums(is.na(.))) %>% # find and keep only those with full data
  group_by(M2ID) %>% 
  mutate(na_sum_id = sum(na_sum)) %>% 
  ungroup() %>%
  filter(na_sum_id == 0) %>% 
  dplyr::select(-M2ID)
  

write_rds(completer, "midus_long_completers.rds")
  


# 2 Missing data ------------------------------------------------------------

#Exploring missingness

midus <- as_tibble(read_rds('midus.rds')) #read in wide file

na_to_1 <- function(x) {
  ifelse(is.na(x), 1, 0)
}

midus_r <- midus %>% 
  dplyr::select(M2ID, age, sex, race, income_w1, income_w2, 
         income_w3, sleep_prob_w1, sleep_prob_w2,
         sleep_prob_w3, smoking_w1, smoking_w2, smoking_w3, 
         depr_bin_w1, depr_bin_w2,
         depr_bin_w3, exercise_w1, exercise_w2, exercise_w3, 
         neighbs_w1, neighbs_w2, neighbs_w3) %>% 
  mutate(race = as.factor(race), 
         sex = as.factor(sex)) %>% 
  mutate(across(c(income_w1:neighbs_w3),
                .fns = list(na_to_1))) 


# _2.1 Exploring predictors of missingness --------------------------------

#Income
summary(glm(income_w2_1 ~ age, family = "binomial", data = midus_r))
summary(glm(income_w3_1 ~ age, family = "binomial", data = midus_r))
summary(glm(income_w2_1 ~ race, family = "binomial", data = midus_r))
summary(glm(income_w3_1 ~ race, family = "binomial", data = midus_r))
summary(glm(income_w2_1 ~ sex, family = "binomial", data = midus_r))
summary(glm(income_w3_1 ~ sex, family = "binomial", data = midus_r))
summary(glm(income_w2_1 ~ sleep_prob_w1, family = "binomial", data = midus_r))
summary(glm(income_w3_1 ~ sleep_prob_w1, family = "binomial", data = midus_r))
summary(glm(income_w2_1 ~ exercise_w1, family = "binomial", data = midus_r))
summary(glm(income_w3_1 ~ exercise_w1, family = "binomial", data = midus_r))
summary(glm(income_w2_1 ~ depr_bin_w1, family = "binomial", data = midus_r))
summary(glm(income_w3_1 ~ depr_bin_w1, family = "binomial", data = midus_r))
summary(glm(income_w2_1 ~ smoking_w1, family = "binomial", data = midus_r))
summary(glm(income_w3_1 ~ smoking_w1, family = "binomial", data = midus_r))
summary(glm(income_w2_1 ~ neighbs_w1, family = "binomial", data = midus_r))
summary(glm(income_w3_1 ~ neighbs_w1, family = "binomial", data = midus_r))

#sleep problems
summary(glm(sleep_prob_w2_1 ~ age, family = "binomial", data = midus_r))
summary(glm(sleep_prob_w3_1 ~ age, family = "binomial", data = midus_r))
summary(glm(sleep_prob_w2_1 ~ race, family = "binomial", data = midus_r))
summary(glm(sleep_prob_w3_1 ~ race, family = "binomial", data = midus_r))
summary(glm(sleep_prob_w2_1 ~ sex, family = "binomial", data = midus_r))
summary(glm(sleep_prob_w3_1 ~ sex, family = "binomial", data = midus_r))
summary(glm(sleep_prob_w2_1 ~ income_w1, family = "binomial", data = midus_r))
summary(glm(sleep_prob_w3_1 ~ income_w1, family = "binomial", data = midus_r))
summary(glm(sleep_prob_w2_1 ~ exercise_w1, family = "binomial", data = midus_r))
summary(glm(sleep_prob_w3_1 ~ exercise_w1, family = "binomial", data = midus_r))
summary(glm(sleep_prob_w2_1 ~ depr_bin_w1, family = "binomial", data = midus_r))
summary(glm(sleep_prob_w3_1 ~ depr_bin_w1, family = "binomial", data = midus_r))
summary(glm(sleep_prob_w2_1 ~ smoking_w1, family = "binomial", data = midus_r))
summary(glm(sleep_prob_w3_1 ~ smoking_w1, family = "binomial", data = midus_r))
summary(glm(sleep_prob_w2_1 ~ neighbs_w1, family = "binomial", data = midus_r))
summary(glm(sleep_prob_w3_1 ~ neighbs_w1, family = "binomial", data = midus_r))

#exercise
summary(glm(exercise_w2_1 ~ age, family = "binomial", data = midus_r))
summary(glm(exercise_w3_1 ~ age, family = "binomial", data = midus_r))
summary(glm(exercise_w2_1 ~ race, family = "binomial", data = midus_r))
summary(glm(exercise_w3_1 ~ race, family = "binomial", data = midus_r))
summary(glm(exercise_w2_1 ~ sex, family = "binomial", data = midus_r))
summary(glm(exercise_w3_1 ~ sex, family = "binomial", data = midus_r))
summary(glm(exercise_w2_1 ~ income_w1, family = "binomial", data = midus_r))
summary(glm(exercise_w3_1 ~ income_w1, family = "binomial", data = midus_r))
summary(glm(exercise_w2_1 ~ sleep_prob_w1, family = "binomial", data = midus_r))
summary(glm(exercise_w3_1 ~ sleep_prob_w1, family = "binomial", data = midus_r))
summary(glm(exercise_w2_1 ~ depr_bin_w1, family = "binomial", data = midus_r))
summary(glm(exercise_w3_1 ~ depr_bin_w1, family = "binomial", data = midus_r))
summary(glm(exercise_w2_1 ~ smoking_w1, family = "binomial", data = midus_r))
summary(glm(exercise_w3_1 ~ smoking_w1, family = "binomial", data = midus_r))
summary(glm(exercise_w2_1 ~ neighbs_w1, family = "binomial", data = midus_r))
summary(glm(exercise_w3_1 ~ neighbs_w1, family = "binomial", data = midus_r))

#depression
summary(glm(depr_bin_w2_1 ~ age, family = "binomial", data = midus_r))
summary(glm(depr_bin_w3_1 ~ age, family = "binomial", data = midus_r))
summary(glm(depr_bin_w2_1 ~ race, family = "binomial", data = midus_r))
summary(glm(depr_bin_w3_1 ~ race, family = "binomial", data = midus_r))
summary(glm(depr_bin_w2_1 ~ sex, family = "binomial", data = midus_r))
summary(glm(depr_bin_w3_1 ~ sex, family = "binomial", data = midus_r))
summary(glm(depr_bin_w2_1 ~ income_w1, family = "binomial", data = midus_r))
summary(glm(depr_bin_w3_1 ~ income_w1, family = "binomial", data = midus_r))
summary(glm(depr_bin_w2_1 ~ sleep_prob_w1, family = "binomial", data = midus_r))
summary(glm(depr_bin_w3_1 ~ sleep_prob_w1, family = "binomial", data = midus_r))
summary(glm(depr_bin_w2_1 ~ exercise_w1, family = "binomial", data = midus_r))
summary(glm(depr_bin_w3_1 ~ exercise_w1, family = "binomial", data = midus_r))
summary(glm(depr_bin_w2_1 ~ smoking_w1, family = "binomial", data = midus_r))
summary(glm(depr_bin_w3_1 ~ smoking_w1, family = "binomial", data = midus_r))
summary(glm(depr_bin_w2_1 ~ neighbs_w1, family = "binomial", data = midus_r))
summary(glm(depr_bin_w3_1 ~ neighbs_w1, family = "binomial", data = midus_r))

#smoking
summary(glm(smoking_w2_1 ~ age, family = "binomial", data = midus_r))
summary(glm(smoking_w3_1 ~ age, family = "binomial", data = midus_r))
summary(glm(smoking_w2_1 ~ race, family = "binomial", data = midus_r))
summary(glm(smoking_w3_1 ~ race, family = "binomial", data = midus_r))
summary(glm(smoking_w2_1 ~ sex, family = "binomial", data = midus_r))
summary(glm(smoking_w3_1 ~ sex, family = "binomial", data = midus_r))
summary(glm(smoking_w2_1 ~ income_w1, family = "binomial", data = midus_r))
summary(glm(smoking_w3_1 ~ income_w1, family = "binomial", data = midus_r))
summary(glm(smoking_w2_1 ~ sleep_prob_w1, family = "binomial", data = midus_r))
summary(glm(smoking_w3_1 ~ sleep_prob_w1, family = "binomial", data = midus_r))
summary(glm(smoking_w2_1 ~ exercise_w1, family = "binomial", data = midus_r))
summary(glm(smoking_w3_1 ~ exercise_w1, family = "binomial", data = midus_r))
summary(glm(smoking_w2_1 ~ depr_bin_w1, family = "binomial", data = midus_r))
summary(glm(smoking_w3_1 ~ depr_bin_w1, family = "binomial", data = midus_r))
summary(glm(smoking_w2_1 ~ neighbs_w1, family = "binomial", data = midus_r))
summary(glm(smoking_w3_1 ~ neighbs_w1, family = "binomial", data = midus_r))

#smoking
summary(glm(neighbs_w2_1 ~ age, family = "binomial", data = midus_r))
summary(glm(neighbs_w3_1 ~ age, family = "binomial", data = midus_r))
summary(glm(neighbs_w2_1 ~ race, family = "binomial", data = midus_r))
summary(glm(neighbs_w3_1 ~ race, family = "binomial", data = midus_r))
summary(glm(neighbs_w2_1 ~ sex, family = "binomial", data = midus_r))
summary(glm(neighbs_w3_1 ~ sex, family = "binomial", data = midus_r))
summary(glm(neighbs_w2_1 ~ income_w1, family = "binomial", data = midus_r))
summary(glm(neighbs_w3_1 ~ income_w1, family = "binomial", data = midus_r))
summary(glm(neighbs_w2_1 ~ sleep_prob_w1, family = "binomial", data = midus_r))
summary(glm(neighbs_w3_1 ~ sleep_prob_w1, family = "binomial", data = midus_r))
summary(glm(neighbs_w2_1 ~ exercise_w1, family = "binomial", data = midus_r))
summary(glm(neighbs_w3_1 ~ exercise_w1, family = "binomial", data = midus_r))
summary(glm(neighbs_w2_1 ~ depr_bin_w1, family = "binomial", data = midus_r))
summary(glm(neighbs_w3_1 ~ depr_bin_w1, family = "binomial", data = midus_r))
summary(glm(neighbs_w2_1 ~ smoking_w1, family = "binomial", data = midus_r))
summary(glm(neighbs_w3_1 ~ smoking_w1, family = "binomial", data = midus_r))

# 3 Imputation --------------------------------------------------------------

# _3.1 Basic preparation ------------------------------------------------------------


midus <- as_tibble(read_rds('midus_long.rds')) #read in long file


midusb <- midus %>% #create minimized dataset
  dplyr::select(M2ID, wave, age, sex, race, 
         depr_bin, exercise_raw, neighbs, smoking, sleep_prob, income) %>% 
  mutate(wave = (as.double(wave)-1)/2, #scale wave to be 0, .5 and 1
         age = as.vector(age),
         depr_bin = as.factor(depr_bin),
         smoking = as.factor(smoking),
         sleep_prob = as.ordered(sleep_prob),
         race = as.factor(race), 
         sex = as.factor(sex))

# _3.2 Multilevel imputation setup --------------------------------------------
# See: "Multiple Imputation of Missing Data for Multilevel  Models: Simulations and Recommendations"
# Example of what the -2, 2, 3, and 4 mean in the predictor matrix (https://stefvanbuuren.name/fimd/sec-mlguidelines.html)


# set up imputation methods
impMethod <- character(ncol(midusb)) # create empty vector for
names(impMethod) <- colnames(midusb) # imputation methods

# ... define method for each variable 
#"2l.pan" = normal, homogeneous variances; 2l.norm = normal, heterogeneous variances; 2l.lmer = normal
#2l.binary = binary, 
impMethod[ "age" ] <- "2lonly.pan" 
impMethod[ "race" ] <- "2lonly.binary" 
impMethod[ "depr_bin" ] <- "2l.binary" 
impMethod[ "exercise_raw" ] <- "2l.pan"
impMethod[ "neighbs" ] <- "2l.pan"
impMethod[ "smoking" ] <- "2l.binary"
impMethod[ "sleep_prob" ] <- "2l.pan"
impMethod[ "income" ] <- "2l.pan"
impMethod[ "sex" ] <- "2lonly.binary"


# set up predictor matrix
predMatrix <- matrix(0, ncol(midusb), ncol(midusb)) # create empty predictor
rownames(predMatrix) <- colnames(predMatrix) <- colnames(midusb) # matrix
# ... define predictors for each variable
predMatrix[ "age" , c("M2ID", "wave", "race", "depr_bin",
                        "exercise_raw", "neighbs",  "smoking", "sleep_prob", 
                        "income",  "sex") ] <- c(-2,1,1,1,1,1,1,
                                                 1,1,1) # -2 = cluster variable
predMatrix[ "race" , c("M2ID", "wave", "age",  "depr_bin", 
                       "exercise_raw", "neighbs",  "smoking", "sleep_prob", 
                       "income",  "sex") ] <- c(-2,1,1,1,1,1,1,
                                                1,1,1) # 1 = overall effect
predMatrix[ "depr_bin" , c("M2ID", "wave", "age", "race",   
                           "exercise_raw", "neighbs",  "smoking", "sleep_prob", 
                           "income",  "sex") ] <- c(-2,3,1,1,3,3,3,
                                                    3,3,1) # 3 = overall + group-level effect 
predMatrix[ "exercise_raw" , c("M2ID", "wave", "age", "race", "depr_bin",  
                           "neighbs",  "smoking", "sleep_prob", 
                           "income",  "sex") ] <- c(-2,3,1,1,3,3,3,
                                                    3,3,1)
predMatrix[ "neighbs" , c("M2ID", "wave", "age", "race",  "depr_bin", 
                          "exercise_raw", "smoking", "sleep_prob", 
                          "income",   "sex") ] <- c(-2,3,1,1,3,3,3,
                                                    3,3,1)
predMatrix[ "smoking" , c("M2ID", "wave", "age", "race",  "depr_bin", 
                          "exercise_raw", "neighbs",   "sleep_prob", 
                          "income",   "sex") ] <- c(-2,3,1,1,3,3,3,
                                                    3,3,1)
predMatrix[ "sleep_prob" , c("M2ID", "wave", "age", "race",  "depr_bin", 
                             "exercise_raw", "neighbs",  "smoking",  
                             "income",  "sex") ] <- c(-2,3,1,1,3,3,3,
                                                      3,3,1)
predMatrix[ "income" , c("M2ID", "wave", "age", "race",   "depr_bin",
                         "exercise_raw", "neighbs",  "smoking", "sleep_prob", 
                         "sex") ] <- c(-2,3,1,1,3,3,3,
                                       3,3,1)
predMatrix[ "sex" , c("M2ID", "wave", "age", "race",  "depr_bin", 
                      "exercise_raw", "neighbs",  "smoking", "sleep_prob", 
                      "income" ) ] <- c(-2,1,1,1,1,1,1,1,
                                         1,1)


# _3.3 Multilevel imputation ---------------------------------------------------



imputed <- mice.par(midusb, m=15, imputationMethod=impMethod,
                    predictorMatrix=predMatrix, nnodes = 3) 


write_rds(imputed, "midus_imputed_multilevel.rds")


midus_long_imp <- read_rds("midus_imputed_multilevel.rds")


midus_long_imp <- as_tibble(complete(midus_long_imp,"long",include = T))



# _3.4 Creating imputed datasets for network models -----------------------


#Centering the variables
midus_long_imp_zc <- midus_long_imp %>% #Standardize the variables within imputation
  mutate(sleep_prob = as.double(sleep_prob),
         depr_bin = as.double(depr_bin),
         smoking = as.double(smoking)) %>% 
  group_by(.imp) %>% 
  mutate(sleep_prob_z = as.vector(scale(sleep_prob)),
         neighbs_z = as.vector(scale(neighbs)),
         income_z = as.vector(scale(income)),
         age_z = as.vector(scale(age))) %>% 
  ungroup() %>% 
  group_by(.imp, wave) %>% #standardizing the exercise variables within wave
  mutate(exercise_z = as.vector(scale(exercise_raw))) %>% 
  ungroup() %>% 
  group_by(.imp, M2ID) %>%  #Cluster mean
  mutate(sleep_mean = mean(sleep_prob_z, na.rm = TRUE),
         neigh_mean = mean(neighbs_z, na.rm = TRUE),
         smoke_mean = mean(smoking, na.rm = TRUE),
         exercise_mean = mean(exercise_z, na.rm = TRUE),
         dep_bin_mean = mean(depr_bin, na.rm = TRUE),
         income_mean = mean(income_z, na.rm = TRUE)) %>% 
  ungroup() %>% 
  mutate(sleep_c = sleep_prob_z - sleep_mean, #person-centered
         neigh_c = neighbs_z - neigh_mean,
         smoke_c = smoking - smoke_mean,
         exercise_c = exercise_z - exercise_mean,
         dep_bin_c = depr_bin - dep_bin_mean,
         income_c = income_z - income_mean)

midus_long_imp_zc

#turn back into mids file
midus_mids <- as.mids(midus_long_imp_zc, .imp = ".imp", .id = ".id")

write_rds(midus_mids, "midus_mids.rds")

#create within mids file
midus_long_imp_zc_within <- midus_long_imp_zc %>% 
  dplyr::select(.imp, .id, dep_bin_c, exercise_c, neigh_c, sleep_c, smoke_c,
                income_c, wave)

midus_mids_within <- as.mids(midus_long_imp_zc_within, .imp = ".imp", .id = ".id")
write_rds(midus_mids_within, "midus_mids_within.rds")

#create between mids file
midus_long_imp_zc_between <- midus_long_imp_zc %>% 
  dplyr::select(.imp, .id, dep_bin_mean, exercise_mean, 
                neigh_mean, smoke_mean, sleep_mean, 
                income_mean, sex, age_z, race) %>% 
  mutate(female = as.double(sex)-1,
         race = as.double(race)-1) %>% 
  dplyr::select(-sex)

midus_mids_between <- as.mids(midus_long_imp_zc_between, .imp = ".imp", .id = ".id")
write_rds(midus_mids_between, "midus_mids_between.rds")


# 4 Summary statistics ------------------------------------------------------


# _4.1 Raw variables ----------------------------------------------------------


midus_long <- as_tibble(read_rds('midus_long.rds'))

midus_sum <- midus_long %>% 
  dplyr::select(M2ID, wave, age, sex, race, income, 
                depr_bin, sleep_prob, smoking, neighbs, exercise_raw)

w1 <- midus_sum %>% 
  filter(wave == 1) 

w2 <- midus_sum %>% 
  filter(wave == 2)

w3 <- midus_sum %>% 
  filter(wave == 3)

names(w1)

print(describe(w1, ), digits = 4)

print(describe(w2, ), digits = 4)

print(describe(w3, ), digits = 4)


# _4.2 Imputed data -----------------------------------------------------------

midus_long_imp <- read_rds("midus_imputed_multilevel.rds")

midus_sum_imp <- as_tibble(complete(midus_long_imp, "long", include = TRUE))
midus_sum_imp <- midus_sum_imp %>% 
  filter(.imp != 0) %>% 
  mutate(race = as.double(race)-1,
         sex = as.double(sex)-1,
         depr_bin = as.double(depr_bin)-1,
         smoking = as.double(smoking)-1,
         sleep_prob = as.double(sleep_prob)-1)

describe(midus_sum_imp)


w1_imp <- midus_sum_imp %>% 
  filter(wave == 0) 

w2_imp <- midus_sum_imp %>% 
  filter(wave == .5)

w3_imp <- midus_sum_imp %>% 
  filter(wave == 1)

print(describe(w1_imp), digits = 4)

print(describe(w2_imp), digits = 4)

print(describe(w3_imp), digits = 4)

# _4.3 Completers   -----------------------------------------------------------

completer <- read_rds("midus_long.rds")
midus_sum_comp  <- completer %>% 
  dplyr::select(M2ID, age, sex, race, depr_bin, exercise_raw, neighbs, 
                smoking, sleep_prob, wave, income) %>%
  mutate(na_sum = rowSums(is.na(.))) %>% 
  group_by(M2ID) %>% 
  mutate(na_sum_id = sum(na_sum)) %>% 
  # mutate(num_na = sum(map(~is.na(.))) %>% 
  ungroup() %>%
  filter(na_sum_id == 0) %>% 
  dplyr::select(-M2ID)
describe(midus_sum_comp)


w1_comp <- midus_sum_comp %>% 
  filter(wave == 1) 
 
w2_comp <- midus_sum_comp %>% 
  filter(wave == 2)  

w3_comp <- midus_sum_comp %>% 
  filter(wave == 3) 

print(describe(w1_comp, ), digits = 4)

print(describe(w2_comp, ), digits = 4)

print(describe(w3_comp, ), digits = 4)