# Set your own working directory #setwd("~/Archiwum_May_2020/Dokumenty/Maszynopisy/CORONAVIRUS/April") # used packages library(lubridate) library(readxl) library(tidyr) library(qgraph) library(rsample) library(gbm) library(xgboost) library(caret) library(h2o) library(pdp) library(ggplot2) library(lime) library(tidyverse) library(Metrics) library(purrr) library(ICEbox) library(ggpubr) # - INITIAL CALCULATIONS --- # data on Covid_19 cases from "COVID-19-geographic-disbtribu" sheet database <- read_excel("Covid_19_2.xlsx", n_max = 10000) database$Year_start_date <- "2019-12-31" database$Year_start_date <- as.Date(database$Year_start_date) database$DateRep_date <- as.Date(database$DateRep) database$Days_pandemic <- database$DateRep_date- database$Year_start_date View(database) #saving intermediate results NOT RUN #write.csv(database, file = "database.csv") # filtering all countries without the Covid database_summary <- database %>% filter(!is.na(Days_pandemic)) %>% filter(Cases > 0) View(database_summary) #Country variables as factor database_summary$Country <- as.factor(database_summary$Country) #Calculation number of days with virus pandemic database_summary$Days_pandemic_num <- as.numeric(database_summary$Days_pandemic) str(database_summary) # ------ CALCULATION OF DEPENDENT VARIABLES ---------------- database_semi_final <- database_summary %>% group_by(Country) %>% summarize(Total_cases = sum(Cases), Total_deaths = sum (Deaths), Pandemic_time = n(), Growth = coef(lm(log(Cases) ~ Days_pandemic_num))[2]) View(database_semi_final) #saving intermediate results. # data requires cleaning of country names(done mannualy) # to appropriate join tables # the final databese is in the sheet "Supplement" # and it was used in analyses write.csv(database_semi_final,file = "database_semi_final.csv") ### ============ FINAL ANALYSES ====================== #reading data data <- read_excel("Covid_19_2.xlsx", sheet = "Supplement", na = "NA", n_max = 300) #---------Visualizing correlations among variables------------- cor_dat <- data.frame( Age = scale(data$Age), Dens = scale(log(data$Dens)), GDP = scale(log(data$GDP)), Lat = scale(data$Lat), Lon = scale(data$Lon), Mig = scale(data$Mig), Time = scale(log(data$Time)), Tour = scale(log(data$Tour)), Urban = scale(data$Urban)) cor_dat <- drop_na(cor_dat) #removing NA cor_mat <- cor(cor_dat) dim(cor_dat) #network graph qgraph::qgraph(cor_mat ,theme ="Reddit", graph = "cor", posCol = "#f90643", negCol = "darkblue", vTrans = 250, minimum = "sig", sampleSize = dim(cor_dat)[1]) corrplot(cor_mat, method = c('number'), type = "upper") # insert "filetype = "tiff"" if want to save graph ## ---- ANALYSIS OF NUMBER OF COVID-19 CASES ------ set.seed(123) data_p <- data.frame(Cases = data$Cases, Age = data$Age, Dens = log(data$Dens), GDP = log(data$GDP), Lat = data$Lat, Lon = data$Lon, Mig = data$Mig, Time = log(data$Time), Tour = log(data$Tour), Urban = data$Urban) # filtering rows with the number of confirmed cases of covid-19 higher than 0 covid <- data_p %>% filter(Cases > 0) # spliting data set.seed(123) covid_split <- initial_split(covid, prop = .7) covid_train <- data.frame(training(covid_split)) covid_test <- data.frame(testing(covid_split)) #building h2o files h2o.no_progress() h2o.init(max_mem_size = "5g") y <- "Cases" x <- setdiff(names(covid_train), y) train.h2o <- as.h2o(covid_train) #initial model with default setting h2o.fit1 <- h2o.gbm( x = x, y = y, distribution = "poisson", training_frame = train.h2o, nfolds = 10, stopping_rounds = 10, stopping_tolerance = 0, seed = 123, ntrees = 10000 ) h2o.performance(h2o.fit1) h2o.fit1 h2o.fit1@parameters$ntrees # cross validated RMSE h2o.rmse(h2o.fit1, xval = TRUE) # GRID SEARCHING FOR OPTIMAL PARAMETERS split <- h2o.splitFrame(train.h2o, ratios = 0.75) train <- split[[1]] valid <- split[[2]] hyper_grid <- list( max_depth = c(1, 3, 5), min_rows = c(1, 5, 10), learn_rate = c(0.001, 0.01, 0.1), learn_rate_annealing = c(.99, 1), sample_rate = c(.5, .75, 1), col_sample_rate = c(.8, .9, 1) ) # perform grid search grid <- h2o.grid( algorithm = "gbm", grid_id = "gbm_grid", x = x, y = y, training_frame = train, validation_frame = valid, hyper_params = hyper_grid, ntrees = 10000, stopping_rounds = 10, stopping_tolerance = 0, seed = 123 ) # collect the results and sort by our model # performance metric of choice grid_perf <- h2o.getGrid( grid_id = "gbm_grid1", sort_by = "mse", decreasing = FALSE ) grid_perf # Grab the model_id for the top model, chosen by validation error best_model_id <- grid_perf@model_ids[[1]] best_model <- h2o.getModel(best_model_id) # Now let’s get performance metrics on the best model h2o.performance(model = best_model, valid = TRUE) best_model@parameters$ntrees # building final model h2o.final <- h2o.gbm( x = x, y = y, distribution = "poisson", training_frame = train.h2o, nfolds = 10, ntrees = 10000, learn_rate = 0.01, learn_rate_annealing = 0.99, max_depth = 3, min_rows = 1, sample_rate = 0.5, col_sample_rate = 0.8, stopping_rounds = 100, stopping_tolerance = 0, seed = 123 ) summary(h2o.final) test.h2o <- as.h2o(covid_test) predicted <- h2o.predict(h2o.final, newdata = test.h2o) # function to calculate R2 rsq <- function (x, y) cor(x, y) ^ 2 rsq(test.h2o$Cases, predicted) x <- as.data.frame(test.h2o$Cases) y <- as.data.frame(predicted) plot(x$Cases,y$predict) # saving results results_cases <- as.data.frame(summary(h2o.final)) write.csv(results_cases, file = "importance_cases.csv") h2o.final@parameters$ntrees # plotting variable importance h2o.varimp_plot(h2o.final, num_of_features = 10) pfun <- function(object, newdata) { as.data.frame(predict(object, newdata = as.h2o(newdata)))[[1L]] } ### plotting results ice <- h2o.final %>% pdp::partial( pred.var = "GDP", pred.fun = pfun, grid.resolution = 100, train = train.h2o, ice = TRUE ) %>% autoplot(rug = TRUE, train = train.h2o, alpha = .1, center = TRUE) + theme_bw(base_size = 14)+ theme(axis.text= element_text(size = 14))+ theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+ labs(x = "Log (GDP)", y = "Predicted number of COVID-19 cases") ice ice1 <- h2o.final %>% pdp::partial( pred.var = "Tour", pred.fun = pfun, grid.resolution = 100, train = train.h2o, ice = TRUE ) %>% autoplot(rug = TRUE, train = train.h2o, alpha = .1, center = TRUE) + theme_bw(base_size = 14)+ theme(axis.text= element_text(size = 14))+ theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+ labs(x = "Log (Number of tourists)", y = "Predicted number of COVID-19 cases") ice1 ice2 <- h2o.final %>% pdp::partial( pred.var = "Lon", pred.fun = pfun, grid.resolution = 100, train = train.h2o, ice = TRUE ) %>% autoplot(rug = TRUE, train = train.h2o, alpha = .1, center = TRUE) + theme_bw(base_size = 14)+ theme(axis.text= element_text(size = 14))+ theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+ labs(x = "Geographic longitude", y = "Predicted number of COVID-19 cases") ice2 ggarrange(ice, ice1, ice2, ncol = 1) %>% ggexport(filename = "cases1.tiff", res = 600, width = 2500, height = 6000) ggarrange(ice, ice1, ice2, ncol = 1) %>% ggexport(filename = "cases.jpeg", res = 600, width = 3000, height = 6000) ## ---- DEATHS ----- data_d <- data.frame(Deaths = data$Deaths, Cases = log(data$Cases), Age = data$Age, Dens = log(data$Dens), GDP = log(data$GDP), Lat = data$Lat, Lon = data$Lon, Mig = data$Mig, Time = log(data$Time), Tour = log(data$Tour), Urban = data$Urban) covid_d <- data_d %>% filter(Deaths > 0)%>% filter(!is.na(Deaths)) str(covid_d) set.seed(123) covid_d_split <- initial_split(covid_d, prop = .7) covid_d_train <- data.frame(training(covid_d_split)) covid_d_test <- data.frame(testing(covid_d_split)) #building h2o files h2o.no_progress() h2o.init(max_mem_size = "5g") yd <- "Deaths" xd <- setdiff(names(covid_d_train), yd) train.h2o_d <- as.h2o(covid_d_train) #initial model with default setting h2o.fit1_d <- h2o.gbm( x = xd, y = yd, distribution = "poisson", training_frame = train.h2o_d, nfolds = 10, stopping_rounds = 10, stopping_tolerance = 0, seed = 123, ntrees = 10000 ) h2o.performance(h2o.fit1_d) summary(h2o.fit1_d) h2o.fit1_d # model stopped after xx trees h2o.fit1_d@parameters$ntrees # cross validated RMSE h2o.rmse(h2o.fit1_d, xval = TRUE) # GRID SEARCHING FOR OPTIMAL PARAMETERS split_d <- h2o.splitFrame(train.h2o_d, ratios = 0.7) train_d <- split_d[[1]] valid_d <- split_d[[2]] hyper_grid_d <- list( max_depth = c(1, 3, 5), min_rows = c(1, 5, 10), learn_rate = c(0.001, 0.01, 0.1), learn_rate_annealing = c(.99, 1), sample_rate = c(.5, .75, 1), col_sample_rate = c(.8, .9, 1) ) # perform grid search grid_d <- h2o.grid( algorithm = "gbm", grid_id = "gbm_grid_d", x = xd, y = yd, training_frame = train_d, validation_frame = valid_d, hyper_params = hyper_grid_d, ntrees = 10000, stopping_rounds = 10, stopping_tolerance = 0, seed = 123 ) # collect the results and sort by our model # performance metric of choice grid_perf_d <- h2o.getGrid( grid_id = "gbm_grid_d", sort_by = "mse", decreasing = FALSE ) grid_perf_d # Grab the model_id for the top model, chosen by validation error best_model_id_d <- grid_perf_d@model_ids[[1]] best_model_d <- h2o.getModel(best_model_id_d) # Now let’s ge performance metrics on the best model h2o.performance(model = best_model_d, valid = TRUE) best_model_d@parameters$ntrees h2o.final_d <- h2o.gbm( x = xg, y = yg, distribution = "gaussian", training_frame = train.h2o_d, nfolds = 10, ntrees = 10000, learn_rate = 0.1, learn_rate_annealing = 1, max_depth = 3, min_rows = 5, sample_rate = 0.75, col_sample_rate = 0.9, seed = 123 ) summary(h2o.final_d) Deaths_res <- as.data.frame(summary(h2o.final_d)) write.csv(Deaths_res, file = "importance_deaths.csv") h2o.final_d@parameters$ntrees h2o.varimp_plot(h2o.final_d, num_of_features = 10) pfun <- function(object, newdata) { as.data.frame(predict(object, newdata = as.h2o(newdata)))[[1L]] } test.h2o_d <- as.h2o(covid_d_test) predicted_d<- h2o.predict(h2o.final_d, newdata = test.h2o_d) rsq <- function (x, y) cor(x, y) ^ 2 rsq(test.h2o_d$Deaths, predicted_d) plot(test.h2o_d$Deaths, predicted_d) a <- as.data.frame(test.h2o_d$Deaths) b <- as.data.frame(predicted_d) plot(a$Deaths,b$predict) # plotting results ice_d <- h2o.final_d %>% pdp::partial( pred.var = "GDP", pred.fun = pfun, grid.resolution = 100, train = covid_d_train, ice = TRUE ) %>% autoplot(rug = TRUE, train = covid_d_train, alpha = .1, center = TRUE) + theme_bw(base_size = 14)+ theme(axis.text= element_text(size = 14))+ theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+ labs(x = "Log (GDP)", y = "Predicted number of deaths") ice_d ice1_d <- h2o.final_d %>% pdp::partial( pred.var = "Tour", pred.fun = pfun, grid.resolution = 100, train = covid_d_train, ice = TRUE ) %>% autoplot(rug = TRUE, train = covid_d_train, alpha = .1, center = TRUE) + theme_bw(base_size = 14)+ theme(axis.text= element_text(size = 14))+ theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+ labs(x = "Log (Number of tourists)", y = "Predicted number of deaths") ice1_d ice2_d <- h2o.final_d %>% pdp::partial( pred.var = "Lon", pred.fun = pfun, grid.resolution = 100, train = covid_d_train, ice = TRUE ) %>% autoplot(rug = TRUE, train = covid_d_train, alpha = .1, center = TRUE) + theme_bw(base_size = 14)+ theme(axis.text= element_text(size = 14))+ theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+ labs(x = "Geographic longitude", y = "Predicted number of deaths") ice2_d ice3_d <- h2o.final_d %>% pdp::partial( pred.var = "Cases", pred.fun = pfun, grid.resolution = 100, train = covid_d_train, ice = TRUE ) %>% autoplot(rug = TRUE, train = covid_d_train, alpha = .1, center = TRUE) + theme_bw(base_size = 14)+ theme(axis.text= element_text(size = 14))+ theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+ labs(x = "Log (Number of cases)", y = "Predicted number of deaths") ice3_d ggarrange(ice1_d, ice3_d, ice_d,ice2_d, nrow = 2, ncol=2) %>% ggexport(filename = "deaths1.tiff", res = 600, width = 5000, height = 4000) ggarrange(ice1_d, ice3_d, ice_d,ice2_d, nrow = 2, ncol=2) %>% ggexport(filename = "deaths1.jpeg", res = 600, width = 5000, height = 4000) ## ----------- GROWTH ----- data_g <- data.frame(Growth = data$Growth, Age = data$Age, Dens = log(data$Dens), GDP = log(data$GDP), Lat = data$Lat, Lon = data$Lon, Mig = data$Mig, Time = log(data$Time), Tour = log(data$Tour), Urban = data$Urban) covid_g <- data_g %>% filter(!is.na(Growth)) str(covid_g) set.seed(123) covid_g_split <- initial_split(covid_g, prop = .7) covid_g_train <- data.frame(training(covid_g_split)) covid_g_test <- data.frame(testing(covid_g_split)) #building h2o files h2o.no_progress() h2o.init(max_mem_size = "5g") yg <- "Growth" xg <- setdiff(names(covid_g_train), yg) train.h2o_g <- as.h2o(covid_g_train) #initial model with default setting h2o.fit1_g <- h2o.gbm( x = xg, y = yg, distribution = "gaussian", training_frame = train.h2o_g, nfolds = 10, stopping_rounds = 10, stopping_tolerance = 0, seed = 123, ntrees = 10000 ) h2o.performance(h2o.fit1_g) summary(h2o.fit1_g) h2o.fit1_g # model stopped after xx trees h2o.fit1_g@parameters$ntrees # cross validated RMSE h2o.rmse(h2o.fit1_g, xval = TRUE) # GRID SEARCHING FOR OPTIMAL PARAMETERS split_g <- h2o.splitFrame(train.h2o_g, ratios = 0.7) train_g <- split_g[[1]] valid_g <- split_g[[2]] hyper_grid_g <- list( max_depth = c(1, 3, 5), min_rows = c(1, 5, 10), learn_rate = c(0.001, 0.01, 0.1), learn_rate_annealing = c(.99, 1), sample_rate = c(.5, .75, 1), col_sample_rate = c(.8, .9, 1) ) # perform grid search grid_g <- h2o.grid( algorithm = "gbm", grid_id = "gbm_grid_g", x = xg, y = yg, training_frame = train_g, validation_frame = valid_g, hyper_params = hyper_grid_g, ntrees = 10000, stopping_rounds = 10, stopping_tolerance = 0, seed = 123 ) # collect the results and sort by our model # performance metric of choice grid_perf_g <- h2o.getGrid( grid_id = "gbm_grid_g", sort_by = "mse", decreasing = FALSE ) grid_perf_g # Grab the model_id for the top model, chosen by validation error best_model_id_g <- grid_perf_g@model_ids[[1]] best_model_g <- h2o.getModel(best_model_id_g) # Now let’s ge performance metrics on the best model h2o.performance(model = best_model_g, valid = TRUE) best_model_g@parameters$ntrees h2o.final_g <- h2o.gbm( x = xg, y = yg, distribution = "gaussian", training_frame = train.h2o_g, nfolds = 10, ntrees = 10000, learn_rate = 0.1, learn_rate_annealing = 1, max_depth = 3, min_rows = 5, sample_rate = 0.75, col_sample_rate = 0.9, seed = 123 ) summary(h2o.final_g) growth_res <- as.data.frame(summary(h2o.final_g)) write.csv(growth_res, file = "importance_growth.csv") h2o.final_g@parameters$ntrees h2o.varimp_plot(h2o.final_g, num_of_features = 10) pfun <- function(object, newdata) { as.data.frame(predict(object, newdata = as.h2o(newdata)))[[1L]] } # plotting results # effect of time ice_g_time <- h2o.final_g %>% pdp::partial( pred.var = "Time", pred.fun = pfun, grid.resolution = 100, train = covid_g_train, ice = TRUE ) %>% autoplot(rug = TRUE, train = covid_g_train, alpha = .1, center = TRUE) + theme_bw(base_size = 14)+ theme(axis.text= element_text(size = 14))+ theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+ labs(x = "Log (Time in days)", y = "Predicted change in the growth rate") ice_g_time ice1_g_gdp <- h2o.final_g %>% pdp::partial( pred.var = "GDP", pred.fun = pfun, grid.resolution = 100, train = covid_g_train, ice = TRUE ) %>% autoplot(rug = TRUE, train = covid_g_train, alpha = .1, center = TRUE) + theme_bw(base_size = 14)+ theme(axis.text= element_text(size = 14))+ theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+ labs(x = "Log (GDP)", y = "Predicted change in the growth rate") ice1_g_gdp ice2_g_lon <- h2o.final_g %>% pdp::partial( pred.var = "Lon", pred.fun = pfun, grid.resolution = 100, train = covid_g_train, ice = TRUE ) %>% autoplot(rug = TRUE, train = covid_g_train, alpha = .1, center = TRUE) + theme_bw(base_size = 14)+ theme(axis.text= element_text(size = 14))+ theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+ labs(x = "Geographic longitude", y = "Predicted change in the growth rate") ice2_g_lon ice3_g_dens <- h2o.final_g %>% pdp::partial( pred.var = "Dens", pred.fun = pfun, grid.resolution = 100, train = covid_g_train, ice = TRUE ) %>% autoplot(rug = TRUE, train = covid_g_train, alpha = .1, center = TRUE) + theme_bw(base_size = 14)+ theme(axis.text= element_text(size = 14))+ theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+ labs(x = "Log (Population density)", y = "Predicted change in the growth rate") ice3_g_dens ice4_g_age <- h2o.final_g %>% pdp::partial( pred.var = "Age", pred.fun = pfun, grid.resolution = 100, train = covid_g_train, ice = TRUE ) %>% autoplot(rug = TRUE, train = covid_g_train, alpha = .1, center = TRUE) + theme_bw(base_size = 14)+ theme(axis.text= element_text(size = 14))+ theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+ labs(x = "Median population age", y = "Predicted change in the growth rate") ice4_g_age ice5_g_lat <- h2o.final_g %>% pdp::partial( pred.var = "Lat", pred.fun = pfun, grid.resolution = 100, train = covid_g_train, ice = TRUE ) %>% autoplot(rug = TRUE, train = covid_g_train, alpha = .1, center = TRUE) + theme_bw(base_size = 14)+ theme(axis.text= element_text(size = 14))+ theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+ labs(x = "Geographic latitude", y = "Predicted change in the growth rate") ice5_g_lat ice6_g_tour <- h2o.final_g %>% pdp::partial( pred.var = "Tour", pred.fun = pfun, grid.resolution = 100, train = covid_g_train, ice = TRUE ) %>% autoplot(rug = TRUE, train = covid_g_train, alpha = .1, center = TRUE) + theme_bw(base_size = 14)+ theme(axis.text= element_text(size = 14))+ theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+ labs(x = "Log (Number of tourists)", y = "Predicted change in the growth rate") ice6_g_tour ice7_g_mig <- h2o.final_g %>% pdp::partial( pred.var = "Mig", pred.fun = pfun, grid.resolution = 100, train = covid_g_train, ice = TRUE ) %>% autoplot(rug = TRUE, train = covid_g_train, alpha = .1, center = TRUE) + theme_bw(base_size = 14)+ theme(axis.text= element_text(size = 14))+ theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+ labs(x = "Migration rate", y = "Predicted change in the growth rate") ice7_g_mig ice8_g_urb <- h2o.final_g %>% pdp::partial( pred.var = "Urban", pred.fun = pfun, grid.resolution = 100, train = covid_g_train, ice = TRUE ) %>% autoplot(rug = TRUE, train = covid_g_train, alpha = .1, center = TRUE) + theme_bw(base_size = 14)+ theme(axis.text= element_text(size = 14))+ theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+ labs(x = "Urbanization level", y = "Predicted change in the growth rate") ice8_g_urb ggarrange(ice_g_time, ice1_g_gdp, ice2_g_lon, ice3_g_dens,ice4_g_age, ice5_g_lat, ice6_g_tour, ice7_g_mig,ice8_g_urb, nrow = 3, ncol=3) %>% ggexport(filename = "growth2.tiff", res = 600, width = 7500, height = 6000) test.h2o_g <- as.h2o(covid_g_test) predicted_g<- h2o.predict(h2o.final_g, newdata = test.h2o_g) bias_g <- Metrics::percent_bias(actual = test.h2o_g$Growth, predicted =predicted_g) bias_g rsq <- function (x, y) cor(x, y) ^ 2 rsq(test.h2o_g$Growth, predicted_g) plot(test.h2o_g$Growth, predicted_g) a <- as.data.frame(test.h2o_g$Growth) b <- as.data.frame(predicted_g) plot(a$Growth,b$predict) ## ----- IMPORTANCE ----- library(readxl) Importance <- read_excel("Covid_19_2.xlsx", sheet = "Importance", na = "NA", n_max = 250) View(Importance) Importance<- Importance %>% mutate(Value = if_else(Importance >= 1, '>= 1%', '< 1%')) import_plot <- ggplot(Importance, aes(x = X, y = Importance, fill = Value, col = Value))+ theme_bw(base_size = 14)+ geom_col(alpha = 0.7)+ facet_wrap(~Variable, ncol =1)+ scale_fill_manual(values = c("steelblue",'red'))+ scale_color_manual(values = c("steelblue","red"))+ theme(strip.background = element_rect(fill = '#e6f2f4'))+ theme(strip.text = element_text(size = 14))+ labs(x = "Explanatory variable", y = "Relative importance %") import_plot ggsave(import_plot, units = "cm", dpi= 600, width = 16, height = 19, filename = "Importance.tiff")