# Set your own working directory
#setwd("~/Archiwum_May_2020/Dokumenty/Maszynopisy/CORONAVIRUS/April")


# used packages
library(lubridate)
library(readxl)
library(tidyr)
library(qgraph)
library(rsample)      
library(gbm)          
library(xgboost)      
library(caret)        
library(h2o)          
library(pdp)          
library(ggplot2)      
library(lime)        
library(tidyverse)
library(Metrics)
library(purrr)
library(ICEbox)
library(ggpubr)

# - INITIAL CALCULATIONS ---


# data on Covid_19 cases from "COVID-19-geographic-disbtribu" sheet
database <- read_excel("Covid_19_2.xlsx", n_max = 10000) 
database$Year_start_date <- "2019-12-31"
database$Year_start_date <- as.Date(database$Year_start_date)
database$DateRep_date <- as.Date(database$DateRep) 
database$Days_pandemic <- database$DateRep_date- database$Year_start_date
View(database)

#saving intermediate results NOT RUN
#write.csv(database, file = "database.csv")

# filtering all countries without the Covid
database_summary <- database %>% 
  filter(!is.na(Days_pandemic)) %>%
  filter(Cases > 0)
View(database_summary)

#Country variables as factor
database_summary$Country <- as.factor(database_summary$Country)
#Calculation number of days with virus pandemic
database_summary$Days_pandemic_num <- as.numeric(database_summary$Days_pandemic)
str(database_summary)

# ------ CALCULATION OF DEPENDENT VARIABLES ----------------
database_semi_final <- database_summary %>%
  group_by(Country) %>%
  summarize(Total_cases = sum(Cases),
            Total_deaths = sum (Deaths),
            Pandemic_time = n(),
            Growth = coef(lm(log(Cases) ~ Days_pandemic_num))[2])

View(database_semi_final)


#saving intermediate results. 
# data requires cleaning of country names(done mannualy)
# to appropriate join tables
# the final databese is in the sheet "Supplement"
# and it was used in analyses
write.csv(database_semi_final,file = "database_semi_final.csv")



### ============ FINAL ANALYSES ======================


#reading data
data <- read_excel("Covid_19_2.xlsx", sheet = "Supplement", 
                   na = "NA", n_max = 300)

#---------Visualizing correlations among variables-------------

cor_dat <- data.frame( Age = scale(data$Age),
                       Dens = scale(log(data$Dens)),
                       GDP = scale(log(data$GDP)),
                       Lat = scale(data$Lat),
                       Lon = scale(data$Lon),
                       Mig = scale(data$Mig),
                       Time = scale(log(data$Time)),
                       Tour = scale(log(data$Tour)),
                       Urban = scale(data$Urban))
cor_dat <- drop_na(cor_dat) #removing NA
cor_mat <- cor(cor_dat)
dim(cor_dat)

#network graph
qgraph::qgraph(cor_mat ,theme  ="Reddit", graph = "cor",
       posCol = "#f90643", negCol = "darkblue",
       vTrans = 250, minimum = "sig", sampleSize = dim(cor_dat)[1])
corrplot(cor_mat, method = c('number'), type = "upper")

# insert "filetype = "tiff"" if want to save graph

## ---- ANALYSIS OF NUMBER OF COVID-19 CASES ------
set.seed(123)
data_p <- data.frame(Cases = data$Cases,
                     Age = data$Age,
                     Dens = log(data$Dens),
                     GDP = log(data$GDP),
                     Lat = data$Lat,
                     Lon = data$Lon,
                     Mig = data$Mig,
                     Time = log(data$Time),
                     Tour = log(data$Tour),
                     Urban = data$Urban)

# filtering rows with the number of confirmed cases of covid-19 higher than 0
covid <- data_p %>%
  filter(Cases > 0)

# spliting data
set.seed(123)
covid_split <- initial_split(covid, prop = .7)
covid_train <- data.frame(training(covid_split))
covid_test  <- data.frame(testing(covid_split))

#building h2o files
h2o.no_progress()
h2o.init(max_mem_size = "5g")
y <- "Cases"
x <- setdiff(names(covid_train), y)
train.h2o <- as.h2o(covid_train)
#initial model with default setting
h2o.fit1 <- h2o.gbm(
  x = x,
  y = y,
  distribution = "poisson",
  training_frame = train.h2o,
  nfolds = 10,
  stopping_rounds = 10,
  stopping_tolerance = 0,
  seed = 123,
  ntrees = 10000
)
h2o.performance(h2o.fit1)

h2o.fit1

h2o.fit1@parameters$ntrees
# cross validated RMSE
h2o.rmse(h2o.fit1, xval = TRUE)


# GRID SEARCHING FOR OPTIMAL PARAMETERS
split <- h2o.splitFrame(train.h2o, ratios = 0.75)
train <- split[[1]]
valid <- split[[2]]

hyper_grid <- list(
  max_depth = c(1, 3, 5),
  min_rows = c(1, 5, 10),
  learn_rate = c(0.001, 0.01, 0.1),
  learn_rate_annealing = c(.99, 1),
  sample_rate = c(.5, .75, 1),
  col_sample_rate = c(.8, .9, 1)
)

# perform grid search 
grid <- h2o.grid(
  algorithm = "gbm",
  grid_id = "gbm_grid",
  x = x, 
  y = y, 
  training_frame = train,
  validation_frame = valid,
  hyper_params = hyper_grid,
  ntrees = 10000,
  stopping_rounds = 10,
  stopping_tolerance = 0,
  seed = 123
)

# collect the results and sort by our model
# performance metric of choice
grid_perf <- h2o.getGrid(
  grid_id = "gbm_grid1", 
  sort_by = "mse", 
  decreasing = FALSE
)
grid_perf

# Grab the model_id for the top model, chosen by validation error
best_model_id <- grid_perf@model_ids[[1]]
best_model <- h2o.getModel(best_model_id)

# Now let’s get performance metrics on the best model
h2o.performance(model = best_model, valid = TRUE)
best_model@parameters$ntrees


# building final model

h2o.final <- h2o.gbm(
  x = x,
  y = y,
  distribution = "poisson",
  training_frame = train.h2o,
  nfolds = 10,
  ntrees = 10000,
  learn_rate = 0.01,
  learn_rate_annealing = 0.99,
  max_depth = 3,
  min_rows = 1,
  sample_rate = 0.5,
  col_sample_rate = 0.8,
  stopping_rounds = 100,
  stopping_tolerance = 0,
  seed = 123
)
summary(h2o.final)

test.h2o <- as.h2o(covid_test)
predicted <- h2o.predict(h2o.final, newdata = test.h2o)


# function to calculate R2

rsq <- function (x, y) cor(x, y) ^ 2
rsq(test.h2o$Cases, predicted)

x <- as.data.frame(test.h2o$Cases) 
y <- as.data.frame(predicted)

plot(x$Cases,y$predict)

# saving results
results_cases <- as.data.frame(summary(h2o.final))
write.csv(results_cases, file = "importance_cases.csv")
h2o.final@parameters$ntrees

# plotting variable importance 

h2o.varimp_plot(h2o.final, num_of_features = 10)
pfun <- function(object, newdata) {
  as.data.frame(predict(object, newdata = as.h2o(newdata)))[[1L]]
}

### plotting results

ice <- h2o.final %>%
  pdp::partial(
    pred.var = "GDP", 
    pred.fun = pfun,
    grid.resolution = 100, 
    train = train.h2o,
    ice = TRUE
  ) %>%
  autoplot(rug = TRUE, train = train.h2o, alpha = .1, center = TRUE) +
  theme_bw(base_size = 14)+
  theme(axis.text= element_text(size = 14))+
  theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank())+
  labs(x = "Log (GDP)", y = "Predicted number of COVID-19 cases")

ice

ice1 <- h2o.final %>%
  pdp::partial(
    pred.var = "Tour", 
    pred.fun = pfun,
    grid.resolution = 100, 
    train = train.h2o,
    ice = TRUE
  ) %>%
  autoplot(rug = TRUE, train = train.h2o, alpha = .1, center = TRUE) +
  theme_bw(base_size = 14)+
  theme(axis.text= element_text(size = 14))+
  theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank())+
  labs(x = "Log (Number of tourists)", y = "Predicted number of COVID-19 cases")
ice1

ice2 <- h2o.final %>%
  pdp::partial(
    pred.var = "Lon", 
    pred.fun = pfun,
    grid.resolution = 100, 
    train = train.h2o,
    ice = TRUE
  ) %>%
  autoplot(rug = TRUE, train = train.h2o, alpha = .1, center = TRUE) +
  theme_bw(base_size = 14)+
  theme(axis.text= element_text(size = 14))+
  theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank())+
  labs(x = "Geographic longitude", y = "Predicted number of COVID-19 cases")
ice2





ggarrange(ice, ice1, ice2, ncol = 1) %>%
  ggexport(filename = "cases1.tiff", res = 600, width = 2500, height = 6000)
ggarrange(ice, ice1, ice2, ncol = 1) %>%
  ggexport(filename = "cases.jpeg", res = 600, width = 3000, height = 6000)



## ---- DEATHS -----


data_d <- data.frame(Deaths = data$Deaths,
                     Cases = log(data$Cases),
                     Age = data$Age,
                     Dens = log(data$Dens),
                     GDP = log(data$GDP),
                     Lat = data$Lat,
                     Lon = data$Lon,
                     Mig = data$Mig,
                     Time = log(data$Time),
                     Tour = log(data$Tour),
                     Urban = data$Urban)

covid_d <- data_d %>%
  filter(Deaths > 0)%>%
  filter(!is.na(Deaths))
str(covid_d)  

set.seed(123)
covid_d_split <- initial_split(covid_d, prop = .7)
covid_d_train <- data.frame(training(covid_d_split))
covid_d_test  <- data.frame(testing(covid_d_split))

#building h2o files
h2o.no_progress()
h2o.init(max_mem_size = "5g")
yd <- "Deaths"
xd <- setdiff(names(covid_d_train), yd)
train.h2o_d <- as.h2o(covid_d_train)
#initial model with default setting
h2o.fit1_d <- h2o.gbm(
  x = xd,
  y = yd,
  distribution = "poisson",
  training_frame = train.h2o_d,
  nfolds = 10,
  stopping_rounds = 10,
  stopping_tolerance = 0,
  seed = 123,
  ntrees = 10000
)

h2o.performance(h2o.fit1_d)
summary(h2o.fit1_d)
h2o.fit1_d
# model stopped after xx trees
h2o.fit1_d@parameters$ntrees
# cross validated RMSE
h2o.rmse(h2o.fit1_d, xval = TRUE)


# GRID SEARCHING FOR OPTIMAL PARAMETERS
split_d <- h2o.splitFrame(train.h2o_d, ratios = 0.7)
train_d <- split_d[[1]]
valid_d <- split_d[[2]]

hyper_grid_d <- list(
  max_depth = c(1, 3, 5),
  min_rows = c(1, 5, 10),
  learn_rate = c(0.001, 0.01, 0.1),
  learn_rate_annealing = c(.99, 1),
  sample_rate = c(.5, .75, 1),
  col_sample_rate = c(.8, .9, 1)
)

# perform grid search 
grid_d <- h2o.grid(
  algorithm = "gbm",
  grid_id = "gbm_grid_d",
  x = xd, 
  y = yd, 
  training_frame = train_d,
  validation_frame = valid_d,
  hyper_params = hyper_grid_d,
  ntrees = 10000,
  stopping_rounds = 10,
  stopping_tolerance = 0,
  seed = 123
)

# collect the results and sort by our model
# performance metric of choice
grid_perf_d <- h2o.getGrid(
  grid_id = "gbm_grid_d", 
  sort_by = "mse", 
  decreasing = FALSE
)
grid_perf_d

# Grab the model_id for the top model, chosen by validation error
best_model_id_d <- grid_perf_d@model_ids[[1]]
best_model_d <- h2o.getModel(best_model_id_d)

# Now let’s ge performance metrics on the best model
h2o.performance(model = best_model_d, valid = TRUE)
best_model_d@parameters$ntrees




h2o.final_d <- h2o.gbm(
  x = xg,
  y = yg,
  distribution = "gaussian",
  training_frame = train.h2o_d,
  nfolds = 10,
  ntrees = 10000,
  learn_rate = 0.1,
  learn_rate_annealing = 1,
  max_depth = 3,
  min_rows = 5,
  sample_rate = 0.75,
  col_sample_rate = 0.9,
  seed = 123
)


summary(h2o.final_d)
Deaths_res <- as.data.frame(summary(h2o.final_d))
write.csv(Deaths_res, file = "importance_deaths.csv")
h2o.final_d@parameters$ntrees


h2o.varimp_plot(h2o.final_d, num_of_features = 10)
pfun <- function(object, newdata) {
  as.data.frame(predict(object, newdata = as.h2o(newdata)))[[1L]]
}


test.h2o_d <- as.h2o(covid_d_test)
predicted_d<- h2o.predict(h2o.final_d, newdata = test.h2o_d)


rsq <- function (x, y) cor(x, y) ^ 2
rsq(test.h2o_d$Deaths, predicted_d)
plot(test.h2o_d$Deaths, predicted_d)
a <- as.data.frame(test.h2o_d$Deaths) 
b <- as.data.frame(predicted_d)
plot(a$Deaths,b$predict)



# plotting results

ice_d <- h2o.final_d %>%
  pdp::partial(
    pred.var = "GDP", 
    pred.fun = pfun,
    grid.resolution = 100, 
    train = covid_d_train,
    ice = TRUE
  ) %>%
  autoplot(rug = TRUE, train = covid_d_train, alpha = .1, center = TRUE) +
  theme_bw(base_size = 14)+
  theme(axis.text= element_text(size = 14))+
  theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank())+
  labs(x = "Log (GDP)", y = "Predicted number of deaths")

ice_d

ice1_d <- h2o.final_d %>%
  pdp::partial(
    pred.var = "Tour", 
    pred.fun = pfun,
    grid.resolution = 100, 
    train = covid_d_train,
    ice = TRUE
  ) %>%
  autoplot(rug = TRUE, train = covid_d_train, alpha = .1, center = TRUE) +
  theme_bw(base_size = 14)+
  theme(axis.text= element_text(size = 14))+
  theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank())+
  labs(x = "Log (Number of tourists)", y = "Predicted number of deaths")
ice1_d

ice2_d <- h2o.final_d %>%
  pdp::partial(
    pred.var = "Lon", 
    pred.fun = pfun,
    grid.resolution = 100, 
    train = covid_d_train,
    ice = TRUE
  ) %>%
  autoplot(rug = TRUE, train = covid_d_train, alpha = .1, center = TRUE) +
  theme_bw(base_size = 14)+
  theme(axis.text= element_text(size = 14))+
  theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank())+
  labs(x = "Geographic longitude", y = "Predicted number of deaths")
ice2_d


ice3_d <- h2o.final_d %>%
  pdp::partial(
    pred.var = "Cases", 
    pred.fun = pfun,
    grid.resolution = 100, 
    train = covid_d_train,
    ice = TRUE
  ) %>%
  autoplot(rug = TRUE, train = covid_d_train, alpha = .1, center = TRUE) +
  theme_bw(base_size = 14)+
  theme(axis.text= element_text(size = 14))+
  theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank())+
  labs(x = "Log (Number of cases)", y = "Predicted number of deaths")
ice3_d


ggarrange(ice1_d, ice3_d, ice_d,ice2_d, nrow = 2, ncol=2) %>%
  ggexport(filename = "deaths1.tiff", res = 600, 
           width = 5000, height = 4000)

ggarrange(ice1_d, ice3_d, ice_d,ice2_d, nrow = 2, ncol=2) %>%
  ggexport(filename = "deaths1.jpeg", res = 600, 
           width = 5000, height = 4000)




## ----------- GROWTH -----


data_g <- data.frame(Growth = data$Growth,
                     Age = data$Age,
                     Dens = log(data$Dens),
                     GDP = log(data$GDP),
                     Lat = data$Lat,
                     Lon = data$Lon,
                     Mig = data$Mig,
                     Time = log(data$Time),
                     Tour = log(data$Tour),
                     Urban = data$Urban)

covid_g <- data_g %>%
  filter(!is.na(Growth))
str(covid_g)  


set.seed(123)
covid_g_split <- initial_split(covid_g, prop = .7)
covid_g_train <- data.frame(training(covid_g_split))
covid_g_test  <- data.frame(testing(covid_g_split))

#building h2o files
h2o.no_progress()
h2o.init(max_mem_size = "5g")
yg <- "Growth"
xg <- setdiff(names(covid_g_train), yg)
train.h2o_g <- as.h2o(covid_g_train)
#initial model with default setting
h2o.fit1_g <- h2o.gbm(
  x = xg,
  y = yg,
  distribution = "gaussian",
  training_frame = train.h2o_g,
  nfolds = 10,
  stopping_rounds = 10,
  stopping_tolerance = 0,
  seed = 123,
  ntrees = 10000
)

h2o.performance(h2o.fit1_g)
summary(h2o.fit1_g)
h2o.fit1_g
# model stopped after xx trees
h2o.fit1_g@parameters$ntrees
# cross validated RMSE
h2o.rmse(h2o.fit1_g, xval = TRUE)


# GRID SEARCHING FOR OPTIMAL PARAMETERS
split_g <- h2o.splitFrame(train.h2o_g, ratios = 0.7)
train_g <- split_g[[1]]
valid_g <- split_g[[2]]

hyper_grid_g <- list(
  max_depth = c(1, 3, 5),
  min_rows = c(1, 5, 10),
  learn_rate = c(0.001, 0.01, 0.1),
  learn_rate_annealing = c(.99, 1),
  sample_rate = c(.5, .75, 1),
  col_sample_rate = c(.8, .9, 1)
)

# perform grid search 
grid_g <- h2o.grid(
  algorithm = "gbm",
  grid_id = "gbm_grid_g",
  x = xg, 
  y = yg, 
  training_frame = train_g,
  validation_frame = valid_g,
  hyper_params = hyper_grid_g,
  ntrees = 10000,
  stopping_rounds = 10,
  stopping_tolerance = 0,
  seed = 123
)

# collect the results and sort by our model
# performance metric of choice
grid_perf_g <- h2o.getGrid(
  grid_id = "gbm_grid_g", 
  sort_by = "mse", 
  decreasing = FALSE
)
grid_perf_g

# Grab the model_id for the top model, chosen by validation error
best_model_id_g <- grid_perf_g@model_ids[[1]]
best_model_g <- h2o.getModel(best_model_id_g)

# Now let’s ge performance metrics on the best model
h2o.performance(model = best_model_g, valid = TRUE)
best_model_g@parameters$ntrees




h2o.final_g <- h2o.gbm(
  x = xg,
  y = yg,
  distribution = "gaussian",
  training_frame = train.h2o_g,
  nfolds = 10,
  ntrees = 10000,
  learn_rate = 0.1,
  learn_rate_annealing = 1,
  max_depth = 3,
  min_rows = 5,
  sample_rate = 0.75,
  col_sample_rate = 0.9,
  seed = 123
)


summary(h2o.final_g)
growth_res <- as.data.frame(summary(h2o.final_g))
write.csv(growth_res, file = "importance_growth.csv")
h2o.final_g@parameters$ntrees


h2o.varimp_plot(h2o.final_g, num_of_features = 10)
pfun <- function(object, newdata) {
  as.data.frame(predict(object, newdata = as.h2o(newdata)))[[1L]]
}

# plotting results
# effect of time
ice_g_time <- h2o.final_g %>%
  pdp::partial(
    pred.var = "Time", 
    pred.fun = pfun,
    grid.resolution = 100, 
    train = covid_g_train,
    ice = TRUE
  ) %>%
  autoplot(rug = TRUE, train = covid_g_train, alpha = .1, center = TRUE) +
  theme_bw(base_size = 14)+
  theme(axis.text= element_text(size = 14))+
  theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank())+
  labs(x = "Log (Time in days)", y = "Predicted change in the growth rate")

ice_g_time

ice1_g_gdp <- h2o.final_g %>%
  pdp::partial(
    pred.var = "GDP", 
    pred.fun = pfun,
    grid.resolution = 100, 
    train = covid_g_train,
    ice = TRUE
  ) %>%
  autoplot(rug = TRUE, train = covid_g_train, alpha = .1, center = TRUE) +
  theme_bw(base_size = 14)+
  theme(axis.text= element_text(size = 14))+
  theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank())+
  labs(x = "Log (GDP)", y = "Predicted change in the growth rate")
ice1_g_gdp

ice2_g_lon <- h2o.final_g %>%
  pdp::partial(
    pred.var = "Lon", 
    pred.fun = pfun,
    grid.resolution = 100, 
    train = covid_g_train,
    ice = TRUE
  ) %>%
  autoplot(rug = TRUE, train = covid_g_train, alpha = .1, center = TRUE) +
  theme_bw(base_size = 14)+
  theme(axis.text= element_text(size = 14))+
  theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank())+
  labs(x = "Geographic longitude", y = "Predicted change in the growth rate")
ice2_g_lon


ice3_g_dens <- h2o.final_g %>%
  pdp::partial(
    pred.var = "Dens", 
    pred.fun = pfun,
    grid.resolution = 100, 
    train = covid_g_train,
    ice = TRUE
  ) %>%
  autoplot(rug = TRUE, train = covid_g_train, alpha = .1, center = TRUE) +
  theme_bw(base_size = 14)+
  theme(axis.text= element_text(size = 14))+
  theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank())+
  labs(x = "Log (Population density)", y = "Predicted change in the growth rate")
ice3_g_dens

ice4_g_age <- h2o.final_g %>%
  pdp::partial(
    pred.var = "Age", 
    pred.fun = pfun,
    grid.resolution = 100, 
    train = covid_g_train,
    ice = TRUE
  ) %>%
  autoplot(rug = TRUE, train = covid_g_train, alpha = .1, center = TRUE) +
  theme_bw(base_size = 14)+
  theme(axis.text= element_text(size = 14))+
  theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank())+
  labs(x = "Median population age", y = "Predicted change in the growth rate")
ice4_g_age

ice5_g_lat <- h2o.final_g %>%
  pdp::partial(
    pred.var = "Lat", 
    pred.fun = pfun,
    grid.resolution = 100, 
    train = covid_g_train,
    ice = TRUE
  ) %>%
  autoplot(rug = TRUE, train = covid_g_train, alpha = .1, center = TRUE) +
  theme_bw(base_size = 14)+
  theme(axis.text= element_text(size = 14))+
  theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank())+
  labs(x = "Geographic latitude", y = "Predicted change in the growth rate")
ice5_g_lat

ice6_g_tour <- h2o.final_g %>%
  pdp::partial(
    pred.var = "Tour", 
    pred.fun = pfun,
    grid.resolution = 100, 
    train = covid_g_train,
    ice = TRUE
  ) %>%
  autoplot(rug = TRUE, train = covid_g_train, alpha = .1, center = TRUE) +
  theme_bw(base_size = 14)+
  theme(axis.text= element_text(size = 14))+
  theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank())+
  labs(x = "Log (Number of tourists)", y = "Predicted change in the growth rate")
ice6_g_tour

ice7_g_mig <- h2o.final_g %>%
  pdp::partial(
    pred.var = "Mig", 
    pred.fun = pfun,
    grid.resolution = 100, 
    train = covid_g_train,
    ice = TRUE
  ) %>%
  autoplot(rug = TRUE, train = covid_g_train, alpha = .1, center = TRUE) +
  theme_bw(base_size = 14)+
  theme(axis.text= element_text(size = 14))+
  theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank())+
  labs(x = "Migration rate", y = "Predicted change in the growth rate")
ice7_g_mig

ice8_g_urb <- h2o.final_g %>%
  pdp::partial(
    pred.var = "Urban", 
    pred.fun = pfun,
    grid.resolution = 100, 
    train = covid_g_train,
    ice = TRUE
  ) %>%
  autoplot(rug = TRUE, train = covid_g_train, alpha = .1, center = TRUE) +
  theme_bw(base_size = 14)+
  theme(axis.text= element_text(size = 14))+
  theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank())+
  labs(x = "Urbanization level", y = "Predicted change in the growth rate")
ice8_g_urb


ggarrange(ice_g_time, ice1_g_gdp, ice2_g_lon,
          ice3_g_dens,ice4_g_age, ice5_g_lat,
          ice6_g_tour, ice7_g_mig,ice8_g_urb,
          nrow = 3, ncol=3) %>%
  ggexport(filename = "growth2.tiff", res = 600, 
           width = 7500, height = 6000)


test.h2o_g <- as.h2o(covid_g_test)
predicted_g<- h2o.predict(h2o.final_g, newdata = test.h2o_g)



bias_g <- Metrics::percent_bias(actual = test.h2o_g$Growth, 
                              predicted =predicted_g)

bias_g
rsq <- function (x, y) cor(x, y) ^ 2
rsq(test.h2o_g$Growth, predicted_g)
plot(test.h2o_g$Growth, predicted_g)
a <- as.data.frame(test.h2o_g$Growth) 
b <- as.data.frame(predicted_g)

plot(a$Growth,b$predict)

## ----- IMPORTANCE -----
library(readxl)
Importance <- read_excel("Covid_19_2.xlsx", sheet = "Importance", 
                   na = "NA", n_max = 250)
View(Importance)

Importance<- Importance %>%
  mutate(Value = if_else(Importance >= 1, '>= 1%', '< 1%'))
  
import_plot <- ggplot(Importance, aes(x = X, y = Importance,
                                      fill = Value, col = Value))+
  theme_bw(base_size = 14)+
  geom_col(alpha = 0.7)+
  facet_wrap(~Variable, ncol =1)+
  scale_fill_manual(values = c("steelblue",'red'))+
  scale_color_manual(values = c("steelblue","red"))+
  theme(strip.background = element_rect(fill = '#e6f2f4'))+
  theme(strip.text = element_text(size = 14))+
  labs(x = "Explanatory variable", y = "Relative importance %")
import_plot

ggsave(import_plot, units = "cm", dpi= 600, width = 16,
       height = 19,
       filename = "Importance.tiff")