library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
setwd("D:/PhDnurtion/papers/paper 5")
library(readxl)#used to import the excel file
CIAF <- read_excel("finaldata.xls")
finaldata <- read_excel("finaldata.xls")
CIAF = as.data.frame(CIAF)
#View(CIAF)###we can see the spreadsheet of the dataset
#CIAF <- CIAF[ which(CIAF$dhs_year==4), ]
############missing value managements in the dataset
#1. identify if there is any missing values
# 2. remove if not much
####################################
#is.na(CIAF$zpopulation)#toc check weather there is a missing value or not
# Recode missing values with NA.
# Replace NA by the average for the continuous variable
CIAF$zaridity1[is.na(CIAF$zaridity1)]=mean(CIAF$zaridity1,na.rm=TRUE)
CIAF$zpopulation[is.na(CIAF$zpopulation)]=mean(CIAF$zpopulation,na.rm=TRUE)
CIAF$zpreciptitation[is.na(CIAF$zpreciptitation)]=mean(CIAF$zpreciptitation,na.rm=TRUE)
CIAF$zur[is.na(CIAF$zur)]=mean(CIAF$zur,na.rm=TRUE)
CIAF$zdrought[is.na(CIAF$zdrought)]=mean(CIAF$zdrought,na.rm=TRUE)
CIAF$zevi[is.na(CIAF$zevi)]=mean(CIAF$zevi,na.rm=TRUE)
CIAF$zirrigation[is.na(CIAF$zirrigation)]=mean(CIAF$zirrigation,na.rm=TRUE)
CIAF$zlst[is.na(CIAF$zlst)]=mean(CIAF$zlst,na.rm=TRUE)
CIAF$zmaxt[is.na(CIAF$zmaxt)]=mean(CIAF$zmaxt,na.rm=TRUE)
CIAF$zmint[is.na(CIAF$zmint)]=mean(CIAF$zmint,na.rm=TRUE)
CIAF$zpet[is.na(CIAF$zpet)]=mean(CIAF$zpet,na.rm=TRUE)
CIAF$zelevition[is.na(CIAF$zelevition)]=mean(CIAF$zelevition,na.rm=TRUE)
CIAF$zwetd[is.na(CIAF$zwetd)]=mean(CIAF$zwetd,na.rm=TRUE)
CIAF$zu5_population[is.na(CIAF$zu5_population)]=mean(CIAF$zu5_population,na.rm=TRUE)
########
# Recode the CIAF (outcome) variable to 0 and 1.
#tail(CIAF,10) # Show the last 10 rows of the dataset in the list
# To improve the reproducibility of our analysis, we set the seed of the random generator
set.seed(123456) # Set a random seed so that repeated analyses have the same outcome.
##Seeds are saved on the PC only and will not allow analyses to be repeated precisely on other machines.
index = 1:nrow(CIAF) #Create an index vector with as many sequential
#variables as there are rows in the CIAF dataset.
testindex = sample(index, trunc(length(index)/5)) #Take a sample of 20% of the observations from
#the index vector.
testset = CIAF[testindex, ] #Create a test (validation) dataset with 20% of the CIAF dataset.
trainset = CIAF[-testindex, ] #Create a trainig dataset with 80% of the data.
x_train = data.matrix(trainset[,c(6,7,10:73)]) # Take the features (x) from the training dataset.
y_train = as.numeric(trainset[, 74]) # Take the outcomes (y) from the training dataset.
x_test = data.matrix(testset[,c(6,7,10:73)]) # Take the features (x) from the testing/validation dataset.
y_test = as.numeric(testset[, 74]) # Take the outcomes (y) from the testing/validation dataset.
# You can use the dim() function to assess the dimension of each matrix
# (e.g., dim(x_train))
# install.packages('glmnet',repos=getOption('repos')) Install latest verison
# of `glmnet`. Only necessary once.
################################################
##OLS
################################################
attach(CIAF)
library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 4.1-1
###ordinal glm (binry logistic regression)
ols.model = cv.glmnet(x_train, y_train, alpha=0, nfolds=10) # 10-fold cross validation of the LASSO-regulated linear model.
lambda.min = ols.model$lambda.min # Save the lambda value which minimizes the error of the linear model.
glm_coef = round(coef(ols.model,s= 0),2) #Individual coefficients for variable included in the model.
plot(ols.model) # Plots mean squared error against log(Lambda).
plot(glmnet(x_train,y_train, family="gaussian", alpha=0),"lambda",label=T, main="") #Plots coefficient values againt log(Lambda)
abline(v=log(lambda.min), lty=3) #Adds a vertical line to the plot of line 34 at the minimum level of l
log(lambda.min)
## [1] -4.318666
##Ridge regression
ridge_model = cv.glmnet(x_train, y_train, alpha=0, nfolds=10) # 10-fold cross validation of the LASSO-regulated linear model.
lambda.min = ridge_model$lambda.min # Save the lambda value which minimizes the error of the linear model.
glm_coef = round(coef(ridge_model,s= lambda.min),2) #Individual coefficients for variable included in the model.
plot(ridge_model) # Plots mean squared error against log(Lambda).
plot(glmnet(x_train,y_train, family="gaussian", alpha=0),"lambda",label=T, main="") #Plots coefficient values againt log(Lambda)
abline(v=log(lambda.min), lty=3) #Adds a vertical line to the plot of line 34 at the minimum level of l
log(lambda.min)
## [1] -4.504733
##Lasso regression
Lasso_model = cv.glmnet(x_train, y_train, alpha=1, nfolds=10) # 10-fold cross validation of the LASSO-regulated linear model.
lambda.min = Lasso_model$lambda.min # Save the lambda value which minimizes the error of the linear model.
glm_coef = round(coef(Lasso_model,s= lambda.min),2) #Individual coefficients for variable included in the model.
plot(Lasso_model) # Plots mean squared error against log(Lambda).
plot(glmnet(x_train,y_train, family="gaussian", alpha=1),"lambda",label=T, main="") #Plots coefficient values againt log(Lambda)
abline(v=log(lambda.min), lty=3) #Adds a vertical line to the plot of line 34 at the minimum level of l
log(lambda.min)
## [1] -8.71451
##Elastic Net regression
elastic_model = cv.glmnet(x_train, y_train, alpha=0.5, nfolds=10) # 10-fold cross validation of the LASSO-regulated linear model.
lambda.min = elastic_model$lambda.min # Save the lambda value which minimizes the error of the linear model.
glm_coef = round(coef(elastic_model,s= lambda.min),2) #Individual coefficients for variable included in the model.
plot(elastic_model) # Plots mean squared error against log(Lambda).
plot(glmnet(x_train,y_train, family="gaussian", alpha=1),"lambda",label=T, main="") #Plots coefficient values againt log(Lambda)
abline(v=log(lambda.min), lty=3) #Adds a vertical line to the plot of line 34 at the minimum level of l
log(lambda.min)
## [1] -7.835295
#
# Create a vector of predictions made from the test/validation data set for OLS, Ridge, Lasso and Elastic.
ols_pred = round(predict(ols.model, x_test, type="response"),3)
ridge_pred = round(predict(ridge_model, x_test, type="response"),3)
Lasso_pred = round(predict(Lasso_model,x_test, type="response"),3)
Elastic_pred = round(predict(elastic_model,x_test, type="response"),3)
require(nnet) # Load e1071 package into this R session.
## Loading required package: nnet
library(nnet)
## Loading required package: nnet
nnet_model = nnet(x_train, y_train, size=5) #Fit a single-layer neural network to the data with 5 units in the hidden layer.
## # weights: 341
## initial value 6298.936357
## iter 10 value 5599.590717
## iter 20 value 5338.083270
## iter 30 value 5223.247933
## iter 40 value 5197.695576
## iter 50 value 5162.223415
## iter 60 value 5138.344666
## iter 70 value 5125.601365
## iter 80 value 5118.571866
## iter 90 value 5101.374015
## iter 100 value 5089.557056
## final value 5089.557056
## stopped after 100 iterations
###Prediction vector for the neural network.
nnet_pred = round(predict(nnet_model, x_test, type="raw"),3)
#install.packages("rpart.plot")
library(rpart)
library(rpart.plot)
train_data=data.frame(y_train,x_train)
fit <-rpart(train_data$y_train~.,data = train_data,method = "class")
#rpart.plot(fit, extra = 106)
x_test=data.frame(x_test)
#Prediction vector for the random forest
RF_pred <-round(predict(fit,x_test),3)
# Collect the six prediction vectors into a data frame.
predictions = data.frame(y_test, ols_pred,ridge_pred,Lasso_pred, Elastic_pred,nnet_pred,RF_pred)
names(predictions) = c("CIAF", "ols","ridge","lasso","Elastic","nnet","RF" )
#Name the columns of the dataframe.
head(predictions,10)#list the first 10 predicted observations
## CIAF ols ridge lasso Elastic nnet RF NA
## 19516 1 0.601 0.607 0.618 0.621 0.644 0.391 0.609
## 16618 1 0.329 0.323 0.337 0.328 0.224 0.733 0.267
## 25642 0 0.349 0.350 0.355 0.353 0.227 0.628 0.372
## 22385 1 0.643 0.646 0.620 0.627 0.705 0.391 0.609
## 23678 1 0.355 0.349 0.341 0.339 0.223 0.733 0.267
## 12999 0 0.692 0.698 0.695 0.702 0.653 0.391 0.609
## 2285 0 0.488 0.489 0.495 0.490 0.348 0.391 0.609
## 6326 0 0.588 0.585 0.588 0.580 0.707 0.391 0.609
## 10982 1 0.195 0.192 0.215 0.204 0.244 0.628 0.372
## 19530 1 0.611 0.613 0.643 0.642 0.493 0.391 0.609
tail(predictions)#list the last six predicted values
## CIAF ols ridge lasso Elastic nnet RF NA
## 17967 1 0.623 0.624 0.621 0.627 0.618 0.391 0.609
## 17041 0 0.525 0.526 0.552 0.550 0.525 0.391 0.609
## 3662 1 0.648 0.653 0.673 0.674 0.656 0.391 0.609
## 23267 1 0.568 0.571 0.565 0.569 0.687 0.628 0.372
## 920 0 0.583 0.583 0.576 0.576 0.478 0.391 0.609
## 14080 1 0.265 0.255 0.262 0.243 0.221 0.733 0.267
write.csv(predictions, file = "datasas1.csv")#import the data to csv format for further analsis
library(readr)
Hsas <- read_csv("Rcodes/Hsas.csv")
##
## -- Column specification --------------------------------------------------------
## cols(
## CIAF = col_double(),
## OLS = col_double(),
## ridge = col_double(),
## lasso = col_double(),
## Elastic = col_double(),
## nnet = col_double(),
## RF = col_double()
## )
ols=round(Hsas$OLS,0)
ridge=round(Hsas$ridge,0)
lasso=round(Hsas$lasso,0)
Elastic=round(Hsas$Elastic,0)
nnet=round(Hsas$nnet,0)
RF=round(Hsas$RF,0)
CIAF=Hsas$CIAF
dataco=data.frame(CIAF,ols,ridge,lasso,Elastic,nnet,RF)
head(dataco)
## CIAF ols ridge lasso Elastic nnet RF
## 1 1 0 1 1 1 1 1
## 2 1 0 1 1 1 1 1
## 3 1 0 1 1 1 1 1
## 4 1 0 1 1 1 1 1
## 5 1 0 1 1 1 1 1
## 6 1 0 1 1 1 1 1
confusionMatrix(as.factor(dataco$ols),as.factor(dataco$CIAF)) # Create a confusion matrix for the SVM.
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1658 2495
## 1 1177 507
##
## Accuracy : 0.3709
## 95% CI : (0.3585, 0.3834)
## No Information Rate : 0.5143
## P-Value [Acc > NIR] : 1
##
## Kappa : -0.2431
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.5848
## Specificity : 0.1689
## Pos Pred Value : 0.3992
## Neg Pred Value : 0.3011
## Prevalence : 0.4857
## Detection Rate : 0.2841
## Detection Prevalence : 0.7115
## Balanced Accuracy : 0.3769
##
## 'Positive' Class : 0
##
confusionMatrix(as.factor(dataco$ridge),as.factor(dataco$CIAF)) # Create a confusion matrix for the SVM.
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1427 662
## 1 1408 2340
##
## Accuracy : 0.6454
## 95% CI : (0.6329, 0.6576)
## No Information Rate : 0.5143
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.2849
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.5034
## Specificity : 0.7795
## Pos Pred Value : 0.6831
## Neg Pred Value : 0.6243
## Prevalence : 0.4857
## Detection Rate : 0.2445
## Detection Prevalence : 0.3579
## Balanced Accuracy : 0.6414
##
## 'Positive' Class : 0
##
confusionMatrix(as.factor(dataco$lasso),as.factor(dataco$CIAF)) # Create a confusion matrix for the SVM.
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1373 559
## 1 1462 2443
##
## Accuracy : 0.6538
## 95% CI : (0.6414, 0.666)
## No Information Rate : 0.5143
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.3008
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.4843
## Specificity : 0.8138
## Pos Pred Value : 0.7107
## Neg Pred Value : 0.6256
## Prevalence : 0.4857
## Detection Rate : 0.2352
## Detection Prevalence : 0.3310
## Balanced Accuracy : 0.6490
##
## 'Positive' Class : 0
##
confusionMatrix(as.factor(dataco$Elastic),as.factor(dataco$CIAF)) # Create a confusion matrix for the SVM.
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1373 596
## 1 1462 2406
##
## Accuracy : 0.6474
## 95% CI : (0.635, 0.6597)
## No Information Rate : 0.5143
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.2882
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.4843
## Specificity : 0.8015
## Pos Pred Value : 0.6973
## Neg Pred Value : 0.6220
## Prevalence : 0.4857
## Detection Rate : 0.2352
## Detection Prevalence : 0.3373
## Balanced Accuracy : 0.6429
##
## 'Positive' Class : 0
##
confusionMatrix(as.factor(dataco$nnet),as.factor(dataco$CIAF)) # Create a confusion matrix for the SVM.
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1414 646
## 1 1421 2356
##
## Accuracy : 0.6459
## 95% CI : (0.6335, 0.6582)
## No Information Rate : 0.5143
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.2857
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.4988
## Specificity : 0.7848
## Pos Pred Value : 0.6864
## Neg Pred Value : 0.6238
## Prevalence : 0.4857
## Detection Rate : 0.2422
## Detection Prevalence : 0.3529
## Balanced Accuracy : 0.6418
##
## 'Positive' Class : 0
##
confusionMatrix(as.factor(dataco$RF),as.factor(dataco$CIAF)) # Create a confusion matrix for the SVM.
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1485 544
## 1 1350 2458
##
## Accuracy : 0.6755
## 95% CI : (0.6633, 0.6875)
## No Information Rate : 0.5143
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.3453
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.5238
## Specificity : 0.8188
## Pos Pred Value : 0.7319
## Neg Pred Value : 0.6455
## Prevalence : 0.4857
## Detection Rate : 0.2544
## Detection Prevalence : 0.3476
## Balanced Accuracy : 0.6713
##
## 'Positive' Class : 0
##
##The Random forest is selected as the best model
setwd("D:/PhDnurtion/papers/paper 5")
library(readxl)#used to import the excel file
CIAF <- read_excel("finaldata.xls")
finaldata <- read_excel("finaldata.xls")
CIAF = as.data.frame(CIAF)
#View(CIAF)###we can see the spreadsheet of the dataset
#CIAF <- CIAF[ which(CIAF$dhs_year==4), ]
############missing value managements in the dataset
#1. identify if there is any missing values
# 2. remove if not much
####################################
#is.na(CIAF$zpopulation)#toc check weather there is a missing value or not
# Recode missing values with NA.
# Replace NA by the average for the continuous variable
CIAF$zaridity1[is.na(CIAF$zaridity1)]=mean(CIAF$zaridity1,na.rm=TRUE)
CIAF$zpopulation[is.na(CIAF$zpopulation)]=mean(CIAF$zpopulation,na.rm=TRUE)
CIAF$zpreciptitation[is.na(CIAF$zpreciptitation)]=mean(CIAF$zpreciptitation,na.rm=TRUE)
CIAF$zur[is.na(CIAF$zur)]=mean(CIAF$zur,na.rm=TRUE)
CIAF$zdrought[is.na(CIAF$zdrought)]=mean(CIAF$zdrought,na.rm=TRUE)
CIAF$zevi[is.na(CIAF$zevi)]=mean(CIAF$zevi,na.rm=TRUE)
CIAF$zirrigation[is.na(CIAF$zirrigation)]=mean(CIAF$zirrigation,na.rm=TRUE)
CIAF$zlst[is.na(CIAF$zlst)]=mean(CIAF$zlst,na.rm=TRUE)
CIAF$zmaxt[is.na(CIAF$zmaxt)]=mean(CIAF$zmaxt,na.rm=TRUE)
CIAF$zmint[is.na(CIAF$zmint)]=mean(CIAF$zmint,na.rm=TRUE)
CIAF$zpet[is.na(CIAF$zpet)]=mean(CIAF$zpet,na.rm=TRUE)
CIAF$zelevition[is.na(CIAF$zelevition)]=mean(CIAF$zelevition,na.rm=TRUE)
CIAF$zwetd[is.na(CIAF$zwetd)]=mean(CIAF$zwetd,na.rm=TRUE)
CIAF$zu5_population[is.na(CIAF$zu5_population)]=mean(CIAF$zu5_population,na.rm=TRUE)
# To improve the reproducibility of our analysis, we set the seed of the random generator
set.seed(123456) # Set a random seed so that repeated analyses have the same outcome.
##Seeds are saved on the PC only and will not allow analyses to be repeated precisely on other machines.
index = 1:nrow(CIAF) #Create an index vector with as many sequential
#variables as there are rows in the CIAF dataset.
testindex = sample(index, trunc(length(index)/5)) #Take a sample of 20% of the observations from
#the index vector.
testset = CIAF[testindex, ] #Create a test (validation) dataset with 20% of the CIAF dataset.
trainset = CIAF[-testindex, ] #Create a trainig dataset with 80% of the data.
x_train = data.matrix(trainset[,c(6,7,10:73)]) # Take the features (x) from the training dataset.
y_train = as.numeric(trainset[, 74]) # Take the outcomes (y) from the training dataset.
x_test = data.matrix(testset[,c(6,7,10:73)]) # Take the features (x) from the testing/validation dataset.
y_test = as.numeric(testset[, 74]) ### based on the AUC and other statistics, the Random forest is selected
library(ggplot2)
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
train_data=trainset[,c(6,7,10:73,74)] # Take the features (x) from the training dataset.
#train_data=c(train_data,stringsAsFactors = FALSE)
test_data=testset[,c(6,7,10:73,74)] # Take the outcomes (y) from the training dataset.
#test_data=c(test_data,stringsAsFactors = FALSE)
rf<- randomForest(train_data$CIAF_u5c~.,data = train_data,importance=TRUE)
## Warning in randomForest.default(m, y, ...): The response has five or fewer
## unique values. Are you sure you want to do regression?
print(rf)
##
## Call:
## randomForest(formula = train_data$CIAF_u5c ~ ., data = train_data, importance = TRUE)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 22
##
## Mean of squared residuals: 0.2180689
## % Var explained: 12.58
attributes(rf)##used to see the attributes of the model
## $names
## [1] "call" "type" "predicted" "mse"
## [5] "rsq" "oob.times" "importance" "importanceSD"
## [9] "localImportance" "proximity" "ntree" "mtry"
## [13] "forest" "coefs" "y" "test"
## [17] "inbag" "terms"
##
## $class
## [1] "randomForest.formula" "randomForest"
rf$confusion#for example
## NULL