###########################SET DIRECTORY#################################################
###SET YOUR OWN DIRECTORY###
###You will need a directory with at least 100Go of memory to download many large rasters required to run the model###

setwd("")
###Create a Figure folder in your directory
###Add Table S1 in your directory

##########################SET LIBRARIES##################################################
library(parallel)
library(doParallel)
library(raster)
library(randomForest)
library(readxl)
library(proj4)
library(rgdal)
library(gdalUtils)
library(exactextractr)
library("rnaturalearth")
library("rnaturalearthdata")
library(ggpubr)
library(cowplot)
library(rasterVis)
library(RColorBrewer)
library(colorRamps)
library(caret)
library(gridExtra)
library("ggspatial")
library(assignR)
library(gstat)
library(GSIF)
library(sp)
library("ranger")


#########################################FIGURE 2###################################################################

###Load sulfur isotope data from compiled database (see supplementary material)
d34S<-readxl::read_excel("Database_S1.xlsx",col_names=TRUE, na="NA", sheet="R")
d34S$Long<-as.numeric(d34S$Long)

d34S<-d34S[c("Lat","Long","d34S")]
d34S<-aggregate(d34S,by=list(d34S$Lat,d34S$Long), FUN=mean,na.rm=TRUE)

d34S_proj<-project(as.matrix(d34S[,c("Long","Lat")]), "+proj=eck4 +lon_0=0 +x_0=0 +y_0=0 +ellps=WGS84 +units=m +no_defs")


###Load world features
world <- ne_countries(scale = "medium", returnclass = "sf")

###Make plot of sample distribution over the globe for different substrate
p_d34S<-ggplot(data = world) +
  geom_sf(color = "darkgrey", fill = "white", size=0.5) +
  geom_point(data = d34S, aes(Lat, Long), shape=21,  fill="green", size = 1)+
  geom_point(data = d34S, aes(Lat, Long), shape=21,  colour="black", stroke = 0.01)+
  theme(plot.margin=unit(c(0,0,0,0), "cm")) +
  labs(x=NULL)+
  labs(y=NULL)+
  labs(tag = "A")+
  theme_void()


###Location of the samples from the database### 
pdf("Map_location.pdf", width=12, height=6)
grid.arrange(p_d34S)
dev.off()


###########################################FIGURE3#########################################

###Density plot of 
pdf("density_plot.pdf", width=8, height=4)
ggplot(d34S, aes(x=d34S)) + 
  geom_density(alpha=.2,size=1,position = "stack")+                           ###Create density function
  labs(x="d34S", y = "Density")+
  theme_classic()+
  xlim(-8, 21)
dev.off()


#############################################################################################################################################
#########################################################RANDOM FOREST REGRESSION############################################################
#############################################################################################################################################

##########################INPUT OBSERVATION##################################################
###Input dataset 

###Select the sheet desired for creating your random forest model change the sheet= to use another sheet. For example Sheet="plant"
d34S_orig<-d34S


###Clean missing values
d34S_orig_0<-d34S_orig[!is.na(d34S_orig$Lat),] #remove rows with missing XY coordinates
d34S_orig_0<-d34S_orig_0[!is.na(d34S_orig_0$Long),]
d34S_orig_1<-d34S_orig_0[!is.na(d34S_orig_0$d34S),] #remove rows with missing observed Sr data

###Project d34S data
d34S_proj<-project(as.matrix(d34S_orig_1[,c("Lat","Long")]), "+proj=eck4 +lon_0=0 +x_0=0 +y_0=0 +ellps=WGS84 +units=m +no_defs")
plot(d34S_proj)


#################################INPUT GLOBAL COVARIATE RASTERS#####################################################
###See Table 1 for raster list and references

###The projected covariates are available at: https://drive.google.com/drive/folders/1g9rCGo3Kd3hz2o5JKkSbgNsGJclvsuQm?usp=sharing
###Use this link and download the Projected_rasters folder as a .zip file into your working directory (35GO)
###Unzip the file using the 7zip program https://www.7-zip.org/ which can handle large .zip file. 
###Verify that the unzip folder has a name of "Projected_rasters" as below and if necessary rename the folder to the correct name
###Once this is done the following raster should load in your script

setwd("D:\\clement_back-up_2017-11-14\\European_Isoscape\\Projected_rasters")
r.mat=raster("D:\\Projected_rasters\\mat_reproj.tif")
r.fert=raster("Projected_rasters\\nfert_reproj.tif")
r.dust = raster("Projected_rasters\\dust_reproj.tif")
r.map = raster("Projected_rasters\\map_reproj.tif")
r.salt = raster("Projected_rasters\\salt_reproj.tif")
r.ai = raster("Projected_rasters\\ai_reproj.tif")
r.pet = raster("Projected_rasters\\pet_reproj.tif")
r.elevation = raster("Projected_rasters\\elevation_reproj.tif")
r.clay = raster("Projected_rasters\\rclay_reproj.tif")
r.ph = raster("Projected_rasters\\rph_reproj.tif")
r.cec = raster("Projected_rasters\\rcec_reproj.tif")
r.bulk = raster("Projected_rasters\\rbulk_reproj.tif")
r.age     =raster("Projected_rasters\\basement_age_reproj.tif")
r.m1 = raster("Projected_rasters\\rm1_reproj.tif")
r.maxage_geol=raster("Projected_rasters\\agemax.tif")
r.minage_geol=raster("Projected_rasters\\agemin.tif")
r.meanage_geol=raster("Projected_rasters\\agemean.tif")
r.sr=raster("D:\\Output\\rf_plantsoilmammal1.tif")
r.bouger=raster("Projected_rasters\\bouger_reproj.tif")
r.ssa=raster("Projected_rasters\\ssa.tif")
r.ssaw=raster("Projected_rasters\\ssaw.tif")
r.xx=raster("Projected_rasters\\xx.tif")
r.mat<-raster("Projected_rasters\\mat_reproj.tif")
r.distance<-raster("Projected_rasters\\distance.tif")


##########################EXTRACT RASTER VALUES AT OBSERVATIONS##################################################
#extract from raw and transformed rasters
Mode <- function(x) {
  ux <- unique(x)
  ux[which.max(tabulate(match(x, ux)))]
}
beginCluster()
m1xy<-extract(r.m1, d34S_proj, method='simple', buffer=7000,fun=Mode, na.rm=TRUE)
agexy<-extract(r.age, d34S_proj, method='simple', na.rm=TRUE) 
dustxy<-extract(r.dust, d34S_proj, method='bilinear',na.rm=TRUE)
saltxy<-extract(r.salt, d34S_proj, method='bilinear', na.rm=TRUE)
mapxy<-extract(r.map, d34S_proj, method='simple', buffer=7000,fun=mean, na.rm=TRUE)
aixy<-extract(r.ai, d34S_proj, method='bilinear', buffer=7000,fun=mean, na.rm=TRUE)
petxy<-extract(r.pet, d34S_proj, method='bilinear', buffer=7000,fun=mean,na.rm=TRUE)
elevationxy<-extract(r.elevation,d34S_proj,method='simple',buffer=7000,fun=mean,na.rm=TRUE)
clayxy<-extract(r.clay,d34S_proj, method='simple', buffer=7000,fun=Mode,na.rm=TRUE)
phxy<-extract(r.ph,d34S_proj, method='simple', buffer=7000,fun=Mode,na.rm=TRUE)
cecxy<-extract(r.cec,d34S_proj, method='simple', buffer=7000,fun=Mode,na.rm=TRUE)
bulkxy<-extract(r.bulk,d34S_proj, method='simple', buffer=7000,fun=Mode,na.rm=TRUE)
minage_geolxy<-extract(r.minage_geol,d34S_proj, method='simple', buffer=7000,fun=Mode, na.rm=TRUE)
maxage_geolxy<-extract(r.maxage_geol,d34S_proj, method='simple', buffer=7000,fun=Mode,na.rm=TRUE)
meanage_geolxy<-extract(r.meanage_geol,d34S_proj, buffer=7000,fun=Mode, method='simple', na.rm=TRUE)
srxy=extract(r.sr,d34S_proj, method='simple', buffer=7000,fun=Mode, na.rm=TRUE)
bougerxy=extract(r.bouger,d34S_proj, method='simple', buffer=7000,fun=mean, na.rm=TRUE)
matxy=extract(r.mat,d34S_proj, method='bilinear', buffer=7000,fun=mean, na.rm=TRUE)
fertxy=extract(r.fert,d34S_proj, method='bilinear', buffer=7000,fun=mean, na.rm=TRUE)
#d18Oxy=extract(r.d18Oann,d34S_proj, method='simple', buffer=7000,fun=mean, na.rm=TRUE)
ssaxy=extract(r.ssa,d34S_proj, method='bilinear', buffer=7000,fun=mean, na.rm=TRUE)
ssawxy=extract(r.ssaw,d34S_proj, method='bilinear', buffer=7000,fun=mean, na.rm=TRUE)
distancexy=extract(r.distance,d34S_proj, method='bilinear', buffer=7000,fun=mean, na.rm=TRUE)
xxxy=extract(r.distance,d34S_proj, method='simple', buffer=7000,fun=Mode, na.rm=TRUE)
endCluster()

### Append all extracted data into a summary table with the database
d34S_proj_xy <- data.frame(d34S_orig_1$Lat,d34S_orig_1$Long, d34S_orig_1$d34S,
                           d34S_proj,m1xy,agexy,dustxy,mapxy,saltxy,
                           aixy,petxy,elevationxy,clayxy,phxy,cecxy,bulkxy,
                           minage_geolxy,maxage_geolxy,meanage_geolxy,
                           srxy,bougerxy,
                           matxy,fertxy,ssaxy, ssawxy,distancexy,xxxy)

### Rename columns same as the rasters (important for running the final model)
colnames(d34S_proj_xy)<-c("Latitude","Longitude","d34S",  
                          "X.1","Y.1","r.m1","r.age","r.dust","r.map",     
                          "r.salt","r.ai","r.pet","r.elevation", "r.clay","r.ph","r.cec","r.bulk",    
                          "r.minage_geol","r.maxage_geol","r.meanage_geol","r.sr","r.bouger",
                          "r.mat","r.fert","r.ssa", "r.ssaw","r.distance","r.xx")


###Aggreatgate redundant lat/long
d34S_proj_xy<-d34S_proj_xy[complete.cases(d34S_proj_xy),]
d34S_agg<-aggregate(d34S_proj_xy,by=list(d34S_proj_xy$Latitude,d34S_proj_xy$Longitude), FUN=mean,na.rm=TRUE)


###Subset aggregated data keeping only lat long and covariates
d34S_agg1<-subset(d34S_proj_xy, select=c("Latitude","Longitude","d34S",  
                                         "r.m1","r.age","r.dust","r.map",     
                                         "r.salt","r.ai","r.pet","r.elevation", "r.clay","r.ph","r.cec","r.bulk",    
                                         "r.minage_geol","r.maxage_geol","r.meanage_geol","r.sr","r.bouger",
                                         "r.mat","r.fert","r.ssa", "r.ssaw","r.distance","r.xx"))

write.csv(d34S_agg1,file="regression_matrix.csv")

###Project subsetted dataset
d34S_agg_p <- project(as.matrix(d34S_proj_xy[,c("Longitude","Latitude")]), "+proj=eck4 +lon_0=0 +x_0=0 +y_0=0 +ellps=WGS84 +units=m +no_defs")


##########################DATA SPLITTING##################################################
###Select predictor using parallelized VSURF algorithm
library(VSURF)
names(d34S_agg1)
set.seed(123)
d34S_agg1.vsurf<-VSURF(d34S_agg1[,4:26],d34S_agg1$d34S, RFimplem = "ranger", parallel = TRUE, ncores = detectCores() - 1, clusterType = "PSOCK") 
d34S_agg1.vsurf$varselect.pred
plot(d34S_agg1.vsurf)
d34S_agg1.sub<-d34S_agg1[,4:26]
d34S_agg_VSURF <- d34S_agg1.sub[c(d34S_agg1.vsurf$varselect.pred)] #automatic subsetting of selected variables
d34S_agg_VSURF<-cbind(d34S_agg1[,1:3],d34S_agg_VSURF)
d34S_agg_VSURF ###Check the variables selected by VSURF. 
###Often VSURF still preserves some strong redundancies between variables that require some clean up.

training<-d34S_agg_VSURF

###Parallelize random forest modeling
cluster <- makeCluster(detectCores() - 1) # convention to leave 1 core for OS
registerDoParallel(cluster)

# Splitting the data for repeated cross validation 
fitControl <- trainControl(## 10-fold Crossvalidation
  method = "repeatedcv",
  number = 10,
  ## repeated ten times
  repeats = 5,
  verboseIter=FALSE ,
  returnResamp="final",
  savePredictions="all",
  # With parallel backend
  allowParallel=TRUE
)


set.seed(124)

bestmtry <- tuneRF(training, training$d34S, stepFactor=1, improve=1e-7, ntree=500)
mtry <- 6
tunegrid <- expand.grid(.mtry=mtry)
metric<-"Accuracy"

###Random forest training
RF_c <- train(d34S ~ r.ssaw+r.bouger+r.dust, data = training, method = "rf", importance=TRUE, tuneGrid=tunegrid,trControl= fitControl)
RF_c

###Quantile random forest training
qrf_c <- ranger(d34S ~ r.ssaw+r.bouger+r.dust, data = training, quantreg=TRUE, num.trees=500, seed=1)

###############################################FIGURE Cross-Validation#########################################################3
pdf("CV.pdf", width=6, height=6)
par(mfrow = c(1,1))
plot(RF_c$pred$pred,RF_c$pred$obs,pch=15, cex=0.4, xlab="d34Smod",ylab="d34Sobs",cex.lab=1, cex.axis=1)
lm2<-lm(RF_c$pred$obs~RF_c$pred$pred)
abline(lm2)
dev.off()


##########################FIGURE Variable Importance##################################################

pdf("\Importance.pdf", width=6, height=6)
par(mfrow=c(1,2))
varImpPlot(RF_c$finalModel,type=1)
varImpPlot(RF_c$finalModel,type=2)
#plot(RF$finalModel,main='Error vs No. of trees plot: Base Model')
dev.off()


#########################FIGURE Model residuals##################################################
pred_RF_c_final<-predict(RF_c,d34S_agg1)
residuals<-d34S_agg1$d34S -pred_RF_c_final
train<-d34S_agg1
train$resid<-residuals
train$absresid<-abs(residuals)
train$pred<-pred_RF_c_final

pdf("Residuals.pdf", width=6, height=4)
par(mfrow = c(1,2))
plot(train$pred,train$resid,pch=15, cex=0.4, log="x")
plot(train$pred,train$absresid,pch=15, cex=0.4, log="y")
dev.off()


########################################FIGURE Partial Dependence Plot##########################################
###Combined plant, soil and local animals
pdf("PD.pdf", width=6, height=6)
par(mfrow = c(2,2))
par(oma = c(1,1,1,1))
partialPlot(RF_c$finalModel, training, x.var = "r.ssaw",main=NA)
partialPlot(RF_c$finalModel, training, x.var = "r.dust",main=NA)
partialPlot(RF_c$finalModel, training, x.var = "r.bouger",main=NA)
dev.off()


##########################APPLY BEST MODEL SPATIALLY##################################################
###Create a raster stack with all predictors 
world_stack<-stack(r.ssaw,r.dust,r.bouger)
names(world_stack)<-c("r.ssaw","r.dust","r.bouger")

###Clip raster stack to study area
europe<-as.vector(c(-1500000,2500000,4000000,8000000))
europe_stack<-crop(world_stack, europe)*r.m1/r.m1
names(europe_stack)<-c("r.ssaw","r.dust","r.bouger")

####Create a grid to apply model extent
europe.grid<-europe_stack$r.ssaw/europe_stack$r.ssaw


###Apply random forest model spatially
rf2 <- predict(europe_stack, RF_c, ext=europe.grid, na.rm=TRUE, overwrite=TRUE, progress='text')

###Save the isoscape raster
writeRaster(rf2, filename="rf_d34S", format="GTiff", overwrite=TRUE)


###Apply quantile random forest model spatially
sp_europe_stack<-as(europe_stack, "SpatialPixelsDataFrame")
sr.rfd_low <- predict(europe_stack2, qrf_c, ext=europe.grid, type="quantiles", quantiles=0.15, fun = function(model, ...) predict(model, ...)$predictions)
sr.rfd_high <- predict(europe_stack2, qrf_c, ext=europe.grid, type="quantiles", quantiles=0.841, fun = function(model, ...) predict(model, ...)$predictions)

###Calculate uncertainty raster for random forest model
sr.se<-sr.rfd_high-sr.rfd_low
sr.se<-sr.se/2
writeRaster(sr.se, filename="Output\\srse", format="GTiff", overwrite=TRUE)

########################FIGURE Isoscape############################################
###Create breakpoints for random forest prediction map

data("wrld_simpl")
wrld_simpl_p<-spTransform(wrld_simpl, crs(rf2))
wrld_simpl_p<-crop(wrld_simpl_p, extent(rf2))


breakpoints<-c(-4,-2,0,2,4,6,8,10,12,14,16,18,20,22)
d34S_agg$Col <- matlab.like2(13)[as.numeric(cut(d34S_agg$d34S,breaks = breakpoints))]

pdf("d34S_isoscape.pdf",width=7, height=5)
par(mfrow=c(1,1))
plot(rf2, col= matlab.like2(13), breaks=breakpoints, axes = FALSE)
#plot(wrld_simpl_p,add=TRUE)
points(d34S_agg$X.1,d34S_agg$Y.1,pch=21,col="black",bg=d34S_agg$Col,cex=1.1)
#scaleBar(crs(na.dH_Hobson2012$mean), "topright",cex=1, seg.len=2,box.color = NULL)
dev.off()