###########################SET DIRECTORY################################################# ###SET YOUR OWN DIRECTORY### ###You will need a directory with at least 100Go of memory to download many large rasters required to run the model### setwd("") ###Create a Figure folder in your directory ###Add Table S1 in your directory ##########################SET LIBRARIES################################################## library(parallel) library(doParallel) library(raster) library(randomForest) library(readxl) library(proj4) library(rgdal) library(gdalUtils) library(exactextractr) library("rnaturalearth") library("rnaturalearthdata") library(ggpubr) library(cowplot) library(rasterVis) library(RColorBrewer) library(colorRamps) library(caret) library(gridExtra) library("ggspatial") library(assignR) library(gstat) library(GSIF) library(sp) library("ranger") #########################################FIGURE 2################################################################### ###Load sulfur isotope data from compiled database (see supplementary material) d34S<-readxl::read_excel("Database_S1.xlsx",col_names=TRUE, na="NA", sheet="R") d34S$Long<-as.numeric(d34S$Long) d34S<-d34S[c("Lat","Long","d34S")] d34S<-aggregate(d34S,by=list(d34S$Lat,d34S$Long), FUN=mean,na.rm=TRUE) d34S_proj<-project(as.matrix(d34S[,c("Long","Lat")]), "+proj=eck4 +lon_0=0 +x_0=0 +y_0=0 +ellps=WGS84 +units=m +no_defs") ###Load world features world <- ne_countries(scale = "medium", returnclass = "sf") ###Make plot of sample distribution over the globe for different substrate p_d34S<-ggplot(data = world) + geom_sf(color = "darkgrey", fill = "white", size=0.5) + geom_point(data = d34S, aes(Lat, Long), shape=21, fill="green", size = 1)+ geom_point(data = d34S, aes(Lat, Long), shape=21, colour="black", stroke = 0.01)+ theme(plot.margin=unit(c(0,0,0,0), "cm")) + labs(x=NULL)+ labs(y=NULL)+ labs(tag = "A")+ theme_void() ###Location of the samples from the database### pdf("Map_location.pdf", width=12, height=6) grid.arrange(p_d34S) dev.off() ###########################################FIGURE3######################################### ###Density plot of pdf("density_plot.pdf", width=8, height=4) ggplot(d34S, aes(x=d34S)) + geom_density(alpha=.2,size=1,position = "stack")+ ###Create density function labs(x="d34S", y = "Density")+ theme_classic()+ xlim(-8, 21) dev.off() ############################################################################################################################################# #########################################################RANDOM FOREST REGRESSION############################################################ ############################################################################################################################################# ##########################INPUT OBSERVATION################################################## ###Input dataset ###Select the sheet desired for creating your random forest model change the sheet= to use another sheet. For example Sheet="plant" d34S_orig<-d34S ###Clean missing values d34S_orig_0<-d34S_orig[!is.na(d34S_orig$Lat),] #remove rows with missing XY coordinates d34S_orig_0<-d34S_orig_0[!is.na(d34S_orig_0$Long),] d34S_orig_1<-d34S_orig_0[!is.na(d34S_orig_0$d34S),] #remove rows with missing observed Sr data ###Project d34S data d34S_proj<-project(as.matrix(d34S_orig_1[,c("Lat","Long")]), "+proj=eck4 +lon_0=0 +x_0=0 +y_0=0 +ellps=WGS84 +units=m +no_defs") plot(d34S_proj) #################################INPUT GLOBAL COVARIATE RASTERS##################################################### ###See Table 1 for raster list and references ###The projected covariates are available at: https://drive.google.com/drive/folders/1g9rCGo3Kd3hz2o5JKkSbgNsGJclvsuQm?usp=sharing ###Use this link and download the Projected_rasters folder as a .zip file into your working directory (35GO) ###Unzip the file using the 7zip program https://www.7-zip.org/ which can handle large .zip file. ###Verify that the unzip folder has a name of "Projected_rasters" as below and if necessary rename the folder to the correct name ###Once this is done the following raster should load in your script setwd("D:\\clement_back-up_2017-11-14\\European_Isoscape\\Projected_rasters") r.mat=raster("D:\\Projected_rasters\\mat_reproj.tif") r.fert=raster("Projected_rasters\\nfert_reproj.tif") r.dust = raster("Projected_rasters\\dust_reproj.tif") r.map = raster("Projected_rasters\\map_reproj.tif") r.salt = raster("Projected_rasters\\salt_reproj.tif") r.ai = raster("Projected_rasters\\ai_reproj.tif") r.pet = raster("Projected_rasters\\pet_reproj.tif") r.elevation = raster("Projected_rasters\\elevation_reproj.tif") r.clay = raster("Projected_rasters\\rclay_reproj.tif") r.ph = raster("Projected_rasters\\rph_reproj.tif") r.cec = raster("Projected_rasters\\rcec_reproj.tif") r.bulk = raster("Projected_rasters\\rbulk_reproj.tif") r.age =raster("Projected_rasters\\basement_age_reproj.tif") r.m1 = raster("Projected_rasters\\rm1_reproj.tif") r.maxage_geol=raster("Projected_rasters\\agemax.tif") r.minage_geol=raster("Projected_rasters\\agemin.tif") r.meanage_geol=raster("Projected_rasters\\agemean.tif") r.sr=raster("D:\\Output\\rf_plantsoilmammal1.tif") r.bouger=raster("Projected_rasters\\bouger_reproj.tif") r.ssa=raster("Projected_rasters\\ssa.tif") r.ssaw=raster("Projected_rasters\\ssaw.tif") r.xx=raster("Projected_rasters\\xx.tif") r.mat<-raster("Projected_rasters\\mat_reproj.tif") r.distance<-raster("Projected_rasters\\distance.tif") ##########################EXTRACT RASTER VALUES AT OBSERVATIONS################################################## #extract from raw and transformed rasters Mode <- function(x) { ux <- unique(x) ux[which.max(tabulate(match(x, ux)))] } beginCluster() m1xy<-extract(r.m1, d34S_proj, method='simple', buffer=7000,fun=Mode, na.rm=TRUE) agexy<-extract(r.age, d34S_proj, method='simple', na.rm=TRUE) dustxy<-extract(r.dust, d34S_proj, method='bilinear',na.rm=TRUE) saltxy<-extract(r.salt, d34S_proj, method='bilinear', na.rm=TRUE) mapxy<-extract(r.map, d34S_proj, method='simple', buffer=7000,fun=mean, na.rm=TRUE) aixy<-extract(r.ai, d34S_proj, method='bilinear', buffer=7000,fun=mean, na.rm=TRUE) petxy<-extract(r.pet, d34S_proj, method='bilinear', buffer=7000,fun=mean,na.rm=TRUE) elevationxy<-extract(r.elevation,d34S_proj,method='simple',buffer=7000,fun=mean,na.rm=TRUE) clayxy<-extract(r.clay,d34S_proj, method='simple', buffer=7000,fun=Mode,na.rm=TRUE) phxy<-extract(r.ph,d34S_proj, method='simple', buffer=7000,fun=Mode,na.rm=TRUE) cecxy<-extract(r.cec,d34S_proj, method='simple', buffer=7000,fun=Mode,na.rm=TRUE) bulkxy<-extract(r.bulk,d34S_proj, method='simple', buffer=7000,fun=Mode,na.rm=TRUE) minage_geolxy<-extract(r.minage_geol,d34S_proj, method='simple', buffer=7000,fun=Mode, na.rm=TRUE) maxage_geolxy<-extract(r.maxage_geol,d34S_proj, method='simple', buffer=7000,fun=Mode,na.rm=TRUE) meanage_geolxy<-extract(r.meanage_geol,d34S_proj, buffer=7000,fun=Mode, method='simple', na.rm=TRUE) srxy=extract(r.sr,d34S_proj, method='simple', buffer=7000,fun=Mode, na.rm=TRUE) bougerxy=extract(r.bouger,d34S_proj, method='simple', buffer=7000,fun=mean, na.rm=TRUE) matxy=extract(r.mat,d34S_proj, method='bilinear', buffer=7000,fun=mean, na.rm=TRUE) fertxy=extract(r.fert,d34S_proj, method='bilinear', buffer=7000,fun=mean, na.rm=TRUE) #d18Oxy=extract(r.d18Oann,d34S_proj, method='simple', buffer=7000,fun=mean, na.rm=TRUE) ssaxy=extract(r.ssa,d34S_proj, method='bilinear', buffer=7000,fun=mean, na.rm=TRUE) ssawxy=extract(r.ssaw,d34S_proj, method='bilinear', buffer=7000,fun=mean, na.rm=TRUE) distancexy=extract(r.distance,d34S_proj, method='bilinear', buffer=7000,fun=mean, na.rm=TRUE) xxxy=extract(r.distance,d34S_proj, method='simple', buffer=7000,fun=Mode, na.rm=TRUE) endCluster() ### Append all extracted data into a summary table with the database d34S_proj_xy <- data.frame(d34S_orig_1$Lat,d34S_orig_1$Long, d34S_orig_1$d34S, d34S_proj,m1xy,agexy,dustxy,mapxy,saltxy, aixy,petxy,elevationxy,clayxy,phxy,cecxy,bulkxy, minage_geolxy,maxage_geolxy,meanage_geolxy, srxy,bougerxy, matxy,fertxy,ssaxy, ssawxy,distancexy,xxxy) ### Rename columns same as the rasters (important for running the final model) colnames(d34S_proj_xy)<-c("Latitude","Longitude","d34S", "X.1","Y.1","r.m1","r.age","r.dust","r.map", "r.salt","r.ai","r.pet","r.elevation", "r.clay","r.ph","r.cec","r.bulk", "r.minage_geol","r.maxage_geol","r.meanage_geol","r.sr","r.bouger", "r.mat","r.fert","r.ssa", "r.ssaw","r.distance","r.xx") ###Aggreatgate redundant lat/long d34S_proj_xy<-d34S_proj_xy[complete.cases(d34S_proj_xy),] d34S_agg<-aggregate(d34S_proj_xy,by=list(d34S_proj_xy$Latitude,d34S_proj_xy$Longitude), FUN=mean,na.rm=TRUE) ###Subset aggregated data keeping only lat long and covariates d34S_agg1<-subset(d34S_proj_xy, select=c("Latitude","Longitude","d34S", "r.m1","r.age","r.dust","r.map", "r.salt","r.ai","r.pet","r.elevation", "r.clay","r.ph","r.cec","r.bulk", "r.minage_geol","r.maxage_geol","r.meanage_geol","r.sr","r.bouger", "r.mat","r.fert","r.ssa", "r.ssaw","r.distance","r.xx")) write.csv(d34S_agg1,file="regression_matrix.csv") ###Project subsetted dataset d34S_agg_p <- project(as.matrix(d34S_proj_xy[,c("Longitude","Latitude")]), "+proj=eck4 +lon_0=0 +x_0=0 +y_0=0 +ellps=WGS84 +units=m +no_defs") ##########################DATA SPLITTING################################################## ###Select predictor using parallelized VSURF algorithm library(VSURF) names(d34S_agg1) set.seed(123) d34S_agg1.vsurf<-VSURF(d34S_agg1[,4:26],d34S_agg1$d34S, RFimplem = "ranger", parallel = TRUE, ncores = detectCores() - 1, clusterType = "PSOCK") d34S_agg1.vsurf$varselect.pred plot(d34S_agg1.vsurf) d34S_agg1.sub<-d34S_agg1[,4:26] d34S_agg_VSURF <- d34S_agg1.sub[c(d34S_agg1.vsurf$varselect.pred)] #automatic subsetting of selected variables d34S_agg_VSURF<-cbind(d34S_agg1[,1:3],d34S_agg_VSURF) d34S_agg_VSURF ###Check the variables selected by VSURF. ###Often VSURF still preserves some strong redundancies between variables that require some clean up. training<-d34S_agg_VSURF ###Parallelize random forest modeling cluster <- makeCluster(detectCores() - 1) # convention to leave 1 core for OS registerDoParallel(cluster) # Splitting the data for repeated cross validation fitControl <- trainControl(## 10-fold Crossvalidation method = "repeatedcv", number = 10, ## repeated ten times repeats = 5, verboseIter=FALSE , returnResamp="final", savePredictions="all", # With parallel backend allowParallel=TRUE ) set.seed(124) bestmtry <- tuneRF(training, training$d34S, stepFactor=1, improve=1e-7, ntree=500) mtry <- 6 tunegrid <- expand.grid(.mtry=mtry) metric<-"Accuracy" ###Random forest training RF_c <- train(d34S ~ r.ssaw+r.bouger+r.dust, data = training, method = "rf", importance=TRUE, tuneGrid=tunegrid,trControl= fitControl) RF_c ###Quantile random forest training qrf_c <- ranger(d34S ~ r.ssaw+r.bouger+r.dust, data = training, quantreg=TRUE, num.trees=500, seed=1) ###############################################FIGURE Cross-Validation#########################################################3 pdf("CV.pdf", width=6, height=6) par(mfrow = c(1,1)) plot(RF_c$pred$pred,RF_c$pred$obs,pch=15, cex=0.4, xlab="d34Smod",ylab="d34Sobs",cex.lab=1, cex.axis=1) lm2<-lm(RF_c$pred$obs~RF_c$pred$pred) abline(lm2) dev.off() ##########################FIGURE Variable Importance################################################## pdf("\Importance.pdf", width=6, height=6) par(mfrow=c(1,2)) varImpPlot(RF_c$finalModel,type=1) varImpPlot(RF_c$finalModel,type=2) #plot(RF$finalModel,main='Error vs No. of trees plot: Base Model') dev.off() #########################FIGURE Model residuals################################################## pred_RF_c_final<-predict(RF_c,d34S_agg1) residuals<-d34S_agg1$d34S -pred_RF_c_final train<-d34S_agg1 train$resid<-residuals train$absresid<-abs(residuals) train$pred<-pred_RF_c_final pdf("Residuals.pdf", width=6, height=4) par(mfrow = c(1,2)) plot(train$pred,train$resid,pch=15, cex=0.4, log="x") plot(train$pred,train$absresid,pch=15, cex=0.4, log="y") dev.off() ########################################FIGURE Partial Dependence Plot########################################## ###Combined plant, soil and local animals pdf("PD.pdf", width=6, height=6) par(mfrow = c(2,2)) par(oma = c(1,1,1,1)) partialPlot(RF_c$finalModel, training, x.var = "r.ssaw",main=NA) partialPlot(RF_c$finalModel, training, x.var = "r.dust",main=NA) partialPlot(RF_c$finalModel, training, x.var = "r.bouger",main=NA) dev.off() ##########################APPLY BEST MODEL SPATIALLY################################################## ###Create a raster stack with all predictors world_stack<-stack(r.ssaw,r.dust,r.bouger) names(world_stack)<-c("r.ssaw","r.dust","r.bouger") ###Clip raster stack to study area europe<-as.vector(c(-1500000,2500000,4000000,8000000)) europe_stack<-crop(world_stack, europe)*r.m1/r.m1 names(europe_stack)<-c("r.ssaw","r.dust","r.bouger") ####Create a grid to apply model extent europe.grid<-europe_stack$r.ssaw/europe_stack$r.ssaw ###Apply random forest model spatially rf2 <- predict(europe_stack, RF_c, ext=europe.grid, na.rm=TRUE, overwrite=TRUE, progress='text') ###Save the isoscape raster writeRaster(rf2, filename="rf_d34S", format="GTiff", overwrite=TRUE) ###Apply quantile random forest model spatially sp_europe_stack<-as(europe_stack, "SpatialPixelsDataFrame") sr.rfd_low <- predict(europe_stack2, qrf_c, ext=europe.grid, type="quantiles", quantiles=0.15, fun = function(model, ...) predict(model, ...)$predictions) sr.rfd_high <- predict(europe_stack2, qrf_c, ext=europe.grid, type="quantiles", quantiles=0.841, fun = function(model, ...) predict(model, ...)$predictions) ###Calculate uncertainty raster for random forest model sr.se<-sr.rfd_high-sr.rfd_low sr.se<-sr.se/2 writeRaster(sr.se, filename="Output\\srse", format="GTiff", overwrite=TRUE) ########################FIGURE Isoscape############################################ ###Create breakpoints for random forest prediction map data("wrld_simpl") wrld_simpl_p<-spTransform(wrld_simpl, crs(rf2)) wrld_simpl_p<-crop(wrld_simpl_p, extent(rf2)) breakpoints<-c(-4,-2,0,2,4,6,8,10,12,14,16,18,20,22) d34S_agg$Col <- matlab.like2(13)[as.numeric(cut(d34S_agg$d34S,breaks = breakpoints))] pdf("d34S_isoscape.pdf",width=7, height=5) par(mfrow=c(1,1)) plot(rf2, col= matlab.like2(13), breaks=breakpoints, axes = FALSE) #plot(wrld_simpl_p,add=TRUE) points(d34S_agg$X.1,d34S_agg$Y.1,pch=21,col="black",bg=d34S_agg$Col,cex=1.1) #scaleBar(crs(na.dH_Hobson2012$mean), "topright",cex=1, seg.len=2,box.color = NULL) dev.off()