library(dismo) library(gbm) # set the root directory for his work. It should have the folders containing your input and output files. path.root <- 'D:/BRT/data/' tick<-read.csv(paste(path.root, "tick.csv", sep=''), header=TRUE) # make a formula. This is only useful for the gbm function in the gbm package. my.formula <- as.formula(paste('case', paste(colnames(tick)[c(3:43)], collapse='+'), sep='~')) # In the first stage, the split-and-fitting step was repeated for 10 times to screen important predictors. # Validation of the trained model using the test set is not performed in this stage. # Predictors that had relative contribution <2% for all bootstrap training sets are excluded from the next stage. # In the second stage, the split-and-fitting step was repeated for 100 times using the remaining predictors. n.iteration <- 10 i.var <- c(3:43) n.var <- length(i.var) rel.con<-preds.gbm.train<-preds.gbm.test<-preds.gbm.all<-NULL resp.curve <- NULL model.lst <- list() for(i in 1:n.iteration) { file.name <- paste(path.root, "train75/train", i, ".csv", sep='') train.data <- read.csv(file.name,header=TRUE) file.name <- paste(path.root, "test25/test", i, ".csv", sep='') test.data <- read.csv(file.name,header=TRUE) set.seed(10000+i) # set random seed so that results can be replicated; otherwise, you will get slightly different results each time you run the program #start.time<-Sys.time() gbm.train<-gbm.step(data=train.data,gbm.x=i.var, gbm.y=2, family="bernoulli", tree.complexity=5, learning.rate=0.005, bag.fraction=0.75, site.weights=train.data$weights, n.folds=10, max.trees = 3000, silent=TRUE, plot.main=FALSE) model.lst[[i]] <- gbm.train tmp <- summary(gbm.train) colnames(tmp) <- c('var', paste('rel.inf.', i, sep='')) if(is.null(rel.con)) rel.con <- tmp else rel.con <- merge(rel.con, tmp, by='var', sort=FALSE) tmp <- cbind(rep(i,nrow(train.data)), train.data[,c('IDDD','case')], predict.gbm(gbm.train, train.data, n.trees= gbm.train$n.trees, type="response")) colnames(tmp) <- c('round', 'IDDD', 'case', 'pred') if(is.null(preds.gbm.train)) preds.gbm.train <- tmp else preds.gbm.train <- rbind(preds.gbm.train, tmp) tmp <- cbind(rep(i,nrow(test.data)), test.data[,c('IDDD','case')], predict.gbm(gbm.train, test.data, n.trees= gbm.train$n.trees, type="response")) colnames(tmp) <- c('round', 'IDDD', 'case', 'pred') if(is.null(preds.gbm.test)) preds.gbm.test <- tmp else preds.gbm.test <- rbind(preds.gbm.test, tmp) tmp <- cbind(rep(i,nrow(sfts)), sfts[,c('IDDD','case')], predict.gbm(gbm.train, sfts, n.trees= gbm.train$n.trees, type="response")) colnames(tmp) <- c('round', 'IDDD', 'case', 'pred') if(is.null(preds.gbm.all)) preds.gbm.all <- tmp else preds.gbm.all <- rbind(preds.gbm.all, tmp) resp.curve.tmp <- NULL for(j in 1:n.var) { tmp <- plot(gbm.train, i.var = j, continuous.resolution = 150, return.grid = TRUE, type = "link") tmp[,2] <- tmp[,2] - mean(tmp[,2]) colnames(tmp)[2] <- paste('y', j, sep='') if (j==1) resp.curve.tmp <- cbind(rep(i, nrow(tmp)), 1:nrow(tmp), tmp) else resp.curve.tmp <- cbind(resp.curve.tmp, tmp) } colnames(resp.curve.tmp)[1:2] <- c('round', 'point') resp.curve <- rbind(resp.curve, resp.curve.tmp) } # raw output, which contains the results of all iterations write.csv(rel.con, file =paste(path.root, "preds/rel_con.csv", sep='')) # relative contributions write.csv(preds.gbm.train, file =paste(path.root, "preds/train.csv", sep='')) # predictions for training data write.csv(preds.gbm.test, file =paste(path.root, "preds/test.csv", sep='')) # predictions for test data write.csv(preds.gbm.all, file =paste(path.root, "preds/all.csv", sep='')) # predictions for all data write.csv(resp.curve, file =paste(path.root, "preds/resp_curve.csv", sep='')) # response curves # output average relative contributions, predicted probabilities, and response curves average.rel.con <- data.frame(var_name=rel.con$var, avg_rel_con=apply(rel.con[,-1], 1, mean)) average.preds.train <- aggregate(cbind(case,pred)~IDDD, data=preds.gbm.train, mean) average.preds.test <- aggregate(cbind(case,pred)~IDDD, data=preds.gbm.test, mean) average.preds.all <- aggregate(cbind(case,pred)~IDDD, data=preds.gbm.all, mean) write.csv(average.rel.con, file =paste(path.root, "preds/average_rel_con.csv", sep='')) write.csv(average.preds.train, file =paste(path.root, "preds/average_preds_train.csv", sep='')) write.csv(average.preds.test, file =paste(path.root, "preds/average_preds_test.csv", sep='')) write.csv(average.preds.all, file =paste(path.root, "preds/average_preds_all.csv", sep='')) save(model.lst, rel.con, preds.gbm.train, preds.gbm.test,preds.gbm.all, resp.curve, average.rel.con, average.preds.train, average.preds.test, average.preds.all, file=paste(path.root, "preds/BRT_Results.Rdata", sep=''))