## 
##  S1 Text
##
#############################################################################
##
##
## SNPs analyses for the platelet responses using the proposed pipeline
##
## Example of selected platelet response is Pselectin release in response to ADP (PA) for finding its most significant associated SNPs by conseensus of the methods
## 
## 

## Loading the data
## Load the SNP data for Pselectin release in response to ADP (PA)
load_snp_pa <- read.csv("bloodomics_pa.csv", header=TRUE)

## Remove first column of rownames
snp_pa_new <- load_snp_pa[,2:ncol(load_snp_pa)]


## Loading packages and libraries
require(ridge)
require(randomForest)
require(leaps)
require(MASS)
require(glmnet)

## STEP ONE
## Use randomForest (RF) to reduce p (total number of SNPs) to k SNPs
## RF will be run as a regression model (mtry = p/3 as proposed by the RF creator)
## Importance of variables parameter should be set TRUE and number of trees, ntree = 3000; Though the model might be run iteratively from ntree=500, 1000, 2000 to monitor its performance, and significance ## of the intermediate methods.

snp.model.pa.rf_3000 <- randomForest(PselectinADP ~ ., snp_pa_new, importance = T, ntree=3000, mtry=(ncol(snp_pa_new)-1)/3)
print(snp.model.pa.rf_3000)

## Print the model to observe mean squared error values
plot(1:3000, snp.model.pa.rf_3000$mse, col="red", type="l", xlab="Number of trees", 
     ylab="Test MSE", main="MSE VS NUMBER OF TREE FOR PA MODEL ALL SNPs")

## Plot variable importance for the model with all p SNPs.
varImpPlot(snp.model.pa.rf_3000, sort=TRUE, n.var=min(40, nrow(snp.model.pa.rf1_3000$importance)), 
           scale=FALSE, type=1, main="Important SNPs for PA model")


## Evaluating model performance using the same training data
plot(predict(snp.model.pa.rf2_3000, newdata=snp_pa_new), snp_pa_new$PselectinADP, xlab="predicted PA", 
     ylab="actual PA", main="PA RF performance using all SNPs with 3000 trees")

## Evaluating model performance using out-of-box (OOB) data
plot(predict(snp.model.pa.rf2_3000), snp_pa_new$PselectinADP, xlab="predicted PA", ylab="actual PA",
     main="PA RF prediction performance using all SNPs with 3000 trees")


# Computing and extractiing important k (40) snps based on the MSE ranks
imp_snps_rf3000_pa <- importance(snp.model.pa.rf2_3000, type=1, scale=FALSE)

best40_pa3000 <- rownames(imp_snps_rf3000_pa)[order(imp_snps_rf3000_pa[, "%IncMSE"],decreasing = T)[1:40]]
cat(best40_pa3000)


# New dataset with k SNPs
snps40.pa3000 <- snp_pa_new[, c(best40_pa3000, "PselectinADP")]

## Re-run RF using selected k SNPs to observe the increased %variance of the model comapring to RF model with p SNPs
snps40.model.pa.rf_3000<- randomForest(PselectinADP ~ ., snps40.pa3000, importance = T, ntree=3000, mtry=(ncol(snps40.pa3000)-1)/3)
cat(snps40.model.pa.rf2_3000)

##Plot to visualise the performance of the RF model using k SNPs
plot(1:3000, snps40.model.pa.rf_3000$mse, col="red", type="l", xlab="Number of trees", 
     ylab="Test MSE", main="MSE VS NUMBER OF TREES FOR PA MODEL USING 40 SNPs")

## You can compare the performance of both RF models using the plots below	 
par(mfrow=c(2,2))
plot(predict(snp.model.pa.rf2_3000, newdata=snp_pa_new), snp_pa_new$PselectinADP, xlab="predicted PA", 
     ylab="actual PA", main="PA RF MODEL PERFORMANCE USING ALL SNPs")

plot(predict(snp.model.pa.rf2_3000), snp_pa_new$PselectinADP, xlab="predicted PA", ylab="actual PA",
     main="PA RF MODEL PERFORMANCE USING ALL SNPs")

plot(predict(snps40.model.pa.rf2_3000, newdata=snps40.pa3000), snps40.pa3000$PselectinADP, xlab="predicted PA", 
     ylab="actual PA", main="PA RF TRAINING PERFORMANCE USING 40 SNPs")

plot(predict(snps40.model.pa.rf2_3000), snps40.pa3000$PselectinADP, xlab="predicted PA", ylab="actual PA",
     main="PA RF PREDICT PERFORMANCE USING 40 SNPs")


##
## REGRESSION METHODS ENSEMBLE
##
 
# Run stepwise regression for k SNPs using 'leaps' package
subs_pa_rf3000 <- regsubsets(PselectinADP ~ ., data = snps40.pa3000, method = "forward")
summary(subs_pa_rf3000) # This shows potential significant SNPs before testing

# Identify exact SNP's significance from the stepwise model using F-Test 
# 
# lmfit_subs_pa_rf3000 <- lm(PselectinADP ~ SNPs(subs_pa_rf3000), data = snps40.pa3000)
# summary(lmfit_subs_pa_rf3000)


## Run ridge regression on k SNPs
## Applyting 'ridge' regression using Ridge package
mod_ridge_pa_3000 <- linearRidge(PselectinADP ~ ., data = snps40.pa3000)
summary(mod_ridge_pa_3000)

## Running LASSO on k SNPs
## Applying lasso using 'glmnet' package
geno_pa_rf3000 <- as.matrix(snps40.pa3000[,1:40])
pheno_pa_rf3000 <- snps40.pa3000[,41]
lasso_40_pa_rf3000 <- glmnet(geno_pa_rf3000, pheno_pa_rf3000)

## Using crosss validation to identify good candidate SNPs for partial F-Test to find the significant SNPs
cv_snps40_pa_rf3000 <- cv.glmnet(geno_pa_rf3000, pheno_pa_rf3000, alpha=1, nfolds = 10)
lambda_40_pa_rf3000 <- cv_snps40_pa_rf3000$lambda.min; # For smallest lambda
res_40_pa_rf3000 <- predict(lasso_40_pa_rf3000, s=lambda_40_pa_rf3000, type="coefficients")
cat(res_40_pa_rf3000) # Print out candidate SNPs for partial F-Test

## Run stepwise the partial F-test on snps from the lasso model
# lasso_fit_snps_pa_3000 <- lm(PselectinADP ~ SNPs(res_40_pa_rf3000), snps40.pa3000)
# summary(lasso_fit_snps_pa_3000)

##
## Boruta method with k SNPs

require(Boruta)

Bor.model.snps40.pa3000 <- Boruta(PselectinADP ~ ., maxRun = 100, data=snps40.pa3000)
cat(Bor.model.snps40.pa3000)

## Plot the Boruta model
par(mar=c(7.2,4.1,2.0,2.1))
plots(Bor.model.snps40.pa3000)

## Retrieve the confirmed important and Tentative (likely important) SNPs
Bor_snps_pa_imp3000 <- getSelectedAttributes(Bor.model.snps40.pa3000, withTentative = F)
Bor_snps_pa3000 <- getSelectedAttributes(Bor.model.snps40.pa3000, withTentative = T)
cat(Bor_snps_pa_imp3000) # Confirmed important (significant) SNPs
cat(Bor_snps_pa3000) # Confirmed important and tentative

## Compare all results froom different methods to identify the most significant SNPs by consensus
# summary(lmfit_subs_pa_rf3000) # stepwise model (Wald-test)
summary(mod_ridge_pa_3000) # Ridge regression (Wald-Test)
# summary(lasso_fit_snps_pa_3000) # Lasso using Partial F-test
cat(Bor_snps_pa_imp3000) # Boruta method