## ## S1 Text ## ############################################################################# ## ## ## SNPs analyses for the platelet responses using the proposed pipeline ## ## Example of selected platelet response is Pselectin release in response to ADP (PA) for finding its most significant associated SNPs by conseensus of the methods ## ## ## Loading the data ## Load the SNP data for Pselectin release in response to ADP (PA) load_snp_pa <- read.csv("bloodomics_pa.csv", header=TRUE) ## Remove first column of rownames snp_pa_new <- load_snp_pa[,2:ncol(load_snp_pa)] ## Loading packages and libraries require(ridge) require(randomForest) require(leaps) require(MASS) require(glmnet) ## STEP ONE ## Use randomForest (RF) to reduce p (total number of SNPs) to k SNPs ## RF will be run as a regression model (mtry = p/3 as proposed by the RF creator) ## Importance of variables parameter should be set TRUE and number of trees, ntree = 3000; Though the model might be run iteratively from ntree=500, 1000, 2000 to monitor its performance, and significance ## of the intermediate methods. snp.model.pa.rf_3000 <- randomForest(PselectinADP ~ ., snp_pa_new, importance = T, ntree=3000, mtry=(ncol(snp_pa_new)-1)/3) print(snp.model.pa.rf_3000) ## Print the model to observe mean squared error values plot(1:3000, snp.model.pa.rf_3000$mse, col="red", type="l", xlab="Number of trees", ylab="Test MSE", main="MSE VS NUMBER OF TREE FOR PA MODEL ALL SNPs") ## Plot variable importance for the model with all p SNPs. varImpPlot(snp.model.pa.rf_3000, sort=TRUE, n.var=min(40, nrow(snp.model.pa.rf1_3000$importance)), scale=FALSE, type=1, main="Important SNPs for PA model") ## Evaluating model performance using the same training data plot(predict(snp.model.pa.rf2_3000, newdata=snp_pa_new), snp_pa_new$PselectinADP, xlab="predicted PA", ylab="actual PA", main="PA RF performance using all SNPs with 3000 trees") ## Evaluating model performance using out-of-box (OOB) data plot(predict(snp.model.pa.rf2_3000), snp_pa_new$PselectinADP, xlab="predicted PA", ylab="actual PA", main="PA RF prediction performance using all SNPs with 3000 trees") # Computing and extractiing important k (40) snps based on the MSE ranks imp_snps_rf3000_pa <- importance(snp.model.pa.rf2_3000, type=1, scale=FALSE) best40_pa3000 <- rownames(imp_snps_rf3000_pa)[order(imp_snps_rf3000_pa[, "%IncMSE"],decreasing = T)[1:40]] cat(best40_pa3000) # New dataset with k SNPs snps40.pa3000 <- snp_pa_new[, c(best40_pa3000, "PselectinADP")] ## Re-run RF using selected k SNPs to observe the increased %variance of the model comapring to RF model with p SNPs snps40.model.pa.rf_3000<- randomForest(PselectinADP ~ ., snps40.pa3000, importance = T, ntree=3000, mtry=(ncol(snps40.pa3000)-1)/3) cat(snps40.model.pa.rf2_3000) ##Plot to visualise the performance of the RF model using k SNPs plot(1:3000, snps40.model.pa.rf_3000$mse, col="red", type="l", xlab="Number of trees", ylab="Test MSE", main="MSE VS NUMBER OF TREES FOR PA MODEL USING 40 SNPs") ## You can compare the performance of both RF models using the plots below par(mfrow=c(2,2)) plot(predict(snp.model.pa.rf2_3000, newdata=snp_pa_new), snp_pa_new$PselectinADP, xlab="predicted PA", ylab="actual PA", main="PA RF MODEL PERFORMANCE USING ALL SNPs") plot(predict(snp.model.pa.rf2_3000), snp_pa_new$PselectinADP, xlab="predicted PA", ylab="actual PA", main="PA RF MODEL PERFORMANCE USING ALL SNPs") plot(predict(snps40.model.pa.rf2_3000, newdata=snps40.pa3000), snps40.pa3000$PselectinADP, xlab="predicted PA", ylab="actual PA", main="PA RF TRAINING PERFORMANCE USING 40 SNPs") plot(predict(snps40.model.pa.rf2_3000), snps40.pa3000$PselectinADP, xlab="predicted PA", ylab="actual PA", main="PA RF PREDICT PERFORMANCE USING 40 SNPs") ## ## REGRESSION METHODS ENSEMBLE ## # Run stepwise regression for k SNPs using 'leaps' package subs_pa_rf3000 <- regsubsets(PselectinADP ~ ., data = snps40.pa3000, method = "forward") summary(subs_pa_rf3000) # This shows potential significant SNPs before testing # Identify exact SNP's significance from the stepwise model using F-Test # # lmfit_subs_pa_rf3000 <- lm(PselectinADP ~ SNPs(subs_pa_rf3000), data = snps40.pa3000) # summary(lmfit_subs_pa_rf3000) ## Run ridge regression on k SNPs ## Applyting 'ridge' regression using Ridge package mod_ridge_pa_3000 <- linearRidge(PselectinADP ~ ., data = snps40.pa3000) summary(mod_ridge_pa_3000) ## Running LASSO on k SNPs ## Applying lasso using 'glmnet' package geno_pa_rf3000 <- as.matrix(snps40.pa3000[,1:40]) pheno_pa_rf3000 <- snps40.pa3000[,41] lasso_40_pa_rf3000 <- glmnet(geno_pa_rf3000, pheno_pa_rf3000) ## Using crosss validation to identify good candidate SNPs for partial F-Test to find the significant SNPs cv_snps40_pa_rf3000 <- cv.glmnet(geno_pa_rf3000, pheno_pa_rf3000, alpha=1, nfolds = 10) lambda_40_pa_rf3000 <- cv_snps40_pa_rf3000$lambda.min; # For smallest lambda res_40_pa_rf3000 <- predict(lasso_40_pa_rf3000, s=lambda_40_pa_rf3000, type="coefficients") cat(res_40_pa_rf3000) # Print out candidate SNPs for partial F-Test ## Run stepwise the partial F-test on snps from the lasso model # lasso_fit_snps_pa_3000 <- lm(PselectinADP ~ SNPs(res_40_pa_rf3000), snps40.pa3000) # summary(lasso_fit_snps_pa_3000) ## ## Boruta method with k SNPs require(Boruta) Bor.model.snps40.pa3000 <- Boruta(PselectinADP ~ ., maxRun = 100, data=snps40.pa3000) cat(Bor.model.snps40.pa3000) ## Plot the Boruta model par(mar=c(7.2,4.1,2.0,2.1)) plots(Bor.model.snps40.pa3000) ## Retrieve the confirmed important and Tentative (likely important) SNPs Bor_snps_pa_imp3000 <- getSelectedAttributes(Bor.model.snps40.pa3000, withTentative = F) Bor_snps_pa3000 <- getSelectedAttributes(Bor.model.snps40.pa3000, withTentative = T) cat(Bor_snps_pa_imp3000) # Confirmed important (significant) SNPs cat(Bor_snps_pa3000) # Confirmed important and tentative ## Compare all results froom different methods to identify the most significant SNPs by consensus # summary(lmfit_subs_pa_rf3000) # stepwise model (Wald-test) summary(mod_ridge_pa_3000) # Ridge regression (Wald-Test) # summary(lasso_fit_snps_pa_3000) # Lasso using Partial F-test cat(Bor_snps_pa_imp3000) # Boruta method