--- title: "Calculate GP metrics" author: "Zhanyou Xu" date: "JUne 9, 2020" output: word_document --- # predict.lm() in a loop. warning: prediction from a rank-deficient fit may be misleading # reasons and solution link: https://stackoverflow.com/questions/26558631/predict-lm-in-a-loop-warning-prediction-from-a-rank-deficient-fit-may-be-mis ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE) library(psych); library(e1071); library(caret) ``` # Read in the data ```{r} filePath_name_IDC = "~/Box/Manuscripts/Soy/2_paper_ISU/For submission/for PLOS One submission/MS 2/re_submitted supplementaldata and R codes/S03.csv" d1000 = read.csv(filePath_name_IDC) d1000[1:3, 1:5] d1000_class= d1000[, grep("true_decision|Marker", names(d1000))] dim(d1000_class) d1000_class[1:3, 1:15] ``` # AUC using rrBLUP ```{r, warning=0} library(rrBLUP) d1000 = read.csv(filePath_name_IDC) d1000[1:3, 1:5] #d1000_Manual[1:3, 1:10] set.seed(300); shuffled=d1000[ sample(1000,1000),] shuffled$true_decision = as.numeric(shuffled$true_decision) shuffled_X= shuffled[, grep("marker", names(shuffled), ignore.case = T)] #dim(shuffled) N_folder=10 accuracyList_rrBLUP_Raw=rep(0, N_folder) IDC_score_all_1000 =as.numeric(as.character( shuffled$true_decision)) rrBLUP.model_all = mixed.solve(IDC_score_all_1000, Z=as.matrix(shuffled_X), K=NULL, SE = FALSE, return.Hinv=FALSE) marker_effects_all_rrBLUP = rrBLUP.model_all$u # effects for 1200 markers (beta_all_rrBLUP=rrBLUP.model_all$beta) marker_effects_all_rrBLUP_matrix= as.matrix(marker_effects_all_rrBLUP) pred_by_rrBLUP_all_rrBLUPs= as.matrix(shuffled_X) %*% marker_effects_all_rrBLUP_matrix pred_by_rrBLUP_all_rrBLUPs=data.frame(pred_by_rrBLUP_all_rrBLUPs) pred_by_rrBLUP_all_rrBLUPs$lineIDs = rownames(pred_by_rrBLUP_all_rrBLUPs) pred_by_rrBLUP_all_rrBLUPs$IDC.Scores= shuffled$true_decision pred_by_rrBLUP_all_rrBLUPs$resids = pred_by_rrBLUP_all_rrBLUPs$IDC.Scores - pred_by_rrBLUP_all_rrBLUPs$pred_by_rrBLUP_all_rrBLUPs names(pred_by_rrBLUP_all_rrBLUPs) model_GLM_for_rrBLUP = glm(IDC.Scores~ resids , data =pred_by_rrBLUP_all_rrBLUPs, family = "binomial" ) pred_all_rrblup = model_GLM_for_rrBLUP$fitted.values library(ROCR) pred_forROC_rrBLUP_all = prediction(pred_all_rrblup, shuffled$true_decision) evale_rrBLUP_all = performance(pred_forROC_rrBLUP_all, "acc") plot(evale_rrBLUP_all) # calculate the AUC AUC_rrBLUP_all = performance(pred_forROC_rrBLUP_all, "auc") AUC_rrBLUP_all = unlist(slot(AUC_rrBLUP_all, "y.values")) AUC_rrBLUP_all= round(AUC_rrBLUP_all, 4) AUC_rrBLUP_all ROC_rrBLUP_all = performance(pred_forROC_rrBLUP_all, "tpr", "fpr", xlab = "FPR", ylab = "TPR", main = paste0("rrBLUP: ", AUC_rrBLUP_all)) par(cex.axis=1.8, cex.lab = 1.5, cex.axis=1.5) plot(ROC_rrBLUP_all, colorize = T, lwd= 5, main =paste0("rrBLUP: ", AUC_rrBLUP_all), cex.main=2, xlab="FPR", ylab = "TPR") abline(a=0, b=1, lwd=2) ``` # Naive Bayes with 10-folder cross-validation ```{r} N_folder=10 set.seed(100) set.seed(300);shuffled=d1000_class[ sample(1000,1000),] shuffled$IDC_class = as.factor(shuffled$true_decision) shuffled$true_decision= as.factor(as.character(shuffled$true_decision)) #shuffled=IDC_5007_Data_Partial dim(shuffled) head(shuffled)[1:3, 1:10] naiveBayes.model_allLines=naiveBayes(IDC_class~., data = shuffled, laplace = 1) pred_NB_all_probs=predict(naiveBayes.model_allLines, shuffled_X, type = "raw" ) pred_NB_all_probs = as.data.frame(pred_NB_all_probs); names(pred_NB_all_probs) =c("R", "S") pred_for_ROC_NB_all = ROCR::prediction(pred_NB_all_probs$S, shuffled$IDC_class) acc_NB_all = performance(pred_for_ROC_NB_all, "acc") plot(acc_NB_all) ROC_NB_all = performance(pred_for_ROC_NB_all, "tpr", "fpr") plot(ROC_NB_all); abline(a=0, b=1) AUC_NB_all_data = performance(pred_for_ROC_NB_all, "auc") AUC_NB_all_data = unlist(slot(AUC_NB_all_data, "y.values")) AUC_NB_all_data = round(AUC_NB_all_data, 4) AUC_NB_all_data ROC_NB_all = performance(pred_for_ROC_NB_all, "tpr", "fpr") plot(ROC_NB_all, colorize = T, main=paste0( "Naive Bayes: ", AUC_NB_all_data)); abline(a=0, b=1) par(mfrow=c(1,2) ,cex.axis=1.8, cex.lab = 1.5, cex.axis=1.5) plot(ROC_rrBLUP_all, colorize = T, lwd= 5, main =paste0("rrBLUP: ", AUC_rrBLUP_all), cex.main=1.5, xlab="FPR", ylab = "TPR"); abline(a=0, b=1, lwd=2) plot(ROC_NB_all, colorize = T, lwd=5,main=paste0( "Naive Bayes: ", AUC_NB_all_data), cex.main =1.5, xlab = "", ylab="TPR"); abline(a=0, b=1, lwd=2) accuracyList=rep(0, N_folder) d3_total=data.frame() d4_total=data.frame() for (i in 1:N_folder) { # These indices indicate the interval of the test set if (i logistic.model$rank list_of_nas_Logistic_temp=logistic.model$coefficients[is.na(logistic.model$coefficients)] list_of_nas_Logistic_temp= as.data.frame(list_of_nas_Logistic_temp) list_of_nas_Logistic_temp =rownames(list_of_nas_Logistic_temp) train= train[,!colnames(train)%in%list_of_nas_Logistic_temp] dim(train) logistic.model = glm(IDC_class~., data=train, family = "binomial", control = list(maxit = 100)) #plot(svm.model1, train, SY2444AQ~SY2463AQ) #print(dim(train)) # Include them in the test set test <- shuffled[indices,]; dim(test) test= test[,!colnames(test)%in%list_of_nas_Logistic_temp]; dim(test) test_X=subset(test, select = -IDC_class) pred_by_logistic=predict(logistic.model, test_X, type = "response") # "response" return probability # draw ROC and calculate the AUC pred_for_ROC_Logistic =ROCR::prediction(pred_by_logistic, test$IDC_class) evaluation_Logistic = performance(pred_for_ROC_Logistic, "acc") plot(evaluation_Logistic) evaluation_Logistic_for_ROC = performance(pred_for_ROC_Logistic, "tpr", "fpr", xlab="FPR", ylab="TPR") plot(evaluation_Logistic_for_ROC, colorize=T) abline(a=0, b=1) # Calculate the Area Under Curve auc_Logistic_temp = performance(pred_for_ROC_Logistic, "auc") auc_Logistic_temp = unlist(slot(auc_Logistic_temp, "y.values")) auc_Logistic_temp= round(auc_Logistic_temp,4) auc_Logistic_temp print(paste0("The AUC is of Logistic regression: ", auc_Logistic_temp)) predict.class=ifelse(pred_by_logistic>0.5, 1,0) predict.class= as.factor(as.character(predict.class)) sum2= confusionMatrix(as.factor(as.character( test$IDC_class)), predict.class) sum2_d3=data.frame(sum2[[3]]) sum2_d4=data.frame( sum2[[4]] ) if (i==1){ sum2_d3_total =sum2_d3 sum2_d4_total=sum2_d4 }else{ sum2_d3_total = cbind(sum2_d3_total, sum2_d3) sum2_d4_total = cbind(sum2_d4_total, sum2_d4) } accuracy=mean(test$IDC_class ==pp) accuracyList[i]=accuracy print(accuracy) } print(accuracyList) print(mean(accuracyList)) plot(test$IDC_class, pp) sum2_d3_total sum2_d4_total write.csv(sum2_d3_total, "d3_total_by_LogisticRegression.csv") write.csv(sum2_d4_total, "d4_total_by_LogisticRegression.csv") ``` # Run K-nearest neighbor KNN clustering ```{r} library(class) N_folder=10 set.seed(300); shuffled=d1000_class[ sample(1000,1000),] shuffled$IDC_class = as.factor(shuffled$IDC_class) dim(shuffled) accuracyList=rep(0, N_folder) d3_total_KNN=data.frame() d4_total_KNN=data.frame() KNN.model_all.Data = knn(train=shuffled, test=shuffled, k=20, cl=shuffled$IDC_class, prob = T) probability_KNN_all.Data= attributes(KNN.model_all.Data)$prob pred_KNN_for_ROC_all.Data = ROCR::prediction(probability_KNN_all.Data, shuffled$IDC_class) ROC_KNN_all.Data = performance(pred_KNN_for_ROC_all.Data, "tpr", "fpr") auc_KNN_all.Data = performance(pred_KNN_for_ROC_all.Data, "auc") auc_KNN_all.Data = unlist(slot(auc_KNN_all.Data, "y.values")) auc_KNN_all.Data= round(auc_KNN_all.Data, 4) ``` # Using xgboost: Extreme Gradient Boosting (tree) library ## Step I: to find the best parameters first ```{r} library(xgboost); library(caret); library(RCurl); library(Metrics) d1000_class[1:3, 1:10] N_folder=10 set.seed(100) shuffled=d1000_class[ sample(1000,1000),] shuffled$IDC_class = as.factor(shuffled$IDC_class) dim(shuffled) accuracyList=rep(0, N_folder) d3_total_xgboost_GBM=data.frame() d4_total_xgboost_GBM=data.frame() for (i in 1:2) { # These indices indicate the interval of the test set if (i0.5, 1, 0) ppp = as.factor(as.character(ppp)) sum1=confusionMatrix(ppp, as.factor(as.character( test$IDC_class))) d3=data.frame(sum1[[3]]) d4=data.frame( sum1[[4]] ) if (i==1){ d3_total_xgboost_GBM =d3 d4_total_xgboost_GBM=d4 }else{ d3_total_xgboost_GBM = cbind(d3_total_xgboost_GBM, d3) d4_total_xgboost_GBM = cbind(d4_total_xgboost_GBM, d4) } accuracy=d3[1,1] accuracyList[i]=accuracy print(accuracy) } print(accuracyList) print(mean(accuracyList)) names(d3_total_xgboost_GBM) =names(d4_total_xgboost_GBM) Total.Results = rbind(d3_total_xgboost_GBM, d4_total_xgboost_GBM) (file_name_to_save_GBM=paste0("Results_from_GBM_by_xgboost_MS2_", gsub(" |:", "_", Sys.time()), ".csv")) write.csv(Total.Results, file_name_to_save_GBM) ``` # Using Random forest classification ```{r} library(xgboost); library(caret); library(RCurl); library(Metrics);library(randomForest) d1000_class[1:3, 1:10] N_folder=10 set.seed(500) # repeat 1 seed =100, repeat 2, seed=500, repeat 3, seed =300 set.seed(300) ; shuffled=d1000_class[ sample(1000,1000),] shuffled$IDC_class = as.factor(shuffled$IDC_class) dim(shuffled) accuracyList=rep(0, N_folder) d3_total_RF_C=data.frame() d4_total_RF_C=data.frame() for (i in 1:N_folder) { # These indices indicate the interval of the test set if (i