---
title: "Calculate GP metrics"
author: "Zhanyou Xu"
date: "JUne 9, 2020"
output: word_document
---
# predict.lm() in a loop. warning: prediction from a rank-deficient fit may be misleading
# reasons and solution link: https://stackoverflow.com/questions/26558631/predict-lm-in-a-loop-warning-prediction-from-a-rank-deficient-fit-may-be-mis

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
library(psych); library(e1071); library(caret)
```
# Read in the data
```{r}

filePath_name_IDC = "~/Box/Manuscripts/Soy/2_paper_ISU/For submission/for PLOS One submission/MS 2/re_submitted supplementaldata and R codes/S03.csv"
d1000 = read.csv(filePath_name_IDC)
d1000[1:3, 1:5]
d1000_class= d1000[, grep("true_decision|Marker", names(d1000))]
dim(d1000_class)
d1000_class[1:3, 1:15]

```
# AUC using rrBLUP

```{r, warning=0}
library(rrBLUP)
d1000 = read.csv(filePath_name_IDC)
d1000[1:3, 1:5]
#d1000_Manual[1:3, 1:10]

set.seed(300); shuffled=d1000[ sample(1000,1000),]
shuffled$true_decision = as.numeric(shuffled$true_decision)
shuffled_X= shuffled[, grep("marker", names(shuffled), ignore.case = T)]
#dim(shuffled)
N_folder=10
accuracyList_rrBLUP_Raw=rep(0, N_folder)
IDC_score_all_1000 =as.numeric(as.character( shuffled$true_decision))

rrBLUP.model_all = mixed.solve(IDC_score_all_1000, Z=as.matrix(shuffled_X), K=NULL, SE = FALSE, return.Hinv=FALSE)

marker_effects_all_rrBLUP = rrBLUP.model_all$u # effects for 1200 markers 
(beta_all_rrBLUP=rrBLUP.model_all$beta)
marker_effects_all_rrBLUP_matrix= as.matrix(marker_effects_all_rrBLUP)
pred_by_rrBLUP_all_rrBLUPs= as.matrix(shuffled_X) %*% marker_effects_all_rrBLUP_matrix 
pred_by_rrBLUP_all_rrBLUPs=data.frame(pred_by_rrBLUP_all_rrBLUPs)
pred_by_rrBLUP_all_rrBLUPs$lineIDs = rownames(pred_by_rrBLUP_all_rrBLUPs)
pred_by_rrBLUP_all_rrBLUPs$IDC.Scores= shuffled$true_decision


pred_by_rrBLUP_all_rrBLUPs$resids = pred_by_rrBLUP_all_rrBLUPs$IDC.Scores - pred_by_rrBLUP_all_rrBLUPs$pred_by_rrBLUP_all_rrBLUPs
names(pred_by_rrBLUP_all_rrBLUPs)
model_GLM_for_rrBLUP = glm(IDC.Scores~ resids , data =pred_by_rrBLUP_all_rrBLUPs, family = "binomial" )
pred_all_rrblup = model_GLM_for_rrBLUP$fitted.values

library(ROCR)
pred_forROC_rrBLUP_all = prediction(pred_all_rrblup, shuffled$true_decision)
evale_rrBLUP_all = performance(pred_forROC_rrBLUP_all, "acc")
plot(evale_rrBLUP_all)
# calculate the AUC
AUC_rrBLUP_all = performance(pred_forROC_rrBLUP_all, "auc")
AUC_rrBLUP_all = unlist(slot(AUC_rrBLUP_all, "y.values"))
AUC_rrBLUP_all= round(AUC_rrBLUP_all, 4)
AUC_rrBLUP_all

ROC_rrBLUP_all = performance(pred_forROC_rrBLUP_all, "tpr", "fpr",
                             xlab = "FPR", ylab = "TPR", main = paste0("rrBLUP: ", AUC_rrBLUP_all))

par(cex.axis=1.8, cex.lab = 1.5, cex.axis=1.5)
plot(ROC_rrBLUP_all, colorize = T, lwd= 5, main =paste0("rrBLUP: ", AUC_rrBLUP_all), cex.main=2,
     xlab="FPR", ylab = "TPR")
abline(a=0, b=1, lwd=2)





```



# Naive Bayes with 10-folder cross-validation

```{r}

N_folder=10
set.seed(100)

set.seed(300);shuffled=d1000_class[ sample(1000,1000),]
shuffled$IDC_class = as.factor(shuffled$true_decision)
shuffled$true_decision= as.factor(as.character(shuffled$true_decision))
#shuffled=IDC_5007_Data_Partial
dim(shuffled)
head(shuffled)[1:3, 1:10]
naiveBayes.model_allLines=naiveBayes(IDC_class~., data = shuffled, laplace = 1)
  pred_NB_all_probs=predict(naiveBayes.model_allLines, shuffled_X, type = "raw" )
pred_NB_all_probs = as.data.frame(pred_NB_all_probs); names(pred_NB_all_probs) =c("R", "S")
pred_for_ROC_NB_all = ROCR::prediction(pred_NB_all_probs$S, shuffled$IDC_class)
acc_NB_all = performance(pred_for_ROC_NB_all, "acc")
plot(acc_NB_all)
ROC_NB_all = performance(pred_for_ROC_NB_all, "tpr", "fpr") 
plot(ROC_NB_all); abline(a=0, b=1)

AUC_NB_all_data = performance(pred_for_ROC_NB_all, "auc")
AUC_NB_all_data = unlist(slot(AUC_NB_all_data, "y.values"))
AUC_NB_all_data = round(AUC_NB_all_data, 4)
AUC_NB_all_data
ROC_NB_all = performance(pred_for_ROC_NB_all, "tpr", "fpr") 
plot(ROC_NB_all, colorize = T, main=paste0( "Naive Bayes: ", AUC_NB_all_data)); abline(a=0, b=1)

par(mfrow=c(1,2) ,cex.axis=1.8, cex.lab = 1.5, cex.axis=1.5)
plot(ROC_rrBLUP_all, colorize = T, lwd= 5, main =paste0("rrBLUP: ", AUC_rrBLUP_all), cex.main=1.5,
     xlab="FPR", ylab = "TPR");  abline(a=0, b=1, lwd=2)
plot(ROC_NB_all, colorize = T, lwd=5,main=paste0( "Naive Bayes: ", AUC_NB_all_data), cex.main =1.5, xlab = "", ylab="TPR"); abline(a=0, b=1, lwd=2)



accuracyList=rep(0, N_folder)
d3_total=data.frame()
d4_total=data.frame()
for (i in 1:N_folder) {
  # These indices indicate the interval of the test set
  if (i<N_folder) {
  indices <- (((i-1) * round((1/N_folder)*nrow(shuffled))) + 1):((i*round((1/N_folder) * nrow(shuffled))))
  } else{
    indices <- (((i-1) * round((1/N_folder)*nrow(shuffled))) + 1):nrow(shuffled)
  }
  # Exclude them from the train set
  train <- shuffled[-indices,]
 #table(train$IDC_cat)
  # naiveBayes.model1 = naiveBayes(subset(train, select = -IDC_class),
  #                                train$IDC_class, laplace = 1)
  naiveBayes.model1=naiveBayes(IDC_class~., data = train, laplace = 1)
  #plot(svm.model1, train, SY2444AQ~SY2463AQ)
  #print(dim(train))
  # Include them in the test set
  test <- shuffled[indices,]; dim(test)
  test_X=subset(test, select = -IDC_class); dim(test_X)
  test_NB= subset(test, select = -IDC_class); dim(test_NB)
  pp=predict(naiveBayes.model1, test_X, type = "class" )
  
  # plot the ROC and AUC
  pp_probs=predict(naiveBayes.model1, subset(test, select = -IDC_class) , type= "raw")
  pp_probs_max =pp_probs[,2]
  pred_for_roc_NB =ROCR::prediction(pp_probs_max, test$IDC_class) 
    evalue1_NB  = performance(pred_for_roc_NB, "acc")
  plot(evalue1_NB)
  
  roc_NB_temp =  performance(pred_for_roc_NB, "tpr", "fpr", xlab ="FPR", ylab= "TPR")
  plot(roc_NB_temp, colorize =T, main = "ROC from NB", lwd=4, cex=3);   abline(a=0, b=1)
  
# Calculate the Area Under Curve
auc_NB_temp = performance(pred_for_roc_NB, "auc")
auc_NB_temp= unlist(slot(auc_NB_temp, "y.values"))
auc_NB_temp = round(auc_NB_temp, 4)
print(paste0("The AUC is for Naive Bayes: ", i, " and ", auc_NB_temp))





  
  table(pp, test$IDC_class)
  sum1=confusionMatrix(pp, test$IDC_class)
    d3=data.frame(sum1[[3]])
    d4=data.frame( sum1[[4]] )

  if (i==1){
   d3_tatal =d3
   d4_total=d4
  }else{
    d3_tatal = cbind(d3_tatal, d3)
    d4_total = cbind(d4_total, d4)
  }
   accuracy=mean(test$IDC_class ==pp)
  accuracyList[i]=accuracy
  print(paste0("The number of Folder i is: ", i, " and the accuracy is: ", accuracy))

}
print(accuracyList)
print(mean(accuracyList))
d3_tatal
d4_total
# write.csv(d3_tatal, "d3_total_by_NBC.csv")
# write.csv(d4_total, "d4_total_by_NBC.csv")
```


# Classfification with Logistic regression
```{r}
library(ISLR); library(caret)

N_folder=10
names(d1000_class)[1] ="IDC_class"
d1000_class$IDC_class = as.factor(as.character(d1000_class$IDC_class))
set.seed(300);shuffled=d1000_class[ sample(1000,1000),]
#shuffled= shuffled[, 1:300]

#shuffled$IDC_class = as.factor(shuffled$IDC_class)

#Calculate the overall AUC from Logistic regression

logistic.model_all.Data = glm(IDC_class~., data=shuffled[1:900,], family = "binomial", control = list(maxit = 50))
# pred_probability_all.data_Logistic = predict(logistic.model_all.Data, shuffled_X, type = "response")
sum(is.na(logistic.model_all.Data$coefficients))
# there is many NAs for the markers whcih are not estimatable, need to remove
list_of_NAs_logistic=logistic.model_all.Data$coefficients[is.na(logistic.model_all.Data$coefficients)]
list_of_NAs_logistic = row.names(data.frame(list_of_NAs_logistic))
# remove the markers that are not estimatable and saved the data as below
shuffled_updated = shuffled[, !colnames(shuffled)%in%list_of_NAs_logistic]
shuffled_updated_X = shuffled_updated[,grep("Marker", names(shuffled_updated), ignore.case = T)]
dim(shuffled_updated_X)
# re fit the glm model and repredict
table(shuffled_updated$IDC_class)
#shuffled_updated$IDC_class = as.factor(as.character(shuffled_updated$IDC_class))
logistic.model_all.Data = glm(IDC_class~., data=shuffled_updated[1:900,], family = "binomial", control = list(maxit = 50))
length(logistic.model_all.Data$coefficients)
sum(is.na(logistic.model_all.Data$coefficients))
pred_probability_all.data_Logistic = predict(logistic.model_all.Data, shuffled_updated_X[901:1000,], type = "response")
sum(is.na(logistic.model_all.Data$coefficients))

pred_for_ROC_Logistic_all.Data = ROCR::prediction(pred_probability_all.data_Logistic, shuffled[901:1000,]$IDC_class)
ROC_Logistic_all.data = performance(pred_for_ROC_Logistic_all.Data, "tpr", "fpr")
plot(performance(pred_for_ROC_Logistic_all.Data, "acc"))


auc_Logistic_900 = performance(pred_for_ROC_Logistic_all.Data, "auc")
auc_Logistic_900 = unlist(slot(auc_Logistic_900, "y.values"))
auc_Logistic_900 = round(auc_Logistic_900, 4)
auc_Logistic_900

accuracyList=rep(0, N_folder)
sum2_d3_total=data.frame()
sum2_d4_total=data.frame()
for (i in 1:N_folder) {
  # These indices indicate the interval of the test set
  if (i<N_folder) {
  indices <- (((i-1) * round((1/N_folder)*nrow(shuffled))) + 1):((i*round((1/N_folder) * nrow(shuffled))))
  } else{
    indices <- (((i-1) * round((1/N_folder)*nrow(shuffled))) + 1):nrow(shuffled)
  }
  # Exclude them from the train set
  train <- shuffled[-indices,]
 #table(train$IDC_cat)
  dim(train)
  logistic.model = glm(IDC_class~., data=train, family = "binomial", control = list(maxit = 100))

  length(logistic.model$coefficients) > logistic.model$rank
  
  list_of_nas_Logistic_temp=logistic.model$coefficients[is.na(logistic.model$coefficients)]
  list_of_nas_Logistic_temp= as.data.frame(list_of_nas_Logistic_temp)
  list_of_nas_Logistic_temp =rownames(list_of_nas_Logistic_temp)
  train= train[,!colnames(train)%in%list_of_nas_Logistic_temp]
  dim(train)
  logistic.model = glm(IDC_class~., data=train, family = "binomial", control = list(maxit = 100))

  #plot(svm.model1, train, SY2444AQ~SY2463AQ)
  #print(dim(train))
  # Include them in the test set
  test <- shuffled[indices,]; dim(test)
  test= test[,!colnames(test)%in%list_of_nas_Logistic_temp]; dim(test)
  test_X=subset(test, select = -IDC_class)
  pred_by_logistic=predict(logistic.model, test_X, type = "response") # "response" return probability
  
  # draw ROC and calculate the AUC
  pred_for_ROC_Logistic =ROCR::prediction(pred_by_logistic, test$IDC_class)
  evaluation_Logistic = performance(pred_for_ROC_Logistic, "acc")
  plot(evaluation_Logistic)
  evaluation_Logistic_for_ROC = performance(pred_for_ROC_Logistic, "tpr", "fpr",
                                            xlab="FPR", ylab="TPR")
  
plot(evaluation_Logistic_for_ROC, colorize=T)
  abline(a=0, b=1)

    # Calculate the Area Under Curve
auc_Logistic_temp = performance(pred_for_ROC_Logistic, "auc")
auc_Logistic_temp = unlist(slot(auc_Logistic_temp, "y.values"))
auc_Logistic_temp= round(auc_Logistic_temp,4)
auc_Logistic_temp
print(paste0("The AUC is of Logistic regression: ", auc_Logistic_temp))
 
  
  
  predict.class=ifelse(pred_by_logistic>0.5, 1,0)
  predict.class= as.factor(as.character(predict.class))
  sum2= confusionMatrix(as.factor(as.character( test$IDC_class)), predict.class)
    sum2_d3=data.frame(sum2[[3]])
    sum2_d4=data.frame( sum2[[4]] )

  if (i==1){
   sum2_d3_total =sum2_d3
   sum2_d4_total=sum2_d4
  }else{
    sum2_d3_total = cbind(sum2_d3_total, sum2_d3)
    sum2_d4_total = cbind(sum2_d4_total, sum2_d4)
  }
   accuracy=mean(test$IDC_class ==pp)
  accuracyList[i]=accuracy
  print(accuracy)

}
print(accuracyList)
print(mean(accuracyList))
plot(test$IDC_class, pp)
sum2_d3_total
sum2_d4_total
write.csv(sum2_d3_total, "d3_total_by_LogisticRegression.csv")
write.csv(sum2_d4_total, "d4_total_by_LogisticRegression.csv")


```


# Run K-nearest neighbor KNN clustering

```{r}
library(class)
N_folder=10
set.seed(300); shuffled=d1000_class[ sample(1000,1000),]
shuffled$IDC_class = as.factor(shuffled$IDC_class)
dim(shuffled)
accuracyList=rep(0, N_folder)
d3_total_KNN=data.frame()
d4_total_KNN=data.frame()

KNN.model_all.Data = knn(train=shuffled, test=shuffled, k=20, cl=shuffled$IDC_class, prob = T)

  probability_KNN_all.Data= attributes(KNN.model_all.Data)$prob

pred_KNN_for_ROC_all.Data = ROCR::prediction(probability_KNN_all.Data, shuffled$IDC_class)
ROC_KNN_all.Data = performance(pred_KNN_for_ROC_all.Data, "tpr", "fpr")

auc_KNN_all.Data = performance(pred_KNN_for_ROC_all.Data, "auc")  
auc_KNN_all.Data = unlist(slot(auc_KNN_all.Data, "y.values"))  
auc_KNN_all.Data= round(auc_KNN_all.Data, 4)

```


# Using xgboost: Extreme Gradient Boosting (tree) library
## Step I: to find the best parameters first

```{r}
library(xgboost); library(caret); library(RCurl); library(Metrics)
d1000_class[1:3, 1:10]
N_folder=10
set.seed(100)

shuffled=d1000_class[ sample(1000,1000),]
shuffled$IDC_class = as.factor(shuffled$IDC_class)
dim(shuffled)
accuracyList=rep(0, N_folder)
d3_total_xgboost_GBM=data.frame()
d4_total_xgboost_GBM=data.frame()
for (i in 1:2) {
  # These indices indicate the interval of the test set
  if (i<N_folder) {
  indices <- (((i-1) * round((1/N_folder)*nrow(shuffled))) + 1):((i*round((1/N_folder) * nrow(shuffled))))
  } else{
    indices <- (((i-1) * round((1/N_folder)*nrow(shuffled))) + 1):nrow(shuffled)
  }
  # Exclude them from the train set
  train <- shuffled[-indices,]
 #table(train$IDC_cat)
  
  test <- shuffled[indices,]; dim(test)

train1=apply(train, 2, as.numeric)
test1= apply(test, 2, as.numeric)
train1[1:3, 1:6]
train1[,1] =as.factor(train1[,1])
samllError=100
for (depth in seq(1,10,1)){
  for ( round1 in seq(1,10,1)){
    totalError = c()
  GBM_xgboost.model = xgboost(data = as.matrix(train1[,2:ncol(train1)]),
                              label = train1[,1],
                              max.depth=depth, nrounds = round1,
                              objective ="reg:squarederror", verbose = 0)
  pred_gbm_cxgboost = predict(GBM_xgboost.model, as.matrix(test1[, 2:ncol(test1)]), outputmargin = T)
   error1 = rmse(as.numeric(test1[,1]), as.numeric(pred_gbm_cxgboost))
   totalError = c(totalError, error1)
   
   if(mean(totalError)<samllError){
     samllError = mean(totalError)
     print(paste(depth, round1, samllError))
   }
  }
}

}

# the best depth is 10, and the round is 1, we use these paramters in step 2


```
# Using xgboost: Extreme Gradient Boosting (tree) library
## Step 2: 10 folder cross-validation by GBM

```{r}
library(xgboost); library(caret); library(RCurl); library(Metrics)
d1000_class[1:3, 1:10]
N_folder=10
set.seed(500) # repeat 1 seed =100, repeat 2, seed=500

set.seed(300); shuffled=d1000_class[ sample(1000,1000),]
shuffled$IDC_class = as.numeric(as.character(shuffled$IDC_class))
dim(shuffled)

  GBM_xgboost.model.all.Data = xgboost(data = as.matrix(shuffled[,2:ncol(shuffled)]),
            label = shuffled[,1],max.depth=10, nrounds = 1,
            objective ="binary:logistic", verbose = 0)# use "binary:logistic" to return probability instead of labls
  
  pred_gbm_cxgboost = predict(GBM_xgboost.model, as.matrix(shuffled[, 2:ncol(shuffled)]), outputmargin = F)
  
pred_for_ROC_GBM_all.Data = ROCR::prediction(pred_gbm_cxgboost, shuffled$IDC_class)
ROC_GBM_all.Data = performance(pred_for_ROC_GBM_all.Data, "tpr", "fpr")
plot(ROC_GBM_all.Data, colorize=T)
auc_GBM_all.Data = performance(pred_for_ROC_GBM_all.Data, "auc")
auc_GBM_all.Data = unlist(slot(auc_GBM_all.Data, "y.values"))
auc_GBM_all.Data = round(auc_GBM_all.Data, 4)



accuracyList=rep(0, N_folder)
d3_total_xgboost_GBM=data.frame()
d4_total_xgboost_GBM=data.frame()
for (i in 1:N_folder) {
  # These indices indicate the interval of the test set
  if (i<N_folder) {
  indices <- (((i-1) * round((1/N_folder)*nrow(shuffled))) + 1):((i*round((1/N_folder) * nrow(shuffled))))
  } else{
    indices <- (((i-1) * round((1/N_folder)*nrow(shuffled))) + 1):nrow(shuffled)
  }
  # Exclude them from the train set
  train <- shuffled[-indices,]
 #table(train$IDC_cat)
  
  test <- shuffled[indices,]; dim(test)

train1=apply(train, 2, as.numeric)
test1= apply(test, 2, as.numeric)
train1[1:3, 1:6]
#train1[,1] =as.factor(train1[,1])
  GBM_xgboost.model = xgboost(data = as.matrix(train1[,2:ncol(train1)]),
                              label = train1[,1],
                              max.depth=10, nrounds = 1,
                              objective ="binary:logistic", verbose = 0)# use "binary:logistic" to return probability instead of labls
  
  pred_gbm_cxgboost = predict(GBM_xgboost.model, as.matrix(test1[, 2:ncol(test1)]), outputmargin = F)
  
  
  
  # below is draw the ROC and calculate the AUC
  library(ROCR)
  pred_for_ROC_GBM = ROCR::prediction(pred_gbm_cxgboost, test1[,1])
  evaluation_GBM_for_ROC = performance(pred_for_ROC_GBM, "acc")
  plot(evaluation_GBM_for_ROC)
  ROC_GBM_1 = performance(pred_for_ROC_GBM, "tpr", "fpr", xlab="FPR", ylab="TPT")
  plot(ROC_GBM_1, colorize = T)
  abline(a=0, b=1)
  
  auc_GBM_1= performance(pred_for_ROC_GBM, "auc")
  auc_GBM_1= round(unlist(slot(auc_GBM_1, "y.values")) ,4)
print(paste0("The AUC is of GBM: ", auc_GBM_1))

  
   error1 = rmse(as.numeric(test1[,1]), as.numeric(pred_gbm_cxgboost))
 min(pred_gbm_cxgboost)
max(pred_gbm_cxgboost)
ppp=ifelse(pred_gbm_cxgboost>0.5, 1, 0)
ppp = as.factor(as.character(ppp))
  sum1=confusionMatrix(ppp, as.factor(as.character( test$IDC_class)))
    d3=data.frame(sum1[[3]])
    d4=data.frame( sum1[[4]] )

  if (i==1){
   d3_total_xgboost_GBM =d3
   d4_total_xgboost_GBM=d4
  }else{
    d3_total_xgboost_GBM = cbind(d3_total_xgboost_GBM, d3)
    d4_total_xgboost_GBM = cbind(d4_total_xgboost_GBM, d4)
  }
   accuracy=d3[1,1]
  accuracyList[i]=accuracy
  print(accuracy)

  }

print(accuracyList)
print(mean(accuracyList))
names(d3_total_xgboost_GBM) =names(d4_total_xgboost_GBM)
Total.Results = rbind(d3_total_xgboost_GBM, d4_total_xgboost_GBM)
(file_name_to_save_GBM=paste0("Results_from_GBM_by_xgboost_MS2_", gsub(" |:", "_", Sys.time()), ".csv"))
write.csv(Total.Results, file_name_to_save_GBM)
```


# Using Random forest classification


```{r}
library(xgboost); library(caret); library(RCurl); library(Metrics);library(randomForest)
d1000_class[1:3, 1:10]
N_folder=10
set.seed(500) # repeat 1 seed =100, repeat 2, seed=500, repeat 3, seed =300

set.seed(300) ; shuffled=d1000_class[ sample(1000,1000),]
shuffled$IDC_class = as.factor(shuffled$IDC_class)
dim(shuffled)
accuracyList=rep(0, N_folder)
d3_total_RF_C=data.frame()
d4_total_RF_C=data.frame()
for (i in 1:N_folder) {
  # These indices indicate the interval of the test set
  if (i<N_folder) {
  indices <- (((i-1) * round((1/N_folder)*nrow(shuffled))) + 1):((i*round((1/N_folder) * nrow(shuffled))))
  } else{
    indices <- (((i-1) * round((1/N_folder)*nrow(shuffled))) + 1):nrow(shuffled)
  }
  # Exclude them from the train set
  train <- shuffled[-indices,]
 #table(train$IDC_cat)
  
  test <- shuffled[indices,]; dim(test)

train1=apply(train, 2, as.numeric)
test1= apply(test, 2, as.numeric)
train1[1:3, 1:6]

#train1[,1] =as.factor(train1[,1])
  RF_C_model = randomForest(train$IDC_class~., train, ntree=100)
                              
  pred_by_RF_C = predict(RF_C_model, test)
  

  sum1=confusionMatrix(pred_by_RF_C, test$IDC_class)
    d3=data.frame(sum1[[3]])
    d4=data.frame( sum1[[4]] )

  if (i==1){
   d3_total_RF_C =d3
   d4_total_RF_C=d4
  }else{
    d3_total_RF_C = cbind(d3_total_RF_C, d3)
    d4_total_RF_C = cbind(d4_total_RF_C, d4)
  }
   accuracy=d3[1,1]
  accuracyList[i]=accuracy
  print(accuracy)

  }

print(accuracyList)
print(mean(accuracyList))
names(d3_total_RF_C) =names(d4_total_RF_C)
Total.Results_RF_C = rbind(d3_total_RF_C, d4_total_RF_C)
(file_name_to_save_RF=paste0("Results_from_RF_by_MS2_", gsub(" |:", "_", Sys.time()), ".csv"))

 write.csv(Total.Results_RF_C, file_name_to_save_RF)
 getwd()
```

# generate ROC and AUC from RF with all the data

```{r}
library(randomForest); library(ROCR)
shuffled$IDC_class = as.factor(as.character(shuffled$IDC_class))
RF_model_with_all.data = randomForest(shuffled$IDC_class~., shuffled, ntree = 500)

  pred_by_RF_all.data = predict(RF_model_with_all.data, shuffled, type = "prob")[,2]
# confusionMatrix(shuffled$IDC_class, pred_by_RF_all.data)

 pred_for_ROC_RF_all.Data =ROCR::prediction(pred_by_RF_all.data, shuffled$IDC_class)
# ROC_RF_all.Data = performance(pred_for_ROC_RF_all.Data, "tpr", "fpr", xlab ="FPR", ylab="TPR", cex=3)
 AUC_curve_RF.all.Data= performance(pred_for_ROC_RF_all.Data, "tpr", "fpr")
 # calculate the AUC of ROC curve
auc_RF_all.Data=performance(pred_for_ROC_RF_all.Data, "auc")
auc_RF_all.Data= unlist(slot(auc_RF_all.Data, "y.values"))
 auc_RF_all.Data = round(auc_RF_all.Data, 4); auc_RF_all.Data
print(paste0("The AUC of RF is: ", auc_RF_all.Data))



  
  # # Calculate the AUC
  # AUC_ANN_all.Data = performance(pred_for_ROC_RF_all.Data, "auc")
  # AUC_ANN_all.Data = unlist(slot(AUC_ANN_all.Data, "y.values"))
  # AUC_ANN_all.Data = round(AUC_ANN_all.Data,4)
  # print(paste0("The AUC of ANN is: ", AUC_ANN_all.Data))
  

```



# Now run ANN with backpropagation from all the data
```{r}
library(neuralnet)

# now change the the data to manually edited data without MISSING data points

IDC1000_Partial = d1000_class[, grep("class|marker", names(d1000_class), ignore.case = T)] 
list.markers = names(IDC1000_Partial)[2:ncol(IDC1000_Partial)]
formular_01 = as.formula(paste("IDC_class~", paste(list.markers[!list.markers%in%"IDC_class"], collapse = "+")))

N_folder=10
accuracyList_ANN=rep(0, N_folder)
d3_ANN_df_total=data.frame()
d4_ANN_df_total=data.frame()
# set.seed(1003) # repeat 1 seed=2002, rpeat2 seed =2003
set.seed(300) ; shuffled= IDC1000_Partial[sample(nrow(IDC1000_Partial), nrow(IDC1000_Partial)),]

# Calculate the AUC from ANN from all the data
list.markers = names(shuffled)[2:ncol(shuffled)]
formular_ANN_all.Data = as.formula(paste("IDC_class~", paste(list.markers, collapse = "+")))

ann.IDC.model_all.Data = neuralnet(formular_ANN_all.Data, data=shuffled, hidden = 2, learningrate = 0.01,algorithm = "backprop", err.fct = "ce", linear.output = FALSE, stepmax=1e6)
pp_values_all.Data= predict(ann.IDC.model_all.Data, shuffled[, 2:ncol(shuffled)])
Pred_class_ANN_temp=as.data.frame( apply(pp_values_all.Data, 1,which.max)-1)
names(Pred_class_ANN_temp)= "Pred_class_ANN"
pp_values_all.Data$Pred_class_ANN =Pred_class_ANN_temp$Pred_class_ANN
pred_from_ANN_for_ROC_all.Data = ROCR::prediction(pp_values_all.Data$Pred_class_ANN, shuffled$IDC_class) 
# evaluation_ANN_all.Data = performance(pred_from_ANN_for_ROC_all.Data, "acc")
# plot(evaluation_ANN_all.Data)
ROC_ANN_all.Data = performance(pred_from_ANN_for_ROC_all.Data, "tpr", "fpr",
                           xlab = "FPR",
                           ylab="TPR", cex=3)
  
  # Calculate the AUC
  AUC_ANN_all.Data = performance(pred_from_ANN_for_ROC_all.Data, "auc")
  AUC_ANN_all.Data = unlist(slot(AUC_ANN_all.Data, "y.values"))
  AUC_ANN_all.Data = round(AUC_ANN_all.Data,4)
  print(paste0("The AUC of ANN is: ", AUC_ANN_all.Data))
  

  
```

# Support Vector Machine (SVM)
```{r}
N_folder=10
set.seed(300)

set.seed(300);d1000_class= d1000_class[sample(1000, 1000),] # re-shuffle the data to make sure they are randomly ordered

shuffled=d1000_class[, grep("IDC_class|marker", names(d1000_class), ignore.case = T)]
shuffled[1:3, 1:5]; dim(shuffled)
#shuffled=IDC_5007_Data_Partial
dim(shuffled)
accuracyList=rep(0, N_folder)
d3_SVM_df = data.frame()
d4_SVM_df = data.frame()
library(e1071); library(ROCR)

svm.model.all.data = svm(IDC_class~., data=shuffled, kernal ="linear", cost =0.1)
SVM_probs_all.Data  = predict(svm.model.all.data, shuffled, type="prob")
SVM_probs_all.Data_list= as.numeric(as.character(SVM_probs_all.Data))


pred_all_SVM_for_ROC = ROCR::prediction(SVM_probs_all.Data_list, shuffled$IDC_class)
ROC_SVM_all.Data = performance(pred_all_SVM_for_ROC, "tpr", "fpr")
AUC_SVM_all.Data =  performance(pred_all_SVM_for_ROC, "auc")
AUC_SVM_all.Data= unlist(slot(AUC_SVM_all.Data, "y.values"))
AUC_SVM_all.Data = round(AUC_SVM_all.Data, 4)


```








# Penalized Logistic Regression (PLR)
```{r}
N_folder=10
set.seed(300)
IDC1000=d1000_class
set.seed(300);IDC1000= IDC1000[sample(1000, 1000),] # re-shuffle the data to make sure they are randomly ordered

shuffled=IDC1000[, grep("IDC_class|marker", names(IDC1000), ignore.case = T)]
shuffled[1:3, 1:5]; dim(shuffled)
#shuffled=IDC_5007_Data_Partial
dim(shuffled)
accuracyList_PLR=rep(0, N_folder)
d3_SVM_df_PLR = data.frame()
d4_SVM_df_PLR = data.frame()
library(e1071); library(ROCR); library(glmnet)



  X1_PLR = model.matrix(IDC_class ~., shuffled[1:ncol(shuffled)])[,2:ncol(shuffled)]
  y1_PLR= shuffled$IDC_class

  # alpha: the elasticnet mixing parameter. Allowed values include:
  # “1”: for lasso regression
  # “0”: for ridge regression
  # a value between 0 and 1 (say 0.3) for elastic net regression.
  
 set.seed(10172020); cv.PLR= cv.glmnet(X1_PLR, y1_PLR, alpha=0, family="binomial")
 set.seed(10172020); PLR.model.all.data = PLR.model. = glmnet(X1_PLR, y1_PLR, alpha=0, family = "binomial", lambda= cv.PLR$lambda.min)
 
   shuffled_matrix= model.matrix(IDC_class ~., shuffled[1:ncol(shuffled)])[,2:ncol(shuffled)]

PLR_probs_all.Data  = predict(PLR.model.all.data, shuffled_matrix, type="response")

pred_all_PLR_for_ROC = ROCR::prediction(PLR_probs_all.Data, shuffled$IDC_class)
ROC_PLR_all.Data = performance(pred_all_PLR_for_ROC, "tpr", "fpr")
AUC_PLR_all.Data =  performance(pred_all_PLR_for_ROC, "auc")
AUC_PLR_all.Data= unlist(slot(AUC_PLR_all.Data, "y.values"))
(AUC_PLR_all.Data = round(AUC_PLR_all.Data, 4))


```

# Bayesian Generalized Linear Regression for Oridnal data with BGLR R package to draw AUC

```{r}
if(!require("BGLR")){install.packages("BGLR", dependencies = T)};library(BGLR)
if(!require("matrixcalc")){install.packages("matrixcalc", dependencies = T)};library(matrixcalc)


# read in the IDC data
idcGenoData= read.csv("~/Box/Manuscripts/Soy/2_paper_ISU/For submission/for PLOS One submission/MS 2/re_submitted supplementaldata and R codes/S03.csv")
idcGenoData[1:3, 1:10]
rownames(idcGenoData)= idcGenoData$LINCD
idcPhenoData= read.csv("~/Box/Manuscripts/Soy/2_paper_ISU/For submission/for PLOS One submission/MS 2/re_submitted supplementaldata and R codes/S02.csv")
head(idcPhenoData)
# there is only 1 line with "0" score value, changed 1 to 1: [L2040	1	L0824	0]
idcPhenoData$obs_IDC_score[idcPhenoData$obs_IDC_score==0]=1
table(idcPhenoData$obs_IDC_score)


# rm(list=ls())  # remove everything from memory in the working environment 
# load("C:/Users/ARSMNSPP5XUX201/Box/Manuscripts/Soy/2_paper_ISU/For submission/for PLOS One submission/MS 2/BGLR_IDC/MarkersFinal.RData")

# extract only markers from IDC
(position_First_marker= grep("Marker", names(idcGenoData))[1])
(position_last_marker= tail( grep("Marker", names(idcGenoData)), n=1))

idcGenoData_SNP_only = idcGenoData[, position_First_marker:position_last_marker]
# assign the genotype data to matrix "X"
X= idcGenoData_SNP_only
M=as.matrix(X)
# load("C:/Users/ARSMNSPP5XUX201/Box/Manuscripts/Soy/2_paper_ISU/For submission/for PLOS One submission/MS 2/BGLR_IDC/PhenoGLS.RDATa")



# to change the phenotype data frame to idcPhenoData to DataOrd

idcPhenoData= idcPhenoData[,grep("location|REPNO|Line|obs_IDC_score", names(idcPhenoData))]
names(idcPhenoData) =c("Loc",	"Rep",	"Stock",	"rating")
DataOrd = idcPhenoData


head(DataOrd)
length(unique(DataOrd$Stock))# 1000 genotypes
X[1:3, 1:10]

DataOrd[order(DataOrd$Stock), ]
y=DataOrd[,4]
XD=DataOrd[,1:4]

#Calculating the marker-derived genomic relationship matrix (GRM)
M<-scale(M,center=TRUE,scale=TRUE)
G<-tcrossprod(M)/ncol(M)

# incidence matrix and covariance for main eff. of environments.
ZE<-model.matrix(~factor(XD$Loc)-1)  
KE=tcrossprod(ZE);
# incidence matrix and covariance for main eff. of lines.
XD$Stock<-factor(x=XD$Stock, levels=rownames(M), ordered=TRUE)

ZL<-model.matrix(~XD$Stock-1)
KL=tcrossprod(ZL);

# Genetic covariance structure of genetic effects in the full data
KG= ZL%*%G%*%t(ZL);

# Epistasis additive × additive covariance structure in the full data
GA=hadamard.prod(G,G)
KGG= ZL%*%GA%*%t(ZL);

# G×E covariance structure for the full data set
KGE=hadamard.prod(KG,KE)
diag(KGE)=diag(KGE)
KGE=KGE/mean(diag(KGE))

# GG×E Epistasis additive × additive structure for the full data set
KGGE=hadamard.prod(KGG,KE)
diag(KGGE)=diag(KGGE)
KGGE=KGGE/mean(diag(KGGE))
          

ETA9<-list( ENV=list(X=ZE, model='FIXED'),  LINE=list(K=KL, model='RKHS',df0=5),
            G=list(K=KG, model='RKHS',df0=5),  GG=list(K=KGG, model='RKHS',df0=5),	
            GE=list(K=KGE, model='RKHS',df0=5),   GGE=list(K=KGGE, model='RKHS',df0=5))
XD$class=ifelse(XD$rating<5, 1,0)
fm9_for_AUC_binary<-BGLR(y=XD$class,response_type='ordinal', ETA=ETA9, saveAt='M9_', nIter=5000, burnIn=200)


summary(fm9_for_AUC_binary)


library(ROCR)

pred_probs_BGLR= ROCR::prediction(fm9_for_AUC_binary$probs[,2], XD$class)
ROC_BGLR_IDC_binary= performance(pred_probs_BGLR, "tpr", "fpr")
plot(ROC_BGLR_IDC_binary)
ROC_PLR_all.Data = performance(pred_all_PLR_for_ROC, "tpr", "fpr")
AUC_BGLR_all.Data =  performance(pred_probs_BGLR, "auc")
AUC_BGLR_all.Data= unlist(slot(AUC_BGLR_all.Data, "y.values"))
(AUC_BGLR_all.Data = round(AUC_BGLR_all.Data, 4))



```







# show the 9 ROC and AUC
```{r, fig.height=13, fig.width=15}
par(mfrow=c(3,4) ,cex.axis=2.5, cex.lab = 2.5, cex.main = 2.5, mar = c(5, 6.5, 3.5, 3.5))
plot(ROC_KNN_all.Data, colorize=T, lwd=5, main =paste0("KNN: ",auc_KNN_all.Data), cex.main=2.5, xlab="FPR", ylab="TPR" ); abline(a=0, b=1, lwd =2)

plot(ROC_rrBLUP_all, colorize = T, lwd= 5, main =paste0("rrBLUP: ", AUC_rrBLUP_all), cex.axis=2.5,
     xlab="FPR", ylab = "TPR");  abline(a=0, b=1, lwd=2)
plot(ROC_Logistic_all.data, colorize = T, lwd=5, main= paste0("LR: ", auc_Logistic_900), cex.main=2.5, xlab="FPR", ylab="TPR"); abline(a=0, b=1)
plot(ROC_NB_all, colorize = T, lwd=5,main=paste0( "NB: ", AUC_NB_all_data), cex.main =2.5, xlab = "", ylab="TPR"); abline(a=0, b=1, lwd=2)

# BGLR

plot(ROC_PLR_all.Data, colorize =T, lwd=5, main= paste0("BGLR: ", AUC_BGLR_all.Data),cex.main= 2.5, xlab="FPR", ylab="TPR" ); abline(a=0, b=1, lwd=2)

plot(ROC_SVM_all.Data, colorize = T, lwd =5, main = paste0("SVM: ", AUC_SVM_all.Data), cex.main= 2.5, xlab="FPR", ylab="TPR"); abline(a=0, b=1, lwd=2)

# PLR
plot(ROC_PLR_all.Data, colorize =T, lwd=5, main= paste0("Panelized LR: ", AUC_PLR_all.Data),cex.main= 2.5, xlab="FPR", ylab="TPR" ); abline(a=0, b=1, lwd=2)

plot(ROC_GBM_all.Data, colorize=T, lwd=5, main = paste0("GBM: ", auc_GBM_all.Data), cex.main =2.5, xlab="FPR", ylab="TPR"); abline(a=0, b=1, lwd = 2)
plot(ROC_ANN_all.Data, colorize =T, lwd=5, main= paste0("ANN: ", AUC_ANN_all.Data),cex.main= 2.5, xlab="FPR", ylab="TPR" ); abline(a=0, b=1, lwd=2)

# RF
plot(AUC_curve_RF.all.Data, colorize =T, lwd=5, main= paste0("RF: ", auc_RF_all.Data),cex.main= 2.5, xlab="FPR", ylab="TPR" ); abline(a=0, b=1, lwd=2)




```
```{r}
# save the whole workspace
save.image(file = "draw_ROC_with_10model_BGLR.Rdata")

```


Note that the `echo = FALSE` parameter was added to the code chunk to prevent printing of the R code that generated the plot.