# conduct 10 fold cross validation with BGLR 


if(!require("BGLR")){install.packages("BGLR", dependencies = T)};library(BGLR)
if(!require("matrixcalc")){install.packages("matrixcalc", dependencies = T)};library(matrixcalc)
# Three ways of producing predictions in validation data using BGLR
# link: https://github.com/gdlc/BGLR-R/blob/master/inst/md/Validation.md

# read in the IDC data
idcGenoData= read.csv("~/Box/Manuscripts/Soy/2_paper_ISU/For submission/for PLOS One submission/MS 2/re_submitted supplementaldata and R codes/S03.csv")
idcGenoData[1:3, 1:10]
rownames(idcGenoData)= idcGenoData$Line
idcPhenoData= read.csv("~/Box/Manuscripts/Soy/2_paper_ISU/For submission/for PLOS One submission/MS 2/re_submitted supplementaldata and R codes/S02.csv")
head(idcPhenoData)

# select the columns and change columns names 
idcPhenoData= idcPhenoData[,grep("location|REPNO|Line|obs_IDC", names(idcPhenoData))]
names(idcPhenoData) =c("Loc",	"Rep",	"Stock",	"rating")

# there is only 1 line with "0" score value, changed 1 to 1: [L2040	1	L0824	0]
idcPhenoData$rating[idcPhenoData$rating==0]=1
table(idcPhenoData$rating)
dim(idcPhenoData)

# rm(list=ls())  # remove everything from memory in the working environment 
# reload the R data without rerun the R code since it takes more than 20 houra to run the codes
# load("C:/Users/ARSMNSPP5XUX201/Box/Manuscripts/Soy/2_paper_ISU/For submission/for PLOS One submission/MS 2/BGLR_IDC/MarkersFinal.RData")

# extract only markers from IDC
(position_1st_snp= grep("Marker", names(idcGenoData))[1])
(position_last_snp= ncol(idcGenoData))
idcGenoData_SNP_only = idcGenoData[, position_1st_snp:position_last_snp]
# assign the genotype data to matrix "X"
X= idcGenoData_SNP_only
M=as.matrix(X)
# load("C:/Users/ARSMNSPP5XUX201/Box/Manuscripts/Soy/2_paper_ISU/For submission/for PLOS One submission/MS 2/BGLR_IDC/PhenoGLS.RDATa")



# to change the phenotype data frame to idcPhenoData to DataOrd
DataOrd = idcPhenoData
head(DataOrd)
length(unique(DataOrd$Stock))# 1000 genotypes
X[1:3, 1:10]

DataOrd[order(DataOrd$Stock), ]
y=DataOrd[,4]
XD=DataOrd[,1:4]

#Calculating the marker-derived genomic relationship matrix (GRM)
M<-scale(M,center=TRUE,scale=TRUE)
G<-tcrossprod(M)/ncol(M)

# incidence matrix and covariance for main eff. of environments.
ZE<-model.matrix(~factor(XD$Loc)-1)  
KE=tcrossprod(ZE);
# incidence matrix and covariance for main eff. of lines.
XD$Stock<-factor(x=XD$Stock, levels=rownames(M), ordered=TRUE)
ZL<-model.matrix(~XD$Stock-1)
KL=tcrossprod(ZL);

# Genetic covariance structure of genetic effects in the full data
KG= ZL%*%G%*%t(ZL);

# Epistasis additive × additive covariance structure in the full data
GA=hadamard.prod(G,G)
KGG= ZL%*%GA%*%t(ZL);

# G×E covariance structure for the full data set
KGE=hadamard.prod(KG,KE)
diag(KGE)=diag(KGE)
KGE=KGE/mean(diag(KGE))

# GG×E Epistasis additive × additive structure for the full data set
KGGE=hadamard.prod(KGG,KE)
diag(KGGE)=diag(KGGE)
KGGE=KGGE/mean(diag(KGGE))

         


# using modeling 9, the best model basdon Osval A. Montesinos-López
#  Cross-validation using a loop
(n= nrow(idcPhenoData))
# need to set.seeds to make sure we can repeat the 10 folds
set.seed(01062020); folds=sample(1:10,size=n,replace=T)
table(folds)
nTST=400
tst=sample(1:n,size=nTST)
yHatCV=rep(NA,n)
y=idcPhenoData$rating

# create a data frame to the 10-fold CV results
CV_IDC_10folds_metrics= data.frame()
# create the vector to store the Brier Scores from the 10-folds CV

Pearson_cor_accuracy= rep(0,10)
for(k in 1:max(folds)){
  tst=which(folds==k)
  yNA=y
  yNA[tst]=NA
ETA9<-list( ENV=list(X=ZE, model='FIXED'),  LINE=list(K=KL, model='RKHS',df0=5),
            G=list(K=KG, model='RKHS',df0=5),  GG=list(K=KGG, model='RKHS',df0=5),	
            GE=list(K=KGE, model='RKHS',df0=5),   GGE=list(K=KGGE, model='RKHS',df0=5))
set.seed(01062020+ k); fm9<-BGLR(y=yNA,response_type='ordinal', ETA=ETA9, saveAt='M9_', nIter=1000, burnIn=500)

# summary(fm9)


# predict(fm9)

# tradition prediction accuracy based on Pearson correlation 
Pearson_cor_accuracy[k]=cor(idcPhenoData$rating, fm9$yHat) # 0.7233046 (with 0), 0.6496149 (without 0)



probs_IDC=data.frame(fm9$probs[tst,])


# probs_IDC$preds=apply(probs_IDC, 1, which.max)

#   based on Cheng Hao suggestion, calculate the score for each line sum(prob * expected class)

probs_IDC$preds=apply(probs_IDC, 1, function(x){round( sum(1*x[1]+ 2*x[2]+ 3*x[3]+ 4*x[4]+ 5*x[5]+ 6*x[6]+ 7*x[7]+ 8*x[8]+ 9*x[9]), 4)})


dim(probs_IDC)
names(probs_IDC)=c(1:9, "preds")
probs_IDC[1:8, 1:10]
probs_IDC$Truth= idcPhenoData[tst,]$rating
probs_IDC[1:8,]

library(caret)
# idcPhenoData$rating= as.factor(as.character(idcPhenoData$rating))
# probs_IDC$pred= as.factor(as.character(probs_IDC$pred))

# confusionMatrix(idcPhenoData$rating, probs_IDC$pred)
bayers_ordinal_scores=rep(0, nrow(probs_IDC))

# 1. calculate the Brier Score defiend by Jose
# probs_IDC$Pred_IDC= bayers_ordinal_scores
head(probs_IDC)




# 2. calculate the decision metrics after converted by < 4 as 1, > 4 as 0
probs_IDC$pred_class= ifelse(probs_IDC$preds<4.0001, 1, 0)
head(probs_IDC)
probs_IDC$oberserved_class= ifelse(probs_IDC$Truth<5, 1, 0)
probs_IDC$oberserved_class= as.factor(as.character(probs_IDC$oberserved_class))
probs_IDC$pred_class= as.factor(as.character(probs_IDC$pred_class))
sum_all=confusionMatrix(probs_IDC$oberserved_class, probs_IDC$pred_class)
(sensitivity_BGLR_IDC=round( sum_all[[4]][1], 4))
(specificity_BGLR_IDC=round(sum_all[[4]][2], 4))
(precision_BGLR_IDC=round(sum_all[[4]][5], 4))
(decicison_accuracy=round(sum_all[[4]][11],4))

print(paste0("The sensitivity is:",sensitivity_BGLR_IDC)) 
print(paste0("The specificity is:",specificity_BGLR_IDC)) 
print(paste0("The prediction Precision is:",precision_BGLR_IDC)) 
print(paste0("The decicison accuracy is:",decicison_accuracy)) 
# save.image(file = "BGLR_IDC_MS2_model_9.Rdata")
print(paste0("you have finished: ",k, " out of 10 folds")) 
df_metrics_tem=data.frame( sum_all[[4]])

if(k==1){
  CV_IDC_10folds_metrics= df_metrics_tem
  
}else{
  CV_IDC_10folds_metrics= cbind(CV_IDC_10folds_metrics, df_metrics_tem)
}


}

dim(CV_IDC_10folds_metrics)
CV_IDC_10folds_metrics= rbind(CV_IDC_10folds_metrics, cor_accuracy=Pearson_cor_accuracy)

names(CV_IDC_10folds_metrics)=c("fold1","fold2","fold3","fold4","fold5","fold6","fold7","fold8","fold9","fold10")
# save the results to Cheng hao's way with file name "Cross_Vallidation_IDC_10folds_metrics_iteration6000_chengHaoway.csv"

CV_IDC_10folds_metrics_5= CV_IDC_10folds_metrics[rownames(CV_IDC_10folds_metrics)%in%c("Sensitivity", "Specificity", "Precision", "Balanced Accuracy", "cor_accuracy"),]
CV_IDC_10folds_metrics_5$means= apply(CV_IDC_10folds_metrics_5,1, mean)
CV_IDC_10folds_metrics_5$sds= apply(CV_IDC_10folds_metrics_5,1, sd)
(CV_IDC_10folds_metrics_5=round(CV_IDC_10folds_metrics_5, 4))
(file_names_results_BGLR_MS2= paste0("BGLR_Cross_Vallidation_IDC_10folds_metrics_iteration1000_", gsub(" |:","_",as.character( Sys.time())), ".csv"))

write.csv(CV_IDC_10folds_metrics_5, file_names_results_BGLR_MS2)
# save.image(file = "BGLR_IDC_ordinal_MS2_model_to_9_CV.Rdata")
getwd()



# It's a good idea to save your workspace image when your work sessions are long.
# This can be done at any time using the function save.image()
# When you close R/RStudio, it asks if you want to save your workspace. 
# If you say yes, the next time you start R that workspace will be loaded. That saved file will be named .RData as well.
# save.image(file = "my_work_space.RData")
# To restore your workspace, type this:load("my_work_space.RData")
# Save and restore one single R object: saveRDS(object, file), my_data <- readRDS(file)
# Save and restore multiple R objects: save(data1, data2, file = "my_data.RData"), load("my_data.RData")
# Save and restore your entire workspace: save.image(file = "my_work_space.RData"), load("my_work_space.RData")