# conduct 10 fold cross validation with BGLR if(!require("BGLR")){install.packages("BGLR", dependencies = T)};library(BGLR) if(!require("matrixcalc")){install.packages("matrixcalc", dependencies = T)};library(matrixcalc) # Three ways of producing predictions in validation data using BGLR # link: https://github.com/gdlc/BGLR-R/blob/master/inst/md/Validation.md # read in the IDC data idcGenoData= read.csv("~/Box/Manuscripts/Soy/2_paper_ISU/For submission/for PLOS One submission/MS 2/re_submitted supplementaldata and R codes/S03.csv") idcGenoData[1:3, 1:10] rownames(idcGenoData)= idcGenoData$Line idcPhenoData= read.csv("~/Box/Manuscripts/Soy/2_paper_ISU/For submission/for PLOS One submission/MS 2/re_submitted supplementaldata and R codes/S02.csv") head(idcPhenoData) # select the columns and change columns names idcPhenoData= idcPhenoData[,grep("location|REPNO|Line|obs_IDC", names(idcPhenoData))] names(idcPhenoData) =c("Loc", "Rep", "Stock", "rating") # there is only 1 line with "0" score value, changed 1 to 1: [L2040 1 L0824 0] idcPhenoData$rating[idcPhenoData$rating==0]=1 table(idcPhenoData$rating) dim(idcPhenoData) # rm(list=ls()) # remove everything from memory in the working environment # reload the R data without rerun the R code since it takes more than 20 houra to run the codes # load("C:/Users/ARSMNSPP5XUX201/Box/Manuscripts/Soy/2_paper_ISU/For submission/for PLOS One submission/MS 2/BGLR_IDC/MarkersFinal.RData") # extract only markers from IDC (position_1st_snp= grep("Marker", names(idcGenoData))[1]) (position_last_snp= ncol(idcGenoData)) idcGenoData_SNP_only = idcGenoData[, position_1st_snp:position_last_snp] # assign the genotype data to matrix "X" X= idcGenoData_SNP_only M=as.matrix(X) # load("C:/Users/ARSMNSPP5XUX201/Box/Manuscripts/Soy/2_paper_ISU/For submission/for PLOS One submission/MS 2/BGLR_IDC/PhenoGLS.RDATa") # to change the phenotype data frame to idcPhenoData to DataOrd DataOrd = idcPhenoData head(DataOrd) length(unique(DataOrd$Stock))# 1000 genotypes X[1:3, 1:10] DataOrd[order(DataOrd$Stock), ] y=DataOrd[,4] XD=DataOrd[,1:4] #Calculating the marker-derived genomic relationship matrix (GRM) M<-scale(M,center=TRUE,scale=TRUE) G<-tcrossprod(M)/ncol(M) # incidence matrix and covariance for main eff. of environments. ZE<-model.matrix(~factor(XD$Loc)-1) KE=tcrossprod(ZE); # incidence matrix and covariance for main eff. of lines. XD$Stock<-factor(x=XD$Stock, levels=rownames(M), ordered=TRUE) ZL<-model.matrix(~XD$Stock-1) KL=tcrossprod(ZL); # Genetic covariance structure of genetic effects in the full data KG= ZL%*%G%*%t(ZL); # Epistasis additive × additive covariance structure in the full data GA=hadamard.prod(G,G) KGG= ZL%*%GA%*%t(ZL); # G×E covariance structure for the full data set KGE=hadamard.prod(KG,KE) diag(KGE)=diag(KGE) KGE=KGE/mean(diag(KGE)) # GG×E Epistasis additive × additive structure for the full data set KGGE=hadamard.prod(KGG,KE) diag(KGGE)=diag(KGGE) KGGE=KGGE/mean(diag(KGGE)) # using modeling 9, the best model basdon Osval A. Montesinos-López # Cross-validation using a loop (n= nrow(idcPhenoData)) # need to set.seeds to make sure we can repeat the 10 folds set.seed(01062020); folds=sample(1:10,size=n,replace=T) table(folds) nTST=400 tst=sample(1:n,size=nTST) yHatCV=rep(NA,n) y=idcPhenoData$rating # create a data frame to the 10-fold CV results CV_IDC_10folds_metrics= data.frame() # create the vector to store the Brier Scores from the 10-folds CV Pearson_cor_accuracy= rep(0,10) for(k in 1:max(folds)){ tst=which(folds==k) yNA=y yNA[tst]=NA ETA9<-list( ENV=list(X=ZE, model='FIXED'), LINE=list(K=KL, model='RKHS',df0=5), G=list(K=KG, model='RKHS',df0=5), GG=list(K=KGG, model='RKHS',df0=5), GE=list(K=KGE, model='RKHS',df0=5), GGE=list(K=KGGE, model='RKHS',df0=5)) set.seed(01062020+ k); fm9<-BGLR(y=yNA,response_type='ordinal', ETA=ETA9, saveAt='M9_', nIter=1000, burnIn=500) # summary(fm9) # predict(fm9) # tradition prediction accuracy based on Pearson correlation Pearson_cor_accuracy[k]=cor(idcPhenoData$rating, fm9$yHat) # 0.7233046 (with 0), 0.6496149 (without 0) probs_IDC=data.frame(fm9$probs[tst,]) # probs_IDC$preds=apply(probs_IDC, 1, which.max) # based on Cheng Hao suggestion, calculate the score for each line sum(prob * expected class) probs_IDC$preds=apply(probs_IDC, 1, function(x){round( sum(1*x[1]+ 2*x[2]+ 3*x[3]+ 4*x[4]+ 5*x[5]+ 6*x[6]+ 7*x[7]+ 8*x[8]+ 9*x[9]), 4)}) dim(probs_IDC) names(probs_IDC)=c(1:9, "preds") probs_IDC[1:8, 1:10] probs_IDC$Truth= idcPhenoData[tst,]$rating probs_IDC[1:8,] library(caret) # idcPhenoData$rating= as.factor(as.character(idcPhenoData$rating)) # probs_IDC$pred= as.factor(as.character(probs_IDC$pred)) # confusionMatrix(idcPhenoData$rating, probs_IDC$pred) bayers_ordinal_scores=rep(0, nrow(probs_IDC)) # 1. calculate the Brier Score defiend by Jose # probs_IDC$Pred_IDC= bayers_ordinal_scores head(probs_IDC) # 2. calculate the decision metrics after converted by < 4 as 1, > 4 as 0 probs_IDC$pred_class= ifelse(probs_IDC$preds<4.0001, 1, 0) head(probs_IDC) probs_IDC$oberserved_class= ifelse(probs_IDC$Truth<5, 1, 0) probs_IDC$oberserved_class= as.factor(as.character(probs_IDC$oberserved_class)) probs_IDC$pred_class= as.factor(as.character(probs_IDC$pred_class)) sum_all=confusionMatrix(probs_IDC$oberserved_class, probs_IDC$pred_class) (sensitivity_BGLR_IDC=round( sum_all[[4]][1], 4)) (specificity_BGLR_IDC=round(sum_all[[4]][2], 4)) (precision_BGLR_IDC=round(sum_all[[4]][5], 4)) (decicison_accuracy=round(sum_all[[4]][11],4)) print(paste0("The sensitivity is:",sensitivity_BGLR_IDC)) print(paste0("The specificity is:",specificity_BGLR_IDC)) print(paste0("The prediction Precision is:",precision_BGLR_IDC)) print(paste0("The decicison accuracy is:",decicison_accuracy)) # save.image(file = "BGLR_IDC_MS2_model_9.Rdata") print(paste0("you have finished: ",k, " out of 10 folds")) df_metrics_tem=data.frame( sum_all[[4]]) if(k==1){ CV_IDC_10folds_metrics= df_metrics_tem }else{ CV_IDC_10folds_metrics= cbind(CV_IDC_10folds_metrics, df_metrics_tem) } } dim(CV_IDC_10folds_metrics) CV_IDC_10folds_metrics= rbind(CV_IDC_10folds_metrics, cor_accuracy=Pearson_cor_accuracy) names(CV_IDC_10folds_metrics)=c("fold1","fold2","fold3","fold4","fold5","fold6","fold7","fold8","fold9","fold10") # save the results to Cheng hao's way with file name "Cross_Vallidation_IDC_10folds_metrics_iteration6000_chengHaoway.csv" CV_IDC_10folds_metrics_5= CV_IDC_10folds_metrics[rownames(CV_IDC_10folds_metrics)%in%c("Sensitivity", "Specificity", "Precision", "Balanced Accuracy", "cor_accuracy"),] CV_IDC_10folds_metrics_5$means= apply(CV_IDC_10folds_metrics_5,1, mean) CV_IDC_10folds_metrics_5$sds= apply(CV_IDC_10folds_metrics_5,1, sd) (CV_IDC_10folds_metrics_5=round(CV_IDC_10folds_metrics_5, 4)) (file_names_results_BGLR_MS2= paste0("BGLR_Cross_Vallidation_IDC_10folds_metrics_iteration1000_", gsub(" |:","_",as.character( Sys.time())), ".csv")) write.csv(CV_IDC_10folds_metrics_5, file_names_results_BGLR_MS2) # save.image(file = "BGLR_IDC_ordinal_MS2_model_to_9_CV.Rdata") getwd() # It's a good idea to save your workspace image when your work sessions are long. # This can be done at any time using the function save.image() # When you close R/RStudio, it asks if you want to save your workspace. # If you say yes, the next time you start R that workspace will be loaded. That saved file will be named .RData as well. # save.image(file = "my_work_space.RData") # To restore your workspace, type this:load("my_work_space.RData") # Save and restore one single R object: saveRDS(object, file), my_data <- readRDS(file) # Save and restore multiple R objects: save(data1, data2, file = "my_data.RData"), load("my_data.RData") # Save and restore your entire workspace: save.image(file = "my_work_space.RData"), load("my_work_space.RData")