#Online Resource 2

#Improving the baking quality of bread wheat by genomic selection in early generations

#Sebastian Michel1, Christian Kummer2, Martin Gallee2, Jakob Hellinger1, Christian Ametz3, Batuhan Akgöl4, Doru Epure5, Franziska Löschenberger3, Hermann Buerstmayr1


#1Department for Agrobiotechnology (IFA-Tulln), Institute for Biotechnology in Plant Production, University of Natural Resources and Life Sciences, Vienna (BOKU), Konrad-Lorenz-Str. 20, 3430 Tulln, Austria
#2Versuchsanstalt für Getreideverarbeitung, Österreichische Mühlenvereinigung e.V., Prinz-Eugen-Straße 14/1/4,1040 Vienna, Austria
#3Saatzucht Donau GesmbH. & CoKG, Saatzuchtstrasse 11, 2301 Probstdorf, Austria
#4ProGen Seed A.S., Büyükdalyan Mah. 2. Küme evler Sok.,No: 49 31001 Antakya/ Hatay, Turkey
#5Probstdorfer Saatzucht Romania SRL, Str. Siriului Nr.20, sect.1 Bucuresti, Romania

#Corresponding author: S. Michel; e-mail: sebastian.michel@boku.ac.at
#####################################################################################################################
#Analysis of an example dataset using a forward prediction approach for the farinograph dough stability

#Date: 03/09/2017
#Author: Sebastian Michel
#For questions/suggestions/comments contact: sebastian.michel@boku.ac.at


#The example dataset contains:
#-->191 lines for training models (called basis population in the publication)
#-->70 lines for validating models (one of the independant validation populations mentioned in the publication)

#-->Phenotypic data for both the protein content (protein) and a dough rheological trait, in this case the
#   farinograph stability (fa.stab); please notice that the phenotypic values were standardized for
#   both traits and the training + validation sets respectively for the purpose of illustration

#-->Markerdata of 7687 GBS markers (M1,M2...) as well as a SDS-PAGE marker for the Glu-D1 locus
#-->Missing Markerdata was imputed with EM algorithm implemented in the A.mat function of rrBLUP (Endelman 2011)
#   and markers were coded as (1,0,-1) for homozygous major,heterozygous, and homozygous minor
#-->The Glu-D1 marker was coded as follows: "1" = '5+10' allele,"-1" = '2+12' allele, and "0" = heterozygous



#The analysis will follow the order in the manuscript:
#1) Single-trait prediction using a GBLUP model
#2) Single-trait prediction using a WBLUP model with Glu-D1 as fixed effect

#3) Multi-trait prediction using a Multi-variate model
#4) Multi-trait prediction using the model-based index
#-->We will assume prior information of the protein content in the multi-trait prediction;
#   results with no prior rotein content information are provided in parenthesis

#5) A combination of the WBLUP model with multi-trait prediction using the model-based index
#-->All models were implemented with the sommer package version 2.9 (Covarrubias-Pazaran 2016)

library(sommer)

#######################
#-->Read-in the dataset
setwd("/Quality in wheat/R code/Example dataset")
DATA <- read.table("GS_Baking_Quali.csv",sep=";",dec=".",header=TRUE)
tapply(DATA$GEN,DATA$set,length)
str(DATA[,1:10])


###############################################
#1) Single-trait prediction using a GBLUP model
DATA.VALID <- droplevels(DATA[which(DATA$set %in% "validation"),])
DATA.TRAIN <- droplevels(DATA[which(DATA$set %in% "training"),])

#-->Create the genomic relationship matrix
DATA.MODEL <- rbind(DATA.TRAIN,DATA.VALID)
G.MATRIX <- A.mat(as.matrix(DATA.MODEL[,-c(1:5)],min.MAF=0.05))
rownames(G.MATRIX) <- colnames(G.MATRIX) <- DATA.MODEL$GEN


#-->Remove the phenotypic records from the validation set
DATA.MODEL$fa.stab[which(DATA.MODEL$GEN %in% DATA.VALID$GEN)] <- NA


#-->Fit the GBLUP model
MODEL <- mmer2(fixed=fa.stab~1,random=~g(GEN),G=list(GEN=G.MATRIX),data=DATA.MODEL,method="NR")
GEBVs <- data.frame(GEN=rownames(MODEL$u.hat[[1]]),gebv.stab=as.numeric(MODEL$u.hat[[1]]))
VALIDATION <- droplevels(merge(DATA.VALID[,c(1,2)],GEBVs,by.x="GEN",by.y="GEN",all.x=F,all.y=F))

cor.test(VALIDATION$fa.stab,VALIDATION$gebv.stab)#r=0.485



###############################################
#2) Single-trait prediction using a WBLUP model with Glu-D1 as fixed effect
DATA.VALID <- droplevels(DATA[which(DATA$set %in% "validation"),])
DATA.TRAIN <- droplevels(DATA[which(DATA$set %in% "training"),])

#-->Create the genomic relationship matrix
DATA.MODEL <- rbind(DATA.TRAIN,DATA.VALID)
G.MATRIX <- A.mat(as.matrix(DATA.MODEL[,-c(1:5)],min.MAF=0.05))
rownames(G.MATRIX) <- colnames(G.MATRIX) <- DATA.MODEL$GEN


#-->Remove the phenotypic records from the validation set
DATA.MODEL$fa.stab[which(DATA.MODEL$GEN %in% DATA.VALID$GEN)] <- NA


#-->Fit the WBLUP model with Glu-D1 as fixed effect (the fixed Glu-D1 effect has to added to the random genetic effects)
MODEL <- mmer2(fixed=fa.stab~GLUD1,random=~g(GEN),G=list(GEN=G.MATRIX),data=DATA.MODEL,method="NR")
GEBVs <- data.frame(GEN=rownames(MODEL$u.hat[[1]]),gebv.stab=as.numeric(MODEL$u.hat[[1]]),b.glu=MODEL$beta.hat[2])
GEBVs <- droplevels(merge(GEBVs,DATA.MODEL[,c(1,5)],by.x="GEN",by.y="GEN",all.x=F,all.y=F))
GEBVs$wblup.stab <- GEBVs$gebv.stab + GEBVs$b.glu*GEBVs$GLUD1

VALIDATION <- droplevels(merge(DATA.VALID[,c(1,2)],GEBVs,by.x="GEN",by.y="GEN",all.x=F,all.y=F))

cor.test(VALIDATION$fa.stab,VALIDATION$wblup.stab)#r=0.533



###############################################
#3) Multi-trait prediction using a multi-variate model
#   assuming prior information of the protein content
DATA.VALID <- droplevels(DATA[which(DATA$set %in% "validation"),])
DATA.TRAIN <- droplevels(DATA[which(DATA$set %in% "training"),])

#-->Create the genomic relationship matrix
DATA.MODEL <- rbind(DATA.TRAIN,DATA.VALID)
G.MATRIX <- A.mat(as.matrix(DATA.MODEL[,-c(1:5)],min.MAF=0.05))
rownames(G.MATRIX) <- colnames(G.MATRIX) <- DATA.MODEL$GEN
dim(G.MATRIX )

#-->Remove the phenotypic records from the validation set
DATA.MODEL$fa.stab[which(DATA.MODEL$GEN %in% DATA.VALID$GEN)] <- NA
#DATA.MODEL$protein[which(DATA.MODEL$GEN %in% DATA.VALID$GEN)] <- NA


#-->Fit the MV-GBLUP model 
MODEL <- mmer2(fixed=cbind(fa.stab,protein)~1,random=~g(GEN),G=list(GEN=G.MATRIX),MVM=TRUE,data=DATA.MODEL,method="NR")
GEBVs <- data.frame(GEN=names(MODEL$u.hat[[1]][,1]),gebv.stab=MODEL$u.hat[[1]][,1])
VALIDATION <- droplevels(merge(DATA.VALID[,c(1,2)],GEBVs,by.x="GEN",by.y="GEN",all.x=F,all.y=F))

cor.test(VALIDATION$fa.stab,VALIDATION$gebv.stab)#r=0.517 (without prior protein information r=0.392)



###############################################
#4) Multi-trait prediction using the model-based index
#   assuming prior information of the protein content
DATA.VALID <- droplevels(DATA[which(DATA$set %in% "validation"),])
DATA.TRAIN <- droplevels(DATA[which(DATA$set %in% "training"),])

#-->Create the genomic relationship matrix
DATA.MODEL <- rbind(DATA.TRAIN,DATA.VALID)
G.MATRIX <- A.mat(as.matrix(DATA.MODEL[,-c(1:5)],min.MAF=0.05))
rownames(G.MATRIX) <- colnames(G.MATRIX) <- DATA.MODEL$GEN
dim(G.MATRIX )

#-->Remove the phenotypic records from the validation set
DATA.MODEL$fa.stab[which(DATA.MODEL$GEN %in% DATA.VALID$GEN)] <- NA
#DATA.MODEL$protein[which(DATA.MODEL$GEN %in% DATA.VALID$GEN)] <- NA


#-->Compute the GEBVs for the protein content and merge them to the training data
MODEL <- mmer2(fixed=protein~1,random=~g(GEN),G=list(GEN=G.MATRIX),data=DATA.MODEL,method="NR")
GEBVs.PROT <- data.frame(GEN=rownames(MODEL$u.hat[[1]]),gebv.protein=as.numeric(MODEL$u.hat[[1]]))
DATA.MODEL <- droplevels(merge(DATA.MODEL[,1:5],GEBVs.PROT,by.x="GEN",by.y="GEN",all.x=F,all.y=F))


#-->Fit the GBLUP model including the GEBVs of the protein content as fixed effect
MODEL <- mmer2(fixed=fa.stab~gebv.protein,random=~g(GEN),G=list(GEN=G.MATRIX),data=DATA.MODEL,method="NR")
GEBVs <- data.frame(GEN=rownames(MODEL$u.hat[[1]]),gebv.stab=as.numeric(MODEL$u.hat[[1]]),b.prot=MODEL$beta.hat[2])
GEBVs <- droplevels(merge(GEBVs,GEBVs.PROT,by.x="GEN",by.y="GEN",all.x=F,all.y=F))
GEBVs$index <- GEBVs$gebv.stab + GEBVs$b.prot*GEBVs$gebv.protein
VALIDATION <- droplevels(merge(DATA.VALID[,c(1,2)],GEBVs,by.x="GEN",by.y="GEN",all.x=F,all.y=F))

cor.test(VALIDATION$fa.stab,VALIDATION$index)#r=0.505 (without prior protein information r=0.492)
#-->Slightly less accurate than the multi-variate mixed model in this case, but computational less demanding
#-->A combination with the WBLUP model is additionally more straight forward with this method 
#-->The method was furthermore more stable when no prior protein information was available


###############################################
#5) A combination of the WBLUP model with multi-trait prediction using the model-based index
DATA.VALID <- droplevels(DATA[which(DATA$set %in% "validation"),])
DATA.TRAIN <- droplevels(DATA[which(DATA$set %in% "training"),])

#-->Create the genomic relationship matrix
DATA.MODEL <- rbind(DATA.TRAIN,DATA.VALID)
G.MATRIX <- A.mat(as.matrix(DATA.MODEL[,-c(1:5)],min.MAF=0.05))
rownames(G.MATRIX) <- colnames(G.MATRIX) <- DATA.MODEL$GEN
dim(G.MATRIX )

#-->Remove the phenotypic records from the validation set
DATA.MODEL$fa.stab[which(DATA.MODEL$GEN %in% DATA.VALID$GEN)] <- NA
#DATA.MODEL$protein[which(DATA.MODEL$GEN %in% DATA.VALID$GEN)] <- NA


#-->Compute the GEBVs for the protein content and merge them to the training data
MODEL <- mmer2(fixed=protein~1,random=~g(GEN),G=list(GEN=G.MATRIX),data=DATA.MODEL,method="NR")
GEBVs.PROT <- data.frame(GEN=rownames(MODEL$u.hat[[1]]),gebv.protein=as.numeric(MODEL$u.hat[[1]]))
DATA.MODEL <- droplevels(merge(DATA.MODEL[,1:5],GEBVs.PROT,by.x="GEN",by.y="GEN",all.x=F,all.y=F))


#-->Fit the model including both the GEBVs of the protein content and the Glu-D1 marker as fixed effect
MODEL <- mmer2(fixed=fa.stab~gebv.protein + GLUD1,random=~g(GEN),G=list(GEN=G.MATRIX),data=DATA.MODEL,method="NR")
GEBVs <- data.frame(GEN=rownames(MODEL$u.hat[[1]]),gebv.stab=as.numeric(MODEL$u.hat[[1]]),b.prot=MODEL$beta.hat[2],b.glu=MODEL$beta.hat[3])
GEBVs <- droplevels(merge(GEBVs,GEBVs.PROT,by.x="GEN",by.y="GEN",all.x=F,all.y=F))
GEBVs <- droplevels(merge(GEBVs,DATA.MODEL[,c(1,5)],by.x="GEN",by.y="GEN",all.x=F,all.y=F))
GEBVs$index <- GEBVs$gebv.stab + GEBVs$b.prot*GEBVs$gebv.protein + GEBVs$b.glu*GEBVs$GLUD1

VALIDATION <- droplevels(merge(DATA.VALID[,c(1,2)],GEBVs,by.x="GEN",by.y="GEN",all.x=F,all.y=F))
cor.test(VALIDATION$fa.stab,VALIDATION$index)#r=0.547 (without prior protein information r=0.536)

#-->Comparison with the baseline accuarcy of r = 0.485
100*(0.547-0.485)/0.485
#-->The full model gives thus an advantage of ~12.8% in accuarcy in this example