#Online Resource 2 #Improving the baking quality of bread wheat by genomic selection in early generations #Sebastian Michel1, Christian Kummer2, Martin Gallee2, Jakob Hellinger1, Christian Ametz3, Batuhan Akgöl4, Doru Epure5, Franziska Löschenberger3, Hermann Buerstmayr1 #1Department for Agrobiotechnology (IFA-Tulln), Institute for Biotechnology in Plant Production, University of Natural Resources and Life Sciences, Vienna (BOKU), Konrad-Lorenz-Str. 20, 3430 Tulln, Austria #2Versuchsanstalt für Getreideverarbeitung, Österreichische Mühlenvereinigung e.V., Prinz-Eugen-Straße 14/1/4,1040 Vienna, Austria #3Saatzucht Donau GesmbH. & CoKG, Saatzuchtstrasse 11, 2301 Probstdorf, Austria #4ProGen Seed A.S., Büyükdalyan Mah. 2. Küme evler Sok.,No: 49 31001 Antakya/ Hatay, Turkey #5Probstdorfer Saatzucht Romania SRL, Str. Siriului Nr.20, sect.1 Bucuresti, Romania #Corresponding author: S. Michel; e-mail: sebastian.michel@boku.ac.at ##################################################################################################################### #Analysis of an example dataset using a forward prediction approach for the farinograph dough stability #Date: 03/09/2017 #Author: Sebastian Michel #For questions/suggestions/comments contact: sebastian.michel@boku.ac.at #The example dataset contains: #-->191 lines for training models (called basis population in the publication) #-->70 lines for validating models (one of the independant validation populations mentioned in the publication) #-->Phenotypic data for both the protein content (protein) and a dough rheological trait, in this case the # farinograph stability (fa.stab); please notice that the phenotypic values were standardized for # both traits and the training + validation sets respectively for the purpose of illustration #-->Markerdata of 7687 GBS markers (M1,M2...) as well as a SDS-PAGE marker for the Glu-D1 locus #-->Missing Markerdata was imputed with EM algorithm implemented in the A.mat function of rrBLUP (Endelman 2011) # and markers were coded as (1,0,-1) for homozygous major,heterozygous, and homozygous minor #-->The Glu-D1 marker was coded as follows: "1" = '5+10' allele,"-1" = '2+12' allele, and "0" = heterozygous #The analysis will follow the order in the manuscript: #1) Single-trait prediction using a GBLUP model #2) Single-trait prediction using a WBLUP model with Glu-D1 as fixed effect #3) Multi-trait prediction using a Multi-variate model #4) Multi-trait prediction using the model-based index #-->We will assume prior information of the protein content in the multi-trait prediction; # results with no prior rotein content information are provided in parenthesis #5) A combination of the WBLUP model with multi-trait prediction using the model-based index #-->All models were implemented with the sommer package version 2.9 (Covarrubias-Pazaran 2016) library(sommer) ####################### #-->Read-in the dataset setwd("/Quality in wheat/R code/Example dataset") DATA <- read.table("GS_Baking_Quali.csv",sep=";",dec=".",header=TRUE) tapply(DATA$GEN,DATA$set,length) str(DATA[,1:10]) ############################################### #1) Single-trait prediction using a GBLUP model DATA.VALID <- droplevels(DATA[which(DATA$set %in% "validation"),]) DATA.TRAIN <- droplevels(DATA[which(DATA$set %in% "training"),]) #-->Create the genomic relationship matrix DATA.MODEL <- rbind(DATA.TRAIN,DATA.VALID) G.MATRIX <- A.mat(as.matrix(DATA.MODEL[,-c(1:5)],min.MAF=0.05)) rownames(G.MATRIX) <- colnames(G.MATRIX) <- DATA.MODEL$GEN #-->Remove the phenotypic records from the validation set DATA.MODEL$fa.stab[which(DATA.MODEL$GEN %in% DATA.VALID$GEN)] <- NA #-->Fit the GBLUP model MODEL <- mmer2(fixed=fa.stab~1,random=~g(GEN),G=list(GEN=G.MATRIX),data=DATA.MODEL,method="NR") GEBVs <- data.frame(GEN=rownames(MODEL$u.hat[[1]]),gebv.stab=as.numeric(MODEL$u.hat[[1]])) VALIDATION <- droplevels(merge(DATA.VALID[,c(1,2)],GEBVs,by.x="GEN",by.y="GEN",all.x=F,all.y=F)) cor.test(VALIDATION$fa.stab,VALIDATION$gebv.stab)#r=0.485 ############################################### #2) Single-trait prediction using a WBLUP model with Glu-D1 as fixed effect DATA.VALID <- droplevels(DATA[which(DATA$set %in% "validation"),]) DATA.TRAIN <- droplevels(DATA[which(DATA$set %in% "training"),]) #-->Create the genomic relationship matrix DATA.MODEL <- rbind(DATA.TRAIN,DATA.VALID) G.MATRIX <- A.mat(as.matrix(DATA.MODEL[,-c(1:5)],min.MAF=0.05)) rownames(G.MATRIX) <- colnames(G.MATRIX) <- DATA.MODEL$GEN #-->Remove the phenotypic records from the validation set DATA.MODEL$fa.stab[which(DATA.MODEL$GEN %in% DATA.VALID$GEN)] <- NA #-->Fit the WBLUP model with Glu-D1 as fixed effect (the fixed Glu-D1 effect has to added to the random genetic effects) MODEL <- mmer2(fixed=fa.stab~GLUD1,random=~g(GEN),G=list(GEN=G.MATRIX),data=DATA.MODEL,method="NR") GEBVs <- data.frame(GEN=rownames(MODEL$u.hat[[1]]),gebv.stab=as.numeric(MODEL$u.hat[[1]]),b.glu=MODEL$beta.hat[2]) GEBVs <- droplevels(merge(GEBVs,DATA.MODEL[,c(1,5)],by.x="GEN",by.y="GEN",all.x=F,all.y=F)) GEBVs$wblup.stab <- GEBVs$gebv.stab + GEBVs$b.glu*GEBVs$GLUD1 VALIDATION <- droplevels(merge(DATA.VALID[,c(1,2)],GEBVs,by.x="GEN",by.y="GEN",all.x=F,all.y=F)) cor.test(VALIDATION$fa.stab,VALIDATION$wblup.stab)#r=0.533 ############################################### #3) Multi-trait prediction using a multi-variate model # assuming prior information of the protein content DATA.VALID <- droplevels(DATA[which(DATA$set %in% "validation"),]) DATA.TRAIN <- droplevels(DATA[which(DATA$set %in% "training"),]) #-->Create the genomic relationship matrix DATA.MODEL <- rbind(DATA.TRAIN,DATA.VALID) G.MATRIX <- A.mat(as.matrix(DATA.MODEL[,-c(1:5)],min.MAF=0.05)) rownames(G.MATRIX) <- colnames(G.MATRIX) <- DATA.MODEL$GEN dim(G.MATRIX ) #-->Remove the phenotypic records from the validation set DATA.MODEL$fa.stab[which(DATA.MODEL$GEN %in% DATA.VALID$GEN)] <- NA #DATA.MODEL$protein[which(DATA.MODEL$GEN %in% DATA.VALID$GEN)] <- NA #-->Fit the MV-GBLUP model MODEL <- mmer2(fixed=cbind(fa.stab,protein)~1,random=~g(GEN),G=list(GEN=G.MATRIX),MVM=TRUE,data=DATA.MODEL,method="NR") GEBVs <- data.frame(GEN=names(MODEL$u.hat[[1]][,1]),gebv.stab=MODEL$u.hat[[1]][,1]) VALIDATION <- droplevels(merge(DATA.VALID[,c(1,2)],GEBVs,by.x="GEN",by.y="GEN",all.x=F,all.y=F)) cor.test(VALIDATION$fa.stab,VALIDATION$gebv.stab)#r=0.517 (without prior protein information r=0.392) ############################################### #4) Multi-trait prediction using the model-based index # assuming prior information of the protein content DATA.VALID <- droplevels(DATA[which(DATA$set %in% "validation"),]) DATA.TRAIN <- droplevels(DATA[which(DATA$set %in% "training"),]) #-->Create the genomic relationship matrix DATA.MODEL <- rbind(DATA.TRAIN,DATA.VALID) G.MATRIX <- A.mat(as.matrix(DATA.MODEL[,-c(1:5)],min.MAF=0.05)) rownames(G.MATRIX) <- colnames(G.MATRIX) <- DATA.MODEL$GEN dim(G.MATRIX ) #-->Remove the phenotypic records from the validation set DATA.MODEL$fa.stab[which(DATA.MODEL$GEN %in% DATA.VALID$GEN)] <- NA #DATA.MODEL$protein[which(DATA.MODEL$GEN %in% DATA.VALID$GEN)] <- NA #-->Compute the GEBVs for the protein content and merge them to the training data MODEL <- mmer2(fixed=protein~1,random=~g(GEN),G=list(GEN=G.MATRIX),data=DATA.MODEL,method="NR") GEBVs.PROT <- data.frame(GEN=rownames(MODEL$u.hat[[1]]),gebv.protein=as.numeric(MODEL$u.hat[[1]])) DATA.MODEL <- droplevels(merge(DATA.MODEL[,1:5],GEBVs.PROT,by.x="GEN",by.y="GEN",all.x=F,all.y=F)) #-->Fit the GBLUP model including the GEBVs of the protein content as fixed effect MODEL <- mmer2(fixed=fa.stab~gebv.protein,random=~g(GEN),G=list(GEN=G.MATRIX),data=DATA.MODEL,method="NR") GEBVs <- data.frame(GEN=rownames(MODEL$u.hat[[1]]),gebv.stab=as.numeric(MODEL$u.hat[[1]]),b.prot=MODEL$beta.hat[2]) GEBVs <- droplevels(merge(GEBVs,GEBVs.PROT,by.x="GEN",by.y="GEN",all.x=F,all.y=F)) GEBVs$index <- GEBVs$gebv.stab + GEBVs$b.prot*GEBVs$gebv.protein VALIDATION <- droplevels(merge(DATA.VALID[,c(1,2)],GEBVs,by.x="GEN",by.y="GEN",all.x=F,all.y=F)) cor.test(VALIDATION$fa.stab,VALIDATION$index)#r=0.505 (without prior protein information r=0.492) #-->Slightly less accurate than the multi-variate mixed model in this case, but computational less demanding #-->A combination with the WBLUP model is additionally more straight forward with this method #-->The method was furthermore more stable when no prior protein information was available ############################################### #5) A combination of the WBLUP model with multi-trait prediction using the model-based index DATA.VALID <- droplevels(DATA[which(DATA$set %in% "validation"),]) DATA.TRAIN <- droplevels(DATA[which(DATA$set %in% "training"),]) #-->Create the genomic relationship matrix DATA.MODEL <- rbind(DATA.TRAIN,DATA.VALID) G.MATRIX <- A.mat(as.matrix(DATA.MODEL[,-c(1:5)],min.MAF=0.05)) rownames(G.MATRIX) <- colnames(G.MATRIX) <- DATA.MODEL$GEN dim(G.MATRIX ) #-->Remove the phenotypic records from the validation set DATA.MODEL$fa.stab[which(DATA.MODEL$GEN %in% DATA.VALID$GEN)] <- NA #DATA.MODEL$protein[which(DATA.MODEL$GEN %in% DATA.VALID$GEN)] <- NA #-->Compute the GEBVs for the protein content and merge them to the training data MODEL <- mmer2(fixed=protein~1,random=~g(GEN),G=list(GEN=G.MATRIX),data=DATA.MODEL,method="NR") GEBVs.PROT <- data.frame(GEN=rownames(MODEL$u.hat[[1]]),gebv.protein=as.numeric(MODEL$u.hat[[1]])) DATA.MODEL <- droplevels(merge(DATA.MODEL[,1:5],GEBVs.PROT,by.x="GEN",by.y="GEN",all.x=F,all.y=F)) #-->Fit the model including both the GEBVs of the protein content and the Glu-D1 marker as fixed effect MODEL <- mmer2(fixed=fa.stab~gebv.protein + GLUD1,random=~g(GEN),G=list(GEN=G.MATRIX),data=DATA.MODEL,method="NR") GEBVs <- data.frame(GEN=rownames(MODEL$u.hat[[1]]),gebv.stab=as.numeric(MODEL$u.hat[[1]]),b.prot=MODEL$beta.hat[2],b.glu=MODEL$beta.hat[3]) GEBVs <- droplevels(merge(GEBVs,GEBVs.PROT,by.x="GEN",by.y="GEN",all.x=F,all.y=F)) GEBVs <- droplevels(merge(GEBVs,DATA.MODEL[,c(1,5)],by.x="GEN",by.y="GEN",all.x=F,all.y=F)) GEBVs$index <- GEBVs$gebv.stab + GEBVs$b.prot*GEBVs$gebv.protein + GEBVs$b.glu*GEBVs$GLUD1 VALIDATION <- droplevels(merge(DATA.VALID[,c(1,2)],GEBVs,by.x="GEN",by.y="GEN",all.x=F,all.y=F)) cor.test(VALIDATION$fa.stab,VALIDATION$index)#r=0.547 (without prior protein information r=0.536) #-->Comparison with the baseline accuarcy of r = 0.485 100*(0.547-0.485)/0.485 #-->The full model gives thus an advantage of ~12.8% in accuarcy in this example