###################################################################### # S3 File: R script for GBVs prediction into a cross validation scheme #--------------------------------------------------------------------- # Paper: "Tournaments between markers as a strategy to enhance genomic # predictions" published in Plos One #--------------------------------------------------------------------- # This R script uses the SNPs selected by Tournaments to fit predictive # models using Bayesian Lasso, into a cross validation scheme. ###################################################################### #------------------------------------------------------------ #Removing objects #------------------------------------------------------------ rm(list=ls(all=TRUE)) #------------------------------------------------------------ #Libraries #------------------------------------------------------------ library(BPrimm) library(parallel) ncores <- detectCores() options(cores=ncores) getOption('cores') #------------------------------------------------------------ #Loading data #------------------------------------------------------------ load("genotypes.RData") load("phenotypes.RData") #------------------------------------------------------------ #Loading Tournament Results #------------------------------------------------------------ load(paste("tournaments.rda")) #--------------------------------------------------------------------- #Rank of SNPs #--------------------------------------------------------------------- rank <- c(TournamentReults[[6]],rev(TournamentReults[[5]])) #----------------------------------------------------------------------- #Dividing the samples into groups for the Cross Validation #----------------------------------------------------------------------- tgval <- 48 #Tamanho dos grupos da validacao cruzada ngval <- nrow(genotypes)/tgval #Numero de grupos da validacao cruzada samp_ind <- sample(1:nrow(genotypes)) #Aleatoriza os individuos gval_aux <- rep(1:ngval,each=tgval) gval <- lapply(1:ngval,function(i) sort(samp_ind[gval_aux==i])) #Grupos #Adjusting models with up to 50,000 SNPs #Can be fitted models with up to 530,566 SNPs nfinal <- c(100,250,500,1000,2500,5000,10000,25000,50000) #----------------------------------------------------------------------- #Combinations of Cross Validation Groups and Number of Selected SNPs #----------------------------------------------------------------------- ncomb <- length(nfinal)*ngval i <- rep(1:ngval,length(nfinal)) j <- rep(nfinal,each=ngval) cbind(i,j,1:ncomb) #----------------------------------------------------------------------- #Function to adjust the Bayesian LASSO (model for prediction of GBVs) #in a cross-validation scheme #----------------------------------------------------------------------- BL_func <- function(comb){ #----------------------------------------------------------------------- #Dividing Genotypic matrix (X) and phenotype vector (y) #for training set and testing set samples #----------------------------------------------------------------------- testing <- as.numeric(unlist(gval[i[comb]])) training <- sort(as.numeric(unlist(gval[-i[comb]]))) XTr <- genotypes[training,] XTe <- genotypes[testing,] yTr <- phenotypes[training] yTe <- phenotypes[testing] y <- yTr-mean(yTr) mu <- mean(yTr) #----------------------------------------------------------------------- #SNPs with larger estimates modules #----------------------------------------------------------------------- maiores <- rank[1:j[comb]] #----------------------------------------------------------------------- #Bayesian Lasso #----------------------------------------------------------------------- BL <- Bayes.Lasso(y, XTr[,maiores], b=NULL, n.burn = 2000, n.thin = 20, n.iter = 100, s = 0.01, r = 0.01, b.update.order = 1, method=1) BLb <- BL$b b <- apply(BLb,1,mean) #----------------------------------------------------------------------- #Predicted phenotypes (yHat): for the training set and for the testing set #----------------------------------------------------------------------- yHatTr <- mu+XTr[,maiores]%*%b yHatTe <- mu+XTe[,maiores]%*%b #----------------------------------------------------------------------- #Correlations between yhat and y in the training set and in the testing set #----------------------------------------------------------------------- corTr <- cor(yHatTr,yTr) corTe <- cor(yHatTe,yTe) #----------------------------------------------------------------------- # Results (correlations) #----------------------------------------------------------------------- result <- round(c(mselec=length(maiores),vc=i[comb],corTr=corTr,corTe=corTe),4) } correlacoes <- mclapply(1:ncomb,BL_func) save(correlacoes,file=paste("Correlations_CrossValidation.rda")) cor1 <- matrix(unlist(correlacoes),ncol=4,byrow=T) colnames(cor1) <- names(correlacoes[[1]]) cor2 <- cbind(tapply(cor1[,1],cor1[,1],mean),tapply(cor1[,3],cor1[,1],mean),tapply(cor1[,4],cor1[,1],mean)) cor2 sd2 <- cbind(tapply(cor1[,1],cor1[,1],mean),tapply(cor1[,3],cor1[,1],sd),tapply(cor1[,4],cor1[,1],sd)) sd2 png("Correlations.png",width=800,height=520) par(mai=c(1,1,1,1)) plot(cor2[,2],ylim=c(0.5,1.2),axes=F,pch=19,cex=1.5,xlab="Number of SNPs",ylab=expression(cor(hat(y),y)), main=expression("Correlations between predicted phenotypes and observed phenotypes")) axis(1,1:nrow(cor2),as.numeric(cor2[,1]),cex=0.6) axis(2,seq(0,1,0.1)) lines(cor2[,2]) points(cor2[,3],pch=19,col="red",cex=1.5) lines(cor2[,3],col="red") for(i in 1:nrow(cor2)) segments(i,(cor2[,2][i]-sd2[,2][i]),i,(cor2[,2][i]+sd2[,2][i]),lwd=2) for(i in 1:nrow(cor2)) segments(i-0.05,(cor2[,2][i]-sd2[,2][i]),i+0.05,(cor2[,2][i]-sd2[,2][i]),lwd=2) for(i in 1:nrow(cor2)) segments(i-0.05,(cor2[,2][i]+sd2[,2][i]),i+0.05,(cor2[,2][i]+sd2[,2][i]),lwd=2) for(i in 1:nrow(cor2)) segments(i,(cor2[,3][i]-sd2[,3][i]),i,(cor2[,3][i]+sd2[,3][i]),lwd=2,col="red") for(i in 1:nrow(cor2)) segments(i-0.05,(cor2[,3][i]-sd2[,3][i]),i+0.05,(cor2[,3][i]-sd2[,3][i]),lwd=2,col="red") for(i in 1:nrow(cor2)) segments(i-0.05,(cor2[,3][i]+sd2[,3][i]),i+0.05,(cor2[,3][i]+sd2[,3][i]),lwd=2,col="red") legend(7,0.6,c("Training set","Testing set"),col=c("black","red"),pch=19,lwd=2) dev.off()