######################################################################
# S3 File: R script for GBVs prediction into a cross validation scheme
#---------------------------------------------------------------------
# Paper: "Tournaments between markers as a strategy to enhance genomic 
# predictions" published in Plos One
#---------------------------------------------------------------------
# This R script uses the SNPs selected by Tournaments to fit predictive 
# models using Bayesian Lasso, into a cross validation scheme.
######################################################################



#------------------------------------------------------------
#Removing objects
#------------------------------------------------------------
rm(list=ls(all=TRUE))

#------------------------------------------------------------
#Libraries
#------------------------------------------------------------
library(BPrimm)
library(parallel)
ncores <- detectCores()
options(cores=ncores)
getOption('cores')

#------------------------------------------------------------
#Loading data
#------------------------------------------------------------
load("genotypes.RData")
load("phenotypes.RData")

#------------------------------------------------------------
#Loading Tournament Results
#------------------------------------------------------------
load(paste("tournaments.rda"))

#---------------------------------------------------------------------
#Rank of SNPs
#---------------------------------------------------------------------
rank <- c(TournamentReults[[6]],rev(TournamentReults[[5]]))


#-----------------------------------------------------------------------
#Dividing the samples into groups for the Cross Validation
#-----------------------------------------------------------------------
tgval <- 48 #Tamanho dos grupos da validacao cruzada
ngval <- nrow(genotypes)/tgval #Numero de grupos da validacao cruzada
samp_ind <- sample(1:nrow(genotypes)) #Aleatoriza os individuos
gval_aux <- rep(1:ngval,each=tgval)
gval <- lapply(1:ngval,function(i) sort(samp_ind[gval_aux==i])) #Grupos


#Adjusting models with up to 50,000 SNPs
#Can be fitted models with up to 530,566 SNPs
nfinal <- c(100,250,500,1000,2500,5000,10000,25000,50000)

#-----------------------------------------------------------------------
#Combinations of Cross Validation Groups and Number of Selected SNPs
#-----------------------------------------------------------------------
ncomb <- length(nfinal)*ngval
i <- rep(1:ngval,length(nfinal))
j <- rep(nfinal,each=ngval)
cbind(i,j,1:ncomb)

#-----------------------------------------------------------------------
#Function to adjust the Bayesian LASSO (model for prediction of GBVs)
#in a cross-validation scheme
#-----------------------------------------------------------------------
BL_func <- function(comb){
  
  #-----------------------------------------------------------------------
  #Dividing Genotypic matrix (X) and phenotype vector (y) 
  #for training set and testing set samples
  #-----------------------------------------------------------------------
  testing <- as.numeric(unlist(gval[i[comb]]))
  training <- sort(as.numeric(unlist(gval[-i[comb]])))
  XTr <- genotypes[training,]
  XTe <- genotypes[testing,]
  yTr <- phenotypes[training]
  yTe <- phenotypes[testing]
  y <- yTr-mean(yTr)
  mu <- mean(yTr)
  
  #-----------------------------------------------------------------------
  #SNPs with larger estimates modules
  #-----------------------------------------------------------------------
  maiores <- rank[1:j[comb]]
  
  #-----------------------------------------------------------------------
  #Bayesian Lasso
  #-----------------------------------------------------------------------
  BL <- Bayes.Lasso(y, XTr[,maiores], b=NULL, n.burn = 2000, n.thin = 20, n.iter = 100, s = 0.01,
                    r = 0.01, b.update.order = 1, method=1)
  BLb <- BL$b
  b <- apply(BLb,1,mean)
  
  #-----------------------------------------------------------------------
  #Predicted phenotypes (yHat): for the training set and for the testing set
  #-----------------------------------------------------------------------
  yHatTr <- mu+XTr[,maiores]%*%b
  yHatTe <- mu+XTe[,maiores]%*%b
  
  #-----------------------------------------------------------------------
  #Correlations between yhat and y in the training set and in the testing set
  #-----------------------------------------------------------------------  
  corTr <- cor(yHatTr,yTr)
  corTe <- cor(yHatTe,yTe)
  
  #-----------------------------------------------------------------------
  # Results (correlations)
  #----------------------------------------------------------------------- 
  result <- round(c(mselec=length(maiores),vc=i[comb],corTr=corTr,corTe=corTe),4)
  
}

correlacoes <- mclapply(1:ncomb,BL_func)
save(correlacoes,file=paste("Correlations_CrossValidation.rda"))

cor1 <- matrix(unlist(correlacoes),ncol=4,byrow=T)
colnames(cor1) <- names(correlacoes[[1]])
cor2 <- cbind(tapply(cor1[,1],cor1[,1],mean),tapply(cor1[,3],cor1[,1],mean),tapply(cor1[,4],cor1[,1],mean))
cor2
sd2 <- cbind(tapply(cor1[,1],cor1[,1],mean),tapply(cor1[,3],cor1[,1],sd),tapply(cor1[,4],cor1[,1],sd))
sd2

png("Correlations.png",width=800,height=520)
par(mai=c(1,1,1,1))
plot(cor2[,2],ylim=c(0.5,1.2),axes=F,pch=19,cex=1.5,xlab="Number of SNPs",ylab=expression(cor(hat(y),y)), main=expression("Correlations between predicted phenotypes and observed phenotypes"))
axis(1,1:nrow(cor2),as.numeric(cor2[,1]),cex=0.6)
axis(2,seq(0,1,0.1))
lines(cor2[,2])
points(cor2[,3],pch=19,col="red",cex=1.5)
lines(cor2[,3],col="red")
for(i in 1:nrow(cor2)) segments(i,(cor2[,2][i]-sd2[,2][i]),i,(cor2[,2][i]+sd2[,2][i]),lwd=2)
for(i in 1:nrow(cor2)) segments(i-0.05,(cor2[,2][i]-sd2[,2][i]),i+0.05,(cor2[,2][i]-sd2[,2][i]),lwd=2)
for(i in 1:nrow(cor2)) segments(i-0.05,(cor2[,2][i]+sd2[,2][i]),i+0.05,(cor2[,2][i]+sd2[,2][i]),lwd=2)
for(i in 1:nrow(cor2)) segments(i,(cor2[,3][i]-sd2[,3][i]),i,(cor2[,3][i]+sd2[,3][i]),lwd=2,col="red")
for(i in 1:nrow(cor2)) segments(i-0.05,(cor2[,3][i]-sd2[,3][i]),i+0.05,(cor2[,3][i]-sd2[,3][i]),lwd=2,col="red")
for(i in 1:nrow(cor2)) segments(i-0.05,(cor2[,3][i]+sd2[,3][i]),i+0.05,(cor2[,3][i]+sd2[,3][i]),lwd=2,col="red")
legend(7,0.6,c("Training set","Testing set"),col=c("black","red"),pch=19,lwd=2)
dev.off()