#!/usr/local/bin/Rscript
#Use this scrpt to run rrBLUP, random forest, MLR, RKHS, or Bayes LASSO

cmdLineArgs = commandArgs(TRUE)

library(rrBLUP)
library(randomForest)
library(BLR)

if (length(cmdLineArgs)==0) {
   cat("ERROR:Not enough arguments given.\n")
   #cat("Plase provide a PREDICTOR NAME and file name.\n")
   cat("Usage: scriptName.R predictorName phenoFile genoFile trainingFilesDir validationFilesDir")
   cat("methods accepted are one of the following: rrBLUP, rrBLUP_Gauss, randomForest, bayesLasso, MLR 
\n")
   cat("Example: scriptName.R rrBLUP /mnt/CAC1-glusterfs/FLOATING_STORE/jes462/GS/Corrected_training_files_2_2014/fold_set_A/1_VP_2012_DS_TP_Everything_Before/1_train_w_vp_season/Pheno_all_2012_DS.csv /mnt/CAC1-glusterfs/FLOATING_STORE/jes462/GS/MET_crfilt_.90_outliers_removed_for_RRBlup_line_corrected.bin /mnt/CAC1-glusterfs/FLOATING_STORE/jes462/GS/Corrected_training_files_2_2014/fold_set_A/1_VP_2012_DS_TP_Everything_Before/1_train_w_vp_season/TP /mnt/CAC1-glusterfs/FLOATING_STORE/jes462/GS/Corrected_training_files_2_2014/fold_set_A/1_VP_2012_DS_TP_Everything_Before/1_train_w_vp_season/VP \n")
   quit()
}

predictorName <- cmdLineArgs[1] #statistical method one of: rrBLUP, rrBLUP_Gauss, randomForest, bayesLasso, MLR
phenoFile <- cmdLineArgs[2] #file containing all validation phenotypes
genoFile <- cmdLineArgs[3] #genotype file in rrBLUP format, r binary or .csv
trainingFilesDir <- cmdLineArgs[4] #directory containing training phenotypes for each fold
validationFilesDir <- cmdLineArgs[5] #directory containing validation phenotypes for each fold


runCrossVal= function(geno,predictor,train_files_dir,foldfiles,vpfiles,val_files_dir){
	nfoldfiles= length(foldfiles)
	
	for (i in 1:nfoldfiles){
		#Read in *corrected* training pheno
		phenoTrain= as.matrix(read.table(paste(train_files_dir,foldfiles[i],sep="/"), header=TRUE, row.names=1, sep=","))
		phenoTrain=na.omit(phenoTrain)
		
		phenoVal= as.matrix(read.table(paste(val_files_dir,vpfiles[i],sep="/"), header=TRUE, row.names=1, sep="," ))
			
		genoTrain= geno[row.names(phenoTrain),]
		genoPred= geno[row.names(phenoVal),]
		pred= predictor(phenoTrain, genoTrain, genoPred)
		#print(cor(pred,phenoVal))
		
		if(i == 1){
			cvPred=pred}
		
		else{cvPred= append(cvPred,pred)}			
	}
	Out=as.matrix(cvPred)
	return(Out)	
}

#Variation on runCrossVal for BLR package
runCrossValforBL= function(geno,predictor,train_files_dir,foldfiles,vpfiles,val_files_dir){
	nfoldfiles= length(foldfiles)
	
	for (i in 1:nfoldfiles){
		phenoTrain= as.matrix(read.table(paste(train_files_dir,foldfiles[i],sep="/"), header=TRUE, row.names=1, sep=","))
		genoTrain= geno[row.names(phenoTrain),]
		
		phenoVal= as.matrix(read.table(paste(val_files_dir,vpfiles[i],sep="/"), header=TRUE, row.names=1, sep="," ))
		genoPred= geno[row.names(phenoVal),]
		pred= predictor(phenoTrain, genoTrain, genoPred)
		
		if(i == 1){
			cvPred=pred}
		
		else{cvPred= rbind(cvPred,pred)}			
	}
	Out=as.matrix(cvPred)
	return(Out)	
}

runMLR= function(Pheno,geno,foldfiles, train_files_dir, vpfiles, val_files_dir){
	
	nfoldfiles= length(foldfiles)
	N=nrow(geno)
	p=ncol(geno)
	set.seed(1235)

	
	for (i in 1:nfoldfiles){
		phenoTrain= as.matrix(read.table(paste(train_files_dir,foldfiles[i],sep="/"), header=TRUE, row.names=1, sep=","))
		phenoTrain=na.omit(phenoTrain)
		genoTrain= geno[row.names(phenoTrain),]
		
		phenoVal= as.matrix(read.table(paste(val_files_dir,vpfiles[i],sep="/"), header=TRUE, row.names=1, sep="," ))
		#phenoVal= as.matrix(phenoVal[!rownames(phenoVal) %in% "", ])
		genoPred= geno[row.names(phenoVal),]
		PhenoPred= Pheno[row.names(phenoVal),]
				
		pValues=numeric()
		for (s in 1:p){
			fm=lm(phenoTrain~genoTrain[,s])
			D= dim(summary(fm)$coef)[1]
			if (D > 1){
				pValues[s]= summary(fm)$coef[2,4]}
				}
		
		#variable selection
		myRanking=order(pValues)
		sqCor= numeric()
		
		#determine the optimal number of markers/index for fold
		if(p < 100){
			maxIndex= p
		}
		else{maxIndex= 100}
		 
		for (s in 1:maxIndex){
			tmpIndex= myRanking[1:s]
			fm= lm(phenoTrain~genoTrain[,tmpIndex])
			bHat= coef(fm)[-1]
			bHat=ifelse(is.na(bHat),0,bHat)
			yHat=as.matrix(genoTrain[,tmpIndex,drop=FALSE])%*%bHat
			sqCor[s]= cor(phenoTrain,yHat)^2
			}
			
		Max=which.max(sqCor)
		fIndex= myRanking[1:Max]
		fm=lm(phenoTrain~genoTrain[,fIndex])
		bHat= coef(fm)[-1]
		bHat=ifelse(is.na(bHat),0,bHat)
		yHat=as.matrix(genoPred[,fIndex])%*%bHat
	
		if(i==1){
			cvPred=yHat}
		else{cvPred=rbind(cvPred,yHat)}
			}
	cvPred= cvPred[match(row.names(cvPred),row.names(Pheno)),1,drop=FALSE]
	saveAcc= cor(cvPred,Pheno)
	print(saveAcc)
	return(saveAcc)
}


runCrossValCorr=function(Out, Pheno){

	matched= Out[match(row.names(Pheno),row.names(Out)),]
	saveAcc= cor(matched,Pheno)
	print(saveAcc)
	return(saveAcc)
}


predictor_rrBLUP <- function(phenoTrain, genoTrain, genoPred){
	return(kinship.BLUP(phenoTrain, genoTrain, genoPred)$g.pred)
}
predictor_rrBLUP_Gauss <- function(phenoTrain, genoTrain, genoPred){
	source('/mnt/CAC1-glusterfs/FLOATING_STORE/jes462/scripts/kinship.BLUP3.R',chdir = TRUE)
	return(kinship.BLUP3(phenoTrain, genoTrain, genoPred, K.method="GAUSS",theta=2.5)$g.pred)
}
predictor_randomForest <- function(phenoTrain, genoTrain, genoPred){
	return(randomForest(genoTrain, phenoTrain, xtest=genoPred)$test$predicted)
}

predictor_BayesLasso <- function(phenoTrain, genoTrain, genoPred){
	ms.out= mixed.solve(phenoTrain, K=A.mat(genoTrain))
	varA <- ms.out$Vu
	varE <- ms.out$Ve
	print(paste("Estimated add gen var:", varA))
	
	genVar <- varA
	errVar <- varE
	varEdf <- 3 # degrees of freedom for degree of belief in error variance
	adjustX <- mean(rowSums(genoTrain^2))
	lambda <- sqrt(2 * errVar / genVar * adjustX)
	prior <- list(
		varE=list(S=errVar * (varEdf + 2), df=varEdf),
		lambda=list(type="fixed", value=lambda)
	)
	mrkEff <- BLR(y=phenoTrain, XF=array(1, c(length(phenoTrain), 1)), XL=genoTrain, nIter=6000, burnIn=1000, thin=10, prior=prior)$bL
	return(genoPred %*% mrkEff)
}


#this works as a selector to switch the appropiate predictor function given as a parameter
getPredictorFunction <- function(predictorName)
{
	return (switch(predictorName,
		 'rrBLUP'=predictor_rrBLUP, 
		 'rrBLUP_Gauss'=predictor_rrBLUP_Gauss, 
		 'randomForest'=predictor_randomForest, 
		 'bayesLasso'=predictor_BayesLasso, 
		 'NOT_DEFINED'))
}

getCorssValidatorFunction <- function(predictorName)
{
	return (switch(predictorName,
		'rrBLUP'=runCrossVal,
		'rrBLUP_Gauss'=runCrossVal,
		'randomForest'=runCrossVal, 
		'bayesLasso'=runCrossValforBL, 
		'NOT_DEFINED'))
}

#this function checks if a string has a specific substring at the end.

stringEndsWith <- function(sourceString, targetEndingString)
{
	endingString = substr(sourceString, nchar(sourceString)-nchar(targetEndingString)+1,nchar(sourceString))
	return( endingString==targetEndingString)
}

train_files_dir <- trainingFilesDir
val_files_dir <- validationFilesDir

foldfiles= list.files(train_files_dir)
vpfiles= list.files(val_files_dir)

if (stringEndsWith(genoFile, ".bin")){
	#this will load genoFile into the geno matrix which was previously saved as .
	# to create the bin file do this:
	#  binFile = sub('.csv','.bin',genoFile)
	#  save(geno,file=binFile)
	#  
	load(genoFile)
}else{
	#print(genoFile)
	geno= as.matrix(read.table(genoFile,header=TRUE, row.names=1, sep=","))
}

if (stringEndsWith(phenoFile, ".bin")){
	#this will load penoFile into the Pheno matrix which was previously saved as .bin
	# to create the bin file do this:
	#  binFile = sub('.csv','.bin',phenoFile)
	#  save(Pheno,file=binFile)
	#
	#print(paste("Loading Phenotypes from BIN", phenoFile)) 
	load(phenoFile)
}else{
	#print(paste("Loading Phenotypes from TEXT(CSV)", phenoFile)) 
	#print(phenoFile)
	Pheno= as.matrix(read.table(phenoFile, header=TRUE, row.names=1, sep=","))
}


#predictorName can be one of the following:
# rrBLUP 
# rrBLUP_Gauss
# randomForest
# bayesLasso
#

#Example of a RUN
#set predictor name to rrBLUP
#predictorName = 'rrBLUP'

#the correct prediction functions will be loaded
#This table describes which function is run for a given predictorName parameter
# PredictionMethod   PredictorFunction          CrossValidationFunc
# rrBLUP             predictor_rrBLUP           runCrossVal
# rrBLUP_Gauss       predictor_rrBLUP_Gauss     runCrossVal
# randomForest       predictor_randomForest     runCrossVal
# bayesLasso         predictor_BayesLasso       runCrossValforBL
# MLR                    NA                     runMLR
# others             NOT_DEFINED                NOT_DEFINED

## Use the code below to run a single cross validation experiment outside of any loop

if (predictorName == 'MLR'){
	#RUN THE MLRCOR
	MLRcor= runMLR(Pheno=Pheno,geno=geno,foldfiles= foldfiles, train_files_dir=train_files_dir,val_files_dir=val_files_dir,vpfiles=vpfiles)

}else{
	predictorFunc = getPredictorFunction(predictorName)
	crossValFunc = getCorssValidatorFunction(predictorName)

	#execution of the functions
	crossval_Pred_Generic = crossValFunc(geno=geno, predictor=predictorFunc, foldfiles=foldfiles,train_files_dir=train_files_dir,vpfiles=vpfiles,val_files_dir=val_files_dir)
	crossval_cor_Generic = runCrossValCorr(Out= crossval_Pred_Generic, Pheno=Pheno)

}