#!/usr/local/bin/Rscript #Use this scrpt to run rrBLUP, random forest, MLR, RKHS, or Bayes LASSO cmdLineArgs = commandArgs(TRUE) library(rrBLUP) library(randomForest) library(BLR) if (length(cmdLineArgs)==0) { cat("ERROR:Not enough arguments given.\n") #cat("Plase provide a PREDICTOR NAME and file name.\n") cat("Usage: scriptName.R predictorName phenoFile genoFile trainingFilesDir validationFilesDir") cat("methods accepted are one of the following: rrBLUP, rrBLUP_Gauss, randomForest, bayesLasso, MLR \n") cat("Example: scriptName.R rrBLUP /mnt/CAC1-glusterfs/FLOATING_STORE/jes462/GS/Corrected_training_files_2_2014/fold_set_A/1_VP_2012_DS_TP_Everything_Before/1_train_w_vp_season/Pheno_all_2012_DS.csv /mnt/CAC1-glusterfs/FLOATING_STORE/jes462/GS/MET_crfilt_.90_outliers_removed_for_RRBlup_line_corrected.bin /mnt/CAC1-glusterfs/FLOATING_STORE/jes462/GS/Corrected_training_files_2_2014/fold_set_A/1_VP_2012_DS_TP_Everything_Before/1_train_w_vp_season/TP /mnt/CAC1-glusterfs/FLOATING_STORE/jes462/GS/Corrected_training_files_2_2014/fold_set_A/1_VP_2012_DS_TP_Everything_Before/1_train_w_vp_season/VP \n") quit() } predictorName <- cmdLineArgs[1] #statistical method one of: rrBLUP, rrBLUP_Gauss, randomForest, bayesLasso, MLR phenoFile <- cmdLineArgs[2] #file containing all validation phenotypes genoFile <- cmdLineArgs[3] #genotype file in rrBLUP format, r binary or .csv trainingFilesDir <- cmdLineArgs[4] #directory containing training phenotypes for each fold validationFilesDir <- cmdLineArgs[5] #directory containing validation phenotypes for each fold runCrossVal= function(geno,predictor,train_files_dir,foldfiles,vpfiles,val_files_dir){ nfoldfiles= length(foldfiles) for (i in 1:nfoldfiles){ #Read in *corrected* training pheno phenoTrain= as.matrix(read.table(paste(train_files_dir,foldfiles[i],sep="/"), header=TRUE, row.names=1, sep=",")) phenoTrain=na.omit(phenoTrain) phenoVal= as.matrix(read.table(paste(val_files_dir,vpfiles[i],sep="/"), header=TRUE, row.names=1, sep="," )) genoTrain= geno[row.names(phenoTrain),] genoPred= geno[row.names(phenoVal),] pred= predictor(phenoTrain, genoTrain, genoPred) #print(cor(pred,phenoVal)) if(i == 1){ cvPred=pred} else{cvPred= append(cvPred,pred)} } Out=as.matrix(cvPred) return(Out) } #Variation on runCrossVal for BLR package runCrossValforBL= function(geno,predictor,train_files_dir,foldfiles,vpfiles,val_files_dir){ nfoldfiles= length(foldfiles) for (i in 1:nfoldfiles){ phenoTrain= as.matrix(read.table(paste(train_files_dir,foldfiles[i],sep="/"), header=TRUE, row.names=1, sep=",")) genoTrain= geno[row.names(phenoTrain),] phenoVal= as.matrix(read.table(paste(val_files_dir,vpfiles[i],sep="/"), header=TRUE, row.names=1, sep="," )) genoPred= geno[row.names(phenoVal),] pred= predictor(phenoTrain, genoTrain, genoPred) if(i == 1){ cvPred=pred} else{cvPred= rbind(cvPred,pred)} } Out=as.matrix(cvPred) return(Out) } runMLR= function(Pheno,geno,foldfiles, train_files_dir, vpfiles, val_files_dir){ nfoldfiles= length(foldfiles) N=nrow(geno) p=ncol(geno) set.seed(1235) for (i in 1:nfoldfiles){ phenoTrain= as.matrix(read.table(paste(train_files_dir,foldfiles[i],sep="/"), header=TRUE, row.names=1, sep=",")) phenoTrain=na.omit(phenoTrain) genoTrain= geno[row.names(phenoTrain),] phenoVal= as.matrix(read.table(paste(val_files_dir,vpfiles[i],sep="/"), header=TRUE, row.names=1, sep="," )) #phenoVal= as.matrix(phenoVal[!rownames(phenoVal) %in% "", ]) genoPred= geno[row.names(phenoVal),] PhenoPred= Pheno[row.names(phenoVal),] pValues=numeric() for (s in 1:p){ fm=lm(phenoTrain~genoTrain[,s]) D= dim(summary(fm)$coef)[1] if (D > 1){ pValues[s]= summary(fm)$coef[2,4]} } #variable selection myRanking=order(pValues) sqCor= numeric() #determine the optimal number of markers/index for fold if(p < 100){ maxIndex= p } else{maxIndex= 100} for (s in 1:maxIndex){ tmpIndex= myRanking[1:s] fm= lm(phenoTrain~genoTrain[,tmpIndex]) bHat= coef(fm)[-1] bHat=ifelse(is.na(bHat),0,bHat) yHat=as.matrix(genoTrain[,tmpIndex,drop=FALSE])%*%bHat sqCor[s]= cor(phenoTrain,yHat)^2 } Max=which.max(sqCor) fIndex= myRanking[1:Max] fm=lm(phenoTrain~genoTrain[,fIndex]) bHat= coef(fm)[-1] bHat=ifelse(is.na(bHat),0,bHat) yHat=as.matrix(genoPred[,fIndex])%*%bHat if(i==1){ cvPred=yHat} else{cvPred=rbind(cvPred,yHat)} } cvPred= cvPred[match(row.names(cvPred),row.names(Pheno)),1,drop=FALSE] saveAcc= cor(cvPred,Pheno) print(saveAcc) return(saveAcc) } runCrossValCorr=function(Out, Pheno){ matched= Out[match(row.names(Pheno),row.names(Out)),] saveAcc= cor(matched,Pheno) print(saveAcc) return(saveAcc) } predictor_rrBLUP <- function(phenoTrain, genoTrain, genoPred){ return(kinship.BLUP(phenoTrain, genoTrain, genoPred)$g.pred) } predictor_rrBLUP_Gauss <- function(phenoTrain, genoTrain, genoPred){ source('/mnt/CAC1-glusterfs/FLOATING_STORE/jes462/scripts/kinship.BLUP3.R',chdir = TRUE) return(kinship.BLUP3(phenoTrain, genoTrain, genoPred, K.method="GAUSS",theta=2.5)$g.pred) } predictor_randomForest <- function(phenoTrain, genoTrain, genoPred){ return(randomForest(genoTrain, phenoTrain, xtest=genoPred)$test$predicted) } predictor_BayesLasso <- function(phenoTrain, genoTrain, genoPred){ ms.out= mixed.solve(phenoTrain, K=A.mat(genoTrain)) varA <- ms.out$Vu varE <- ms.out$Ve print(paste("Estimated add gen var:", varA)) genVar <- varA errVar <- varE varEdf <- 3 # degrees of freedom for degree of belief in error variance adjustX <- mean(rowSums(genoTrain^2)) lambda <- sqrt(2 * errVar / genVar * adjustX) prior <- list( varE=list(S=errVar * (varEdf + 2), df=varEdf), lambda=list(type="fixed", value=lambda) ) mrkEff <- BLR(y=phenoTrain, XF=array(1, c(length(phenoTrain), 1)), XL=genoTrain, nIter=6000, burnIn=1000, thin=10, prior=prior)$bL return(genoPred %*% mrkEff) } #this works as a selector to switch the appropiate predictor function given as a parameter getPredictorFunction <- function(predictorName) { return (switch(predictorName, 'rrBLUP'=predictor_rrBLUP, 'rrBLUP_Gauss'=predictor_rrBLUP_Gauss, 'randomForest'=predictor_randomForest, 'bayesLasso'=predictor_BayesLasso, 'NOT_DEFINED')) } getCorssValidatorFunction <- function(predictorName) { return (switch(predictorName, 'rrBLUP'=runCrossVal, 'rrBLUP_Gauss'=runCrossVal, 'randomForest'=runCrossVal, 'bayesLasso'=runCrossValforBL, 'NOT_DEFINED')) } #this function checks if a string has a specific substring at the end. stringEndsWith <- function(sourceString, targetEndingString) { endingString = substr(sourceString, nchar(sourceString)-nchar(targetEndingString)+1,nchar(sourceString)) return( endingString==targetEndingString) } train_files_dir <- trainingFilesDir val_files_dir <- validationFilesDir foldfiles= list.files(train_files_dir) vpfiles= list.files(val_files_dir) if (stringEndsWith(genoFile, ".bin")){ #this will load genoFile into the geno matrix which was previously saved as . # to create the bin file do this: # binFile = sub('.csv','.bin',genoFile) # save(geno,file=binFile) # load(genoFile) }else{ #print(genoFile) geno= as.matrix(read.table(genoFile,header=TRUE, row.names=1, sep=",")) } if (stringEndsWith(phenoFile, ".bin")){ #this will load penoFile into the Pheno matrix which was previously saved as .bin # to create the bin file do this: # binFile = sub('.csv','.bin',phenoFile) # save(Pheno,file=binFile) # #print(paste("Loading Phenotypes from BIN", phenoFile)) load(phenoFile) }else{ #print(paste("Loading Phenotypes from TEXT(CSV)", phenoFile)) #print(phenoFile) Pheno= as.matrix(read.table(phenoFile, header=TRUE, row.names=1, sep=",")) } #predictorName can be one of the following: # rrBLUP # rrBLUP_Gauss # randomForest # bayesLasso # #Example of a RUN #set predictor name to rrBLUP #predictorName = 'rrBLUP' #the correct prediction functions will be loaded #This table describes which function is run for a given predictorName parameter # PredictionMethod PredictorFunction CrossValidationFunc # rrBLUP predictor_rrBLUP runCrossVal # rrBLUP_Gauss predictor_rrBLUP_Gauss runCrossVal # randomForest predictor_randomForest runCrossVal # bayesLasso predictor_BayesLasso runCrossValforBL # MLR NA runMLR # others NOT_DEFINED NOT_DEFINED ## Use the code below to run a single cross validation experiment outside of any loop if (predictorName == 'MLR'){ #RUN THE MLRCOR MLRcor= runMLR(Pheno=Pheno,geno=geno,foldfiles= foldfiles, train_files_dir=train_files_dir,val_files_dir=val_files_dir,vpfiles=vpfiles) }else{ predictorFunc = getPredictorFunction(predictorName) crossValFunc = getCorssValidatorFunction(predictorName) #execution of the functions crossval_Pred_Generic = crossValFunc(geno=geno, predictor=predictorFunc, foldfiles=foldfiles,train_files_dir=train_files_dir,vpfiles=vpfiles,val_files_dir=val_files_dir) crossval_cor_Generic = runCrossValCorr(Out= crossval_Pred_Generic, Pheno=Pheno) }