#!/usr/local/bin/Rscript #This script is for GS + historical GWAS #run from the command line library(rrBLUP) cmdLineArgs = commandArgs(TRUE) trait_markers <- cmdLineArgs[1] #list of markers to include as fixed effects, seperate markers with comma, no spaces! phenoFile <- cmdLineArgs[2] #list of all validation phenotypes genoFile <- cmdLineArgs[3] #genotype file as an R binary in rrBLUP format trainingFilesDir <- cmdLineArgs[4] #directory containing training phenotypes for each fold validationFilesDir <- cmdLineArgs[5] #directory containing validation phenotypes for each fold getdesignmat=function(geno, inds){ genoTrunc=geno[,inds,drop=FALSE] inc=rep(1,dim(genoTrunc)[1]) X= cbind(inc,genoTrunc) X= X[, !duplicated(t(X))] if (ncol(X) == 1){ return(NULL) } else{ return(X) } } getxpred=function(genofull,genopred,markers){ inds=ConvertSNPnames(markers,genofull) genotrunc=genopred[,inds,drop=FALSE] inc=rep(1,dim(genotrunc)[1]) X= cbind(inc, genotrunc) return(X) } ConvertSNPnames=function(snplist,geno){ out=numeric() for (i in 1:length(snplist)){ ind=which(colnames(geno)==snplist[i]) out[i]=ind } return(out) } get_fixed_markers= function(INDs,geno,train_files_dir, foldfiles, FOLD){ p=ncol(geno) set.seed(1235) out=numeric() #read in fold specific training data to find SNPs that have best fit phenoTrain= as.matrix(read.table(paste(train_files_dir,foldfiles[FOLD],sep="/"), header=TRUE, row.names=1, sep=",")) phenoTrain=na.omit(phenoTrain) genoTrain= geno[row.names(phenoTrain),] topsnps=INDs sqCor= numeric() tempindices=list() N=length(topsnps) #test all combinations of topsnps itt=1 for (s in 1:N){ Combinations_Listing= combn(1:N,s,simplify=FALSE) for (j in 1:length(Combinations_Listing)){ TEST_COMBINATION=unlist(Combinations_Listing[j]) if (s==1) { tmpIndex= topsnps[j] } else { tmpIndex=numeric() for (k in 1:length(TEST_COMBINATION)){ tmpIndex[k]= topsnps[k] } } tempindices[[length(tempindices)+1]]=list(tmpIndex) fm= lm(phenoTrain~genoTrain[,tmpIndex,drop=FALSE]) bHat= coef(fm)[-1] bHat=ifelse(is.na(bHat),0,bHat) yHat=as.matrix(genoTrain[,tmpIndex,drop=FALSE])%*%bHat COR= cor(phenoTrain,yHat)^2 sqCor[itt]=COR itt=itt+1 } } Max=which.max(sqCor) fIndex=tempindices[[Max]] return(fIndex) } runCrossValwfixed2=function(geno, Pheno, train_files_dir,foldfiles,vpfiles,val_files_dir,Markers){ nfoldfiles= length(foldfiles) INDs=ConvertSNPnames(snplist=Markers,geno) for (i in 1:nfoldfiles){ phenoTrain= as.matrix(read.table(paste(train_files_dir,foldfiles[i],sep="/"), header=TRUE, row.names=1, sep=",")) phenoTrain=na.omit(phenoTrain) phenoVal= as.matrix(read.table(paste(val_files_dir,vpfiles[i],sep="/"), header=TRUE, row.names=1, sep="," )) genoTrain= geno[row.names(phenoTrain),] genoPred= geno[row.names(phenoVal),] final_INDs= get_fixed_markers(INDs,geno,train_files_dir, foldfiles, FOLD=i) X= getdesignmat(genoTrain,unlist(final_INDs)) This_fold_fixed_markers=colnames(X)[2:length(colnames(X))] print(paste("fold",i)) print(This_fold_fixed_markers) kinout= kinship.BLUP(phenoTrain, genoTrain, genoPred, X) gpred=as.matrix(kinout$g.pred) Xpred= getxpred(geno, genoPred, This_fold_fixed_markers) pred= Xpred%*%kinout$beta + gpred if(i == 1){ cvPred=pred } else{cvPred= rbind(cvPred,pred) } } return(cvPred) } runCrossValCorr=function(Out, Pheno){ matched= Out[match(row.names(Pheno),row.names(Out)),] saveAcc= cor(matched,Pheno) print(saveAcc) return(saveAcc) } load(genoFile) train_files_dir <- trainingFilesDir val_files_dir <- validationFilesDir foldfiles= list.files(train_files_dir) vpfiles= list.files(val_files_dir) Pheno= as.matrix(read.table(phenoFile,header=TRUE, row.names=1, sep=",")) Markers=lapply(strsplit(trait_markers,','),as.vector)[[1]] wFixed=runCrossValwfixed2(geno=geno, Pheno=Pheno, train_files_dir= train_files_dir,foldfiles= foldfiles,vpfiles= vpfiles,val_files_dir= val_files_dir,Markers= Markers) Cor= runCrossValCorr(wFixed,Pheno)