.First <- function() { options( repos = c(CRAN = "https://cran.rstudio.com/"), download.file.method = "wget" ) } .First() #install.packages("ff", dependencies=T) #install.packages("ffbase",dependencies=TRUE) #install.packages("rrBLUP",dependencies=TRUE ) #install.packages("rpart",dependencies=TRUE) #install.packages("EMMREML",dependencies=TRUE) #install.packages("RColorBrewer",dependencies=TRUE) #install.packages("SAMM", dependencies=TRUE) library("rpart") library("stringr") library("Matrix") library("parallel") library("ff") library("ffbase") library("EMMREML") library("RColorBrewer") library("rrBLUP") library("compiler") #library("SAMM") setCompilerOptions(suppressUndefined=TRUE) tr <- function(x)sum(diag(x)) #M is coded as -1,0,1; no missing emm_reml<-function (y, X, Z, K) { enableJIT(3) q = dim(X)[2] n = length(y) X<-Matrix(X) spI <- Diagonal(n) S <- spI - tcrossprod(X %*% solve(crossprod(X)), X) if (is.null(K)){ZK=Z}else{ZK <- Z %*% K} offset <- log(n) ZKZt <- tcrossprod(ZK, Z) ZKZtandoffset <- ZKZt + offset * spI SZKZtSandoffset <- { S %*% ZKZtandoffset } %*% S svdSZKZtSandspI <- eigen(SZKZtSandoffset, symmetric = TRUE) rm(SZKZtSandoffset) foo <- gc();rm(foo) Ur <- svdSZKZtSandspI$vectors[, 1:(n - q)] lambda <- svdSZKZtSandspI$values[1:(n - q)] - offset eta <- crossprod(Ur, y) minimfunc <- function(delta) { (n - q) * log(sum(eta^2/{ lambda + delta })) + sum(log(lambda + delta)) } minimfunc <- cmpfun(minimfunc) optimout <- optimize(minimfunc, lower = 9^(-10), upper = 9^10, tol = 1e-10) deltahat <- optimout$minimum Hinvhat <- solve(ZKZt + deltahat * spI) XtHinvhat <- crossprod(X, Hinvhat) betahat <- solve(XtHinvhat %*% X, XtHinvhat %*% y) ehat <- (y - { X %*% betahat }) Hinvhatehat <- Hinvhat %*% ehat sigmausqhat <- sum(eta^2/{ lambda + deltahat })/(n - q) Vinv <- (1/sigmausqhat) * Hinvhat sigmaesqhat <- deltahat * sigmausqhat uhat <- crossprod(ZK, Hinvhatehat) df <- n - q loglik <- -0.5 * (optimout$objective + df + df * log(2 * pi/df)) return(list(Vu = sigmausqhat, Ve = sigmaesqhat, betahat = betahat, uhat = uhat, loglik = loglik)) } ########M is 0,1,2 coded Amat.pieces<-function(M, pieces=10, mc.cores=1){ AmatPieces<-function(M){ pvec<-matrix(apply(M, 2, function(x){mean(x)/2}), ncol=1) MMt<-tcrossprod(M-2*matrix(1, nrow=nrow(M), ncol=1)%*%t(pvec)) return(list(pvec, MMt)) } CombAmatPieces<-function(Amatpiecesout){ nparts<-length(Amatpiecesout) NumAmat<-Amatpiecesout[[1]][[2]] denomAmat<-sum(2*Amatpiecesout[[1]][[1]]*(1-Amatpiecesout[[1]][[1]])) for (i in 2:length(Amatpiecesout)){ NumAmat<-NumAmat+Amatpiecesout[[i]][[2]] denomAmat<-denomAmat+sum(2*Amatpiecesout[[i]][[1]]*(1-Amatpiecesout[[i]][[1]])) } return(NumAmat/denomAmat) } x <- 1:ncol(M) n <- pieces chunk <- function(x,n) split(x, factor(sort(rank(x)%%n))) listforlapply<-chunk(x,n) lapplyfunc<-function(x){ return(AmatPieces(as(M[,x], "sparseMatrix"))) } lapplyout<-mclapply(X=listforlapply, FUN=lapplyfunc, mc.cores=mc.cores) A<-as.matrix(CombAmatPieces(lapplyout)) return(A) } ###############Example loglike<- function(resid, Vinv) { n<- nrow(resid) temp1 <- Vinv%*%resid (-.5)*(-determinant(Vinv, logarithm=T)$modulus + crossprod(resid,temp1)+n*log(2*pi)) } em.mixed <- function(y, X, Z, beta, Ve, Vu,maxiter=2000,tolerance = 1e-0010) { n <- nrow(y) q1 <- nrow(Z) conv <- 0 ZZt<- tcrossprod(Z) tolpareig<-log(ncol(ZZt)) diag(ZZt)<-diag(ZZt)+tolpareig eigZZt<-eigen(ZZt, symmetric=T) diag(ZZt)<-diag(ZZt)-tolpareig eigenvals=eigZZt$values-tolpareig XtX<-crossprod(X) xb=X%*%beta resid=y-xb lambda=as.numeric(Ve/Vu) Vinv <- tcrossprod(eigZZt$vectors%*%diag(1/sqrt(eigenvals+lambda)))/as.numeric(Vu) L0 <- loglike(resid, Vinv) i<-0 repeat{ if(i>maxiter){ break } temp1 <- Vinv %*% resid s0 <- as.numeric(Ve)^2 * crossprod(temp1) + as.numeric(Ve) * n - as.numeric(Ve)^2 * tr(Vinv) s1 <- as.numeric(Vu)^2 * crossprod(temp1,ZZt%*%temp1)+ as.numeric(Vu)*q1 - as.numeric(Vu)^2 *sum(Z*(Vinv%*%Z)) w <- xb + as.numeric(Ve) * temp1 Ve <- s0/n Vu <- s1/q1 beta <- solve(XtX,crossprod(X,w)) lambda=as.numeric(Ve/Vu) xb <- X %*% beta resid <- (y-xb) Vinv <- tcrossprod(eigZZt$vectors%*%diag(1/sqrt(eigenvals+lambda)))/as.numeric(Vu) L1 <- loglike(resid, Vinv) if(L1 < L0) { conv <- 0 break } i <- i + 1 if(abs(L1 - L0) < tolerance) { conv <- 1 break } L0 <- L1 } list(beta=beta, Ve=Ve,Vu=Vu,Loglikelihood=L1,conv=conv) } em.mixed2 <- function(y, X, Z, ZZt, eigZZt, eigenvals, beta, Ve, Vu,maxiter=2000,tolerance = 1e-0010) { n <- nrow(y) q1 <- nrow(Z) conv <- 0 XtX<-crossprod(X) xb=X%*%beta resid=y-xb lambda=as.numeric(Ve/Vu) Vinv <- tcrossprod(eigZZt$vectors%*%diag(1/sqrt(eigenvals+lambda)))/as.numeric(Vu) L0 <- loglike(resid, Vinv) i<-0 repeat{ if(i>maxiter){ break } temp1 <- Vinv %*% resid s0 <- as.numeric(Ve)^2 * crossprod(temp1) + as.numeric(Ve) * n - as.numeric(Ve)^2 * tr(Vinv) s1 <- as.numeric(Vu)^2 * crossprod(temp1,ZZt%*%temp1)+ as.numeric(Vu)*q1 - as.numeric(Vu)^2 *sum(Z*(Vinv%*%Z)) w <- xb + as.numeric(Ve) * temp1 Ve <- s0/n Vu <- s1/q1 beta <- solve(XtX,crossprod(X,w)) lambda=as.numeric(Ve/Vu) xb <- X %*% beta resid <- (y-xb) Vinv <- tcrossprod(eigZZt$vectors%*%diag(1/sqrt(eigenvals+lambda)))/as.numeric(Vu) L1 <- loglike(resid, Vinv) if(L1 < L0) { conv <- 0 break } i <- i + 1 if(abs(L1 - L0) < tolerance) { conv <- 1 break } L0 <- L1 } return(list(beta=beta, Ve=Ve,Vu=Vu,Loglikelihood=L1,conv=conv)) } loglikerpart<- function(resid, Vinv) { n<- nrow(resid) temp1 <- Vinv%*%resid (-.5)*(-determinant(Vinv, logarithm=T)$modulus + crossprod(resid,temp1)+n*log(2*pi)) } merpart<-function(y, X, z=z, ZZt, eigZZt, eigenvals, M, var00, var10,LNull=NULL,maxiter=2000,tolerance = 1e-0010, skipfittree=10, rpartcontrol=rpart.control(minsplit = 1, minbucket = 2, cp = 0.01, maxcompete = 4, maxsurrogate = 5, usesurrogate = 2, xval = 10, surrogatestyle = 0, maxdepth = 3)){ n=nrow(y) if (((!is.null(M))&(!is.null(X)))){ x=cbind(X,M) colnames(x)<-c(colnames(X), colnames(M)) } else if (((!is.null(M))&(is.null(X)))){ x=M } else if (((is.null(M))&(!is.null(X)))){ x=X } else {print("Either X or M should not be NULL") break } q1 <- nrow(z) meanrpart=mean(y) var0=var00 var1=var10 lambda=as.numeric(var0/var1) Vinv <- tcrossprod(eigZZt$vectors%*%diag(1/sqrt(eigenvals+lambda)))/as.numeric(var1) resid <- (y-meanrpart) conv <- 0 L0 <- loglikerpart(resid, Vinv) i<-0 repeat{ if(i>maxiter){ break } temp1 <- Vinv %*% resid s0 <- as.numeric(var0)^2 * crossprod(temp1) + as.numeric(var0) * n - as.numeric(var0)^2 * tr(Vinv) s1 <- as.numeric(var1)^2 * crossprod(temp1,ZZt%*%temp1)+ as.numeric(var1)*q1 - as.numeric(var1)^2 *sum(z*(Vinv%*%z)) w <- meanrpart + as.numeric(var0) * temp1 var0 <- s0/n var1 <- s1/q1 if ((i %% skipfittree)==0){ rpartdata<-data.frame(cbind(w,x)) colnames(rpartdata)<-c("w",colnames(x)) require(rpart) rpartmodel<-rpart(w~., data=rpartdata, control = rpartcontrol) meanrpart<-predict(rpartmodel) } lambda=as.numeric(var0/var1) Vinv <- tcrossprod(eigZZt$vectors%*%diag(1/sqrt(eigenvals+lambda)))/as.numeric(var1) resid <- (y-meanrpart) L1 <- loglikerpart(resid, Vinv) i <- i + 1 if(abs(L1 - L0) < tolerance) { conv=1 break } L0 <- L1 } getrulesrpart<-function(tree){ leafnodeRows <- grepl("leaf",tree$frame$var) nodevals <- as.numeric(rownames(tree$frame)[leafnodeRows]) capture.output(rules <- path.rpart(tree,nodevals)) capture.output(rulesdf <- do.call("rbind",lapply(rules,function(x)paste(x,collapse = " -AND- ")))) capture.output(rulesdf <- data.frame(nodeNumber=rownames(rulesdf),rule=rulesdf[,1],stringsAsFactors=FALSE)) return(rulesdf) } output<- list(mean=getrulesrpart(rpartmodel), var0=var0,var1=var1,Loglikelihood=L1, LNull=LNull, conv=conv, iter=i) return(output) } localmarkersampler<-function(Map, nsample, maxlength, keep=NULL){ nsample=nsample-1 Map$pos<-as.numeric(Map$pos) firstmarker<-Map[sample(1:nrow(Map), 1),] chrfirstmarker<-firstmarker$chr posfirstmarker<-firstmarker$pos minonchrom<-min(Map[Map$chr==chrfirstmarker,]$pos) maxonchrom<-max(Map[Map$chr==chrfirstmarker,]$pos) minpos<-max(minonchrom, posfirstmarker-maxlength) maxpos<-min(maxonchrom, posfirstmarker+maxlength) Mappart1<-Map[(Map$chr==chrfirstmarker&(((Map$pos<=maxpos)&(Map$pos>=minpos))&Map$pos>posfirstmarker)),] #print(dim(Mappart)) Mapsample1<-Mappart1[sample(1:nrow(Mappart1), min(nsample-1,nrow(Mappart1)), replace=T),] Mappart2<-Map[(Map$chr==chrfirstmarker&(((Map$pos<=maxpos)&(Map$pos>=minpos))&Map$pos1){ output<-lapply(1:nrulesindf, function(iruledf){ splitrule1<-unlist(strsplit(ruledf[iruledf,2],split=" -AND- ")) if (length(splitrule1)==1){ return(rep(1,ncol(newdata))) } else{ splitrule1<-splitrule1[-1] lengthsplitrule1<-length(splitrule1) splittype1<-lapply(splitrule1,function(x){grepl("<",x)}) #print(splitrule1) out1<-lapply(1:lengthsplitrule1, function(ix){ if (splittype1[[ix]]){ splitcond<-unlist(strsplit(splitrule1[ix],split="<")) return(newdata[splitcond[1],]=as.numeric(splitcond[2])) } } ) out2<-Reduce("rbind", out1) out2<-matrix(out2, ncol=ncol(newdata)) return(apply(out2,2,prod)) }}) return(Reduce("cbind",output)) } else {return(matrix(1,nrow=ncol(newdata), ncol=1))} } predicrulesmeforest<-function(meforestout, X=NULL,Mlist, mc.cores=1){ nchunks<-length(Mlist) getnamesfromrules<-function(meforestout){ nbag<-length(meforestout) ntree<-length(meforestout[[1]]) c(unlist(mclapply(1:nbag, function(ibag){ lapply(1:ntree, function(itree){ meforestout[[ibag]][[itree]][[1]]$mean[,2] }) }) )) } nbag<-length(meforestout) ntree<-length(meforestout[[1]]) ntotal<-nbag*ntree matrixforntotaltree<-matrix(1:ntree,ntree,nbag) matrixforntotalbag<-matrix(1:nbag,ntree,nbag, byrow=T) output<-Reduce("rbind",lapply(1:nchunks, function(xi){ output<-Reduce("cbind",mclapply(1:ntotal, function(x){ itree<-matrixforntotaltree[x] ibag<-matrixforntotalbag[x] Matxi<-Mlist[[xi]] if(!is.null(X)){ Xi<-X[rownames(X)%in%colnames(Matxi),] Xi<-matrix(Xi, ncol=ncol(X), dimnames=dimnames(Xi)) Xi<-Xi[match(colnames(Matxi),rownames(Xi)),] Matxi<-rbind(t(Xi),Matxi) } outdf<-predictruledf(meforestout[[ibag]][[itree]][[1]]$mean, newdata=Matxi) return(outdf) } , mc.preschedule = T, mc.cores=mc.cores)) colnames(output)<-getnamesfromrules(meforestout) return(output) })) return(output) } ####################################### getnamesfromrules<-function(meforestout, mc.cores=1){ nbag<-length(meforestout) ntree<-length(meforestout[[1]]) c(unlist(mclapply(1:nbag, function(ibag){ lapply(1:ntree, function(itree){ meforestout[[ibag]][[itree]][[1]]$mean[,2] }) }, mc.preschedule = T, mc.cores=mc.cores) )) } getpositionsfromrulesnames<-function(rulesnames, colnamesX=NULL){ nrules<-length(rulesnames) c(unlist(lapply(1:nrules, function(irule){ rulename<-rulesnames[irule] if (rulename!="root"){ splitrule1<-unlist(strsplit(rulename,split=" -AND- ")) splitrule1<-splitrule1[-1] splitrule2<-unlist(strsplit(unlist(strsplit(splitrule1,split=c("<"))) ,split=">=")) if (!is.null(colnamesX)){splitrule2.2<-splitrule2[!(splitrule2%in%colnamesX)]} else {splitrule2.2<-splitrule2} onlylocs<-splitrule2.2[unlist(lapply(splitrule2.2,function(x){grepl("_",x)}))] if (length(onlylocs)>0){ outformatrix<-c(unlist(strsplit(c(onlylocs),split="_"))) splitrule3<-matrix(c(unlist(strsplit(onlylocs,split="_"))), ncol=2, byrow=T) return(paste(splitrule3[1,1], mean(as.numeric(splitrule3[,2])), sep="_")) } else { splitrule2.2<-splitrule2[(splitrule2%in%colnamesX)] return(paste(splitrule2.2, sep="-and-")) } } else {return(NULL)} } ) ) ) } modelwithrules<-function(y, X, Z, K=NULL, M=NULL,R, map=NULL, useGblup=F){ modelout<-emm_reml(y=y,X=X,Z=Z%*%R,K=NULL) return(modelout) } modelwithrulesglmnet<-function(y, X, Z, K=NULL, M=NULL,R, map=NULL, alpha=1){ library(glmnet) fit.glmnet.cv <- cv.glmnet(Z%*%R, y,nfold = 5,alpha = alpha) uhat<-coef(fit.glmnet.cv, s = fit.glmnet.cv$lambda.min) modelout<-list(uhat=uhat) return(modelout) } ################################################################################# variableimportance<-function(outemmremlobj, rulesmat, markernames, GBLUP=F, K=NULL){ if (GBLUP){ ruleeffects<-crossprod(rulesmat,solve(K,outemmremlobj$uhat)) } else{ruleeffects<-outemmremlobj$uhat} namesrules<-colnames(rulesmat) ###########read names and extract variable names library("stringr") markercount<-sapply(markernames,function(x){return(str_count(namesrules,paste(x, "<", sep="")))}, simplify = "array") markercount<-markercount+sapply(markernames,function(x){return(str_count(namesrules,paste(x, ">=", sep="")))}, simplify = "array") colnames(markercount)<-markernames importances<-apply(markercount, 2, function(x){return(x*abs(ruleeffects))})[[1]] colnames(importances)<-markernames return(colSums(importances)) } variableimportancemclapply<-function(outemmremlobj, rulesmat, markernames, GBLUP=F, K=NULL, mc.cores=1){ return(simplify2array(mclapply(X=markernames, FUN=function(x){return(variableimportance(outemmremlobj, rulesmat, x, GBLUP, K))}, mc.cores = mc.cores, mc.preschedule = T))) } ################### variableinteractionmeasures<-function(outemmremlobj, rulesmat, markernames1,markernames2, GBLUP=F, K=NULL){ if (GBLUP){ ruleeffects<-crossprod(rulesmat,solve(K,outemmremlobj$uhat)) } else{ruleeffects<-outemmremlobj$uhat} namesrules<-colnames(rulesmat) ###########read names and extract variable names library("stringr") markercount1<-sapply(markernames1,function(x){return(str_count(namesrules,x))}, simplify = "array") colnames(markercount1)<-markernames1 markercount2<-sapply(markernames2,function(x){return(str_count(namesrules,x))}, simplify = "array") colnames(markercount2)<-markernames2 markercount1[markercount1>0]<-1 markercount2[markercount2>0]<-1 markerinteractionsimportancematrix<-matrix(0,nrow=length(markernames1), ncol=length(markernames2)) rownames(markerinteractionsimportancematrix)<-markernames1 colnames(markerinteractionsimportancematrix)<-markernames2 for (i in markernames1){ for (j in markernames2){ for (k in 1:length(ruleeffects)){ markerinteractionsimportancematrix[i,j]=markerinteractionsimportancematrix[i,j]+markercount1[k,i]*markercount2[k,j]*ruleeffects[k] } } } rownames(markerinteractionsimportancematrix)<-markernames1 colnames(markerinteractionsimportancematrix)<-markernames2 return(list(interactionstrangth=markerinteractionsimportancematrix)) } sameSizeVectorList2Matrix <- function(vectorList){ sm_i<-NULL sm_j<-NULL sm_x<-NULL for (k in 1:length(vectorList)) { sm_i <- c(sm_i,rep(k,length(vectorList[[k]]@i))) sm_j <- c(sm_j,vectorList[[k]]@i) sm_x <- c(sm_x,vectorList[[k]]@x) } return(sparseMatrix(i=sm_i,j=sm_j,x=sm_x,dims=c(length(vectorList),vectorList[[1]]@length))) } variableimportance2<-function(outemmremlobj, rulesmat, markernames, GBLUP=F, K=NULL, mc.cores=1){ if (GBLUP){ ruleeffects<-crossprod(rulesmat,solve(K,outemmremlobj$uhat)) } else{ruleeffects<-outemmremlobj$uhat} namesrules<-colnames(rulesmat) ###########read names and extract variable names markercount<-mclapply(X=markernames,FUN=function(x){return(as(str_count(namesrules, paste(x, "<", sep=""))+str_count(namesrules, paste(x, ">=", sep="")),"dsparseVector"))}, mc.cores=mc.cores) Markercount<-sameSizeVectorList2Matrix(markercount) dim(Markercount) rownames(Markercount)<-markernames #importances<-mclapply(1:nrow(Markercount), function(x){return(as(Markercount[x,]*ruleeffects,"dsparseVector"))}, mc.cores=mc.cores) importances<-mcmapply(function(x){return(as(c(Markercount[x,])*abs(ruleeffects),"sparseMatrix"))}, 1:nrow(Markercount), mc.cores=mc.cores) importances<-Reduce("cBind", importances) colnames(importances)<-markernames return(importances) } variableimportancemclapply2<-function(outemmremlobj, rulesmat, markernames, GBLUP=F, K=NULL, mc.cores=1){ return(simplify2array(mclapply(X=markernames, FUN=function(x){return(colSums(variableimportance2(outemmremlobj, rulesmat, x, GBLUP, K, mc.cores=1)))}, mc.cores = mc.cores, mc.preschedule = T))) } CartProduct = function(CurrentMatrix, NewElement) { if (length(dim(NewElement)) != 0 ) { warning("New vector has more than one dimension.") return (NULL) } if (length(dim(CurrentMatrix)) == 0) { CurrentRows = length(CurrentMatrix) CurrentMatrix = as.matrix(CurrentMatrix, nrow = CurrentRows, ncol = 1) } else { CurrentRows = nrow(CurrentMatrix) } var1 = replicate(length(NewElement), CurrentMatrix, simplify=F) var1 = do.call("rbind", var1) var2 = rep(NewElement, CurrentRows) var2 = matrix(var2[order(var2)], nrow = length(var2), ncol = 1) CartProduct = cbind(var1, var2) return (CartProduct) } variableinteractionmeasures2<-function(outemmremlobj, rulesmat, markernames1,markernames2, GBLUP=F, K=NULL, mc.cores=1){ if (GBLUP){ ruleeffects<-crossprod(rulesmat,solve(K,outemmremlobj$uhat)) } else{ruleeffects<-outemmremlobj$uhat} namesrules<-colnames(rulesmat) ###########read names and extract variable names markercount1<-mclapply(X=markernames1,FUN=function(xi){return(as(str_count(namesrules, paste(xi, "<", sep=""))+str_count(namesrules, paste(xi, ">=", sep="")),"dsparseVector"))}, mc.cores=mc.cores) Markercount1<-sameSizeVectorList2Matrix(markercount1) dim(Markercount1) rownames(Markercount1)<-markernames1 markercount2<-mclapply(X=markernames2,FUN=function(xi){return(as(str_count(namesrules, paste(xi, "<", sep=""))+str_count(namesrules, paste(xi, ">=", sep="")),"dsparseVector"))}, mc.cores=mc.cores) Markercount2<-sameSizeVectorList2Matrix(markercount2) dim(Markercount2) rownames(Markercount2)<-markernames2 Markercount1[Markercount1>0]<-1 Markercount2[Markercount2>0]<-1 dim(Markercount1) dim(Markercount2) sum(Markercount2>0) someFunction<-function(i,j){ return(sum(abs(Markercount1[i,]*Markercount2[j,]*ruleeffects))) } cartout = CartProduct(1:length(markernames1), 1:length(markernames2)) require(parallel) aList = mcmapply(someFunction, cartout[,1], cartout[,2], mc.cores=mc.cores, SIMPLIFY = T) return(aList) } require(compiler) predictruledf<-cmpfun(predictruledf) Amat.pieces<-cmpfun(Amat.pieces) em.mixed<-cmpfun(em.mixed) localmarkersampler<-cmpfun(localmarkersampler) loglike<-cmpfun(loglike) loglikerpart<-cmpfun(loglikerpart) merpart<-cmpfun(merpart) meforest<-cmpfun(meforest) modelwithrules<-cmpfun(modelwithrules) tr<-cmpfun(tr) predictruledf<-cmpfun(predictruledf) predicrulesmeforest <- cmpfun(predicrulesmeforest) getnamesfromrules<-cmpfun(getnamesfromrules) getpositionsfromrulesnames<-cmpfun(getpositionsfromrulesnames) variableimportance<-cmpfun(variableimportance) variableinteractionmeasures<-cmpfun(variableinteractionmeasures) ########################################################### LER<-function(y, X=NULL, Z, K, Mlist,map, nsample=10, maxlength=20,ntrees=100,bagging=9,proprow=.1, mc.cores=3,var00=1, var10=1,maxiter=500,tolerance = 1e-6, skipfittree=5, rpartcontrol=NULL, cp=0.03, maxdepth=2){ meforestout<-meforest(y=y, X=X, Z=Z, K=K, Mlist=Mlist,map=map, nsample=nsample, maxlength=maxlength,ntrees=ntrees,bagging=bagging,proprow=proprow, mc.cores=mc.cores,var00=var00, var10=var10,maxiter=maxiter,tolerance = tolerance, skipfittree=skipfittree, rpartcontrol=rpartcontrol, cp=cp, maxdepth=maxdepth) getnamesfromrulesout<-getnamesfromrules(meforestout) if (is.null(X)){ predicrulesmeforestout<-predicrulesmeforest(meforestout, Mlist, X=NULL,mc.cores=mc.cores) } else {predicrulesmeforestout<-predicrulesmeforest(meforestout, Mlist, X=X,mc.cores=mc.cores) } R=apply(predicrulesmeforestout, 2, function(y) (y - mean(y)) / sd(y) ^ as.logical(sd(y))) modelwithrulesout<-modelwithrules(y=y, X=matrix(1,nrow=length(y)), Z=Z, K=NULL, M=NULL,R=R, map=NULL, useGblup=F) if (is.null(X)){ variableimportancemclapply2out<-variableimportancemclapply2(outemmremlobj=modelwithrulesout, rulesmat=R, markernames=c(rownames(Mlist[[1]])),mc.cores=mc.cores) variableinteractionmeasures2out<-variableinteractionmeasures2(outemmremlobj=modelwithrulesout, rulesmat=R, markernames1=c(rownames(Mlist[[1]]))[1:200],markernames2=c(rownames(Mlist[[1]]))[1:200], GBLUP=F, K=NULL, mc.cores=mc.cores) } else { variableimportancemclapply2out<-variableimportancemclapply2(outemmremlobj=modelwithrulesout, rulesmat=R, markernames=c(colnames(X),rownames(Mlist[[1]])),mc.cores=mc.cores) variableinteractionmeasures2out<-variableinteractionmeasures2(outemmremlobj=modelwithrulesout, rulesmat=R, markernames1=c(colnames(X),rownames(Mlist[[1]]))[1:200],markernames2=c(colnames(X),rownames(Mlist[[1]]))[1:200], GBLUP=F, K=NULL, mc.cores=mc.cores) } matrixout<-matrix(variableinteractionmeasures2out, nrow=200) return(list(meforestout=meforestout,R=R,modelwithrulesout=modelwithrulesout, variableimportancemclapply2out=variableimportancemclapply2out,variableinteractionmeasures2out=matrixout)) } LER<-cmpfun(LER) ExtractRules<-function(ytrain,Xtrain,Xtest, nrules,proprow=.2,propcol=.2, maxdepth=3, cp.prune=NULL, alpha=.2){ require(rpart) onetree<-function(ytrain,Xtrain,Xtest,proprow=proprow,propcol=propcol, maxdepth=maxdepth){ maxdepthin=rpois(1,maxdepth) while(maxdepthin>maxdepth+1||maxdepthin==0){maxdepthin=rpois(1,maxdepth)} maxdepth=maxdepthin ntt<-sample(1:dim(Xtrain)[[1]],ceiling(dim(Xtrain)[[1]]*proprow)) ptt<-sample(1:dim(Xtrain)[[2]],ceiling(dim(Xtrain)[[2]]*propcol)) Xtrain0<-matrix(Xtrain[,ptt], nrow=nrow(Xtrain)) Xtrain1<-matrix(Xtrain[ntt,ptt], ncol=ncol(Xtrain0)) Xtest1<-matrix(Xtest[,ptt], ncol=ncol(Xtrain0)) ytrain1<-ytrain[ntt] data1<-data.frame(p=ytrain1, X0=I(Xtrain1)) data0<-data.frame(p=ytrain, X0=I(Xtrain0)) datatest<-data.frame(X0=I(cbind(Xtest1))) dataall<-data.frame(X0=I(rbind(Xtrain0, Xtest1))) colnames(data1$X0)<-colnames(data0$X0)<-colnames(datatest$X0)<-colnames(dataall$X0)<-colnames(Xtrain)[ptt] tree <- rpart(p~X0, data=data1 , minsplit = 1, minbucket = 2, maxdepth=maxdepth) if (!is.null(cp.prune)){tree<- prune(tree, cp = cp.prune)} #cptarg = sqrt(tree$cptable[3,1]*tree$cptable[4,1]) #tree = prune(tree,cp=cptarg) leafnodeRows <- grepl("leaf",tree$frame$var) nodevals <- as.numeric(rownames(tree$frame)[leafnodeRows]) capture.output(rules <- path.rpart(tree,nodevals)) capture.output(rulesdf <- do.call("rbind",lapply(rules,function(x)paste(x,collapse = " -AND- ")))) capture.output(rulesdf <- data.frame(nodeNumber=rownames(rulesdf),rule=rulesdf[,1],stringsAsFactors=FALSE)) #print("Check1") ################### ############################################################# # code from Deniz (march 20) to extract the rules # also works if the tree has been pruned # the current CRAN version of partykit might create an error there (march 2012) aaaall<-predict(tree,newdata = dataall) #print("check2") featureall<-as.factor(aaaall) aaa<-predict(tree,newdata = data0) featuretrain<-factor(aaa, levels(featureall)) #print("check3") aaatest<-predict(tree,newdata = datatest) featuretest<-factor(aaatest,levels(featureall)) featuretrain<-model.matrix(~featuretrain-1) featuretest<-model.matrix(~featuretest-1) return(list(ftrain=featuretrain[,1:(dim(featuretrain)[2]-1)],ftest=featuretest[,1:(dim(featuretrain)[2]-1)], rulesdf=rulesdf[1:(dim(featuretrain)[2]-1),])) } rulesdfv<-c() ftrainv<-c() ftestv<-c() ncolrules<-0 while (ncolrules 1){ require("parallel") out<- mclapply(as.list(PvU), lapplyfunc, mc.cores=mc.cores) return(out) } if(mc.cores == 1){ out<-lapply(PvU, lapplyfunc) return(out) } } meanimpute<-function(X){ imputecol<-function(x){ x[is.na(x)]<-mean(x[!is.na(x)]) return(x) } Ximp<-apply(X,2,imputecol) return(as.matrix(Ximp)) } sameSizeVectorList2Matrix <- function(vectorList){ sm_i<-NULL sm_j<-NULL sm_x<-NULL for (k in 1:length(vectorList)) { sm_i <- c(sm_i,rep(k,length(vectorList[[k]]@i))) sm_j <- c(sm_j,vectorList[[k]]@i) sm_x <- c(sm_x,vectorList[[k]]@x) } return(sparseMatrix(i=sm_i,j=sm_j,x=sm_x,dims=c(length(vectorList),vectorList[[1]]@length))) } variableimportance<-function(outemmremlobj, rulesmat, markernames, GBLUP=F, K=NULL, mc.cores=1){ if (GBLUP){ ruleeffects<-t(rulesmat)%*%solve(K)%*%outemmremlobj$uhat } else{ruleeffects<-outemmremlobj$uhat} namesrules<-colnames(rulesmat) ###########read names and extract variable names library(stringr) library(Matrix) library(parallel) markercount<-mclapply(markernames,function(x){return(as(str_count(namesrules, paste(x, "<", sep=""))+str_count(namesrules, paste(x, ">", sep="")),"dsparseVector"))}, mc.cores=mc.cores) Markercount<-sameSizeVectorList2Matrix(markercount) dim(Markercount) rownames(Markercount)<-markernames #importances<-mclapply(1:nrow(Markercount), function(x){return(as(Markercount[x,]*ruleeffects,"dsparseVector"))}, mc.cores=mc.cores) importances<-mcmapply(function(x){return(as(c(Markercount[x,])*ruleeffects,"sparseMatrix"))}, 1:nrow(Markercount), mc.cores=mc.cores) importances<-Reduce("cBind", importances) colnames(importances)<-markernames return(list(counts=Markercount,importances=importances)) } CartProduct = function(CurrentMatrix, NewElement) { if (length(dim(NewElement)) != 0 ) { warning("New vector has more than one dimension.") return (NULL) } if (length(dim(CurrentMatrix)) == 0) { CurrentRows = length(CurrentMatrix) CurrentMatrix = as.matrix(CurrentMatrix, nrow = CurrentRows, ncol = 1) } else { CurrentRows = nrow(CurrentMatrix) } var1 = replicate(length(NewElement), CurrentMatrix, simplify=F) var1 = do.call("rbind", var1) var2 = rep(NewElement, CurrentRows) var2 = matrix(var2[order(var2)], nrow = length(var2), ncol = 1) CartProduct = cbind(var1, var2) return (CartProduct) } variableinteractionmeasures<-function(outemmremlobj, rulesmat, markernames1,markernames2, GBLUP=F, K=NULL, mc.cores=1){ if (GBLUP){ ruleeffects<-t(rulesmat)%*%solve(K)%*%outemmremlobj$uhat } else{ruleeffects<-outemmremlobj$uhat} namesrules<-colnames(rulesmat) ###########read names and extract variable names library(stringr) library(Matrix) library(parallel) markercount1<-mclapply(markernames1,function(x){return(as(str_count(namesrules, paste(x, "<", sep=""))+str_count(namesrules, paste(x, ">", sep="")),"dsparseVector"))}, mc.cores=mc.cores) Markercount1<-sameSizeVectorList2Matrix(markercount1) dim(Markercount1) rownames(Markercount1)<-markernames1 markercount2<-mclapply(markernames2,function(x){return(as(str_count(namesrules, paste(x, "<", sep=""))+str_count(namesrules, paste(x, ">", sep="")),"dsparseVector"))}, mc.cores=mc.cores) Markercount2<-sameSizeVectorList2Matrix(markercount2) dim(Markercount2) rownames(Markercount2)<-markernames2 Markercount1[Markercount1>0]<-1 Markercount2[Markercount2>0]<-1 dim(Markercount1) dim(Markercount2) someFunction<-function(i,j){ return(sum(abs(Markercount1[i,]*Markercount2[j,]*ruleeffects))) } cartout = CartProduct(1:length(markernames1), 1:length(markernames2)) require(parallel) aList = mcmapply(someFunction, cartout[,1], cartout[,2], mc.cores=mc.cores) return(aList) } ############## set.seed(123) topmarkerslistEMMA<-vector(mode="list", length=100) topmarkerslistLER<-vector(mode="list", length=100) for (iii in 1:100){ print(iii) library(EMMREML) nrules=4000 maxdepth=3 nsplit=10 alpha=0.001 mc.cores=5 perctrain=.99 nimpmarkers=30 cp.prune=0.001 proprow=.3 propcol=.2 traitcolumn=10 load("MouseData.RData") genodata01<-genodata01[genodata01$chrom==3,] ####filter maffilterfunction<-function(x){ xtable<-table(x[-c(1:3)]) if (length(xtable)==1){return(FALSE)} else{ if (length(xtable)==3){ f1=2*xtable[names(xtable)=="0"]+xtable[names(xtable)=="1"] f2=2*xtable[names(xtable)=="2"]+xtable[names(xtable)=="1"] if((f1<.05)||(f2<.05)){return(FALSE)}else{return(TRUE)} } if (length(xtable)==2){ if (("0"%in%names(xtable))&&("1"%in%names(xtable))){ f1=2*xtable[names(xtable)=="0"]+xtable[names(xtable)=="1"] if((f1<.05)){return(FALSE)}else{return(TRUE)} } if (("2"%in%names(xtable))&&("1"%in%names(xtable))){ f2=2*xtable[names(xtable)=="2"]+xtable[names(xtable)=="1"] if((f2<.05)){return(FALSE)}else{return(TRUE)} } if (("0"%in%names(xtable))&&("2"%in%names(xtable))){ f1=2*xtable[names(xtable)=="0"] f2=2*xtable[names(xtable)=="2"] if(((f1<.05)||(f2<.05))){return(FALSE)}else{return(TRUE)} } } } } filtermarkers<-c(apply(genodata01,1,maffilterfunction)) genodata01<-genodata01[filtermarkers,] dim(genodata01) genodata01[,1]<-paste("M",1:nrow(genodata01), sep="") phenodataonetrait<-Phenotypes[,c(1,4,traitcolumn)] colnames(phenodataonetrait)[1]<-"id" Mmapped<-t(genodata01[,-c(1,2,3)]) rownames(Mmapped)<-colnames(genodata01[,-c(1,2,3)]) indicesmatrix<-mapply(function(x){ irow<-sample(1:nrow(Mmapped), 1) icol<-sample(1:ncol(Mmapped), 1) return(c(irow,icol)) },1:10000, SIMPLIFY=T) dim(indicesmatrix) #Mmapped[t(indicesmatrix)]<-rbinom(10000,2,.15) Mmapped<-matrix(rbinom(2000*1000,2,.15), nrow=2000, ncol=1000) Mmapped[1:50,1:5] colnames(Mmapped)<-paste("M",1:1000, sep="") rownames(Mmapped)<-paste("L",1:2000, sep="") mrkMap<-data.frame(marker=paste("M",1:1000, sep=""), chrom=rep(1,1000), loc=1:1000) mrkMap1<-makehierarchy(mrkMap=mrkMap, nsplit=5, depth=2) mrkMap<-makehierarchy(mrkMap=mrkMap, nsplit=nsplit, depth=2) Phenotype<-data.frame(id=paste("L",1:2000, sep=""), sex=rep(c("M", "F"), each=1000), trait=rep(0,2000)) ################################################ ####replace the Phenotype[,3] with the simulated phenotype based on mrkMap dim(Mmapped) M1<-Mmapped[,mrkMap1[,6]%in%levels(as.factor(mrkMap1[,6]))[1]] M2<-Mmapped[,mrkMap1[,6]%in%levels(as.factor(mrkMap1[,6]))[2]] M3<-Mmapped[,mrkMap1[,6]%in%levels(as.factor(mrkMap1[,6]))[3]] M4<-Mmapped[,mrkMap1[,6]%in%levels(as.factor(mrkMap1[,6]))[4]] M5<-Mmapped[,mrkMap1[,6]%in%levels(as.factor(mrkMap1[,6]))[5]] svdMmapped<-svd(Mmapped,nu=3,nv=3) PCs<-Mmapped%*%svdMmapped$v PCs<-scale(PCs, center=T, scale=T) colnames(PCs)<-c("pc1", "pc2", "pc3") M1<-cbind(PCs, M1) M2<-cbind(PCs, M2) M3<-cbind(PCs, M3) M4<-cbind(PCs, M4) M5<-cbind(PCs, M5) M1effect<-apply(M1, 1, function(x){return((.6*x[11]+.5*x[14]-.4*x[17]))}) M1effect<-M1effect/sd(M1effect) M1markers<-colnames(M1)[c(11,14,17)] M2effect<-apply(M2, 1, function(x){if(x[1]<0) {return((.6*x[11]-.5*x[14]-.4*x[17]))} else (return(-(.6*x[11]+.5*x[14]+.4*x[17])))}) M2effect<-M2effect/sd(M2effect) M2markers<-colnames(M2)[c(11,14,17)] M3effect<-apply(M3, 1, function(x){(.6*x[11]+.5*x[14]-.4*x[17])^2}) M3effect<-M3effect/sd(M3effect) M3markers<-colnames(M3)[c(11,14,17)] M4effect<-apply(M4, 1, function(x){if(x[1]<0){return((.6*x[11]+.5*x[14]-.4*x[17])^2)} else {return(-(.6*x[11]-.5*x[14]+.4*x[17])^2)}}) M4effect<-M4effect/sd(M4effect) M4markers<-colnames(M4)[c(11,14,17)] M5effect<-apply(M5, 1, function(x){if(x[1]<0){return((.6*x[11]+.5*x[14]-.4*x[17]+.5*x[2])^2)} else {return((-.6*x[11]-.5*x[14]-.4*x[17]+.5*x[2])^2)}}) M5effect<-M5effect/sd(M5effect) M5markers<-colnames(M5)[c(11,14,17)] allmarkers<-c(M1markers,M2markers,M3markers,M4markers, M5markers) Gval<-M1effect+M2effect+M3effect+M4effect+M5effect Phenotype[,3]<-as.matrix(model.matrix(~-1+Phenotype[,2]))%*%c(50, 55)+Gval+.5*sd(Gval)*rnorm(nrow(Phenotype)) Ntotal<-dim(Phenotype)[1] trainingsamplesize=ceiling(perctrain*Ntotal) sampletrain<-sample(rownames(Mmapped), trainingsamplesize, replace=FALSE) sampletest<-setdiff(rownames(Mmapped), sampletrain) colnames(Phenotypes)<-c("taxa", "sex","trait") K<-cov(t(Mmapped)) K<-K/mean(diag(K)) rownames(K)<-colnames(K)<-rownames(Mmapped) Phenotypestrain<-Phenotype[Phenotype$id%in%sampletrain,] Phenotypestest<-Phenotype[Phenotype$id%in%sampletest,] head(Phenotypestrain) Ztrain<-as.matrix(model.matrix(~factor(Phenotypestrain[,1], levels=rownames(Mmapped)))) Ztest<-as.matrix(model.matrix(~factor(Phenotypestest[,1], levels=rownames(Mmapped)))) colnames(Mmapped)<-mrkMap$marker Xtrain<-model.matrix(~Phenotypestrain[,2]) Xtest<-model.matrix(~Phenotypestest[,2]) PCtrain<-Ztrain%*%PCs PCtest<-Ztest%*%PCs MmappedTrain<-Ztrain%*%Mmapped MmappedTest<-Ztest%*%Mmapped rownames(MmappedTrain)<-Phenotypestrain[,1] #######regular GWAS with rrBLUP library(rrBLUP) colnames(Mmapped)<-paste("M",1:1000, sep="") rownames(Mmapped)<-paste("L",1:2000, sep="") geno=cbind(data.frame(marker=paste("M",1:1000, sep=""), chrom=rep(1,1000), pos=1:1000),t(MmappedTrain)) Ktrain<-tcrossprod(Ztrain%*%K, Ztrain) rownames(Ktrain)<-colnames(Ktrain)<-Phenotypestrain[,1] pheno2 <- Phenotypestrain colnames(pheno2) <- c("ID","Sex","Trait") ans.emmax <- GWAS(pheno=pheno2,fixed="Sex",geno=geno,P3D=TRUE,n.core=1,K=Ktrain, plot=F, n.PC=3) ########## outemmreml2<-emmreml(y=Phenotypestrain[,3], X=Xtrain, Z=Ztrain, K=K) yhattrain<-Xtrain%*%outemmreml2$betahat+Ztrain%*%outemmreml2$uhat Xbetatrain<-Xtrain%*%outemmreml2$betahat rules<-extractrulesforpartitionswithpcs(ytrain=Phenotypestrain[,3]-Xbetatrain, Xtrain=MmappedTrain, PCtrain=PCtrain,Xtest=MmappedTest, PCtest=PCtest, Pv=mrkMap[,6], nrules=nrules,proprow=proprow,propcol=propcol, maxdepth=maxdepth, cp.prune=cp.prune,mc.cores=mc.cores, alpha=alpha) Pv<-unique(mrkMap[,6]) rulestrain<-c() rulestest<-c() rulesdescdf<-c() KRuleslist<-vector(mode="list", length=length(rules)) Krulesrestlist<-vector(mode="list", length=length(rules)) for (i in 1:length(rules)){ rulestrain<-cbind(rulestrain,rules[[i]]$ftrainv) KRuleslist[[i]]<-cov(t(rules[[i]]$ftrainv)) KRuleslist[[i]]<-KRuleslist[[i]]/mean(diag(KRuleslist[[i]])) rulestrainnoi<-c() for (j in setdiff(1:length(rules), i)){ rulestrainnoi<-cbind(rulestrainnoi,rules[[j]]$ftrainv) } Krulesrestlist[[i]]<-cov(t(rulestrainnoi)) Krulesrestlist[[i]]<-Krulesrestlist[[i]]/mean(diag(Krulesrestlist[[i]])) rulestest<-cbind(rulestest,rules[[i]]$ftestv) rulesdescdf<-rbind(rulesdescdf,cbind(rep(Pv[i],nrow(rules[[i]]$rulesdfv)), rules[[i]]$rulesdfv)) } rownames(rulestrain)<-Phenotypestrain[,1] rownames(rulestest)<-Phenotypestest[,1] colnames(rulestrain)<-colnames(rulestest)<-rulesdescdf[,3] dim(rulestrain) rulesmat<-rbind(rulestrain,rulestest) rulesmat<-rulesmat[match(rownames(K), rownames(rulesmat)),] Krules<-cov(t(rulesmat)) Krules<-Krules/mean(diag(Krules)) outemmremlKrules<-emmreml(y=Phenotypestrain[,3], X=Xtrain, Z=Ztrain, K=Krules) outemmremlrules<-emmreml(y=Phenotypestrain[,3], X=Xtrain, Z=rulestrain, K=diag(ncol(rulestrain))) library(glmnet) cvglmnetout<-cv.glmnet(rulestrain, Phenotypestrain[,3], alpha=.9) glmnetcoefs<-coef(cvglmnetout, s = "lambda.min") outvarimp<-variableimportance(outemmremlobj=outemmremlrules, rulesmat=rulesmat, markernames=paste("X0",c("pc1","pc2","pc3",colnames(Mmapped)),sep=""), GBLUP=F ,ruleeffects=NULL, K=NULL, mc.cores=mc.cores) outvarimpglmnet<-variableimportance(outemmremlobj=NULL, rulesmat=rulesmat, markernames=paste("X0",c("pc1","pc2","pc3",colnames(Mmapped)),sep=""), GBLUP=F ,ruleeffects=glmnetcoefs[-1], K=NULL, mc.cores=mc.cores) tablerules<-table(rulesdescdf[,1]) colvecrules<-c() for (ii in 1:length(tablerules)){ colvecrules<-c(colvecrules,rep(ii,tablerules[ii])) } tablemarkers<-table(mrkMap[,2]) colvecmarkers<-c() for (ii in 1:length(tablemarkers)){ colvecmarkers<-c(colvecmarkers,rep(ii,tablemarkers[ii])) } ######################### topmarkers<-names(colSums(outvarimp$importances))[order(abs(colSums(outvarimp$importances)), decreasing=T)[1:nimpmarkers]] ####now sort the top markers with respect to the map topmarkerslistEMMA[[iii]]<-(ans.emmax[order(ans.emmax[,4], decreasing=T),1])[1:nimpmarkers] topmarkerslistLER[[iii]]<-topmarkers } save(topmarkerslistEMMA,topmarkerslistLER, file="simresultsasso.RData" ) emmaassociation<-c() lerassociation<-c() for (i in 1:100){ emmaassociation<-rbind(emmaassociation,allmarkers%in%topmarkerslistEMMA[[i]]) lerassociation<-rbind(lerassociation,paste("X0",allmarkers,sep="")%in%topmarkerslistLER[[i]]) } colSums(emmaassociation) colSums(lerassociation)