################################################################################# # Biometrics- 150118PR # # This r-code is provided by HYOYOUNG CHOO-WOSOBA # # This r-code is a part of the real data analysis section in the paper including# # MES estimates with bootstrap-base-standard errors # # This r-code includes codes for Cardinal Research Clueter (CRC) server lines # ################################################################################# args <- commandArgs(trailingOnly = TRUE) # Cluster code seq <- as.numeric(args[1]) # Cluster code library(MASS) library(pscl) library(matrixcalc) caries <- read.csv("/home/h0choo01/caries_bs_CorrectZ/caries.csv", header=TRUE) # Z function Z<- function(xmat,b,v, max) { lambda <- exp(xmat%*%b) # Compute the terms used to sum for the (in)finite summation forans <- matrix(0,ncol=max+1,nrow=length(lambda)) for (j in 1:max){ temp <- matrix(0,ncol=j,nrow=length(lambda)) for (i in 1:j){temp[,i] <- lambda/(i^c(v))} for (k in 1:length(lambda)){forans[k,j+1] <- prod(temp[k,])} } forans[,1] <- rep(1,length(lambda)) # Determine the (in)finite sum ans <- rowSums(forans) return(ans) } # Calculate estimating u eu_cmp <- function(xmat,zmat,beta,gamma,y,v,max) { eu <- matrix(0,nrow(y),1) p <- exp(zmat%*%gamma)/(1+exp(zmat%*%gamma)) exp_u <- p/(p+(1-p)/Z(xmat,beta,v,max)) for (i in 1:nrow(y)){ eu[i,] <- ifelse(y[i]==0,exp_u[i],0) } return(eu) } # Functions for MES estimation RowbyRow<-function(A, b){ # b is a vector, A is a matrix temp <-A for (i in 1:length(A[,1])) {temp[i,]<-A[i,]*b[i]} return(temp) } prod.b2 <- function(xmat, b, v,max) { l <- exp(xmat%*%b) smat <- matrix(0,length(l), max) for(i in 1:length(l)) { for( j in 1: max) { smat[i,j] <- l[i]/j^v }} temp <- matrix(NA, length(l),max) for (i in 1:length(l)) { for (k in 1: max) { temp[i,k] <- k^2*prod(smat[i,1:k]) } } res <- apply(temp, 1, sum) return(res) } prod.b1 <- function(xmat, b, v,max) { l <- exp(xmat%*%b) smat <- matrix(0,length(l), max) for(i in 1:length(l)) { for( j in 1: max) { smat[i,j] <- l[i]/j^v }} temp <- matrix(NA, length(l),max) for (i in 1:length(l)) { for (k in 1: max) { temp[i,k] <- k*prod(smat[i,1:k]) } } res <- apply(temp, 1, sum) return(res) } prod.dlv1 <- function(xmat, b,v,max) { l <- exp(xmat%*%b) smat <- matrix(0,length(l), max) for(i in 1:length(l)) { for( j in 1: max) { smat[i,j] <- l[i]/j^v }} temp <- matrix(NA, length(l),max) for (i in 1:length(l)) { for (k in 1: max) { temp[i,k] <- k*(log(factorial(k)))*prod(smat[i,1:k]) } } res <- apply(temp, 1, sum) return(res) } prod.v2 <- function(xmat, b,v,max) { l <- exp(xmat%*%b) smat <- matrix(0,length(l), max) for(i in 1:length(l)) { for( j in 1: max) { smat[i,j] <- l[i]/j^v }} temp <- matrix(NA, length(l),max) for (i in 1:length(l)) { for (k in 1: max) { temp[i,k] <- (log(factorial(k)))^2*prod(smat[i,1:k]) } } res <- apply(temp, 1, sum) return(res) } prod.v1 <- function(xmat, b,v,max) { l <- exp(xmat%*%b) smat <- matrix(0,length(l), max) for(i in 1:length(l)) { for( j in 1: max) { smat[i,j] <- l[i]/j^v }} temp <- matrix(NA, length(l),max) for (i in 1:length(l)) { for (k in 1: max) { temp[i,k] <- (log(factorial(k)))*prod(smat[i,1:k]) } } res <- apply(temp, 1, sum) return(res) } var.w <- function(xmat,beta,v,max){ temp1 <- prod.b2(xmat,beta,v,max)/Z(xmat,beta,v,max) temp2 <- prod.b1(xmat,beta,v,max)/Z(xmat,beta,v,max) temp <- temp1-temp2^2 res <- diag(temp) return(res) } e.w <- function(xmat,beta,v,max){ prod.b1(xmat,beta,v,max)/Z(xmat,beta,v,max) } # derivative terms of estimators by using complete pseudo loglikelihood based form for MES estimates ddlv <- function(xmat,zmat,b,g,y,v,max){ uhat <- eu_cmp(xmat,zmat,b,g,y,v,max) wv1 <- prod.v2(xmat, b,v,max)/Z(xmat,b,v,max) wv2 <- (prod.v1(xmat, b,v,max)/Z(xmat,b,v,max))^2 v <- wv1-wv2 res <- v*(1-uhat) return(-sum(res)) } dlvv<- function(xmat,zmat,b,g,y,v,max) { p <- exp(zmat%*%g)/(1+exp(zmat%*%g)) uhat <- eu_cmp(xmat,zmat,b,g,y,v,max) temp1 <- -log(factorial(y))+(prod.v1(xmat, b,v,max)/Z(xmat,b,v,max)) res <- (1-uhat)*temp1 return(t(res)) } dpg <- function(zmat, gamma) { # derivative term of gamma temp <- exp(zmat%*%gamma) temp1 <- RowbyRow(zmat,temp) temp2 <- (1+temp)^2 res <- RowbyRow(temp1,1/temp2) return(res) } dlb <- function(xmat,beta,v,max) { # derivative term of beta temp1 <- prod.b2(xmat,beta,v,max)/Z(xmat,beta,v,max) temp2 <- prod.b1(xmat,beta,v,max)/Z(xmat,beta,v,max) temp3 <- temp1-(temp2)^2 res <- RowbyRow(xmat, temp3) return(res) } dlv<- function(xmat, beta,v,max) { # derivative term of v temp1 <- as.matrix(prod.dlv1(xmat,beta,v,max)/Z(xmat,beta,v,max)) temp2 <- as.matrix(prod.b1(xmat,beta,v,max))*as.matrix(prod.v1(xmat,beta,v,max))/((Z(xmat,beta,v,max))^2) res <- temp2-temp1 return(res) } prod_dlv2 <- function(xmat, b,v, max) { l <- exp(xmat%*%b) smat <- matrix(0,length(l), max) for(i in 1:length(l)) { for( j in 1: max) { smat[i,j] <- l[i]/j^v }} temp <- matrix(NA, length(l),max) for (i in 1:length(l)) { for (k in 1: max) { temp[i,k] <- k^2*(log(factorial(k)))*prod(smat[i,1:k]) } } res <- apply(temp, 1, sum) return(res) } r <- 0.25 # setup the tuning parameter for a modified Newton-Raphson algorithm var_gamma <- function(zmat,gamma) { p <- exp(zmat%*%gamma)/(1+exp(zmat%*%gamma)) return(diag(c(RowbyRow(p,1-p)))) } # create a function for correlation coeff. for zero-inf. part corr_zero1 <- function(eu,p) { uist <- NULL for(i in 1:(length(y)-1)) { for (j in (i+1):length(y)) { uist0 <- (eu[i,]-p[i,])*(eu[j,]-p[j,])/sqrt(p[i,]*(1-p[i,])*p[j,]*(1-p[j,])) uist <- rbind(uist0,uist) } } return(sum(uist)) } corr_zero2 <- function(eu,p) { std_u <- (eu-p)^2/(p*(1-p)) return(sum(std_u)) } # create a function for correlation coeff. for CMP part corr_cmp2 <- function(eu,beta0,v0) { l <- e.w(xmat,beta0,v0,max) res <- (1-eu)^2*(y-l)^2/as.matrix(diag(var.w(xmat,beta0,v0,max))) return(sum(res)) } # calculate the derivative term of rho and delta ddel <- function(csize){ dde1 <- NULL for( i in 2:(csize-1)) { a1 <- matrix(c(NA,rep(1,csize-i)),csize-i+1,1) dde1 <- rbind(dde1, a1) } dde2 <- matrix(c(NA,rep(1,csize-1)),csize,1) ddel <- rbind(dde2,dde1,NA) return(ddel) } corr_zero_bs1 <- function(eu,p) { uist <- NULL for(i in 1:(length(y)-1)) { for (j in (i+1):length(y)) { uist0 <- (eu[i,]-p[i,])*(eu[j,]-p[j,])/sqrt(p[i,]*(1-p[i,])*p[j,]*(1-p[j,])) uist <- rbind(uist0,uist) } } return(sum(uist)) } # pseudo log likelihood function for MPL estimates used for initial values of MES estimates logL_cmp <- function(parm) { bhat <- as.matrix(parm[1:9]) ghat <- as.matrix(parm[10:18]) vhat <- parm[19] p <- exp(zmat%*%ghat)/(1+exp(zmat%*%ghat)) l <- exp(xmat%*%bhat) yidx <- ifelse(y==0,1,0) l_0 <- log(p+(1-p)/Z(xmat,bhat,vhat,100)) l_c <- log((1-p)*l^y/((factorial(y)^c(vhat))*Z(xmat,bhat,vhat,100))) lsum <- sum(yidx*l_0)+sum((1-yidx)*l_c) return(-lsum) } set.seed(588+seq) B <- 1 bsparm <- matrix(NA,21,B) for ( BS in 1:B) { id <- sample(unique(caries$SID), replace=TRUE, size=length(unique(caries$SID))) bs_data <- NULL for( i in 1: length(id)){ bs_data <- rbind(bs_data,caries[which(caries$SID==id[i]),]) } onevec <- as.matrix(rep(1,nrow(bs_data)) ) y <- as.matrix(bs_data$CariesCount) all_x <- bs_data[,9:16] newx <- data.frame(int=onevec,all_x) xmat <- as.matrix(cbind(newx[,c(1,9)],newx[,2:8])) zmat <- xmat cs_bs <- matrix(NA,length(id),1) for (k in 1:length(id)){ cs_bs[k,] <- nrow(caries[which(caries$SID==id[k]),]) } N <- length(unique(caries$SID)) N1 <- sum(cs_bs*(cs_bs-1)/2) nt <- nrow(bs_data) id_bs <- NULL for(i in 1:nrow(cs_bs)) { idx <- as.matrix(rep(i,cs_bs[i])) id_bs <- rbind(id_bs, idx) } id_bs <- as.matrix(id_bs) # Get the initial value by using zeroinfl in R mZIP_bs <- zeroinfl(formula=CariesCount~ Gender + DentalExamAge + AUCmgF5_9yrs + AUCSodaOz5_9yrs + ToothBrushingFreqPerDayAvg + DentalVisitPast6moAvg + FluorideTreatment6moAvg + HomeFluorideppmAvg, dist = "poisson", data =bs_data) beta0 <- as.matrix(summary(mZIP_bs)$coefficients$count[,1]) gamma0 <-as.matrix(summary(mZIP_bs)$coefficients$zero[,1]) v0 <- 1 mlebs_parm <- optim(c(beta0,gamma0,v0), logL_cmp,control=list(maxit=30000), hessian=TRUE) beta0 <- as.matrix(c(mlebs_parm$par[1:9])) gamma0 <- as.matrix(c(mlebs_parm$par[10:18])) v0 <- c(mlebs_parm$par[19]) delta0 <- 0.5 rho0 <- 0.5 allx <- cbind(id_bs, xmat) ally <- cbind(id_bs, as.numeric(paste(bs_data$CariesCount))) colnames(allx)[1] <- "id" colnames(ally) <- c("id","CariesCount") max <- 100 res_bsparm <- matrix(NA, 21,100) for (j in 1: 100) { all_ddg <- matrix(0,length(gamma0),length(gamma0)) all_geeg <- matrix(0,length(gamma0),1) all_ddb <- matrix(0,length(beta0),length(beta0)) all_geeb <- matrix(0,length(beta0),1) all_del1 <- 0 all_del2 <- 0 all_rho1 <- 0 all_rho2 <- 0 N2 <- 0 ntot <- 0 for (k in 1:N) { xmat <- as.matrix(subset(allx,allx[,1]==k))[,-1] zmat <- xmat y <- as.matrix(as.matrix(subset(ally,ally[,1]==k))[,-1]) p0 <- exp(zmat%*%gamma0)/(1+exp(zmat%*%gamma0)) eu0 <- eu_cmp(xmat,zmat,beta0,gamma0,y,v0,max) ntot1 <- sum((1-eu0)^2) ntot <- ntot+ntot1 N20 <- as.vector(1-eu0)%*%t(as.vector(1-eu0)) N21 <- upper.triangle(N20) diag(N21) <- 0 N22 <- sum(N21) N2 <- N2 + N22 ew0 <- e.w(xmat,beta0,v0,max) cor_u <- matrix(delta0,cs_bs[k],cs_bs[k])+ diag(1-c(delta0),cs_bs[k],cs_bs[k]) wu0 <- sqrt(var_gamma(zmat,gamma0))%*%cor_u%*%sqrt(var_gamma(zmat,gamma0)) # cov-var of u cor_y <- matrix(rho0,cs_bs[k],cs_bs[k])+ diag(1-c(rho0),cs_bs[k],cs_bs[k]) wy0 <- sqrt(var.w(xmat,beta0,v0,max))%*%cor_y%*%sqrt(var.w(xmat,beta0,v0,max)) # update gamma estimate temp_g <- t(dpg(zmat,gamma0))%*%solve(wu0) geeg <- as.matrix(apply(RowbyRow(t(temp_g),eu0-p0),2,sum)) temp_g1 <- t(dpg(zmat,gamma0))%*%solve(wu0)%*%dpg(zmat,gamma0) + geeg%*%t(geeg) all_ddg <- all_ddg + temp_g1 ## sum upto N for derivative of gee for gamma all_geeg <- all_geeg + geeg ## sum upto N for gee for gamma # update delta estimate all_del1 <- all_del1 + corr_zero_bs1(eu0,p0) all_del2 <- all_del2 + corr_zero2(eu0,p0) # update beta estimate temp_b <- dlb(xmat,beta0,v0,max) temp_b1 <- t(temp_b)%*%solve(wy0)%*%diag(c(1-eu0)) geeb <- as.matrix(apply(RowbyRow(t(temp_b1),(y-e.w(xmat,beta0,v0,max))),2,sum)) temp_b2 <- t(temp_b)%*%solve(wy0)%*%diag(c(1-eu0))%*%temp_b + geeb%*%t(geeb) all_geeb <- all_geeb + geeb all_ddb <- all_ddb + temp_b2 # update rho estimate (cor.coefficient for y) vary <- as.matrix(diag(var.w(xmat,beta0,v0,max))) temp_rho <- cbind(ddel(cs_bs[k]),vech((1-eu0)%*%t(1-eu0)),vech((y-e.w(xmat,beta0,v0,max))%*%t(y-e.w(xmat,beta0,v0,max))),vech(vary%*%t(vary)) ) temp_rho <- na.omit(temp_rho) temp_rho1 <- temp_rho[,2]*temp_rho[,3]/sqrt(temp_rho[,4]) all_rho1 <- all_rho1 + sum(temp_rho1) all_rho2 <- all_rho2 + corr_cmp2(eu0,beta0,v0) } all_xmat <- as.matrix(allx[,-1]) all_zmat <- all_xmat all_ys <- as.matrix(ally[,2]) v1 <- v0 - r*sum(dlvv(all_xmat,all_zmat,beta0,gamma0,all_ys,v0,max))/(ddlv(all_xmat,all_zmat,beta0,gamma0,all_ys,v0,max)+ sum(dlvv(all_xmat,all_zmat,beta0,gamma0,all_ys,v0,max))^2) ########## updating scheme ############# if(j==1){ all_geeg1 <- all_geeg} else{ all_geeg1 <- ((j-1)*all_geeg1+all_geeg)/j } if(j==1){ all_ddg1 <- all_ddg} else{ all_ddg1 <- ((j-1)*all_ddg1+all_ddg)/j } if(j==1){ all_geeb1 <- all_geeb} else{ all_geeb1 <- ((j-1)*all_geeb1+all_geeb)/j } if(j==1){ all_ddb1 <- all_ddb} else{ all_ddb1 <- ((j-1)*all_ddb1+all_ddb)/j } gnew0 <- gamma0 + r*solve(all_ddg1)%*%all_geeg1 gnew <- (j*gamma0+gnew0)/(j+1) del_new <- (all_del1/N1)/(all_del2/nt) beta11 <- beta0 + r*solve(all_ddb1)%*%all_geeb1 beta1 <- (j*beta0+beta11)/(j+1) rho_new <- (all_rho1/N2)/(all_rho2/ntot) vnew <- (j*v0+v1)/(j+1) res_bsparm[,j] <- matrix(c(j,beta1,gnew,vnew, max(abs(gnew-gamma0),abs(beta1-beta0),abs(vnew-v0))),length(c(beta1,gnew,vnew))+2,1) if(max(abs(gnew-gamma0),abs(beta1-beta0),abs(vnew-v0)) < 0.01 | j==100) { break } gamma0<- gnew beta0<- beta1 delta0 <- del_new rho0 <- rho_new v0 <- vnew cat(c(res_bsparm[,j]),"\n") } bsparm[,BS] <- res_bsparm[,j] } file.name <- paste("bscaries_CorrectZ=",seq,".RData", sep="") save.image(file.name) q()