# Functions described in # "Bias Correction and Bayesian Analysis of Aggregate Counts in SAGE Libraries" # by Zaretzki, Gilchrist, Briggs and Armagan Copyright 2009 rdirichlet<-function(n,alpha){ #Random Dirichlet Function from the MCMCpack library in R. l <- length(alpha) x <- matrix(rgamma(l * n, alpha), ncol = l, byrow = TRUE) sm <- x %*% rep(1, l) return(x/as.vector(sm)) } dpb=function(Y,simsize=1500,subsize=20, burnin = 500, type="flat", Nprior.shape=100, Nprior.scale=200){ #DPB Gibbs Sampling approach. #here we assume that the observed counts Ti are binomial(phi_i,g_i) #g_i are distributed poisson with parameter(Nm_i). m_i follow a Dirichlet #distribution and N follows a gamma distribution and is then truncated since it should be a whole number. #Data is contained in the Dataframe Y which should have two parts, #Y$Tags is the aggregated tag counts and Y$Phi is the estimated tag formation probability. #simsize is the total number of simulations stored. subsize=20 #means that only every 20th sample is kept so that subsize*simsize simulations #are actually computed. burnin is the number of burnin samples it is included in #the total number of samples so here 500 of the 1500 samples kept will be considered burnin and #dropped from the returned data. type describe the dirichlet prior alpha = "flat" or "tub". #Nprior.shape and Nprior.scale are described in Zaretzki(2009) and should define the approximate #average population size. N ~ shape*scale sizeY = dim(Y) #Storage Structures. #T are the observed counts. sizeY<-dim(Y) #full storage Nsamp<-matrix(0,simsize,1); msamp<-matrix(0,sizeY[1],simsize) gsamp<-matrix(0,sizeY[1],simsize) #Sub storage Nvals<-matrix(0,subsize,1); mvals<-matrix(0,sizeY[1],subsize) gvals<-matrix(0,sizeY[1],subsize) #Initial Values alpha = rep(1,sizeY[1]) if(type != "flat") alpha = (1/sizeY[1])*alpha Nvals[1]<-round(sum(Y$Tags/Y$Phi)) mvals[,1]<-alpha/sizeY[1] gvals[,1]<-round(Y$Tags/Y$Phi) #Use a Gamma prior for the shape and scale of N. #Nprior.shape<-100 #Nprior.scale<-200 for(i in 1:simsize){ for(k in 2:subsize){ lam<-Nvals[(k-1)]*mvals[,(k-1)]*(1-Y$Phi) gvals[,k]<-Y$Tags+rpois(sizeY[1],lam) #g_k conditional Nshape<-sum(gvals[,k])+Nprior.shape+1 Nvals.rate<-(1+1/Nprior.scale) Nvals[k]<-round(rgamma(1,shape=Nshape,rate=Nvals.rate)) #N conditional mvals[,k]<-rgamma(sizeY[1],shape=(gvals[,k]+alpha),rate=Nvals[k]) #mconditional } gvals[,1]<-gsamp[,i]<-gvals[,subsize] Nvals[1]<-Nsamp[i]<-Nvals[subsize] mvals[,1]<-msamp[,i]<-mvals[,subsize] # if( i%%200 ==0 ) print(i) } #msim.big<-msamp[big.mle,] mean.samp<-apply(msamp[,(burnin+1):simsize],1,mean) #apparently returns a matrix. cuts<-c(.025,.5,.975) m.pctiles<-apply(msamp[,(burnin+1):simsize],1,quantile,probs=cuts) answer = data.frame(t(rbind(mean.samp,m.pctiles))) } dmb=function(Y,simsize=1500,subsize=20, burnin = 500, type="flat"){ #Gibbs Sampling approach. #here we assume that the observed counts Ti are binomial(phi_i,g_i) #g_i are distributed Multinomial with parameter(N,m_i). m_i follow a Dirichlet #distribution and N follows a gamma distribution and is then truncated since it should be a whole number. #Data is contained in the Dataframe Y which should have two parts, #Y$Tags is the aggregated tag counts and Y$Phi is the estimated tag formation probability. #simsize is the total number of simulations stored. subsize=20 #means that every 20th sample is kept so that subsize*simsize is the actual simulations #computed. burnin is the number of burnin samples; it is included in #the total number of samples so here 500 of the 1500 samples kept will be considered burnin and #dropped from the returned data. type describe the dirichlet prior alpha = "flat" or "tub". sizeY = dim(Y) #Storage Structures. #T are the observed counts. sizeY<-dim(Y) T.tot<-sum(Y$Tags) #full storage Nsamp<-matrix(0,simsize,1); msamp<-matrix(0,sizeY[1],simsize) gsamp<-matrix(0,sizeY[1],simsize) #Sub storage Nvals<-matrix(0,subsize,1); mvals<-matrix(0,sizeY[1],subsize) gvals<-matrix(0,sizeY[1],subsize) #Initial Values alpha = rep(1,sizeY[1]) if(type != "flat") alpha = (1/sizeY[1])*alpha #Subsize Stuff N.parm<-round(sum(Y$Tags/Y$Phi)) Nvals<-rpois(subsize,N.parm) mvals[,1]<-alpha/sizeY[1] gvals[,1]<-round(Y$Tags/Y$Phi) #library(MCMCpack) for(i in 1:simsize){ for(k in 2:subsize){ gvals[,k]<-Y$Tags+rmultinom(1,(Nvals[k-1]-T.tot),((1-Y$Phi)*mvals[,(k-1)])) #g_k conditional mvals[,k]<-rdirichlet(1,(gvals[,k]+alpha)) #Nvals[k]<-rpois(1,N.parm) #N conditional } gvals[,1]<-gsamp[,i]<-gvals[,subsize] Nsamp[i]<-Nvals[subsize] Nvals<-rpois(subsize,N.parm) mvals[,1]<-msamp[,i]<-mvals[,subsize] #if( i%%200 ==0 ) print(i) } #Save results. mean.samp<-apply(msamp[,(burnin+1):simsize],1,mean) #apparently returns a matrix. cuts<-c(.025,.5,.975) m.pctiles<-apply(msamp[,(burnin+1):simsize],1,quantile,probs=cuts) answer = data.frame(t(rbind(mean.samp,m.pctiles))) } md=function(Y,simsize=1500,subsize=20,burnin=500,type="flat",lambda=20000,Phi.c=Phi.c){ #Uses data augmentation by assuming that the SAGE process produces "r" unformed tags where #r follows a Poisson distribution. Given "r", we can uses a Dirichlet-Multinomial posterior #distribution to do inference for proportions. #Data is contained in the Dataframe Y which should have two parts, #Y$Tags is the aggregated tag counts and Y$Phi is the estimated tag formation probability. #simsize is the total number of simulations stored. subsize=20 #means that every 20th sample is kept so that subsize*simsize is the actual simulations #computed. burnin is the number of burnin samples; it is included in #the total number of samples so here 500 of the 1500 samples kept will be considered burnin and #dropped from the returned data. type describe the dirichlet prior alpha = "flat" or "tub". sizeY = dim(Y) #Storage Structures. #T are the observed counts. sizeY<-dim(Y) #full storage rsamp<-matrix(0,simsize,1); msamp<-matrix(0,sizeY[1],simsize) #gsamp<-matrix(0,sizeY[1],simsize) #Sub storage rvals<-matrix(0,subsize,1); mvals<-matrix(0,sizeY[1],subsize) #gvals<-matrix(0,sizeY[1],subsize) #Initial Values alpha = rep(1,sizeY[1]) if(type != "flat") alpha = (1/sizeY[1])*alpha mvals[,1]<-alpha/sizeY[1] rvals[1]<-10000 mu<-sum(Y$Tags)*((1-Phi.c)/Phi.c) for(i in 1:simsize){ for(k in 2:subsize){ postvec<-c(Y$Tags+alpha,(rvals[(k-1)]+1)) theta<-rdirichlet(1,postvec) #mphi conditional mtemp<-theta[1:sizeY[1]]/Y$Phi mvals[,k]<-mtemp/sum(mtemp) mu<-(lambda*(1-sum(theta[1:sizeY[1]]))) rvals[k]<-rpois(1,mu) #r conditional } rvals[1]<-rsamp[i]<-rvals[subsize] mvals[,1]<-msamp[,i]<-mvals[,subsize] # if( i%%100 ==0 ) print(rsamp[i]) } #Save results. mean.samp<-apply(msamp[,(burnin+1):simsize],1,mean) #apparently returns a matrix. cuts<-c(.025,.5,.975) m.pctiles<-apply(msamp[,(burnin+1):simsize],1,quantile,probs=cuts) answer = data.frame(t(rbind(mean.samp,m.pctiles))) }