############################################################## # Program to test method of identifying unusual clusters in ## # survey data starting with simulated clustered data ## # Author: Elizabeth Handorf # Fox Chase Cancer Center # Email: elizabeth.handorf@fccc.edu # Supplementary material for PLOS-One manuscript # "A hierarchical clustering approach to identify repeated enrollments in web survey data" library(cluster) library(fpc) library(mvtnorm) library(GenOrd) library(grDevices) library(Matrix) start_time <- Sys.time() setwd('My Directory') load("S1_File.RData") #simulation parameters #Some code to extract the inputs from the command line, modified from #http://yangfeng.wordpress.com/2009/09/03/including-arguments-in-r-cmd-batch-mode/ ##First read in the arguments listed at the command line args=(commandArgs(TRUE)) ##args is now a list of character vectors ## First check to see if arguments are passed. ## Then cycle through each element of the list and evaluate the expressions. if(length(args)==0){ print("No arguments supplied.") ##supply default values seed <- 9999999 #autocorrelation of off-block-diagonals in clusters rho<- 0.8 threshold<-0.95 }else{ for(i in 1:length(args)){ eval(parse(text=args[[i]])) } } #m=number of simulated "null" datasets to use m<-1000 set.seed(seed) #number in the cluster n.tot=5000 n.clst1=70 n.unclst=n.tot - n.clst1 #Unclustered indivudals - simulate as usual simdat.unclst<-ordsample(n.unclst, marg.unclst, cor.unclst, Spearman=TRUE) #Now generate MV Norm samples for the cluster #Sigma - block matrix, #Block diagonal = cor.clst1 #Off-diagnoal blocks= cor.clst1*rho offdiag<-cor.clst1*rho Sigma.clst1.offdiagtmp<-do.call("rbind", rep(list(offdiag), 70)) Sigma.clst1.offdiag<-do.call("cbind", rep(list(Sigma.clst1.offdiagtmp), 70)) Sigma.clst1.diag<-as.matrix(.bdiag(rep(list(cor.clst1),n.clst1))) Sigma.clst1<-Sigma.clst1.diag*(1-rho)+Sigma.clst1.offdiag #generate all observations at once using the full correlation matrix, #then split into appropriate rows/columns simdat.clst1.tmp<-matrix(rmvnorm(1,sigma=Sigma.clst1),ncol=9) #Apply probability integral transformation to get ordinal values simdat.clst1<-pnorm(simdat.clst1.tmp) #cut at marginal cdfs for(i in 1:9) { simdat.clst1[,i]<-cut(simdat.clst1[,i],breaks=c(0,marg.clst1[[i]],1),labels=FALSE) } simdat.comb<-rbind(simdat.clst1,simdat.unclst) simdat.comb<-data.frame(simdat.comb) names(simdat.comb)<-c("burneasily","darktan","freckles","moles","sunburn","age","climate","haircolor","skincolor") #indicator for which group they are a member of simdat.comb$cluster.number<-c(rep(1,n.clst1),rep(0,n.unclst)) #Screener score simdat.comb$score<-3*(simdat.comb[,1]==2)+ 3*(simdat.comb[,2]==1)+2*(simdat.comb[,2]==2)+1*(simdat.comb[,2]==3)+ 2*(simdat.comb[,3]==2)+4*(simdat.comb[,3]==3)+ 5*(simdat.comb[,4]==2)+10*(simdat.comb[,4]==3)+20*(simdat.comb[,4]==4)+30*(simdat.comb[,4]==5)+ 1*(simdat.comb[,5]==2)+2*(simdat.comb[,5]==3)+3*(simdat.comb[,5]==4)+4*(simdat.comb[,5]==5)+ 5*(simdat.comb[,7]==2)+10*(simdat.comb[,7]==3)+ 4*(simdat.comb[,8]==1)+3*(simdat.comb[,8]==2)+2*(simdat.comb[,8]==3)+1*(simdat.comb[,8]==4)+ 20*(simdat.comb[,9]==1)+18*(simdat.comb[,9]==2)+16*(simdat.comb[,9]==3)+4*(simdat.comb[,9]==4)+2*(simdat.comb[,9]==5) #simulate enrollment order, grouping cluster 1 in time simdat.comb$enrolltime<-c(runif(n.clst1,min=0.9,max=0.905), #equivalent to enrolling over 2 days during a 365 day enrollment period runif(n.unclst,min=0, max=1)) simdat.comb<-simdat.comb[order(simdat.comb$enrolltime),] simdat.comb$enrollorder<-rep(1:n.tot) simdat.comb$age<-simdat.comb$age+17 #rescale to actual age #Probability of completing based on glm model from original data r.bin<-function(p) {rbinom(1, 1, p)} pr.complete<-predict(m.included,newdata=simdat.comb,type="response") simdat.comb$completed<-sapply(pr.complete,r.bin) #Final dataset with eligible people who completed baseline simdat.final.sam<-simdat.comb[simdat.comb$score>=27 & simdat.comb$completed==1,] #How many from each cluster wound up in the final dataset? table(simdat.final.sam$cluster.number) ############################################################### #### Step 1 ################################################### #### Run standard clustering on the simulated dataset ############ dat2<-simdat.final.sam[,c(1:9,13)] #Euclidean distance #Normalize to have unit variance SDs<-apply(dat2,2,sd) vec<-as.vector(dat2[1,]) dat3<- as.matrix(dat2) %*% diag(1/SDs) rownames(dat3)<-simdat.final.sam[,13] dist.obj<-dist(dat3) clust.obj.true.data<-hclust(dist.obj) #function to determine cluster properties at a given height threshold get.clusters<-function(clust.obj, height) { memb <- cutree(clust.obj, h = height) #This generates a matrix with the following elements: row number=order of clusters (L to R) #clst.cnt=total number of subjects in that cluster #id1=cluster number (from cutree) #idLR = cluster number from left to right in plot clst.cnt<-table(memb[clust.obj$order]) id1<-unique(memb[clust.obj$order]) summary.clust<-cluster.stats(dist.obj,memb) tmp<-cbind(idLR=c(1:length(id1)),id1,clst.size=clst.cnt[id1]) #Now determine which clusters have the smallest within-cluster distance tmp2<-cbind(tmp[order(tmp[,2]),], avg.dist=summary.clust$average.distance, size=summary.clust$cluster.size, med.dist=summary.clust$median.distance,max.dist=summary.clust$diameter, sill = summary.clust$clus.avg.silwidths) tmp2<-tmp2[order(tmp2[,4]),] tmp2<-data.frame(tmp2) tmp2$comb<-tmp2$sill/max(tmp2$sill)+ tmp2$size/max(tmp2$size) return(tmp2) } #run with varying hieghts clust.true.4.5<-get.clusters(clust.obj = clust.obj.true.data, height=4.5) clust.true.5.0<-get.clusters(clust.obj = clust.obj.true.data, height=5.0) clust.true.5.5<-get.clusters(clust.obj = clust.obj.true.data, height=5.5) #Select the height with the fewest negative sil widths prop.neg.sill<-c(mean(clust.true.4.5$sill<0), mean(clust.true.5.0$sill<0), mean(clust.true.5.5$sill<0)) #Note: if they are equal, choose the highest threshold (resulting in the smallest number of clusters) if (prop.neg.sill[3] == min(prop.neg.sill) ) { clust.true.sim<-clust.true.5.5 clust.height<-5.5 }else if (prop.neg.sill[2] == min(prop.neg.sill) ) { clust.true.sim<-clust.true.5.0 clust.height<-5.0 }else if (prop.neg.sill[1] == min(prop.neg.sill) ) { clust.true.sim<-clust.true.4.5 clust.height<-4.5 } #save the number of observations in the "true" simulated data n.final.sam<-dim(simdat.final.sam)[1] ############################################################### #### Step 2 ################################################### #### Find multivariate distirbution of clustering vars ######## #### (starting with the full set) ############################# #ineligible? sum(simdat.comb$type==3 & simdat.comb$score>=27) #didn't complete baseline? sum(simdat.comb$type==2 & simdat.comb$completed.baseline==0) simdat.comb.good.score<-simdat.comb[simdat.comb$score>=27,] simdat.comb.good.score$burneasily<-simdat.comb.good.score$burneasily #model the probability of entering the study IF they had a valid score m.included<-glm(completed ~ as.factor(burneasily) + as.factor(darktan) + as.factor(freckles)+as.factor(moles)+as.factor(sunburn)+as.factor(age)+ as.factor(climate)+as.factor(haircolor)+as.factor(skincolor) ,data=simdat.comb.good.score, family="binomial") #Simulation parameters #Correlation cor.matrix.obs<-cor(simdat.comb[,2:10],method="spearman") #proportions responding in each likert item t1<-prop.table(table(simdat.comb$burneasily)) t2<-prop.table(table(simdat.comb$darktan)) t3<-prop.table(table(simdat.comb$freckles)) t4<-prop.table(table(simdat.comb$moles)) t5<-prop.table(table(simdat.comb$sunburn)) t6<-prop.table(table(simdat.comb$age)) t7<-prop.table(table(simdat.comb$climate)) t8<-prop.table(table(simdat.comb$haircolor)) t9<-prop.table(table(simdat.comb$skincolor)) #Cumulative proportions t1.cum<-cumsum(as.vector(t1)) t2.cum<-cumsum(as.vector(t2)) t3.cum<-cumsum(as.vector(t3)) t4.cum<-cumsum(as.vector(t4)) t5.cum<-cumsum(as.vector(t5)) t6.cum<-cumsum(as.vector(t6)) t7.cum<-cumsum(as.vector(t7)) t8.cum<-cumsum(as.vector(t8)) t9.cum<-cumsum(as.vector(t9)) #Transform to quantiles of the normal distribution v1.cutoffs<-c(-Inf,qnorm(t1.cum)) v2.cutoffs<-c(-Inf,qnorm(t2.cum)) v3.cutoffs<-c(-Inf,qnorm(t3.cum)) v4.cutoffs<-c(-Inf,qnorm(t4.cum)) v5.cutoffs<-c(-Inf,qnorm(t5.cum)) v6.cutoffs<-c(-Inf,qnorm(t6.cum)) v7.cutoffs<-c(-Inf,qnorm(t7.cum)) v8.cutoffs<-c(-Inf,qnorm(t8.cum)) v9.cutoffs<-c(-Inf,qnorm(t9.cum)) #Correct the correlation matrix given the observed marginals marginal<-list( t1.cum[-length(t1.cum)], t2.cum[-length(t2.cum)], t3.cum[-length(t3.cum)], t4.cum[-length(t4.cum)], t5.cum[-length(t5.cum)], t6.cum[-length(t6.cum)], t7.cum[-length(t7.cum)], t8.cum[-length(t8.cum)], t9.cum[-length(t9.cum)] ) cor.matrix.adjusted<-contord(marginal, cor.matrix.obs) ############################################################### #### Step 3-4 ################################################# #### Simulate M sets of N samples under independence ########## #function to draw from bernoulli with probability p r.bin<-function(p) {rbinom(1, 1, p)} fun.sim.good.scores<-function() { dat.sim.tmp<-ordsample(6500, marginal, cor.matrix.obs, Spearman=TRUE) dat.sim.tmp[,6]<-dat.sim.tmp[,6]+17 #only valid screener scores score<-3*(dat.sim.tmp[,1]==2)+ 3*(dat.sim.tmp[,2]==1)+2*(dat.sim.tmp[,2]==2)+1*(dat.sim.tmp[,2]==3)+ 2*(dat.sim.tmp[,3]==2)+4*(dat.sim.tmp[,3]==3)+ 5*(dat.sim.tmp[,4]==2)+10*(dat.sim.tmp[,4]==3)+20*(dat.sim.tmp[,4]==4)+30*(dat.sim.tmp[,4]==5)+ 1*(dat.sim.tmp[,5]==2)+2*(dat.sim.tmp[,5]==3)+3*(dat.sim.tmp[,5]==4)+4*(dat.sim.tmp[,5]==5)+ 5*(dat.sim.tmp[,7]==2)+10*(dat.sim.tmp[,7]==3)+ 4*(dat.sim.tmp[,8]==1)+3*(dat.sim.tmp[,8]==2)+2*(dat.sim.tmp[,8]==3)+1*(dat.sim.tmp[,8]==4)+ 20*(dat.sim.tmp[,9]==1)+18*(dat.sim.tmp[,9]==2)+16*(dat.sim.tmp[,9]==3)+4*(dat.sim.tmp[,9]==4)+2*(dat.sim.tmp[,9]==5) dat.sim.tmp2<-as.data.frame(dat.sim.tmp[score>=27,]) names(dat.sim.tmp2)<-c("burneasily","darktan","freckles","moles", "sunburn", "age", "climate","haircolor","skincolor") #Probability of completing based on glm model pr.complete<-predict(m.included,newdata=dat.sim.tmp2,type="response") sim.completed<-sapply(pr.complete,r.bin) #Select observations based on random probability of completing dat.sim.tmp3<-dat.sim.tmp2[sim.completed==1,] #Make a final dataset with the same number of observations as the "true" dataset (includ error catching if total N is smaller than 1234) dat.sim.fin<-dat.sim.tmp3[1:min(dim(dat.sim.tmp3)[1],n.final.sam),] dat.sim.fin$order<-sample(simdat.final.sam$enrollorder)[1:min(dim(dat.sim.tmp3)[1],n.final.sam)] return(dat.sim.fin) } i<-1 simulated.data.ind<-fun.sim.good.scores() simulated.data.ind$rep<-rep(i,dim(simulated.data.ind)[1]) many.sims<-simulated.data.ind for (i in 2: m) { simulated.data.ind<-fun.sim.good.scores() simulated.data.ind$rep<-rep(i,dim(simulated.data.ind)[1]) many.sims<-rbind(many.sims,simulated.data.ind) } ############################################################### #### Step 5 ################################################### #### Hierarchical clustering applied to all simulated datasets #Maximum size and sil width from the clusters identified in the true data true.max.size<-max(clust.true.sim$size) true.max.sill<-max(clust.true.sim$sill) max.size<-max.sill<-max.comb<-rep(NA,m) #Run over all simulated datasets for (i in 1: m) { dat.sim<-many.sims[many.sims$rep==i,c(10,1:9)] SDs<-apply(dat.sim,2,sd) vec<-as.vector(dat.sim[1,]) dat.sim.a<- as.matrix(dat.sim) %*% diag(1/SDs) rownames(dat.sim.a)<-simdat.final.sam[,1] dist.obj<-dist(dat.sim.a) clust.obj.sim<-hclust(dist.obj) clust.sim.cut<-get.clusters(clust.obj = clust.obj.sim, height=4.5) clust.sim.cut$comb<-clust.sim.cut$sill/true.max.sill + clust.sim.cut$size/true.max.size max.size[i]<-max(clust.sim.cut$size) max.sill[i]<-max(clust.sim.cut$sill) max.comb[i]<-max(clust.sim.cut$comb) } end_time <- Sys.time() end_time - start_time ############################################################### #### Step 6 ################################################### #### Compare simulated results to those from the true data #### #For each of the clusters in the true data - what proportion of the #max simulated combined scores are greater than the observed score? clust.true.sim$comb.pval<-rep(NA,dim(clust.true.sim)[1]) for ( i in 1: dim(clust.true.sim)[1]) { clust.true.sim$comb.pval[i]<-sum(clust.true.sim$comb[i]>max.comb)/m } clust.true.sim ##which clusters were identified clust.chosen<-clust.true.sim$id1[clust.true.sim$comb.pval>=threshold] # Identify PIDs in the cluster of interest, compare with "true" clustered data clust.memb <- cutree(clust.obj.true.data, h = clust.height) #Note that clust.memb=id1 from the results table- this is the cluster id #and is different form "idLR" which is used to identify clusters in the plotted dendrogram clust.suspicious<-clust.memb[clust.memb %in% clust.chosen] clust.ids<-as.numeric(labels(clust.suspicious)) #how many are really in the cluster? true.n.clst<- sum(simdat.final.sam$cluster.number) if (length(clust.ids)>0) { results<-data.frame(clust.ids, identified=1) #merge back in with individual data to summarize results simdat.final.sam.results<-merge(simdat.final.sam,results, by.x="enrollorder", by.y="clust.ids", all=T) simdat.final.sam.results$identified[is.na(simdat.final.sam.results$identified)]<-0 summary.table<-table(simdat.final.sam.results$cluster.number,simdat.final.sam.results$identified) summary.table #Results of this simulation n.clusters.selected<-length(clust.chosen) prop.TP<- (summary.table[2,2])/sum(simdat.final.sam.results$cluster.number ==1) prop.TN<- summary.table[1,1]/sum(simdat.final.sam.results$cluster.number ==0) size.c1<-sum(simdat.final.sam.results$cluster.number ==1) res<-data.frame(seed, rho, n.clusters.selected, n.final.sam, true.n.clst, size.c1, prop.TP,prop.TN) }else { res<-data.frame(seed, rho, 0, n.final.sam,true.n.clst, 0, 0 , 1)} setwd('My results directory') filename<-paste("tempres", rho, "_", threshold, "_", seed, ".csv", sep="") write.table(res,filename, col.names =FALSE,append=TRUE, sep = ",", row.names = FALSE) end_time <- Sys.time() end_time - start_time