############################################################## # Part 1: #################################################### # Program to identify unusual clusters in survey data ######## # Author: Elizabeth Handorf # Fox Chase Cancer Center # Email: elizabeth.handorf@fccc.edu # Supplementary material for PLOS-One manuscript # "A hierarchical clustering approach to identify repeated enrollments in web survey data" library(cluster) library(fpc) library(copula) library(mvtnorm) library(GenOrd) library(grDevices) setwd("C:\\My directory") #Read in cluster input file dat<-read.csv("S1_Dataset.csv") #read in quality outcomes file qual.outcomes<-read.csv("S2_Dataset.csv",header=T) final.PIDs<-qual.outcomes$PID #quality outcomes of the final sample dat.final.sam<-dat[dat$PID %in% final.PIDs,] dat.final.sam<-dat.final.sam[order(dat.final.sam$PID),] ############################################################### #### Step 1 ################################################### #### Run standard clustering on the actual dataset ############ dat2<-dat.final.sam[,c(1:10)] #Euclidean distance #Normalize to have unit variance SDs<-apply(dat2,2,sd) vec<-as.vector(dat2[1,]) dat3<- as.matrix(dat2) %*% diag(1/SDs) rownames(dat3)<-dat.final.sam[,1] dist.obj<-dist(dat3) clust.obj.true.data<-hclust(dist.obj) #function to determine cluster properties at a given height threshold get.clusters<-function(clust.obj, height) { memb <- cutree(clust.obj, h = height) #This generates a matrix with the following elements: row number=order of clusters (L to R) #clst.cnt=total number of subjects in that cluster #id1=cluster number (from cutree) #idLR = cluster number from left to right in plot clst.cnt<-table(memb[clust.obj$order]) id1<-unique(memb[clust.obj$order]) summary.clust<-cluster.stats(dist.obj,memb) tmp<-cbind(idLR=c(1:length(id1)),id1,clst.size=clst.cnt[id1]) #Now determine which clusters have the smallest within-cluster distance tmp2<-cbind(tmp[order(tmp[,2]),], avg.dist=summary.clust$average.distance, size=summary.clust$cluster.size, med.dist=summary.clust$median.distance,max.dist=summary.clust$diameter, sill = summary.clust$clus.avg.silwidths) tmp2<-tmp2[order(tmp2[,4]),] tmp2<-data.frame(tmp2) tmp2$comb<-tmp2$sill/max(tmp2$sill)+ tmp2$size/max(tmp2$size) return(tmp2) } #run with varying hieghts clust.true.4.5<-get.clusters(clust.obj = clust.obj.true.data, height=4.5) clust.true.5.0<-get.clusters(clust.obj = clust.obj.true.data, height=5.0) clust.true.5.5<-get.clusters(clust.obj = clust.obj.true.data, height=5.5) #Examine the sil widths mean(clust.true.4.5$sill<0) mean(clust.true.5.0$sill<0) mean(clust.true.5.5$sill<0) range(clust.true.4.5$sill) range(clust.true.5.0$sill) range(clust.true.5.5$sill) #Of the 3 options, 4.5 seems best (fewest negative sil widths) #Note: Lower threshold not considered as this already produces 80 clusters #Create Figure 1 par(mfrow=c(1,1)) pdf("dendrogram.pdf") plot(clust.obj.true.data,hang=-1,cex=.4,labels=F,main="Dendrogram - clustering of screener responses", xlab="") rect.hclust(clust.obj.true.data,h=4.5,which=c(7,78),border="red") dev.off() ############################################################### #### Step 2 ################################################### #### Find multivariate distirbution of clustering vars ######## #### (starting with the full set) ############################# #ineligible? sum(dat$type==3 & dat$obs.score>=27) #didn't complete baseline? sum(dat$type==2 & dat$completed.baseline==0) dat.good.score<-dat[dat$obs.score>=27,] dat.good.score$burneasily<-dat.good.score$burneasily+1 #model the probability of being included IF they had a valid score m.included<-glm(completed.baseline ~ as.factor(burneasily) + as.factor(darktan) + as.factor(freckles)+as.factor(moles)+as.factor(sunburn)+as.factor(age)+ as.factor(climate)+as.factor(haircolor)+as.factor(skincolor) ,data=dat.good.score, family="binomial") #Simulation parameters #Correlation cor.matrix.obs<-cor(dat[,2:10],method="spearman") #proportions responding in each likert item t1<-prop.table(table(dat$burneasily)) t2<-prop.table(table(dat$darktan)) t3<-prop.table(table(dat$freckles)) t4<-prop.table(table(dat$moles)) t5<-prop.table(table(dat$sunburn)) t6<-prop.table(table(dat$age)) t7<-prop.table(table(dat$climate)) t8<-prop.table(table(dat$haircolor)) t9<-prop.table(table(dat$skincolor)) #Cumulative proportions t1.cum<-cumsum(as.vector(t1)) t2.cum<-cumsum(as.vector(t2)) t3.cum<-cumsum(as.vector(t3)) t4.cum<-cumsum(as.vector(t4)) t5.cum<-cumsum(as.vector(t5)) t6.cum<-cumsum(as.vector(t6)) t7.cum<-cumsum(as.vector(t7)) t8.cum<-cumsum(as.vector(t8)) t9.cum<-cumsum(as.vector(t9)) #Transform to quantiles of the normal distribution v1.cutoffs<-c(-Inf,qnorm(t1.cum)) v2.cutoffs<-c(-Inf,qnorm(t2.cum)) v3.cutoffs<-c(-Inf,qnorm(t3.cum)) v4.cutoffs<-c(-Inf,qnorm(t4.cum)) v5.cutoffs<-c(-Inf,qnorm(t5.cum)) v6.cutoffs<-c(-Inf,qnorm(t6.cum)) v7.cutoffs<-c(-Inf,qnorm(t7.cum)) v8.cutoffs<-c(-Inf,qnorm(t8.cum)) v9.cutoffs<-c(-Inf,qnorm(t9.cum)) #Correct the correlation matrix given the observed marginals marginal<-list( t1.cum[-length(t1.cum)], t2.cum[-length(t2.cum)], t3.cum[-length(t3.cum)], t4.cum[-length(t4.cum)], t5.cum[-length(t5.cum)], t6.cum[-length(t6.cum)], t7.cum[-length(t7.cum)], t8.cum[-length(t8.cum)], t9.cum[-length(t9.cum)] ) cor.matrix.adjusted<-contord(marginal, cor.matrix.obs) ############################################################### #### Step 3-4 ################################################# #### Simulate M sets of N samples under independence ########## #function to draw from bernoulli with probability p r.bin<-function(p) {rbinom(1, 1, p)} fun.sim.good.scores<-function() { dat.sim.tmp<-ordsample(6500, marginal, cor.matrix.obs, Spearman=TRUE) dat.sim.tmp[,6]<-dat.sim.tmp[,6]+17 #only valid screener scores score<-3*(dat.sim.tmp[,1]==2)+ 3*(dat.sim.tmp[,2]==1)+2*(dat.sim.tmp[,2]==2)+1*(dat.sim.tmp[,2]==3)+ 2*(dat.sim.tmp[,3]==2)+4*(dat.sim.tmp[,3]==3)+ 5*(dat.sim.tmp[,4]==2)+10*(dat.sim.tmp[,4]==3)+20*(dat.sim.tmp[,4]==4)+30*(dat.sim.tmp[,4]==5)+ 1*(dat.sim.tmp[,5]==2)+2*(dat.sim.tmp[,5]==3)+3*(dat.sim.tmp[,5]==4)+4*(dat.sim.tmp[,5]==5)+ 5*(dat.sim.tmp[,7]==2)+10*(dat.sim.tmp[,7]==3)+ 4*(dat.sim.tmp[,8]==1)+3*(dat.sim.tmp[,8]==2)+2*(dat.sim.tmp[,8]==3)+1*(dat.sim.tmp[,8]==4)+ 20*(dat.sim.tmp[,9]==1)+18*(dat.sim.tmp[,9]==2)+16*(dat.sim.tmp[,9]==3)+4*(dat.sim.tmp[,9]==4)+2*(dat.sim.tmp[,9]==5) dat.sim.tmp2<-as.data.frame(dat.sim.tmp[score>=27,]) names(dat.sim.tmp2)<-c("burneasily","darktan","freckles","moles", "sunburn", "age", "climate","haircolor","skincolor") #Probability of completing based on glm model pr.complete<-predict(m.included,newdata=dat.sim.tmp2,type="response") sim.completed<-sapply(pr.complete,r.bin) #Select observations based on random probability of completing dat.sim.tmp3<-dat.sim.tmp2[sim.completed==1,] #Make a final dataset with first 1234 observations (includ error catching if total N is smaller than 1234) dat.sim.fin<-dat.sim.tmp3[1:min(dim(dat.sim.tmp3)[1],1234),] dat.sim.fin$order<-sample(final.PIDs)[1:min(dim(dat.sim.tmp3)[1],1234)] return(dat.sim.fin) } set.seed(1179658) i<-1 simulated.data<-fun.sim.good.scores() simulated.data$rep<-rep(i,dim(simulated.data)[1]) write.table(simulated.data,"simulated_sets.csv", sep = ",", row.names = FALSE) for (i in 2: 1000) { simulated.data<-fun.sim.good.scores() simulated.data$rep<-rep(i,dim(simulated.data)[1]) write.table(simulated.data,"simulated_sets.csv", col.names =FALSE,append=TRUE, sep = ",", row.names = FALSE) } ############################################################### #### Step 5 ################################################### #### Hierarchical clustering applied to all simulated datasets # Optionally: Re-load the final version of the file many.sims<-read.csv("simulated_sets.csv") #Maximum size and sil width from the clusters identified in the true data w.size.4.5<-max(clust.true.4.5$size) w.sill.4.5<-max(clust.true.4.5$sill) max.size.4.5<-max.sill.4.5<-max.comb.4.5<-rep(NA,1000) #Run over all simulated datasets for (i in 1: 1000) { dat.sim<-many.sims[many.sims$rep==i,c(10,1:9)] SDs<-apply(dat.sim,2,sd) vec<-as.vector(dat.sim[1,]) dat.sim.a<- as.matrix(dat.sim) %*% diag(1/SDs) rownames(dat.sim.a)<-dat.final.sam[,1] dist.obj<-dist(dat.sim.a) clust.obj.sim<-hclust(dist.obj) clust.sim.4.5<-get.clusters(clust.obj = clust.obj.sim, height=4.5) clust.sim.4.5$comb<-clust.sim.4.5$sill/w.sill.4.5 + clust.sim.4.5$size/w.size.4.5 max.size.4.5[i]<-max(clust.sim.4.5$size) max.sill.4.5[i]<-max(clust.sim.4.5$sill) max.comb.4.5[i]<-max(clust.sim.4.5$comb) } ############################################################### #### Step 6 ################################################### #### Compare simulated results to those from the true data #### #For each of the clusters in the true data - what proportion of the #max simulated combined scores are greater than the observed score? clust.true.4.5$comb.pval<-rep(NA,dim(clust.true.4.5)[1]) for ( i in 1: dim(clust.true.4.5)[1]) { clust.true.4.5$comb.pval[i]<-sum(clust.true.4.5$comb[i]>max.comb.4.5)/1000 } ### Save results #### save.image("Section2_results.Rdata") ###Plot Figure 2 # Use Example simulation (simulation 1) #Re-run the clustering algorith and plot eample simulated results against true results dat.sim1<-many.sims[many.sims$rep==1,c(10,1:9)] SDs<-apply(dat.sim1,2,sd) vec<-as.vector(dat.sim1[1,]) dat.sim1.a<- as.matrix(dat.sim1) %*% diag(1/SDs) rownames(dat.sim1.a)<-dat.final.sam[,1] dist.obj<-dist(dat.sim1.a) clust.obj.sim1<-hclust(dist.obj) clust.sim1.4.5<-get.clusters(clust.obj = clust.obj.sim1, height=4.5) #Create Figure 2 png(file = "sim_vs_obs.png", width = 6.5, height = 8, units="in", res=600) #Comparing observed data to sim 1 par(mfrow=c(2,1), mar=(c(5, 4, 3, 1) + 0.1)) xmax <- max(c(clust.sim1.4.5$sill,clust.true.4.5$sill)) xmin <- min(c(clust.sim1.4.5$sill,clust.true.4.5$sill)) ymax <- max(c(clust.sim1.4.5$size,clust.true.4.5$size)) plot(clust.sim1.4.5$sill,clust.sim1.4.5$size, pch=19, col="darkgray", xlim=c(xmin,xmax),ylim=c(0,ymax),xlab="Sil width",ylab="Cluster size", main="A") points(clust.true.4.5$sill[clust.true.4.5$comb.pval<0.95], clust.true.4.5$size[clust.true.4.5$comb.pval<0.95], pch=19, col="black") points(clust.true.4.5$sill[clust.true.4.5$comb.pval>=0.95], clust.true.4.5$size[clust.true.4.5$comb.pval>=0.95], pch=13, col="black", lwd=2,cex=1.5) legend("topright",c("1 simulated sample","observed","observed (outliers)"), col=c("darkgray","black","black"),pch=c(19,19,13), cex=0.75 ) top5.vals<-clust.true.4.5$comb[order(clust.true.4.5$comb,decreasing=T)][1:3] hist(max.comb.4.5, col="gray", xlim=c(0.6,max(clust.true.4.5$comb)), main="B", ylab="Simulation frequency", xlab="Combined Size and Sil Width (SSW)", border="white") points(top5.vals,rep(50,3), cex=c(1.5,1.5,1), pch=c(13,13,19), lwd=2) legend("topright",c("simulated max values"), bty="n", fill=c("gray"),border="white",cex=0.75) dev.off() #print out table of results for each observed cluster print(clust.true.4.5) ##### Finally, identify PIDs in the 2 clusters of interest clust.memb <- cutree(clust.obj.true.data, h = 4.5) clst.cnt<-table(clust.memb[clust.obj.true.data$order]) #Note that clust.memb=id1 from the results table- this is the cluster id #and is different form "idLR" which is used to identify clusters in the plotted dendrogram clust.suspicious<-clust.memb[clust.memb==64 | clust.memb==49] clust.ids<-as.numeric(labels(clust.suspicious)) clust.suspicious2<-cbind(clust.ids, as.numeric(clust.suspicious)) ####### Notes - testing the clustering method if we include ####### those with known poor quality (dropped) #Changes which need to be made to code: #Define the "observed sample" using this code: #dat.final.sam<-dat[dat$PID %in% final.PIDs | (dat$type==1 & dat$completed.baseline==1),] #Change number of people in final simulated sets to 1491 #Use this code: #dat.sim.fin<-dat.sim.tmp3[1:min(dim(dat.sim.tmp3)[1],1491),] #dat.sim.fin$order<-sample(dat.final.sam$PID)[1:min(dim(dat.sim.tmp3)[1],1491)] #Note that all cluster number will change, and would also need to be updated #Subject with known repeat enrollments: #known.repeat.enroller.PIDs<-c(3151,3147,3224,3285,4417,4550,4670,4674,4679,4755,4783,4816, #4920,4921,5857,3170,3211,3296,3409,3476,3572,3610,4683,4802,4894,4915,5008,5049,5746,3173, #3176,3192,3193,3195,3202,3205,3208,3210,3219,3322,3332,3414,3483,3578,3598,3601,4140,4307, #4424,4431,4544,4911,5729) #Additional cluster found if code is run with the additional 257 subjects #clust.suspicious3<-clust.memb[clust.memb==52] #clust.ids3<-as.numeric(labels(clust.suspicious3)) #compare to known repeater #sum(known.repeat.enroller.PIDs %in% clust.ids3)