##############################################################
# Part 1: ####################################################
# Program to identify unusual clusters in survey data ########
# Author: Elizabeth Handorf
# Fox Chase Cancer Center
# Email: elizabeth.handorf@fccc.edu
# Supplementary material for PLOS-One manuscript
# "A hierarchical clustering approach to identify repeated enrollments in web survey data" 


library(cluster)
library(fpc)
library(copula)
library(mvtnorm)
library(GenOrd)
library(grDevices)


setwd("C:\\My directory")

#Read in cluster input file
dat<-read.csv("S1_Dataset.csv")

#read in quality outcomes file
qual.outcomes<-read.csv("S2_Dataset.csv",header=T)
final.PIDs<-qual.outcomes$PID

#quality outcomes of the final sample
dat.final.sam<-dat[dat$PID %in% final.PIDs,]
dat.final.sam<-dat.final.sam[order(dat.final.sam$PID),]

###############################################################
#### Step 1 ###################################################
#### Run standard clustering on the actual dataset ############


dat2<-dat.final.sam[,c(1:10)]

#Euclidean distance
#Normalize to have unit variance
SDs<-apply(dat2,2,sd)

vec<-as.vector(dat2[1,])

dat3<-    as.matrix(dat2) %*% diag(1/SDs)

rownames(dat3)<-dat.final.sam[,1]
dist.obj<-dist(dat3)

clust.obj.true.data<-hclust(dist.obj)

#function to determine cluster properties at a given height threshold

get.clusters<-function(clust.obj, height) {
  
  memb <- cutree(clust.obj, h = height)
  
  #This generates a matrix with the following elements: row number=order of clusters (L to R)
  #clst.cnt=total number of subjects in that cluster
  #id1=cluster number (from cutree)
  #idLR = cluster number from left to right in plot
  
  clst.cnt<-table(memb[clust.obj$order])
  id1<-unique(memb[clust.obj$order])
  summary.clust<-cluster.stats(dist.obj,memb)
  tmp<-cbind(idLR=c(1:length(id1)),id1,clst.size=clst.cnt[id1])
  
  #Now determine which clusters have the smallest within-cluster distance
  tmp2<-cbind(tmp[order(tmp[,2]),], avg.dist=summary.clust$average.distance,
              size=summary.clust$cluster.size,
              med.dist=summary.clust$median.distance,max.dist=summary.clust$diameter,
              sill = summary.clust$clus.avg.silwidths)
  
  tmp2<-tmp2[order(tmp2[,4]),]
  tmp2<-data.frame(tmp2)
  
  tmp2$comb<-tmp2$sill/max(tmp2$sill)+
    tmp2$size/max(tmp2$size)
  
  return(tmp2)
}

#run with varying hieghts
clust.true.4.5<-get.clusters(clust.obj = clust.obj.true.data, height=4.5)
clust.true.5.0<-get.clusters(clust.obj = clust.obj.true.data, height=5.0)
clust.true.5.5<-get.clusters(clust.obj = clust.obj.true.data, height=5.5)

#Examine the sil widths
mean(clust.true.4.5$sill<0)
mean(clust.true.5.0$sill<0)
mean(clust.true.5.5$sill<0)

range(clust.true.4.5$sill)
range(clust.true.5.0$sill)
range(clust.true.5.5$sill)

#Of the 3 options, 4.5 seems best (fewest negative sil widths)
#Note: Lower threshold not considered as this already produces 80 clusters


#Create Figure 1
par(mfrow=c(1,1))

pdf("dendrogram.pdf")

plot(clust.obj.true.data,hang=-1,cex=.4,labels=F,main="Dendrogram - clustering of screener responses", xlab="")
rect.hclust(clust.obj.true.data,h=4.5,which=c(7,78),border="red")
dev.off()


###############################################################
#### Step 2 ###################################################
#### Find multivariate distirbution of clustering vars ########
#### (starting with the full set) #############################


#ineligible?
sum(dat$type==3 & dat$obs.score>=27)

#didn't complete baseline?
sum(dat$type==2 & dat$completed.baseline==0)

dat.good.score<-dat[dat$obs.score>=27,]

dat.good.score$burneasily<-dat.good.score$burneasily+1

#model the probability of being included IF they had a valid score
m.included<-glm(completed.baseline ~ as.factor(burneasily) + as.factor(darktan) +
                  as.factor(freckles)+as.factor(moles)+as.factor(sunburn)+as.factor(age)+
                  as.factor(climate)+as.factor(haircolor)+as.factor(skincolor)
                ,data=dat.good.score, family="binomial")

#Simulation parameters

#Correlation 
cor.matrix.obs<-cor(dat[,2:10],method="spearman")


#proportions responding in each likert item
t1<-prop.table(table(dat$burneasily))
t2<-prop.table(table(dat$darktan))
t3<-prop.table(table(dat$freckles))
t4<-prop.table(table(dat$moles))
t5<-prop.table(table(dat$sunburn))
t6<-prop.table(table(dat$age))
t7<-prop.table(table(dat$climate))
t8<-prop.table(table(dat$haircolor))
t9<-prop.table(table(dat$skincolor))


#Cumulative proportions
t1.cum<-cumsum(as.vector(t1))
t2.cum<-cumsum(as.vector(t2))
t3.cum<-cumsum(as.vector(t3))
t4.cum<-cumsum(as.vector(t4))
t5.cum<-cumsum(as.vector(t5))
t6.cum<-cumsum(as.vector(t6))
t7.cum<-cumsum(as.vector(t7))
t8.cum<-cumsum(as.vector(t8))
t9.cum<-cumsum(as.vector(t9))


#Transform to quantiles of the normal distribution
v1.cutoffs<-c(-Inf,qnorm(t1.cum))
v2.cutoffs<-c(-Inf,qnorm(t2.cum))
v3.cutoffs<-c(-Inf,qnorm(t3.cum))
v4.cutoffs<-c(-Inf,qnorm(t4.cum))
v5.cutoffs<-c(-Inf,qnorm(t5.cum))
v6.cutoffs<-c(-Inf,qnorm(t6.cum))
v7.cutoffs<-c(-Inf,qnorm(t7.cum))
v8.cutoffs<-c(-Inf,qnorm(t8.cum))
v9.cutoffs<-c(-Inf,qnorm(t9.cum))


#Correct the correlation matrix given the observed marginals
marginal<-list(
  t1.cum[-length(t1.cum)],
  t2.cum[-length(t2.cum)],
  t3.cum[-length(t3.cum)],
  t4.cum[-length(t4.cum)],
  t5.cum[-length(t5.cum)],
  t6.cum[-length(t6.cum)],
  t7.cum[-length(t7.cum)],
  t8.cum[-length(t8.cum)],
  t9.cum[-length(t9.cum)]
)
cor.matrix.adjusted<-contord(marginal, cor.matrix.obs)


###############################################################
#### Step 3-4 #################################################
#### Simulate M sets of N samples under independence ##########

#function to draw from bernoulli with probability p
r.bin<-function(p) {rbinom(1, 1, p)}

fun.sim.good.scores<-function() {

  dat.sim.tmp<-ordsample(6500, marginal, cor.matrix.obs, Spearman=TRUE)
  
  dat.sim.tmp[,6]<-dat.sim.tmp[,6]+17
  #only valid screener scores
  score<-3*(dat.sim.tmp[,1]==2)+
    3*(dat.sim.tmp[,2]==1)+2*(dat.sim.tmp[,2]==2)+1*(dat.sim.tmp[,2]==3)+
    2*(dat.sim.tmp[,3]==2)+4*(dat.sim.tmp[,3]==3)+
    5*(dat.sim.tmp[,4]==2)+10*(dat.sim.tmp[,4]==3)+20*(dat.sim.tmp[,4]==4)+30*(dat.sim.tmp[,4]==5)+
    1*(dat.sim.tmp[,5]==2)+2*(dat.sim.tmp[,5]==3)+3*(dat.sim.tmp[,5]==4)+4*(dat.sim.tmp[,5]==5)+
    5*(dat.sim.tmp[,7]==2)+10*(dat.sim.tmp[,7]==3)+
    4*(dat.sim.tmp[,8]==1)+3*(dat.sim.tmp[,8]==2)+2*(dat.sim.tmp[,8]==3)+1*(dat.sim.tmp[,8]==4)+
    20*(dat.sim.tmp[,9]==1)+18*(dat.sim.tmp[,9]==2)+16*(dat.sim.tmp[,9]==3)+4*(dat.sim.tmp[,9]==4)+2*(dat.sim.tmp[,9]==5)
  
  
  dat.sim.tmp2<-as.data.frame(dat.sim.tmp[score>=27,])
  
  names(dat.sim.tmp2)<-c("burneasily","darktan","freckles","moles",   
                         "sunburn",
                         "age",
                         "climate","haircolor","skincolor")  
  
  #Probability of completing based on glm model
  pr.complete<-predict(m.included,newdata=dat.sim.tmp2,type="response")
  
  sim.completed<-sapply(pr.complete,r.bin)
  
  #Select observations based on random probability of completing
  dat.sim.tmp3<-dat.sim.tmp2[sim.completed==1,]
  
  #Make a final dataset with first 1234 observations (includ error catching if total N is smaller than 1234)
  dat.sim.fin<-dat.sim.tmp3[1:min(dim(dat.sim.tmp3)[1],1234),]
  dat.sim.fin$order<-sample(final.PIDs)[1:min(dim(dat.sim.tmp3)[1],1234)]
  
  return(dat.sim.fin)
  
}


set.seed(1179658)
i<-1
simulated.data<-fun.sim.good.scores()
simulated.data$rep<-rep(i,dim(simulated.data)[1])
write.table(simulated.data,"simulated_sets.csv", sep = ",", row.names = FALSE)

for (i in 2: 1000) {
  simulated.data<-fun.sim.good.scores()
  simulated.data$rep<-rep(i,dim(simulated.data)[1])
  write.table(simulated.data,"simulated_sets.csv", col.names =FALSE,append=TRUE, sep = ",", row.names = FALSE)
  
}

###############################################################
#### Step 5 ###################################################
#### Hierarchical clustering applied to all simulated datasets

# Optionally: Re-load the final version of the file
many.sims<-read.csv("simulated_sets.csv")

#Maximum size and sil width from the clusters identified in the true data
w.size.4.5<-max(clust.true.4.5$size)
w.sill.4.5<-max(clust.true.4.5$sill)

max.size.4.5<-max.sill.4.5<-max.comb.4.5<-rep(NA,1000)

#Run over all simulated datasets
for (i in 1: 1000) {
  dat.sim<-many.sims[many.sims$rep==i,c(10,1:9)]
  SDs<-apply(dat.sim,2,sd)
  vec<-as.vector(dat.sim[1,])
  dat.sim.a<-    as.matrix(dat.sim) %*% diag(1/SDs)
  rownames(dat.sim.a)<-dat.final.sam[,1]
  dist.obj<-dist(dat.sim.a)
  
  clust.obj.sim<-hclust(dist.obj)
  
  clust.sim.4.5<-get.clusters(clust.obj = clust.obj.sim, height=4.5)
  
  clust.sim.4.5$comb<-clust.sim.4.5$sill/w.sill.4.5 + clust.sim.4.5$size/w.size.4.5 
  
  max.size.4.5[i]<-max(clust.sim.4.5$size)
  max.sill.4.5[i]<-max(clust.sim.4.5$sill)
  max.comb.4.5[i]<-max(clust.sim.4.5$comb)
  
  
}

###############################################################
#### Step 6 ###################################################
#### Compare simulated results to those from the true data ####

#For each of the clusters in the true data - what proportion of the 
#max simulated combined scores are greater than the observed score?

clust.true.4.5$comb.pval<-rep(NA,dim(clust.true.4.5)[1])
for ( i in 1: dim(clust.true.4.5)[1]) {
  clust.true.4.5$comb.pval[i]<-sum(clust.true.4.5$comb[i]>max.comb.4.5)/1000
}

### Save results ####
save.image("Section2_results.Rdata")

###Plot Figure 2

# Use Example simulation (simulation 1)
#Re-run the clustering algorith and plot eample simulated results against true results
dat.sim1<-many.sims[many.sims$rep==1,c(10,1:9)]
SDs<-apply(dat.sim1,2,sd)
vec<-as.vector(dat.sim1[1,])
dat.sim1.a<-    as.matrix(dat.sim1) %*% diag(1/SDs)
rownames(dat.sim1.a)<-dat.final.sam[,1]
dist.obj<-dist(dat.sim1.a)

clust.obj.sim1<-hclust(dist.obj)

clust.sim1.4.5<-get.clusters(clust.obj = clust.obj.sim1, height=4.5)

#Create Figure 2

png(file = "sim_vs_obs.png",
    width = 6.5, height = 8, units="in", res=600)

#Comparing observed data to sim 1
par(mfrow=c(2,1), mar=(c(5, 4, 3, 1) + 0.1))

xmax <- max(c(clust.sim1.4.5$sill,clust.true.4.5$sill))
xmin <- min(c(clust.sim1.4.5$sill,clust.true.4.5$sill))
ymax <- max(c(clust.sim1.4.5$size,clust.true.4.5$size))

plot(clust.sim1.4.5$sill,clust.sim1.4.5$size, pch=19, col="darkgray", 
     xlim=c(xmin,xmax),ylim=c(0,ymax),xlab="Sil width",ylab="Cluster size",
     main="A")
points(clust.true.4.5$sill[clust.true.4.5$comb.pval<0.95],
       clust.true.4.5$size[clust.true.4.5$comb.pval<0.95], pch=19, col="black")
points(clust.true.4.5$sill[clust.true.4.5$comb.pval>=0.95],
       clust.true.4.5$size[clust.true.4.5$comb.pval>=0.95], pch=13, col="black", lwd=2,cex=1.5)
legend("topright",c("1 simulated sample","observed","observed (outliers)"), 
       col=c("darkgray","black","black"),pch=c(19,19,13), cex=0.75 )

top5.vals<-clust.true.4.5$comb[order(clust.true.4.5$comb,decreasing=T)][1:3]

hist(max.comb.4.5, col="gray", xlim=c(0.6,max(clust.true.4.5$comb)),  main="B",
     ylab="Simulation frequency", xlab="Combined Size and Sil Width (SSW)", border="white")

points(top5.vals,rep(50,3), cex=c(1.5,1.5,1), pch=c(13,13,19), lwd=2)


legend("topright",c("simulated max values"), bty="n", fill=c("gray"),border="white",cex=0.75)

dev.off()

#print out table of results for each observed cluster
print(clust.true.4.5)


##### Finally, identify PIDs in the 2 clusters of interest 
clust.memb <- cutree(clust.obj.true.data, h = 4.5)
clst.cnt<-table(clust.memb[clust.obj.true.data$order])

#Note that clust.memb=id1 from the results table- this is the cluster id 
#and is different form "idLR" which is used to identify clusters in the plotted dendrogram
clust.suspicious<-clust.memb[clust.memb==64 | clust.memb==49]

clust.ids<-as.numeric(labels(clust.suspicious))

clust.suspicious2<-cbind(clust.ids, as.numeric(clust.suspicious))




####### Notes - testing the clustering method if we include
####### those with known poor quality (dropped)

#Changes which need to be made to code:

#Define the "observed sample" using this code:
#dat.final.sam<-dat[dat$PID %in% final.PIDs | (dat$type==1 & dat$completed.baseline==1),]

#Change number of people in final simulated sets to 1491
#Use this code:
#dat.sim.fin<-dat.sim.tmp3[1:min(dim(dat.sim.tmp3)[1],1491),]
#dat.sim.fin$order<-sample(dat.final.sam$PID)[1:min(dim(dat.sim.tmp3)[1],1491)]

#Note that all cluster number will change, and would also need to be updated

#Subject with known repeat enrollments:

#known.repeat.enroller.PIDs<-c(3151,3147,3224,3285,4417,4550,4670,4674,4679,4755,4783,4816,
#4920,4921,5857,3170,3211,3296,3409,3476,3572,3610,4683,4802,4894,4915,5008,5049,5746,3173,
#3176,3192,3193,3195,3202,3205,3208,3210,3219,3322,3332,3414,3483,3578,3598,3601,4140,4307,
#4424,4431,4544,4911,5729)

#Additional cluster found if code is run with the additional 257 subjects
#clust.suspicious3<-clust.memb[clust.memb==52]
#clust.ids3<-as.numeric(labels(clust.suspicious3))
#compare to known repeater
#sum(known.repeat.enroller.PIDs %in% clust.ids3)