##############################################################
# Program to test method of identifying unusual clusters in ##
# survey data starting with simulated clustered data        ##
# Author: Elizabeth Handorf
# Fox Chase Cancer Center
# Email: elizabeth.handorf@fccc.edu
# Supplementary material for PLOS-One manuscript
# "A hierarchical clustering approach to identify repeated enrollments in web survey data" 


library(cluster)
library(fpc)
library(mvtnorm)
library(GenOrd)
library(grDevices)
library(Matrix)



start_time <- Sys.time()

setwd('My Directory')

load("S1_File.RData")

#simulation parameters

#Some code to extract the inputs from the command line, modified from
#http://yangfeng.wordpress.com/2009/09/03/including-arguments-in-r-cmd-batch-mode/
##First read in the arguments listed at the command line
args=(commandArgs(TRUE))

##args is now a list of character vectors
## First check to see if arguments are passed.
## Then cycle through each element of the list and evaluate the expressions.
if(length(args)==0){
  print("No arguments supplied.")
  ##supply default values
  
  seed <- 9999999
  #autocorrelation of off-block-diagonals in clusters
  rho<- 0.8
  threshold<-0.95
}else{
  for(i in 1:length(args)){
    eval(parse(text=args[[i]]))
  }
}


#m=number of simulated "null" datasets to use 
m<-1000

set.seed(seed)

#number in the cluster 
n.tot=5000
n.clst1=70
n.unclst=n.tot - n.clst1 


#Unclustered indivudals - simulate as usual
simdat.unclst<-ordsample(n.unclst, marg.unclst, cor.unclst, Spearman=TRUE)

#Now generate MV Norm samples for the cluster

#Sigma - block matrix, 
#Block diagonal = cor.clst1
#Off-diagnoal blocks= cor.clst1*rho
offdiag<-cor.clst1*rho

Sigma.clst1.offdiagtmp<-do.call("rbind", rep(list(offdiag), 70))
Sigma.clst1.offdiag<-do.call("cbind", rep(list(Sigma.clst1.offdiagtmp), 70))
Sigma.clst1.diag<-as.matrix(.bdiag(rep(list(cor.clst1),n.clst1)))


Sigma.clst1<-Sigma.clst1.diag*(1-rho)+Sigma.clst1.offdiag

#generate all observations at once using the full correlation matrix,
#then split into appropriate rows/columns
simdat.clst1.tmp<-matrix(rmvnorm(1,sigma=Sigma.clst1),ncol=9)

#Apply probability integral transformation to get ordinal values
simdat.clst1<-pnorm(simdat.clst1.tmp)
#cut at marginal cdfs
for(i in 1:9) {
  simdat.clst1[,i]<-cut(simdat.clst1[,i],breaks=c(0,marg.clst1[[i]],1),labels=FALSE)
}

simdat.comb<-rbind(simdat.clst1,simdat.unclst)

simdat.comb<-data.frame(simdat.comb)
names(simdat.comb)<-c("burneasily","darktan","freckles","moles","sunburn","age","climate","haircolor","skincolor")

#indicator for which group they are a member of
simdat.comb$cluster.number<-c(rep(1,n.clst1),rep(0,n.unclst))

#Screener score
simdat.comb$score<-3*(simdat.comb[,1]==2)+
  3*(simdat.comb[,2]==1)+2*(simdat.comb[,2]==2)+1*(simdat.comb[,2]==3)+
  2*(simdat.comb[,3]==2)+4*(simdat.comb[,3]==3)+
  5*(simdat.comb[,4]==2)+10*(simdat.comb[,4]==3)+20*(simdat.comb[,4]==4)+30*(simdat.comb[,4]==5)+
  1*(simdat.comb[,5]==2)+2*(simdat.comb[,5]==3)+3*(simdat.comb[,5]==4)+4*(simdat.comb[,5]==5)+
  5*(simdat.comb[,7]==2)+10*(simdat.comb[,7]==3)+
  4*(simdat.comb[,8]==1)+3*(simdat.comb[,8]==2)+2*(simdat.comb[,8]==3)+1*(simdat.comb[,8]==4)+
  20*(simdat.comb[,9]==1)+18*(simdat.comb[,9]==2)+16*(simdat.comb[,9]==3)+4*(simdat.comb[,9]==4)+2*(simdat.comb[,9]==5)



#simulate enrollment order, grouping cluster 1 in time

simdat.comb$enrolltime<-c(runif(n.clst1,min=0.9,max=0.905), #equivalent to enrolling over 2 days during a 365 day enrollment period
                          runif(n.unclst,min=0, max=1))

simdat.comb<-simdat.comb[order(simdat.comb$enrolltime),]
simdat.comb$enrollorder<-rep(1:n.tot)
simdat.comb$age<-simdat.comb$age+17 #rescale to actual age


#Probability of completing based on glm model from original data
r.bin<-function(p) {rbinom(1, 1, p)}

pr.complete<-predict(m.included,newdata=simdat.comb,type="response")

simdat.comb$completed<-sapply(pr.complete,r.bin)


#Final dataset with eligible people who completed baseline
simdat.final.sam<-simdat.comb[simdat.comb$score>=27 & simdat.comb$completed==1,]


#How many from each cluster wound up in the final dataset?
table(simdat.final.sam$cluster.number)


###############################################################
#### Step 1 ###################################################
#### Run standard clustering on the simulated dataset ############


dat2<-simdat.final.sam[,c(1:9,13)]

#Euclidean distance
#Normalize to have unit variance
SDs<-apply(dat2,2,sd)

vec<-as.vector(dat2[1,])

dat3<-    as.matrix(dat2) %*% diag(1/SDs)

rownames(dat3)<-simdat.final.sam[,13]
dist.obj<-dist(dat3)

clust.obj.true.data<-hclust(dist.obj)

#function to determine cluster properties at a given height threshold

get.clusters<-function(clust.obj, height) {
  
  memb <- cutree(clust.obj, h = height)
  
  #This generates a matrix with the following elements: row number=order of clusters (L to R)
  #clst.cnt=total number of subjects in that cluster
  #id1=cluster number (from cutree)
  #idLR = cluster number from left to right in plot
  
  clst.cnt<-table(memb[clust.obj$order])
  id1<-unique(memb[clust.obj$order])
  summary.clust<-cluster.stats(dist.obj,memb)
  tmp<-cbind(idLR=c(1:length(id1)),id1,clst.size=clst.cnt[id1])
  
  #Now determine which clusters have the smallest within-cluster distance
  tmp2<-cbind(tmp[order(tmp[,2]),], avg.dist=summary.clust$average.distance,
              size=summary.clust$cluster.size,
              med.dist=summary.clust$median.distance,max.dist=summary.clust$diameter,
              sill = summary.clust$clus.avg.silwidths)
  
  tmp2<-tmp2[order(tmp2[,4]),]
  tmp2<-data.frame(tmp2)
  
  tmp2$comb<-tmp2$sill/max(tmp2$sill)+
    tmp2$size/max(tmp2$size)
  
  return(tmp2)
}

#run with varying hieghts
clust.true.4.5<-get.clusters(clust.obj = clust.obj.true.data, height=4.5)
clust.true.5.0<-get.clusters(clust.obj = clust.obj.true.data, height=5.0)
clust.true.5.5<-get.clusters(clust.obj = clust.obj.true.data, height=5.5)

#Select the height with the fewest negative sil widths
prop.neg.sill<-c(mean(clust.true.4.5$sill<0),
  mean(clust.true.5.0$sill<0),  
  mean(clust.true.5.5$sill<0))
#Note: if they are equal, choose the highest threshold (resulting in the smallest number of clusters)

if (prop.neg.sill[3] == min(prop.neg.sill) ) {
  clust.true.sim<-clust.true.5.5
  clust.height<-5.5
}else if (prop.neg.sill[2] == min(prop.neg.sill) ) {
  clust.true.sim<-clust.true.5.0
  clust.height<-5.0
}else if (prop.neg.sill[1] == min(prop.neg.sill) ) {
  clust.true.sim<-clust.true.4.5
  clust.height<-4.5
}


#save the number of observations in the "true" simulated data
n.final.sam<-dim(simdat.final.sam)[1]

###############################################################
#### Step 2 ###################################################
#### Find multivariate distirbution of clustering vars ########
#### (starting with the full set) #############################


#ineligible?
sum(simdat.comb$type==3 & simdat.comb$score>=27)

#didn't complete baseline?
sum(simdat.comb$type==2 & simdat.comb$completed.baseline==0)

simdat.comb.good.score<-simdat.comb[simdat.comb$score>=27,]

simdat.comb.good.score$burneasily<-simdat.comb.good.score$burneasily

#model the probability of entering the study IF they had a valid score
m.included<-glm(completed ~ as.factor(burneasily) + as.factor(darktan) +
                  as.factor(freckles)+as.factor(moles)+as.factor(sunburn)+as.factor(age)+
                  as.factor(climate)+as.factor(haircolor)+as.factor(skincolor)
                ,data=simdat.comb.good.score, family="binomial")

#Simulation parameters

#Correlation 
cor.matrix.obs<-cor(simdat.comb[,2:10],method="spearman")


#proportions responding in each likert item
t1<-prop.table(table(simdat.comb$burneasily))
t2<-prop.table(table(simdat.comb$darktan))
t3<-prop.table(table(simdat.comb$freckles))
t4<-prop.table(table(simdat.comb$moles))
t5<-prop.table(table(simdat.comb$sunburn))
t6<-prop.table(table(simdat.comb$age))
t7<-prop.table(table(simdat.comb$climate))
t8<-prop.table(table(simdat.comb$haircolor))
t9<-prop.table(table(simdat.comb$skincolor))


#Cumulative proportions
t1.cum<-cumsum(as.vector(t1))
t2.cum<-cumsum(as.vector(t2))
t3.cum<-cumsum(as.vector(t3))
t4.cum<-cumsum(as.vector(t4))
t5.cum<-cumsum(as.vector(t5))
t6.cum<-cumsum(as.vector(t6))
t7.cum<-cumsum(as.vector(t7))
t8.cum<-cumsum(as.vector(t8))
t9.cum<-cumsum(as.vector(t9))


#Transform to quantiles of the normal distribution
v1.cutoffs<-c(-Inf,qnorm(t1.cum))
v2.cutoffs<-c(-Inf,qnorm(t2.cum))
v3.cutoffs<-c(-Inf,qnorm(t3.cum))
v4.cutoffs<-c(-Inf,qnorm(t4.cum))
v5.cutoffs<-c(-Inf,qnorm(t5.cum))
v6.cutoffs<-c(-Inf,qnorm(t6.cum))
v7.cutoffs<-c(-Inf,qnorm(t7.cum))
v8.cutoffs<-c(-Inf,qnorm(t8.cum))
v9.cutoffs<-c(-Inf,qnorm(t9.cum))


#Correct the correlation matrix given the observed marginals
marginal<-list(
  t1.cum[-length(t1.cum)],
  t2.cum[-length(t2.cum)],
  t3.cum[-length(t3.cum)],
  t4.cum[-length(t4.cum)],
  t5.cum[-length(t5.cum)],
  t6.cum[-length(t6.cum)],
  t7.cum[-length(t7.cum)],
  t8.cum[-length(t8.cum)],
  t9.cum[-length(t9.cum)]
)
cor.matrix.adjusted<-contord(marginal, cor.matrix.obs)


###############################################################
#### Step 3-4 #################################################
#### Simulate M sets of N samples under independence ##########

#function to draw from bernoulli with probability p
r.bin<-function(p) {rbinom(1, 1, p)}

fun.sim.good.scores<-function() {

  dat.sim.tmp<-ordsample(6500, marginal, cor.matrix.obs, Spearman=TRUE)
  
  dat.sim.tmp[,6]<-dat.sim.tmp[,6]+17
  #only valid screener scores
  score<-3*(dat.sim.tmp[,1]==2)+
    3*(dat.sim.tmp[,2]==1)+2*(dat.sim.tmp[,2]==2)+1*(dat.sim.tmp[,2]==3)+
    2*(dat.sim.tmp[,3]==2)+4*(dat.sim.tmp[,3]==3)+
    5*(dat.sim.tmp[,4]==2)+10*(dat.sim.tmp[,4]==3)+20*(dat.sim.tmp[,4]==4)+30*(dat.sim.tmp[,4]==5)+
    1*(dat.sim.tmp[,5]==2)+2*(dat.sim.tmp[,5]==3)+3*(dat.sim.tmp[,5]==4)+4*(dat.sim.tmp[,5]==5)+
    5*(dat.sim.tmp[,7]==2)+10*(dat.sim.tmp[,7]==3)+
    4*(dat.sim.tmp[,8]==1)+3*(dat.sim.tmp[,8]==2)+2*(dat.sim.tmp[,8]==3)+1*(dat.sim.tmp[,8]==4)+
    20*(dat.sim.tmp[,9]==1)+18*(dat.sim.tmp[,9]==2)+16*(dat.sim.tmp[,9]==3)+4*(dat.sim.tmp[,9]==4)+2*(dat.sim.tmp[,9]==5)
  
  
  dat.sim.tmp2<-as.data.frame(dat.sim.tmp[score>=27,])
  
  names(dat.sim.tmp2)<-c("burneasily","darktan","freckles","moles",   
                         "sunburn",
                         "age",
                         "climate","haircolor","skincolor")  
  
  #Probability of completing based on glm model
  pr.complete<-predict(m.included,newdata=dat.sim.tmp2,type="response")
  
  sim.completed<-sapply(pr.complete,r.bin)
  
  #Select observations based on random probability of completing
  dat.sim.tmp3<-dat.sim.tmp2[sim.completed==1,]
  
  #Make a final dataset with the same number of observations as the "true" dataset (includ error catching if total N is smaller than 1234)
  dat.sim.fin<-dat.sim.tmp3[1:min(dim(dat.sim.tmp3)[1],n.final.sam),]
  dat.sim.fin$order<-sample(simdat.final.sam$enrollorder)[1:min(dim(dat.sim.tmp3)[1],n.final.sam)]
  
  return(dat.sim.fin)
  
}





i<-1
simulated.data.ind<-fun.sim.good.scores()
simulated.data.ind$rep<-rep(i,dim(simulated.data.ind)[1])

many.sims<-simulated.data.ind
for (i in 2: m) {
  simulated.data.ind<-fun.sim.good.scores()
  simulated.data.ind$rep<-rep(i,dim(simulated.data.ind)[1])
  many.sims<-rbind(many.sims,simulated.data.ind)
}

###############################################################
#### Step 5 ###################################################
#### Hierarchical clustering applied to all simulated datasets

#Maximum size and sil width from the clusters identified in the true data
true.max.size<-max(clust.true.sim$size)
true.max.sill<-max(clust.true.sim$sill)



max.size<-max.sill<-max.comb<-rep(NA,m)



#Run over all simulated datasets
for (i in 1: m) {
  dat.sim<-many.sims[many.sims$rep==i,c(10,1:9)]
  SDs<-apply(dat.sim,2,sd)
  vec<-as.vector(dat.sim[1,])
  dat.sim.a<-    as.matrix(dat.sim) %*% diag(1/SDs)
  rownames(dat.sim.a)<-simdat.final.sam[,1]
  dist.obj<-dist(dat.sim.a)
  
  clust.obj.sim<-hclust(dist.obj)
  
  clust.sim.cut<-get.clusters(clust.obj = clust.obj.sim, height=4.5)
  
  clust.sim.cut$comb<-clust.sim.cut$sill/true.max.sill + clust.sim.cut$size/true.max.size 
  
  max.size[i]<-max(clust.sim.cut$size)
  max.sill[i]<-max(clust.sim.cut$sill)
  max.comb[i]<-max(clust.sim.cut$comb)
  
}


end_time <- Sys.time()

end_time - start_time


###############################################################
#### Step 6 ###################################################
#### Compare simulated results to those from the true data ####

#For each of the clusters in the true data - what proportion of the 
#max simulated combined scores are greater than the observed score?

clust.true.sim$comb.pval<-rep(NA,dim(clust.true.sim)[1])
for ( i in 1: dim(clust.true.sim)[1]) {
  clust.true.sim$comb.pval[i]<-sum(clust.true.sim$comb[i]>max.comb)/m
}

clust.true.sim

##which clusters were identified
clust.chosen<-clust.true.sim$id1[clust.true.sim$comb.pval>=threshold]


# Identify PIDs in the cluster of interest, compare with "true" clustered data 
clust.memb <- cutree(clust.obj.true.data, h = clust.height)


#Note that clust.memb=id1 from the results table- this is the cluster id 
#and is different form "idLR" which is used to identify clusters in the plotted dendrogram
clust.suspicious<-clust.memb[clust.memb %in% clust.chosen]

clust.ids<-as.numeric(labels(clust.suspicious))

#how many are really in the cluster?
true.n.clst<-  sum(simdat.final.sam$cluster.number)


if (length(clust.ids)>0) {

  results<-data.frame(clust.ids, identified=1)


  #merge back in with individual data to summarize results
  simdat.final.sam.results<-merge(simdat.final.sam,results, by.x="enrollorder", by.y="clust.ids", all=T)
  simdat.final.sam.results$identified[is.na(simdat.final.sam.results$identified)]<-0

  summary.table<-table(simdat.final.sam.results$cluster.number,simdat.final.sam.results$identified)
  summary.table


  
 #Results of this simulation
 n.clusters.selected<-length(clust.chosen)
 prop.TP<- (summary.table[2,2])/sum(simdat.final.sam.results$cluster.number ==1)
 
 prop.TN<- summary.table[1,1]/sum(simdat.final.sam.results$cluster.number ==0)
 size.c1<-sum(simdat.final.sam.results$cluster.number ==1)
 res<-data.frame(seed, rho, n.clusters.selected, n.final.sam, true.n.clst, size.c1, prop.TP,prop.TN)
}else {
  res<-data.frame(seed, rho, 0, n.final.sam,true.n.clst, 0, 0 , 1)}



setwd('My results directory')

filename<-paste("tempres", rho, "_", threshold, "_", seed, ".csv", sep="")

write.table(res,filename, col.names =FALSE,append=TRUE, sep = ",", row.names = FALSE)


end_time <- Sys.time()

end_time - start_time