rm(list=ls())
######load libraries #####
library(party)
library(ggplot2)
library(cowplot)
library(plyr)

##### Analysis 1A: Skipping (Figure 1) #####
##### load data ----
demo.dat <- read.csv("Kuperman_etal_2016_data.csv")
var.list <- read.csv("Kuperman_etal_2016_predictor_list.csv", stringsAsFactors = F)
ivs <- var.list$predictor
##### setting up control variables ----
# generate random seeds so the results will be consistant
set.seed(1)
s <- runif(2) * 10000000
# The parameter mtry defines the number of randomly sampled predictor variables that are 
# used to select each split point in a tree.
# Here, we use a range of values from square root of the number of predictors to 
# one-third of the number of predictors, in an increment of one
mtrys <- seq(floor(sqrt(length(ivs))), floor(length(ivs)/3))
# ntree is a parameter that defines the number of trees to be built.
# ntree is set to 500 here so the script will run faster
my.ntree <- 500

##### fit the model and calculate variable importance ----

#set up a counter and a list for output
count <- 1
res_list <- list()
#loop through different values of mtry (in this case, 4, 5, and 6)
for(my.mtry in mtrys)
{
  fmla <- paste("IA_SKIPPING", "~", paste(ivs, collapse = " + "))
  set.seed(s[1])
  rf.out <- cforest(formula(fmla), data=demo.dat, control=cforest_unbiased(ntree=my.ntree, mtry=my.mtry))
  set.seed(s[2])
  vi.rf <- varimp(rf.out)
  tmp <- data.frame(mtry=my.mtry, 
                    measures="IA_SKIPPING", 
                    eye_label="SKipping",
                    predictor=names(vi.rf), 
                    value=vi.rf, rank=rank(-vi.rf), stringsAsFactors=FALSE)
  
  rm(rf.out) #remove cforest object to free up memory
  res_list[[count]] <- tmp
  count <- count + 1
}

#rbind the list
res_mat <- do.call("rbind", res_list)

#calculate mean, sd, se and rank of variable importance
res_mean <- ddply(res_mat, ~ measures + eye_label + predictor,  summarise, 
                     mean=mean(value), sd=sd(value), se=sd(value)/sqrt(length(value)))
res_mean$rank <- rank(-res_mean$mean)

##### calculate correlations ----
get.cor <- function(x, y, dat){
  cor(as.numeric(as.character(dat[,x])), as.numeric(as.character(dat[,y])), method="spearman", use = "pairwise")
}
rho <- sapply(ivs, get.cor, "IA_SKIPPING", demo.dat)

res_mean$rho <- rho[res_mean$predictor]
res_mean$signs.rho <- ifelse(sign(res_mean$rho)>0, "+", "-")

imp.dat <- merge(res_mean, var.list, by.x="predictor", by.y="predictor", sort = FALSE)

### generate the "scree" plot in order to visually determine the cut off ----
limits <- aes(xmax = mean + se, xmin= mean - se)
plot.scree <- ggplot(imp.dat, aes(x=mean, y=reorder(name, mean))) + 
  geom_point() + 
  geom_errorbarh(limits, height=0.2) +
  ylab("") + xlab("Variable Importance") + theme_bw()

print(plot.scree)
### Here we set the cutoff to be 7 (i.e., the top 7 predictor will be colored in the heatmap)
cutoff <- 7

# replacing the rank below cutoff with NA, and turning "rank" into a factor
imp.dat$rank[imp.dat$rank > cutoff] <- NA
imp.dat$rank <- factor(imp.dat$rank)

# adding a cutoff border to the plot.scree
plot.scree.border <- plot.scree +
  geom_hline(yintercept = nrow(imp.dat) + .5 - cutoff, color="black")

# setting up the colors to use for heatmap
maxrank <- max(as.numeric(imp.dat$rank), na.rm=T)
colinterval <- seq(from=0,to=1,length.out=7)
my.color <- rgb(red=rev(colinterval), green=0, blue=colinterval)

##### generate the heatmap -----
plot.heatmap <- ggplot(imp.dat, aes(x=eye_label, y=reorder(name, -order))) + 
  geom_tile(aes(fill=rank), color="white") + 
  scale_fill_manual(values=c(my.color)) +
  geom_tile(data = subset(imp.dat,  is.na(rank)), fill = "light gray", color="white", alpha=.5) +
  geom_text(data=subset(imp.dat, !is.na(rank)), aes(label = signs.rho), size=7, color="white") +
  ylab("") + xlab("") +
  scale_x_discrete(expand=c(0,0)) +
  scale_y_discrete(expand=c(0,0)) +
  facet_grid(group ~ ., drop=T,space="free",scales="free") +
  theme(axis.text.x = element_text(angle = 20, hjust = 1),
        axis.line = element_blank(),
        panel.border = element_blank(),
        panel.background = element_blank(),
        panel.margin = unit(1, "char"),
        panel.border = element_blank(),
        panel.background = element_blank(),
        plot.margin=unit(c(0,0,0,0), "lines"),
        axis.ticks = element_blank(),
        strip.text.x = element_text(size = 12, colour = "black", angle=20, hjust = .5, vjust=.5),
        strip.text.y = element_blank(),
        strip.background = element_rect(size=20)) 

###### combine both the scree and heatmap into one  (Figure 1) -----
png(file="Figure1.png", width=11*300, height=8.5*300, res=300)
plot_grid(plot.scree.border, plot.heatmap, labels = c("A", "B"), rel_widths = c(1.3, 1))
dev.off()

##### Analysis 1A: All dependent variables (Figure 2) #####
### now we load the data that has mean variable importance from all the other eye-movement measures
imp.dat.all <- read.csv("Kuperman_etal_2016_importance.csv", stringsAsFactors = FALSE)
imp.dat.all$rank <- factor(imp.dat.all$rank)

# making sure that DVs are labeled and ordered correctly
imp.dat.all$eye_label <- factor(imp.dat.all$eye_label, levels = c("First Fixation Position", "First Fixation Duration","Gaze Duration","First-pass Regression", "Regression Path Duration", "Total Reading Time","Skipping"))
imp.dat.all$eye.name <- factor(as.numeric(imp.dat.all$eye_label))
levels(imp.dat.all$eye.name) <- c("First fixation position", "First fixation duration","Gaze duration","First-pass regression", "Regression path duration", "Total reading time","Skipping")

# again, setting up the colors to use for heatmap
maxrank <- max(as.numeric(imp.dat.all$rank), na.rm=T)
colinterval <- seq(from=0,to=1,length.out=7)
my.color <- rgb(red=rev(colinterval), green=0, blue=colinterval)

# create a data.frame that categorize eye-movments meausres into one of Skipping, Early and Late
dv.group.nam <- data.frame(measures=c("IA_FIRST_FIXATION_DURATION", "IA_FIRST_RUN_DWELL_TIME",
                                      "IA_DWELL_TIME", "IA_REGRESSION_OUT", 
                                      "IA_REGRESSION_PATH_DURATION", 
                                      "IA_SKIPPING", "FixPos"),
                           dv.group= c("Early", "Early", 
                                       "Late","Late", 
                                       "Late", 
                                       "Skipping","Early"))

dv.group.nam$dv.group <- factor(dv.group.nam$dv.group, levels = c("Skipping","Early", "Late")) 

# combine the imp.dat.all and dv.group.nam
imp.dat.all <- merge(imp.dat.all, dv.group.nam)

###### plot the heatmap (Figure 2) -----

plot.heatmap.all <- ggplot(imp.dat.all, aes(x=eye.name, y=reorder(name, -order))) + 
  geom_tile(aes(fill=rank), color="white") + 
  scale_fill_manual(values=c(my.color)) +
  geom_tile(data = subset(imp.dat.all,  is.na(rank)), fill = "light gray", color="white", alpha=.5) + 
  geom_text(data=subset(imp.dat.all, !is.na(rank)), aes(label = signs.rho), size=7, color="white") +
  ylab("") + xlab("") +
  scale_x_discrete(expand=c(0,0)) +
  scale_y_discrete(expand=c(0,0)) +
  facet_grid(group ~ dv.group, drop=T,space="free",scales="free") +
  theme(axis.text.x = element_text(angle = 20, hjust = 1),
        axis.line = element_blank(),
        panel.border = element_blank(),
        panel.background = element_blank(),
        panel.margin = unit(1, "char"),
        panel.border = element_blank(),
        panel.background = element_blank(),
        plot.margin=unit(c(0,0,0,0), "lines"),
        axis.ticks = element_blank(),
        strip.text.x = element_text(size = 12, colour = "black", angle=20, hjust = .5, vjust=.5),
        strip.text.y = element_blank(),
        strip.background = element_rect(size=20))

ggsave(file="Figure2.png", plot=plot.heatmap.all, width=11, height=8.5)