rm(list=ls()) ######load libraries ##### library(party) library(ggplot2) library(cowplot) library(plyr) ##### Analysis 1A: Skipping (Figure 1) ##### ##### load data ---- demo.dat <- read.csv("Kuperman_etal_2016_data.csv") var.list <- read.csv("Kuperman_etal_2016_predictor_list.csv", stringsAsFactors = F) ivs <- var.list$predictor ##### setting up control variables ---- # generate random seeds so the results will be consistant set.seed(1) s <- runif(2) * 10000000 # The parameter mtry defines the number of randomly sampled predictor variables that are # used to select each split point in a tree. # Here, we use a range of values from square root of the number of predictors to # one-third of the number of predictors, in an increment of one mtrys <- seq(floor(sqrt(length(ivs))), floor(length(ivs)/3)) # ntree is a parameter that defines the number of trees to be built. # ntree is set to 500 here so the script will run faster my.ntree <- 500 ##### fit the model and calculate variable importance ---- #set up a counter and a list for output count <- 1 res_list <- list() #loop through different values of mtry (in this case, 4, 5, and 6) for(my.mtry in mtrys) { fmla <- paste("IA_SKIPPING", "~", paste(ivs, collapse = " + ")) set.seed(s[1]) rf.out <- cforest(formula(fmla), data=demo.dat, control=cforest_unbiased(ntree=my.ntree, mtry=my.mtry)) set.seed(s[2]) vi.rf <- varimp(rf.out) tmp <- data.frame(mtry=my.mtry, measures="IA_SKIPPING", eye_label="SKipping", predictor=names(vi.rf), value=vi.rf, rank=rank(-vi.rf), stringsAsFactors=FALSE) rm(rf.out) #remove cforest object to free up memory res_list[[count]] <- tmp count <- count + 1 } #rbind the list res_mat <- do.call("rbind", res_list) #calculate mean, sd, se and rank of variable importance res_mean <- ddply(res_mat, ~ measures + eye_label + predictor, summarise, mean=mean(value), sd=sd(value), se=sd(value)/sqrt(length(value))) res_mean$rank <- rank(-res_mean$mean) ##### calculate correlations ---- get.cor <- function(x, y, dat){ cor(as.numeric(as.character(dat[,x])), as.numeric(as.character(dat[,y])), method="spearman", use = "pairwise") } rho <- sapply(ivs, get.cor, "IA_SKIPPING", demo.dat) res_mean$rho <- rho[res_mean$predictor] res_mean$signs.rho <- ifelse(sign(res_mean$rho)>0, "+", "-") imp.dat <- merge(res_mean, var.list, by.x="predictor", by.y="predictor", sort = FALSE) ### generate the "scree" plot in order to visually determine the cut off ---- limits <- aes(xmax = mean + se, xmin= mean - se) plot.scree <- ggplot(imp.dat, aes(x=mean, y=reorder(name, mean))) + geom_point() + geom_errorbarh(limits, height=0.2) + ylab("") + xlab("Variable Importance") + theme_bw() print(plot.scree) ### Here we set the cutoff to be 7 (i.e., the top 7 predictor will be colored in the heatmap) cutoff <- 7 # replacing the rank below cutoff with NA, and turning "rank" into a factor imp.dat$rank[imp.dat$rank > cutoff] <- NA imp.dat$rank <- factor(imp.dat$rank) # adding a cutoff border to the plot.scree plot.scree.border <- plot.scree + geom_hline(yintercept = nrow(imp.dat) + .5 - cutoff, color="black") # setting up the colors to use for heatmap maxrank <- max(as.numeric(imp.dat$rank), na.rm=T) colinterval <- seq(from=0,to=1,length.out=7) my.color <- rgb(red=rev(colinterval), green=0, blue=colinterval) ##### generate the heatmap ----- plot.heatmap <- ggplot(imp.dat, aes(x=eye_label, y=reorder(name, -order))) + geom_tile(aes(fill=rank), color="white") + scale_fill_manual(values=c(my.color)) + geom_tile(data = subset(imp.dat, is.na(rank)), fill = "light gray", color="white", alpha=.5) + geom_text(data=subset(imp.dat, !is.na(rank)), aes(label = signs.rho), size=7, color="white") + ylab("") + xlab("") + scale_x_discrete(expand=c(0,0)) + scale_y_discrete(expand=c(0,0)) + facet_grid(group ~ ., drop=T,space="free",scales="free") + theme(axis.text.x = element_text(angle = 20, hjust = 1), axis.line = element_blank(), panel.border = element_blank(), panel.background = element_blank(), panel.margin = unit(1, "char"), panel.border = element_blank(), panel.background = element_blank(), plot.margin=unit(c(0,0,0,0), "lines"), axis.ticks = element_blank(), strip.text.x = element_text(size = 12, colour = "black", angle=20, hjust = .5, vjust=.5), strip.text.y = element_blank(), strip.background = element_rect(size=20)) ###### combine both the scree and heatmap into one (Figure 1) ----- png(file="Figure1.png", width=11*300, height=8.5*300, res=300) plot_grid(plot.scree.border, plot.heatmap, labels = c("A", "B"), rel_widths = c(1.3, 1)) dev.off() ##### Analysis 1A: All dependent variables (Figure 2) ##### ### now we load the data that has mean variable importance from all the other eye-movement measures imp.dat.all <- read.csv("Kuperman_etal_2016_importance.csv", stringsAsFactors = FALSE) imp.dat.all$rank <- factor(imp.dat.all$rank) # making sure that DVs are labeled and ordered correctly imp.dat.all$eye_label <- factor(imp.dat.all$eye_label, levels = c("First Fixation Position", "First Fixation Duration","Gaze Duration","First-pass Regression", "Regression Path Duration", "Total Reading Time","Skipping")) imp.dat.all$eye.name <- factor(as.numeric(imp.dat.all$eye_label)) levels(imp.dat.all$eye.name) <- c("First fixation position", "First fixation duration","Gaze duration","First-pass regression", "Regression path duration", "Total reading time","Skipping") # again, setting up the colors to use for heatmap maxrank <- max(as.numeric(imp.dat.all$rank), na.rm=T) colinterval <- seq(from=0,to=1,length.out=7) my.color <- rgb(red=rev(colinterval), green=0, blue=colinterval) # create a data.frame that categorize eye-movments meausres into one of Skipping, Early and Late dv.group.nam <- data.frame(measures=c("IA_FIRST_FIXATION_DURATION", "IA_FIRST_RUN_DWELL_TIME", "IA_DWELL_TIME", "IA_REGRESSION_OUT", "IA_REGRESSION_PATH_DURATION", "IA_SKIPPING", "FixPos"), dv.group= c("Early", "Early", "Late","Late", "Late", "Skipping","Early")) dv.group.nam$dv.group <- factor(dv.group.nam$dv.group, levels = c("Skipping","Early", "Late")) # combine the imp.dat.all and dv.group.nam imp.dat.all <- merge(imp.dat.all, dv.group.nam) ###### plot the heatmap (Figure 2) ----- plot.heatmap.all <- ggplot(imp.dat.all, aes(x=eye.name, y=reorder(name, -order))) + geom_tile(aes(fill=rank), color="white") + scale_fill_manual(values=c(my.color)) + geom_tile(data = subset(imp.dat.all, is.na(rank)), fill = "light gray", color="white", alpha=.5) + geom_text(data=subset(imp.dat.all, !is.na(rank)), aes(label = signs.rho), size=7, color="white") + ylab("") + xlab("") + scale_x_discrete(expand=c(0,0)) + scale_y_discrete(expand=c(0,0)) + facet_grid(group ~ dv.group, drop=T,space="free",scales="free") + theme(axis.text.x = element_text(angle = 20, hjust = 1), axis.line = element_blank(), panel.border = element_blank(), panel.background = element_blank(), panel.margin = unit(1, "char"), panel.border = element_blank(), panel.background = element_blank(), plot.margin=unit(c(0,0,0,0), "lines"), axis.ticks = element_blank(), strip.text.x = element_text(size = 12, colour = "black", angle=20, hjust = .5, vjust=.5), strip.text.y = element_blank(), strip.background = element_rect(size=20)) ggsave(file="Figure2.png", plot=plot.heatmap.all, width=11, height=8.5)