#################################
## File 00_load_data.R ##########
#################################
setwd("~/wp-survey/")

library(foreign)

## load december 2008 pew data
##################################################################
# http://pewinternet.org/Data-Tools/Explore-Survey-Questions/Roper-Center.aspx?t=&sdate=mm/dd/yy&edate=mm/dd/yy&k=wikipedia
#
# Also, download the questionnaire file:
# http://www.pewinternet.org/~/media/Files/Data%20Sets/2008/December_2008_Health_Questionnaire.doc

pew.full <- read.spss("data/Pew_data/Pew-December_2008_Health.sav",
                      use.value.labels=TRUE,
                      trim_values=TRUE,
                      use.missings=TRUE,
                      to.data.frame=TRUE)

# recode pew.full data into forms that are more comperable
pew.full$female <- pew.full$sex == "Female"
pew.full$married <- pew.full$mar == "Married" | pew.full$mar == "Separated"
pew.full$children <- pew.full$par == "Yes"
pew.full$immigrant <- pew.full$imm1 != "In the U.S. (including Puerto Rico or other U.S. territory)"

pew.full$edu4 <- NA
pew.full$edu4[pew.full$educ == "None, or grades 1-8" |
       pew.full$educ == "High school incomplete (grades 9-11)"] <- "primary school"
pew.full$edu4[pew.full$educ == "High school graduate (grade 12 or GED certificate)" |
       pew.full$educ == "Technical, trade or vocational school AFTER high school"] <-
  "high school"
pew.full$edu4[pew.full$educ == "Some college, no 4-year degree (includes associate degree)" |
       pew.full$educ == "College graduate (B.S., B.A., or other 4-year degree)"] <-
  "college"
pew.full$edu4[pew.full$educ == "Post-graduate training/professional school after college (toward a Masters/Ph.D., Law or Medical school)"] <- "grad school"
pew.full$edu4[pew.full$educ == "Don't know/Refused"] <- NA
pew.full$edu4 <- ordered(pew.full$edu4, c("primary school", "high school", "college", "grad school"))

# create student dich
pew.full$student <- pew.full$stud %in% c("Yes, full-time", "Yes, part-time")

# These two questions allow us to select the subset of Wikipedia
# editors:
# 
# act103a (asked to respondents who use the Internet but did not do so
# yesterday): Do you ever use the internet to -- Look for information
# on Wikipedia?
#
# act103b (asked to respondents who use the Internet and did so
# yesterday): Do you ever use the internet to -- Look for information
# on Wikipedia -- Did you happen to do this YESTERDAY, or not

# restrict to the dataset of people who answered either question
pew <- pew.full[!(is.na(pew.full$act103a) & is.na(pew.full$act103b)),]

# test number of positive readers
table(pew$act103a == "Yes, do this")
table(pew$act103b == "Yes, did this yesterday")
table(pew$act103b == "Yes, do this (but NOT yesterday)")

# restrict dataset to just people who have "looked for information on
# wikipedia" (we will compare them to "readers" in the unm data)
pew <- pew[(!is.na(pew$act103a) & pew$act103a == "Yes, do this") |
           (!is.na(pew$act103b) &
            (pew$act103b == "Yes, did this yesterday" |
             pew$act103b == "Yes, do this (but NOT yesterday)")),]

# print the total number of people in our sample of wikipedia readers
dim(pew)

## load december 2008 WMF/UNU-MERIT data
##################################################################
#
# To get this data, you must email either the WMF or UNU-MERIT
# researchers for an anonymized dataset.

unm <- read.spss("data/wikipedia_survey_data.sav",
                 use.value.labels=TRUE,
                 trim_values=TRUE,
                 use.missings=TRUE,
                 to.data.frame=TRUE)

## CREATE WP SUBSET
# clean up the data
unm$A3_nationality[unm$A3_nationality == "no answer given"] <- NA
unm$A4_Residence[unm$A4_Residence == "no answer given"] <- NA

# nationality is resident?
unm$immigrant <- (!(is.na(unm$A3_nationality) & is.na(unm$A4_Residence))) &
  (as.character(unm$A3_nationality) != as.character(unm$A4_Residence))

# recode female, age, and married, children variables
unm$female <- unm$A1 == "female"

unm$age.group <- NA
unm$age.group[unm$A2 <= 12] <- "3-12"
unm$age.group[unm$A2 >= 13 & unm$A2 <= 17] <- "13-17"
unm$age.group[unm$A2 >= 18 & unm$A2 <= 34] <- "18-34"
unm$age.group[unm$A2 >= 35 & unm$A2 <= 49] <- "35-49"
unm$age.group[unm$A2 >= 50] <- "50+"
unm$age.group <- as.factor(unm$age.group)
unm$age <- unm$A2

unm$married <- NA
unm$married[unm$A9 == 1] <- TRUE
unm$married[unm$A9 == 2] <- FALSE

unm$children <- NA
unm$children[unm$A10 == 1] <- TRUE
unm$children[unm$A10 == 2] <- FALSE

# do a three way education split to compare to quantcast
unm$edu3[unm$A7 == 1 | unm$A7 == 2] <- "no college"
unm$edu3[unm$A7 == 3] <- "college"
unm$edu3[unm$A7 == 4 | unm$A7 == 5] <- "grad school"
unm$edu3 <- as.factor(unm$edu)

# NOTE the UNU merit survey didn't allow you to select less than
# primary school completed!
unm$edu4 <- as.character(unm$edu3)
unm$edu4[unm$A7 == 1] <- "primary school"
unm$edu4[unm$A7 == 2] <- "high school"
unm$edu4 <- ordered(unm$edu4,
                    c("primary school", "high school", "college", "grad school"))

# student dichtomy
unm$student <- unm$A11 == 6

# save an intermediary file with a combined dataset
save(unm, pew, pew.full, file="data/combined_datasets.RData")

#################################
## File 01_analysis.R ###########
#################################
setwd("~/wp-survey/")

library(ggplot2)
library(descr)
library(dummies)
library(survey) # for applying weights!

# load a file if a variable is missing
load.if.missing <- function (var.name, file.name) {
  if (!exists(var.name)) {
    load(file.name, parent.frame())

    # check to see if we're dealing with a data.table because, if we
    # are, we need to do some nasty back and forth
    if (class(eval(as.name(var.name)))[1] == "data.table") {

      # gnarly function that loads resorts things within the parent
      # frame to get around the bug in data.table
      assign(var.name,
             data.table(as.data.frame(eval(as.name(var.name))),
                        key=attr(eval(as.name(var.name)), "sorted")),
             parent.frame())
    }
  }
}

## load the combined dataset
##################################################################
load.if.missing("unm", "data/combined_datasets.RData")

## NEW ANALYSIS
##########################################################################

# create an appropriate subset of wp data
unm.readers <- unm[unm$activity_int == "readers",]
unm.editors <- unm[unm$activity_int == "contributors",]

unm.readers.us <- unm.readers[(!is.na(unm.readers$A4_Residence)) &
                              unm.readers$A4_Residence == "US",]
unm.editors.us <- unm.editors[(!is.na(unm.editors$A4_Residence)) &
                              unm.editors$A4_Residence == "US",]
 
# QS1. Are you under 18 years old, OR are you 18 or older?
# PEW ONLY CONTAINS 18
unm.readers.us <- unm.readers.us[unm.readers.us$age >= 18,]

# Note - this means that we've subset the WP data as well.
variables.of.interest <- c("age", "female", "married", "children",
                           "immigrant", "student", "edu4")

u.sub <- unm.readers.us[,variables.of.interest]
p.sub <- pew[,c(variables.of.interest, "weight")]

table(complete.cases(u.sub))
table(complete.cases(p.sub))

# Here's where we set the weights to scale
#
# The UNU weights are kept at 1 and the Pew weights are multiplied by
# 102138 (scaling roughly to 230 million) in order to scale those
# responses to the full reference population of US
# adults. Consequently, the regression models the likelihood of
# volunteering for the UNU survey given the distribution of the
# (weighted) covariates available from both studies.

d <- rbind(cbind(u.sub, data.frame(survey=TRUE, weight=1)),
           cbind(p.sub[,!colnames(p.sub) == "weight"],
                 data.frame(survey=FALSE, weight=102138*p.sub$weight)))

d <- d[complete.cases(d),]

dummify.edu4 <- function (x) {
  library(dummies)
  x.dummies <- as.data.frame(dummy(x$edu4)[,2:length(levels(x$edu4))])
  colnames(x.dummies) <- paste("edu", levels(x$edu4)[2:length(levels(x$edu4))],
                               sep=".")
  colnames(x.dummies) <- gsub(" ", ".", colnames(x.dummies))

  cbind(x[,!colnames(x) == "edu4"], x.dummies)
}

d <- dummify.edu4(d)


# original model without weights
m <- glm(survey ~ age + female + married + children + immigrant +
         student + edu.high.school + edu.college + edu.grad.school,
         family=binomial("logit"), data=d)
summary(m)

m <- svyglm(survey ~ age + female + married + children + immigrant +
         student + edu.high.school + edu.college + edu.grad.school,
         family=binomial("logit"), design=d)
summary(m)


# model with weights following Valliant & Devers (2011)
d.svy <- svydesign(ids=~1, data=d, weights=d$weight)

m.svy <- svyglm(survey ~ age + female + married + children + immigrant
                + student + edu.high.school + edu.college +
                edu.grad.school, family=binomial("logit"), design=d.svy)
summary(m.svy)

gen.weighted.ed.sample <- function (x, m) {
  logistic <- function (x) {
    1/(1+exp(-1 * x))
  }

  x <- x[,variables.of.interest]
  x <- x[complete.cases(x),]
  x <- dummify.edu4(x)

  x$pred.prob <- logistic(apply(cbind(intercept=1, x),
                                1, function (y)
                                {sum(y*coef(m))}))
  x$pscore.weight <- 1/x$pred.prob
  return(x)
}

ed.us <- gen.weighted.ed.sample(unm.editors.us, m.svy)
ed.all <- gen.weighted.ed.sample(unm.editors, m.svy)

compare.freq <- function (x, w) {
  options(descr.plot=FALSE)
  return(list(orig=freq(x), adj=freq(x,w)))
}

print.compare.freq <- function (x,w) {
  print(compare.freq(x,w))
}

#generate the new weights derived from the propensity scores
print.compare.freq(ed.us$female, ed.us$pscore.weight)
print.compare.freq(ed.us$married, ed.us$pscore.weight)
print.compare.freq(ed.us$immigrant, ed.us$pscore.weight)
print.compare.freq(ed.us$edu.college, ed.us$pscore.weight)

print.compare.freq(ed.all$female, ed.all$pscore.weight)
print.compare.freq(ed.all$married, ed.all$pscore.weight)
print.compare.freq(ed.all$immigrant, ed.all$pscore.weight)
print.compare.freq(ed.all$edu.college, ed.all$pscore.weight)

## print tables
##########################################################################

## comparisons between wp survey and pew
# WP survey contains all respondents 18 & older
# Pew survey contains all respondents who use WP
##################################################

build.adjusted.rows <- function (x) {
  data.frame(var.name=x,
    pew.prob.us=prop.table(table(pew[,x]))["TRUE"],
    unm.prop.us=prop.table(table(unm.readers.us[,x]))["TRUE"],
#    odds.ratio=exp(coef(m.svy)[paste(x, "TRUE", sep="")]),
    editors.prop.us=prop.table(table(unm.editors.us[,x]))["TRUE"],
    editors.scaled.us=compare.freq(ed.us[,x],
    ed.us$pscore.weight)[["adj"]]["TRUE","Percent"]*0.01,
      editors.prob.raw=prop.table(table(unm.editors[,x]))["TRUE"],
    editors.scaled=compare.freq(ed.all[,x],
      ed.all$pscore.weight)[["adj"]]["TRUE","Percent"]*0.01)
}

variables.of.interest <- c("female", "married", "children", "immigrant",
                           "student")

library(xtable)

comp.tbl <- do.call("rbind", lapply(variables.of.interest, build.adjusted.rows))
rownames(comp.tbl) <- NULL
comp.tbl[,2:7] <-  comp.tbl[,2:7] * 100
colnames(comp.tbl) <- c("Variable", "Readers (Pew)", "Readers (UNM)",
                        "Editors US (UNM)", "Editors US Adj.",
                        "Editors (UNM)", "Editors Adj.")
print(xtable(comp.tbl, digits=c(NA,NA,1,1,1,1,1,1)),
      include.rownames=FALSE, floating=TRUE)

# Generate regression table
library(apsrtable)
apsrtable(m.svy)

## decide order of authors

set.seed(18)
sample(c("Aaron", "Mako"), 1)