################################# ## File 00_load_data.R ########## ################################# setwd("~/wp-survey/") library(foreign) ## load december 2008 pew data ################################################################## # http://pewinternet.org/Data-Tools/Explore-Survey-Questions/Roper-Center.aspx?t=&sdate=mm/dd/yy&edate=mm/dd/yy&k=wikipedia # # Also, download the questionnaire file: # http://www.pewinternet.org/~/media/Files/Data%20Sets/2008/December_2008_Health_Questionnaire.doc pew.full <- read.spss("data/Pew_data/Pew-December_2008_Health.sav", use.value.labels=TRUE, trim_values=TRUE, use.missings=TRUE, to.data.frame=TRUE) # recode pew.full data into forms that are more comperable pew.full$female <- pew.full$sex == "Female" pew.full$married <- pew.full$mar == "Married" | pew.full$mar == "Separated" pew.full$children <- pew.full$par == "Yes" pew.full$immigrant <- pew.full$imm1 != "In the U.S. (including Puerto Rico or other U.S. territory)" pew.full$edu4 <- NA pew.full$edu4[pew.full$educ == "None, or grades 1-8" | pew.full$educ == "High school incomplete (grades 9-11)"] <- "primary school" pew.full$edu4[pew.full$educ == "High school graduate (grade 12 or GED certificate)" | pew.full$educ == "Technical, trade or vocational school AFTER high school"] <- "high school" pew.full$edu4[pew.full$educ == "Some college, no 4-year degree (includes associate degree)" | pew.full$educ == "College graduate (B.S., B.A., or other 4-year degree)"] <- "college" pew.full$edu4[pew.full$educ == "Post-graduate training/professional school after college (toward a Masters/Ph.D., Law or Medical school)"] <- "grad school" pew.full$edu4[pew.full$educ == "Don't know/Refused"] <- NA pew.full$edu4 <- ordered(pew.full$edu4, c("primary school", "high school", "college", "grad school")) # create student dich pew.full$student <- pew.full$stud %in% c("Yes, full-time", "Yes, part-time") # These two questions allow us to select the subset of Wikipedia # editors: # # act103a (asked to respondents who use the Internet but did not do so # yesterday): Do you ever use the internet to -- Look for information # on Wikipedia? # # act103b (asked to respondents who use the Internet and did so # yesterday): Do you ever use the internet to -- Look for information # on Wikipedia -- Did you happen to do this YESTERDAY, or not # restrict to the dataset of people who answered either question pew <- pew.full[!(is.na(pew.full$act103a) & is.na(pew.full$act103b)),] # test number of positive readers table(pew$act103a == "Yes, do this") table(pew$act103b == "Yes, did this yesterday") table(pew$act103b == "Yes, do this (but NOT yesterday)") # restrict dataset to just people who have "looked for information on # wikipedia" (we will compare them to "readers" in the unm data) pew <- pew[(!is.na(pew$act103a) & pew$act103a == "Yes, do this") | (!is.na(pew$act103b) & (pew$act103b == "Yes, did this yesterday" | pew$act103b == "Yes, do this (but NOT yesterday)")),] # print the total number of people in our sample of wikipedia readers dim(pew) ## load december 2008 WMF/UNU-MERIT data ################################################################## # # To get this data, you must email either the WMF or UNU-MERIT # researchers for an anonymized dataset. unm <- read.spss("data/wikipedia_survey_data.sav", use.value.labels=TRUE, trim_values=TRUE, use.missings=TRUE, to.data.frame=TRUE) ## CREATE WP SUBSET # clean up the data unm$A3_nationality[unm$A3_nationality == "no answer given"] <- NA unm$A4_Residence[unm$A4_Residence == "no answer given"] <- NA # nationality is resident? unm$immigrant <- (!(is.na(unm$A3_nationality) & is.na(unm$A4_Residence))) & (as.character(unm$A3_nationality) != as.character(unm$A4_Residence)) # recode female, age, and married, children variables unm$female <- unm$A1 == "female" unm$age.group <- NA unm$age.group[unm$A2 <= 12] <- "3-12" unm$age.group[unm$A2 >= 13 & unm$A2 <= 17] <- "13-17" unm$age.group[unm$A2 >= 18 & unm$A2 <= 34] <- "18-34" unm$age.group[unm$A2 >= 35 & unm$A2 <= 49] <- "35-49" unm$age.group[unm$A2 >= 50] <- "50+" unm$age.group <- as.factor(unm$age.group) unm$age <- unm$A2 unm$married <- NA unm$married[unm$A9 == 1] <- TRUE unm$married[unm$A9 == 2] <- FALSE unm$children <- NA unm$children[unm$A10 == 1] <- TRUE unm$children[unm$A10 == 2] <- FALSE # do a three way education split to compare to quantcast unm$edu3[unm$A7 == 1 | unm$A7 == 2] <- "no college" unm$edu3[unm$A7 == 3] <- "college" unm$edu3[unm$A7 == 4 | unm$A7 == 5] <- "grad school" unm$edu3 <- as.factor(unm$edu) # NOTE the UNU merit survey didn't allow you to select less than # primary school completed! unm$edu4 <- as.character(unm$edu3) unm$edu4[unm$A7 == 1] <- "primary school" unm$edu4[unm$A7 == 2] <- "high school" unm$edu4 <- ordered(unm$edu4, c("primary school", "high school", "college", "grad school")) # student dichtomy unm$student <- unm$A11 == 6 # save an intermediary file with a combined dataset save(unm, pew, pew.full, file="data/combined_datasets.RData") ################################# ## File 01_analysis.R ########### ################################# setwd("~/wp-survey/") library(ggplot2) library(descr) library(dummies) library(survey) # for applying weights! # load a file if a variable is missing load.if.missing <- function (var.name, file.name) { if (!exists(var.name)) { load(file.name, parent.frame()) # check to see if we're dealing with a data.table because, if we # are, we need to do some nasty back and forth if (class(eval(as.name(var.name)))[1] == "data.table") { # gnarly function that loads resorts things within the parent # frame to get around the bug in data.table assign(var.name, data.table(as.data.frame(eval(as.name(var.name))), key=attr(eval(as.name(var.name)), "sorted")), parent.frame()) } } } ## load the combined dataset ################################################################## load.if.missing("unm", "data/combined_datasets.RData") ## NEW ANALYSIS ########################################################################## # create an appropriate subset of wp data unm.readers <- unm[unm$activity_int == "readers",] unm.editors <- unm[unm$activity_int == "contributors",] unm.readers.us <- unm.readers[(!is.na(unm.readers$A4_Residence)) & unm.readers$A4_Residence == "US",] unm.editors.us <- unm.editors[(!is.na(unm.editors$A4_Residence)) & unm.editors$A4_Residence == "US",] # QS1. Are you under 18 years old, OR are you 18 or older? # PEW ONLY CONTAINS 18 unm.readers.us <- unm.readers.us[unm.readers.us$age >= 18,] # Note - this means that we've subset the WP data as well. variables.of.interest <- c("age", "female", "married", "children", "immigrant", "student", "edu4") u.sub <- unm.readers.us[,variables.of.interest] p.sub <- pew[,c(variables.of.interest, "weight")] table(complete.cases(u.sub)) table(complete.cases(p.sub)) # Here's where we set the weights to scale # # The UNU weights are kept at 1 and the Pew weights are multiplied by # 102138 (scaling roughly to 230 million) in order to scale those # responses to the full reference population of US # adults. Consequently, the regression models the likelihood of # volunteering for the UNU survey given the distribution of the # (weighted) covariates available from both studies. d <- rbind(cbind(u.sub, data.frame(survey=TRUE, weight=1)), cbind(p.sub[,!colnames(p.sub) == "weight"], data.frame(survey=FALSE, weight=102138*p.sub$weight))) d <- d[complete.cases(d),] dummify.edu4 <- function (x) { library(dummies) x.dummies <- as.data.frame(dummy(x$edu4)[,2:length(levels(x$edu4))]) colnames(x.dummies) <- paste("edu", levels(x$edu4)[2:length(levels(x$edu4))], sep=".") colnames(x.dummies) <- gsub(" ", ".", colnames(x.dummies)) cbind(x[,!colnames(x) == "edu4"], x.dummies) } d <- dummify.edu4(d) # original model without weights m <- glm(survey ~ age + female + married + children + immigrant + student + edu.high.school + edu.college + edu.grad.school, family=binomial("logit"), data=d) summary(m) m <- svyglm(survey ~ age + female + married + children + immigrant + student + edu.high.school + edu.college + edu.grad.school, family=binomial("logit"), design=d) summary(m) # model with weights following Valliant & Devers (2011) d.svy <- svydesign(ids=~1, data=d, weights=d$weight) m.svy <- svyglm(survey ~ age + female + married + children + immigrant + student + edu.high.school + edu.college + edu.grad.school, family=binomial("logit"), design=d.svy) summary(m.svy) gen.weighted.ed.sample <- function (x, m) { logistic <- function (x) { 1/(1+exp(-1 * x)) } x <- x[,variables.of.interest] x <- x[complete.cases(x),] x <- dummify.edu4(x) x$pred.prob <- logistic(apply(cbind(intercept=1, x), 1, function (y) {sum(y*coef(m))})) x$pscore.weight <- 1/x$pred.prob return(x) } ed.us <- gen.weighted.ed.sample(unm.editors.us, m.svy) ed.all <- gen.weighted.ed.sample(unm.editors, m.svy) compare.freq <- function (x, w) { options(descr.plot=FALSE) return(list(orig=freq(x), adj=freq(x,w))) } print.compare.freq <- function (x,w) { print(compare.freq(x,w)) } #generate the new weights derived from the propensity scores print.compare.freq(ed.us$female, ed.us$pscore.weight) print.compare.freq(ed.us$married, ed.us$pscore.weight) print.compare.freq(ed.us$immigrant, ed.us$pscore.weight) print.compare.freq(ed.us$edu.college, ed.us$pscore.weight) print.compare.freq(ed.all$female, ed.all$pscore.weight) print.compare.freq(ed.all$married, ed.all$pscore.weight) print.compare.freq(ed.all$immigrant, ed.all$pscore.weight) print.compare.freq(ed.all$edu.college, ed.all$pscore.weight) ## print tables ########################################################################## ## comparisons between wp survey and pew # WP survey contains all respondents 18 & older # Pew survey contains all respondents who use WP ################################################## build.adjusted.rows <- function (x) { data.frame(var.name=x, pew.prob.us=prop.table(table(pew[,x]))["TRUE"], unm.prop.us=prop.table(table(unm.readers.us[,x]))["TRUE"], # odds.ratio=exp(coef(m.svy)[paste(x, "TRUE", sep="")]), editors.prop.us=prop.table(table(unm.editors.us[,x]))["TRUE"], editors.scaled.us=compare.freq(ed.us[,x], ed.us$pscore.weight)[["adj"]]["TRUE","Percent"]*0.01, editors.prob.raw=prop.table(table(unm.editors[,x]))["TRUE"], editors.scaled=compare.freq(ed.all[,x], ed.all$pscore.weight)[["adj"]]["TRUE","Percent"]*0.01) } variables.of.interest <- c("female", "married", "children", "immigrant", "student") library(xtable) comp.tbl <- do.call("rbind", lapply(variables.of.interest, build.adjusted.rows)) rownames(comp.tbl) <- NULL comp.tbl[,2:7] <- comp.tbl[,2:7] * 100 colnames(comp.tbl) <- c("Variable", "Readers (Pew)", "Readers (UNM)", "Editors US (UNM)", "Editors US Adj.", "Editors (UNM)", "Editors Adj.") print(xtable(comp.tbl, digits=c(NA,NA,1,1,1,1,1,1)), include.rownames=FALSE, floating=TRUE) # Generate regression table library(apsrtable) apsrtable(m.svy) ## decide order of authors set.seed(18) sample(c("Aaron", "Mako"), 1)