######################################################## # Script generating the statistics reported "The neurophysiology of language processing shapes the evolution of grammar: evidence from case-marking" # Written by Balthasar Bickel [2014-07-16] # Results are pasted in as comments ######################################################## ######################################################## # Libraries, helpers etc. install.packages('devtools') library(devtools) install_github("IVS-UZH/familybias", ref='793313ebe5441c80b1003bb9046d9a2c08c1bb80') # family bias estimates and extrapolations to small families library(familybias) library(parallel) # multicore processing (change mclapply below to lappy if not multicore processing is not required) library(ape) # used for tree manipulation library(geiger) # used for ML estimates on character evolution in trees library(vcd) # for plotting of results # function for creating pyhlo objects based on flat taxonomies: source_url('https://github.com/IVS-UZH/phylo-convert/raw/bf572f1e75aeff4b873793ec5d395e8e273d2f4f/phylo-convert.R') # small helper function for reading output from BayesTraits (last lines in each file) lastline <- function(filename) { out <- system(sprintf("wc -l %s",filename),intern=TRUE) n <- as.integer(sub(sprintf("[ ]*([0-9]+)[ ]%s",filename),"\\1",out)) scan(filename,what="",skip=n-1,nlines=1,sep="\n",quiet=TRUE) } ######################################################## # Data (without BibTeX keys) case.alignment.data.gg <- read.csv('bickeletal_S1_database.csv', header=T)[,-36] # some helpers for easy reference glottolog.taxa <- paste('Level',18:1, sep='') autotyp.taxa <- c("stock", "mbranch", "sbranch", "ssbranch", "lsbranch", "language") # summary counts of data length(unique(case.alignment.data.gg$LID)) #[1] 617 nrow(case.alignment.data.gg) #[1] 705 length(unique(case.alignment.data.gg$stock)) #[1] 156 length(unique(case.alignment.data.gg$Level18)) #[1] 144 ######################################################## # Estimates of biases within families # NOTE: results of estimations will always be a slightly different from each other because of random sampling #------------------------------------------------------- # 1. With Autotyp taxonomy: #------------------------------------------------------- # 1.1. with binomial tests (the default in familybias) #------------------------------------------------------- case.alignment.autotyp.fam <- familybias(df = case.alignment.data.gg, family.names = c(autotyp.taxa, 'subsystems'), r.name = 'acc', p.names = 'area', B = 10000, lapplyfunc = mclapply) # means of extrapolations: case.alignment.autotyp.fam.mean <- mean(case.alignment.autotyp.fam) # standard errors across extrapolations, saving them for later tabulation case.alignment.autotyp.fam.freqs <- lapply(case.alignment.autotyp.fam$extrapolations,function(x) {as.data.frame(xtabs(~area+majority.response,x))}) case.alignment.autotyp.fam.freqs.df <- do.call(cbind, lapply(case.alignment.autotyp.fam.freqs, "[", 3)) case.alignment.autotyp.fam.se <- data.frame(case.alignment.autotyp.fam.freqs[[1]][,1:2],SE=apply(case.alignment.autotyp.fam.freqs.df[,-c(1:2)],1,function(x) sd(x)/sqrt(length(x)))) case.alignment.autotyp.fam.mean.biases <- droplevels(subset(case.alignment.autotyp.fam.mean, !majority.response %in% 'diverse')) case.alignment.autotyp.fam.mean.biases$Freq <- round(case.alignment.autotyp.fam.mean.biases$Freq) # Loglinear Analysis: full <- glm(Freq~majority.response+area+majority.response:area, case.alignment.autotyp.fam.mean.biases, family='poisson') indep <- glm(Freq~majority.response+area, case.alignment.autotyp.fam.mean.biases, family='poisson') anova(full, indep, test='Chi') # Analysis of Deviance Table # # Model 1: Freq ~ majority.response + area + majority.response:area # Model 2: Freq ~ majority.response + area # Resid. Df Resid. Dev Df Deviance Pr(>Chi) # 1 0 0.000 # 2 4 15.809 -4 -15.809 0.003286 ** # --- # Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 area.only <- glm(Freq~area, case.alignment.autotyp.fam.mean.biases, family='poisson') anova(area.only, full, test='Chi') # Analysis of Deviance Table # # Model 1: Freq ~ area # Model 2: Freq ~ majority.response + area + majority.response:area # Resid. Df Resid. Dev Df Deviance Pr(>Chi) # 1 5 121.84 # 2 0 0.00 5 121.84 < 2.2e-16 *** # --- # Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 # Factorial analysis, resolving the interaction by area: apply(xtabs(Freq~.,case.alignment.autotyp.fam.mean.biases), 1 , binom.test, alternative='gr') # $Africa # # Exact binomial test # # data: newX[, i] # number of successes = 29, number of trials = 29, p-value = 1.863e-09 # alternative hypothesis: true probability of success is greater than 0.5 # 95 percent confidence interval: # 0.9018554 1.0000000 # sample estimates: # probability of success # 1 # # # $Eurasia # # Exact binomial test # # data: newX[, i] # number of successes = 24, number of trials = 27, p-value = 2.462e-05 # alternative hypothesis: true probability of success is greater than 0.5 # 95 percent confidence interval: # 0.7372612 1.0000000 # sample estimates: # probability of success # 0.8888889 # # # $`N/C America` # # Exact binomial test # # data: newX[, i] # number of successes = 35, number of trials = 35, p-value = 2.91e-11 # alternative hypothesis: true probability of success is greater than 0.5 # 95 percent confidence interval: # 0.9179684 1.0000000 # sample estimates: # probability of success # 1 # # # $Pacific # # Exact binomial test # # data: newX[, i] # number of successes = 19, number of trials = 24, p-value = 0.003305 # alternative hypothesis: true probability of success is greater than 0.5 # 95 percent confidence interval: # 0.6108607 1.0000000 # sample estimates: # probability of success # 0.7916667 # # # $`S America` # # Exact binomial test # # data: newX[, i] # number of successes = 10, number of trials = 12, p-value = 0.01929 # alternative hypothesis: true probability of success is greater than 0.5 # 95 percent confidence interval: # 0.5618946 1.0000000 # sample estimates: # probability of success # 0.8333333 #------------------------------------------------------- # 1.2. estimating biases in large non-uniform families using ML as implemented in geiger::FitDiscrete and BayesTraits and using MCMC as implemented in BayesTraits rather than using a binomial test as implemented in familybias #------------------------------------------------------- # first, extract relevant families: case.alignment.autotyp.fam.big <- droplevels(subset(case.alignment.autotyp.fam$large.families.estimate[order(case.alignment.autotyp.fam$large.families.estimate$family.name), ], majority.prop<1 | is.na(majority.prop))) # NOTE familybias labels Finisterre and Binanderean as mbranches because there no other mbranches in their stocks, but they are the full data that we have for the larger stocks 'Finisterre-Huon' and 'Greater Binanderean' in our dataset. stocks <- c(as.character(subset(case.alignment.autotyp.fam$large.families.estimate[order(case.alignment.autotyp.fam$large.families.estimate$family.name), ], (majority.prop<1 | is.na(majority.prop)) & taxonomic.level %in% 'stock')$family.name), 'Finisterre-Huon','Greater Binanderean') # prepare trees, adding Oceanic which is not extracted by calling stocks above because it is an sbranch, not a stock big.case.alignment.autotyp.phylo <- as.phylo(case.alignment.data.gg[case.alignment.data.gg$stock %in% stocks | case.alignment.data.gg$sbranch %in% 'Oceanic', c(autotyp.taxa, 'subsystems')]) # Note that this conversion labels Oceanic as "Austronesian" because phylo keeps the entire node hierarchy in store. But the data is really just the subtree: plot(collapse.singles(big.case.alignment.autotyp.phylo[["Austronesian"]])) # # now prepare data: big.case.alignment.autotyp.data <- case.alignment.data.gg[case.alignment.data.gg$stock %in% stocks | case.alignment.data.gg$sbranch %in% 'Oceanic', c('subsystems', 'acc')] # get ML estimates with fitDiscrete big.case.autotyp.ml <- do.call(rbind, mclapply(big.case.alignment.autotyp.phylo, function(x) { current.tree <- multi2di(collapse.singles(x)) # clean up for fitDiscrete current.data <- sapply(current.tree$tip.label, function(i) big.case.alignment.autotyp.data$acc[big.case.alignment.autotyp.data$subsystems==i]) current.ard <- fitDiscrete(current.tree, current.data, model='ARD')$opt current.er <- fitDiscrete(current.tree, current.data, model='ER')$opt lr <- 2*(current.ard$lnL-current.er$lnL) p <- 1-pchisq(2*(current.ard$lnL-current.er$lnL),1) data.frame(ARD.qAE=current.ard$q12, ARD.qEA=current.ard$q21, ARD.lnL=current.ard$lnL, ER.q=current.er$q12, ER.lnL=current.er$lnL, lr.ml=lr, lr.ml.p=p) })) # write out the data and the trees for analysis by BayesTraits x <- lapply(big.case.alignment.autotyp.phylo, function(x) { fam <- gsub('\\s','',x$node.label[1]) current.tree <- collapse.singles(x) current.tree$node.label <- NULL # BayesTraits doesn't like node labels current.tree$tip.label <- gsub('\\s','',current.tree$tip.label) big.case.alignment.autotyp.data$subsystems2 <- gsub('\\s','',big.case.alignment.autotyp.data$subsystems) current.data <- sapply(current.tree$tip.label, function(i) big.case.alignment.autotyp.data$acc[big.case.alignment.autotyp.data$subsystems2==i]) current.data.df <-data.frame(names(current.data), current.data) write.nexus(current.tree, file=paste(fam, ".tree", sep = "")) write.table(current.data.df, file=paste(fam, ".data", sep = ""), row.names = F, quote = F, col.names = F, sep = "\t") }) # run as ./BayesTraits Austronesian.tree Austronesian.data < commands.ER.txt Austronesian.ER.txt etc as per the BayesTraits manual specs # # Read in results from BayesTraits ML: big.case.autotyp.ms.er.ml <- as.data.frame(do.call(rbind, sapply(dir(pattern='^[[:upper:]].*ER.log.txt'),function(l) { x <- lastline(l) rows <- strsplit(x, "\t") } ))) names(big.case.autotyp.ms.er.ml) <- c("TreeNo", "Lh", "qAE", "qEA", "RootP(A)", "RootP(E)") big.case.autotyp.ms.ard.ml <- as.data.frame(do.call(rbind, sapply(dir(pattern='^[[:upper:]].*ARD.log.txt'),function(l) { x <- lastline(l) rows <- strsplit(x, "\t") } ))) names(big.case.autotyp.ms.ard.ml) <- c("TreeNo", "Lh", "qAE", "qEA", "RootP(A)", "RootP(E)") # collect results in one dataframe for later tabulation big.case.autotyp.ms.ml <- data.frame(fam=gsub('\\.ER\\.txt','',rownames(big.case.autotyp.ms.er.ml)), lr.ml.ms=2*(as.numeric(as.character(big.case.autotyp.ms.ard.ml$Lh))-as.numeric(as.character(big.case.autotyp.ms.er.ml$Lh)))) big.case.autotyp.ms.ml$lr.ml.ms.p <- round(1-pchisq(big.case.autotyp.ms.ml$lr.ml.ms,1),5) # from BayesTraits MCMC: big.case.autotyp.er.mcmc <- as.data.frame(do.call(rbind, sapply(dir(pattern='^[[:upper:]].*ER.MCMC.log.txt'),function(l) { x <- secondlastline(l) rows <- strsplit(x, "\t") } ))) names(big.case.autotyp.er.mcmc) <- c("Iteration", "Lh", "HarmonicMean", "TreeNo", "qAE", "qEA", "RootP(A)", "RootP(E)") big.case.autotyp.ard.mcmc <- as.data.frame(do.call(rbind, sapply(dir(pattern='^[[:upper:]].*ARD.MCMC.log.txt'),function(l) { x <- secondlastline(l) rows <- strsplit(x, "\t") } ))) names(big.case.autotyp.ard.mcmc) <- c("Iteration", "Lh", "HarmonicMean", "TreeNo", "qAE", "qEA", "RootP(A)", "RootP(E)") big.case.autotyp.mcmc <- data.frame(fam=gsub('\\.ARD\\.MCMC\\.txt','',rownames(big.case.autotyp.ard.mcmc)), lr.mcmc=2*(as.numeric(as.character(big.case.autotyp.ard.mcmc$HarmonicMean))-as.numeric(as.character(big.case.autotyp.er.mcmc$HarmonicMean)))) # Summary table, for comparison of results: big.case.autotyp.summary <- data.frame(case.alignment.autotyp.fam.big, binom.p=round(sapply(case.alignment.autotyp.fam.big.p, function(x) x$p.value),5), apply(big.case.autotyp.ml,2,round,5)[,6:7], big.case.autotyp.ms.ml[,2:3],lr.mcmc=big.case.autotyp.mcmc[,2]) # From mcmc-lex-based trees for Austronesian and Indo-European (from Dunn et al. 2011) # Note for this we have to treat polymorphisms as uncertainties because trees are based on whole languages because # Matching between language names was done by hand as there were no ISO oder Glottolog Codes and sampling was very different. # Austronesian languages with data in both trees and our database (names from the trees, data from database): # language acc # ArakiSouthwestSanto A # Bali A # FutunaEast E # FutunaWest A # Hawaiian A # Ilokano A # Javanese A # Lamaholot A # Madurese A # Maori A # Mwotlap A # Nalik A # Nias E # Seimat A # Siar A # # Indo-European languages with data in both trees and our databases (names from the trees, plus ID from the tree): # language acc ID # Albanian_C A 3 # Armenian_Mod A 17 # Bulgarian A 42 # Catalan A 77 # Cornish A 92 # English_ST A 59 # French A 85 # German_ST A 75 # Greek_D A 20 # Hindi AE 31 # Hittite AE 0 # Italian A 88 # Marathi AE 37 # Ossetic A 15 # Persian_List A 12 # Provencal A 86 # Russian A 44 # Sardinian_C A 81 # Serbocroatian A 41 # Spanish A 80 # Tadzik A 11 # # final table with estimates from all tree-based methods, adding the results from the MCMC analysis on the lexical trees by hand: big.case.autotyp.summary$lex.mcmc <- NA big.case.autotyp.summary$lex.mcmc[big.case.autotyp.summary$family.name %in% 'Oceanic'] <- 5.262682 big.case.autotyp.summary$lex.mcmc[big.case.autotyp.summary$family.name %in% 'Indo-European'] <- 6.495412 # Now, we re-run extrapolations, based on the tree-based estimates. We do this by feeeding overrides into the extrapolation algorithm: # a template for this is bias.override.autotyp.template <- apply(case.alignment.autotyp.fam.big, 1, function(r) { mj.p <- as.numeric("["(r,'majority.prop')) names(mj.p) <- NULL bias <- "["(r,'majority.response') bias <- ifelse(bias %in% 'diverse', NA, bias) names(bias) <- NULL list(mj.prop=mj.p, bias.val=bias)}) # fill the template with the appropriate values. See big.case.autotyp.summary or Table S2. names(bias.override.autotyp.template) <- case.alignment.autotyp.fam.big$family.name bias.override.autotyp.ml.p <- bias.override.autotyp.template bias.override.autotyp.ml.p[['Oceanic']]$mj.prop <- NA bias.override.autotyp.ml.p[['Oceanic']]$bias.val <- NA bias.override.autotyp.ml.p[['Madang']]$mj.prop <- NA bias.override.autotyp.ml.p[['Madang']]$bias.val <- NA case.alignment.autotyp.fam.ml.p <- familybias(df = case.alignment.data.gg, family.names = c(autotyp.taxa, 'subsystems'), r.name = 'acc', p.names = 'area', B = 10000, lapplyfunc = mclapply, bias.override=bias.override.autotyp.ml.p) # case.alignment.autotyp.fam.ml.p$prior # area p.bias p.bias.95.cred p.deviate # 1 Africa 0.9285714 0.75, 1 0.00000000 # 2 Eurasia 0.7500000 0.48, 0.94 0.02871621 # 3 N/C America 0.8750000 0.59, 1 0.00000000 # 4 Pacific 0.3750000 0.16, 0.62 0.00000000 # 5 S America 0.5000000 0.15, 0.85 0.00000000 # mean across extrapolations case.alignment.autotyp.fam.ml.p.mean <- mean(case.alignment.autotyp.fam.ml.p) # std. error across extrapolations for later tabulation case.alignment.autotyp.fam.ml.p.freqs <- lapply(case.alignment.autotyp.fam.ml.p$extrapolations,function(x) {as.data.frame(xtabs(~area+majority.response,x))}) case.alignment.autotyp.fam.ml.p.freqs.df <- do.call(cbind, lapply(case.alignment.autotyp.fam.ml.p.freqs, "[", 3)) case.alignment.autotyp.fam.ml.p.se <- data.frame(case.alignment.autotyp.fam.ml.p.freqs[[1]][,1:2],SE=apply(case.alignment.autotyp.fam.ml.p.freqs.df[,-c(1:2)],1,function(x) sd(x)/sqrt(length(x)))) # loglinear analysis case.alignment.autotyp.fam.ml.p.mean.biases <- droplevels(subset(case.alignment.autotyp.fam.ml.p.mean, !majority.response %in% 'diverse')) case.alignment.autotyp.fam.ml.p.mean.biases$Freq <- round(case.alignment.autotyp.fam.ml.p.mean.biases$Freq) full <- glm(Freq~majority.response+area+majority.response:area, case.alignment.autotyp.fam.ml.p.mean.biases, family='poisson') indep <- glm(Freq~majority.response+area, case.alignment.autotyp.fam.ml.p.mean.biases, family='poisson') anova(full, indep, test='Chi') # Analysis of Deviance Table # # Model 1: Freq ~ majority.response + area + majority.response:area # Model 2: Freq ~ majority.response + area # Resid. Df Resid. Dev Df Deviance Pr(>Chi) # 1 0 0.000 # 2 4 15.368 -4 -15.368 0.003996 ** # --- # Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 area.only <- glm(Freq~area, case.alignment.autotyp.fam.ml.p.mean.biases, family='poisson') anova(area.only, full, test='Chi') # Analysis of Deviance Table # # Model 1: Freq ~ area # Model 2: Freq ~ majority.response + area + majority.response:area # Resid. Df Resid. Dev Df Deviance Pr(>Chi) # 1 5 119.02 # 2 0 0.00 5 119.02 < 2.2e-16 *** # --- # Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 # Factorial analysis, resolving the interaction by area: apply(xtabs(Freq~.,case.alignment.autotyp.fam.ml.p.mean.biases), 1 , binom.test, alternative='gr') # $Africa # # Exact binomial test # # data: newX[, i] # number of successes = 29, number of trials = 29, p-value = 1.863e-09 # alternative hypothesis: true probability of success is greater than 0.5 # 95 percent confidence interval: # 0.9018554 1.0000000 # sample estimates: # probability of success # 1 # # # $Eurasia # # Exact binomial test # # data: newX[, i] # number of successes = 24, number of trials = 27, p-value = 2.462e-05 # alternative hypothesis: true probability of success is greater than 0.5 # 95 percent confidence interval: # 0.7372612 1.0000000 # sample estimates: # probability of success # 0.8888889 # # # $`N/C America` # # Exact binomial test # # data: newX[, i] # number of successes = 35, number of trials = 35, p-value = 2.91e-11 # alternative hypothesis: true probability of success is greater than 0.5 # 95 percent confidence interval: # 0.9179684 1.0000000 # sample estimates: # probability of success # 1 # # # $Pacific # # Exact binomial test # # data: newX[, i] # number of successes = 14, number of trials = 18, p-value = 0.01544 # alternative hypothesis: true probability of success is greater than 0.5 # 95 percent confidence interval: # 0.5611172 1.0000000 # sample estimates: # probability of success # 0.7777778 # # # $`S America` # # Exact binomial test # # data: newX[, i] # number of successes = 10, number of trials = 12, p-value = 0.01929 # alternative hypothesis: true probability of success is greater than 0.5 # 95 percent confidence interval: # 0.5618946 1.0000000 # sample estimates: # probability of success # 0.8333333 ######################################################## #------------------------------------------------------- # 2. With Glottolog taxonomy: #------------------------------------------------------- # 2.1. with binomial tests (the default in familybias) #------------------------------------------------------- case.alignment.glottolog.fam <- familybias(df = case.alignment.data.gg, family.names = c(glottolog.taxa, 'subsystems'), r.name = 'acc', p.names = 'area', B = 10000, lapplyfunc = mclapply) case.alignment.glottolog.fam.mean <- mean(case.alignment.glottolog.fam) case.alignment.glottolog.fam.freqs <- lapply(case.alignment.glottolog.fam$extrapolations,function(x) {as.data.frame(xtabs(~area+majority.response,x))}) case.alignment.glottolog.fam.freqs.df <- do.call(cbind, lapply(case.alignment.glottolog.fam.freqs, "[", 3)) # summaries for later case.alignment.glottolog.fam.se <- data.frame(case.alignment.glottolog.fam.freqs[[1]][,1:2],SE=apply(case.alignment.glottolog.fam.freqs.df[,-c(1:2)],1,function(x) sd(x)/sqrt(length(x)))) case.alignment.glottolog.fam.mean.biases <- droplevels(subset(case.alignment.glottolog.fam.mean, !majority.response %in% 'diverse')) case.alignment.glottolog.fam.mean.biases$Freq <- round(case.alignment.glottolog.fam.mean.biases$Freq) # loglinear analysis full <- glm(Freq~majority.response+area+majority.response:area, case.alignment.glottolog.fam.mean.biases, family='poisson') indep <- glm(Freq~majority.response+area, case.alignment.glottolog.fam.mean.biases, family='poisson') anova(full, indep, test='Chi') # Analysis of Deviance Table # # Model 1: Freq ~ majority.response + area + majority.response:area # Model 2: Freq ~ majority.response + area # Resid. Df Resid. Dev Df Deviance Pr(>Chi) # 1 0 0.000 # 2 4 21.296 -4 -21.296 0.0002766 *** # --- # Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 area.only <- glm(Freq~area, case.alignment.glottolog.fam.mean.biases, family='poisson') anova(area.only, full, test='Chi') # Analysis of Deviance Table # # Model 1: Freq ~ area # Model 2: Freq ~ majority.response + area + majority.response:area # Resid. Df Resid. Dev Df Deviance Pr(>Chi) # 1 5 139.53 # 2 0 0.00 5 139.53 < 2.2e-16 *** # --- # Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 # Factorial analysis, resolving the interaction by areas: apply(xtabs(Freq~.,case.alignment.glottolog.fam.mean.biases), 1 , binom.test, alternative='gr') # $Eurasia # # Exact binomial test # # data: newX[, i] # number of successes = 24, number of trials = 27, p-value = 2.462e-05 # alternative hypothesis: true probability of success is greater than 0.5 # 95 percent confidence interval: # 0.7372612 1.0000000 # sample estimates: # probability of success # 0.8888889 # # # $Pacific # # Exact binomial test # # data: newX[, i] # number of successes = 21, number of trials = 27, p-value = 0.002962 # alternative hypothesis: true probability of success is greater than 0.5 # 95 percent confidence interval: # 0.6079016 1.0000000 # sample estimates: # probability of success # 0.7777778 # # # $Africa # # Exact binomial test # # data: newX[, i] # number of successes = 42, number of trials = 42, p-value = 2.274e-13 # alternative hypothesis: true probability of success is greater than 0.5 # 95 percent confidence interval: # 0.9311574 1.0000000 # sample estimates: # probability of success # 1 # # # $`N/C America` # # Exact binomial test # # data: newX[, i] # number of successes = 36, number of trials = 36, p-value = 1.455e-11 # alternative hypothesis: true probability of success is greater than 0.5 # 95 percent confidence interval: # 0.9201535 1.0000000 # sample estimates: # probability of success # 1 # # # $`S America` # # Exact binomial test # # data: newX[, i] # number of successes = 10, number of trials = 13, p-value = 0.04614 # alternative hypothesis: true probability of success is greater than 0.5 # 95 percent confidence interval: # 0.5053503 1.0000000 # sample estimates: # probability of success # 0.7692308 #------------------------------------------------------- # 1.2. estimating biases in large non-uniform families using ML as implemented in geiger::FitDiscrete and BayesTraits and using MCMC as implemented in BayesTraits rather than using a binomial test as implemented in familybias #------------------------------------------------------- # same procedure as above, but now with glottolog taxonomy case.alignment.glottolog.fam.big <- droplevels(subset(case.alignment.glottolog.fam$large.families.estimate[order(case.alignment.glottolog.fam$large.families.estimate$family.name), ], majority.prop<1 | is.na(majority.prop))) # fix taxonomic levels level18.glot <- subset(case.alignment.glottolog.fam.big, taxonomic.level %in% 'Level18')$family.name # Prepare trees, adding Central-Eastern Malayo-Polynesian: big.case.alignment.glottolog.phylo <- as.phylo(case.alignment.data.gg[case.alignment.data.gg$Level15 %in% "'Central-Eastern Malayo-Polynesian [cent2237]'"| case.alignment.data.gg$Level18 %in% level18.glot, c(glottolog.taxa, 'subsystems')]) # Note that this conversion gets Central-Eastern Malayo-Polynesian with name "Austronesian" because phylo keeps the entire node hierarchy in store. But the data is really just the subtree: plot(collapse.singles(big.case.alignment.glottolog.phylo[[10]])) # prepare data: big.case.alignment.glottolog.data <- case.alignment.data.gg[case.alignment.data.gg$Level15 %in% "'Central-Eastern Malayo-Polynesian [cent2237]'"| case.alignment.data.gg$Level18 %in% level18.glot, c('subsystems', 'acc')] # ML estimates using FitDiscrete: big.case.glottolog.ml <- do.call(rbind, mclapply(big.case.alignment.glottolog.phylo, function(x) { current.tree <- multi2di(collapse.singles(x)) # clean up for fitDiscrete current.data <- sapply(current.tree$tip.label, function(i) big.case.alignment.glottolog.data$acc[big.case.alignment.glottolog.data$subsystems==i]) current.ard <- fitDiscrete(current.tree, current.data, model='ARD')$opt current.er <- fitDiscrete(current.tree, current.data, model='ER')$opt lr <- 2*(current.ard$lnL-current.er$lnL) p <- 1-pchisq(2*(current.ard$lnL-current.er$lnL),1) data.frame(ARD.qAE=current.ard$q12, ARD.qEA=current.ard$q21, ARD.lnL=current.ard$lnL, ER.q=current.er$q12, ER.lnL=current.er$lnL, lr.ml=lr, lr.ml.p=p) })) # export for BayesTraits x <- mclapply(big.case.alignment.glottolog.phylo, function(x) { fam <- gsub("\\'|\\s","",x$node.label[1]) # BayesTraits doesn't like quotation marks or spaces in names and the brackets are a nuisance as well. fam <- gsub("\\[|\\]",".",fam) current.tree <- collapse.singles(x) current.tree$node.label <- NULL # BayesTraits doesn't like node labels current.tree$tip.label <- gsub('\\s','',current.tree$tip.label) big.case.alignment.glottolog.data$subsystems.clean <- gsub('\\s','',big.case.alignment.glottolog.data$subsystems) current.data <- sapply(current.tree$tip.label, function(i) big.case.alignment.glottolog.data$acc[big.case.alignment.glottolog.data$subsystems.clean==i]) current.data.df <-data.frame(names(current.data), current.data) write.nexus(current.tree, file=paste(fam, "tree", sep = "")) write.table(current.data.df, file=paste(fam, "data", sep = ""), row.names = F, quote = F, col.names = F, sep = "\t") }) # read results big.case.glottolog.ms.er.ml <- as.data.frame(do.call(rbind, sapply(dir(pattern='.*ER.log.txt'),function(l) { x <- lastline(l) rows <- strsplit(x, "\t") } ))) names(big.case.glottolog.ms.er.ml) <- c("TreeNo", "Lh", "qAE", "qEA", "RootP(A)", "RootP(E)") big.case.glottolog.ms.ard.ml <- as.data.frame(do.call(rbind, sapply(dir(pattern='.*ARD.log.txt'),function(l) { x <- lastline(l) rows <- strsplit(x, "\t") } ))) names(big.case.glottolog.ms.ard.ml) <- c("TreeNo", "Lh", "qAE", "qEA", "RootP(A)", "RootP(E)") big.case.glottolog.ms.ml <- data.frame(fam=gsub('\\.ER\\.out\\.txt','',rownames(big.case.glottolog.ms.er.ml)), lr.ml.ms=2*(as.numeric(as.character(big.case.glottolog.ms.ard.ml$Lh))-as.numeric(as.character(big.case.glottolog.ms.er.ml$Lh)))) big.case.glottolog.ms.ml$lr.ml.ms.p <- round(1-pchisq(big.case.glottolog.ms.ml$lr.ml.ms,1),5) # read results from from BayesTraits MCMC: big.case.glottolog.er.mcmc <- as.data.frame(do.call(rbind, sapply(dir(pattern='.*ER.MCMC.log.txt'),function(l) { x <- secondlastline(l) rows <- strsplit(x, "\t") } ))) names(big.case.glottolog.er.mcmc) <- c("Iteration", "Lh", "HarmonicMean", "TreeNo", "qAE", "qEA", "RootP(A)", "RootP(E)") big.case.glottolog.ard.mcmc <- as.data.frame(do.call(rbind, sapply(dir(pattern='.*ARD.MCMC.log.txt'),function(l) { x <- secondlastline(l) rows <- strsplit(x, "\t") } ))) names(big.case.glottolog.ard.mcmc) <- c("Iteration", "Lh", "HarmonicMean", "TreeNo", "qAE", "qEA", "RootP(A)", "RootP(E)") # collect in a table: big.case.glottolog.mcmc <- data.frame(fam=gsub('\\.ARD\\.MCMC\\.out.txt','',rownames(big.case.glottolog.ard.mcmc)), lr.mcmc=2*(as.numeric(as.character(big.case.glottolog.ard.mcmc$HarmonicMean))-as.numeric(as.character(big.case.glottolog.er.mcmc$HarmonicMean)))) # Summary table, for comparison of results: big.case.glottolog.summary <- data.frame(case.alignment.glottolog.fam.big, binom.p=round(sapply(case.alignment.glottolog.fam.big.p, function(x) x$p.value),5), apply(big.case.glottolog.ml,2,round,5)[order(rownames(big.case.glottolog.ml)),6:7], big.case.glottolog.ms.ml[,2:3], lr.mcmc=big.case.glottolog.mcmc[,2]) # final table: big.case.glottolog.summary$lex.mcmc <- NA big.case.glottolog.summary$lex.mcmc[big.case.glottolog.summary$family.name %in% "'Central-Eastern Malayo-Polynesian [cent2237]'"] <- 5.262682 big.case.glottolog.summary$lex.mcmc[big.case.glottolog.summary$family.name %in% "'Indo-European [indo1319]'"] <- 6.495412 # re-run extrapolations. Again, the MCMC-based estimates agree with the binomial tests (See big.case.glottolog.summary and Table S3), so we perform override analysis only for the ML-based estimates, where Malayo-Polynesian doesn't show an effect at an alpha=.05 level bias.override.glottolog.template <- apply(case.alignment.glottolog.fam.big, 1, function(r) { mj.p <- as.numeric("["(r,'majority.prop')) names(mj.p) <- NULL bias <- "["(r,'majority.response') bias <- ifelse(bias %in% 'diverse', NA, bias) names(bias) <- NULL list(mj.prop=mj.p, bias.val=bias)}) names(bias.override.glottolog.template) <- case.alignment.glottolog.fam.big$family.name bias.override.glottolog.ml.p <- bias.override.glottolog.template bias.override.glottolog.ml.p[["'Central-Eastern Malayo-Polynesian [cent2237]'"]]$mj.prop <- NA bias.override.glottolog.ml.p[["'Central-Eastern Malayo-Polynesian [cent2237]'"]]$mj.val <- NA # re-run familybias with these estimates: case.alignment.glottolog.fam.ml.p <- familybias(df = case.alignment.data.gg, family.names = c(glottolog.taxa, 'subsystems'), r.name = 'acc', p.names = 'area', B = 10000, lapplyfunc = mclapply, bias.override=bias.override.glottolog.ml.p) # summaries case.alignment.glottolog.fam.ml.p.mean <- mean(case.alignment.glottolog.fam.ml.p) case.alignment.glottolog.fam.ml.p.freqs <- lapply(case.alignment.glottolog.fam.ml.p$extrapolations,function(x) {as.data.frame(xtabs(~area+majority.response,x))}) case.alignment.glottolog.fam.ml.p.freqs.df <- do.call(cbind, lapply(case.alignment.glottolog.fam.ml.p.freqs, "[", 3)) case.alignment.glottolog.fam.ml.p.se <- data.frame(case.alignment.glottolog.fam.ml.p.freqs[[1]][,1:2],SE=apply(case.alignment.glottolog.fam.ml.p.freqs.df[,-c(1:2)],1,function(x) sd(x)/sqrt(length(x)))) # loglinear analysis case.alignment.glottolog.fam.ml.p.mean.biases <- droplevels(subset(case.alignment.glottolog.fam.ml.p.mean, !majority.response %in% 'diverse')) case.alignment.glottolog.fam.ml.p.mean.biases$Freq <- round(case.alignment.glottolog.fam.ml.p.mean.biases$Freq) full <- glm(Freq~majority.response+area+majority.response:area, case.alignment.glottolog.fam.ml.p.mean.biases, family='poisson') indep <- glm(Freq~majority.response+area, case.alignment.glottolog.fam.ml.p.mean.biases, family='poisson') anova(full, indep, test='Chi') # Analysis of Deviance Table # # Model 1: Freq ~ majority.response + area + majority.response:area # Model 2: Freq ~ majority.response + area # Resid. Df Resid. Dev Df Deviance Pr(>Chi) # 1 0 0.000 # 2 4 21.296 -4 -21.296 0.0002766 *** # --- # Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 area.only <- glm(Freq~area, case.alignment.glottolog.fam.ml.p.mean.biases, family='poisson') anova(area.only, full, test='Chi') # Analysis of Deviance Table # # Model 1: Freq ~ area # Model 2: Freq ~ majority.response + area + majority.response:area # Resid. Df Resid. Dev Df Deviance Pr(>Chi) # 1 5 139.53 # 2 0 0.00 5 139.53 < 2.2e-16 *** # --- # Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 # Factorial analysis, resolving interaction by areas: apply(xtabs(Freq~.,case.alignment.glottolog.fam.ml.p.mean.biases), 1 , binom.test, alternative='gr') # $Eurasia # # Exact binomial test # # data: newX[, i] # number of successes = 24, number of trials = 27, p-value = 2.462e-05 # alternative hypothesis: true probability of success is greater than 0.5 # 95 percent confidence interval: # 0.7372612 1.0000000 # sample estimates: # probability of success # 0.8888889 # # # $Pacific # # Exact binomial test # # data: newX[, i] # number of successes = 21, number of trials = 27, p-value = 0.002962 # alternative hypothesis: true probability of success is greater than 0.5 # 95 percent confidence interval: # 0.6079016 1.0000000 # sample estimates: # probability of success # 0.7777778 # # # $Africa # # Exact binomial test # # data: newX[, i] # number of successes = 42, number of trials = 42, p-value = 2.274e-13 # alternative hypothesis: true probability of success is greater than 0.5 # 95 percent confidence interval: # 0.9311574 1.0000000 # sample estimates: # probability of success # 1 # # # $`N/C America` # # Exact binomial test # # data: newX[, i] # number of successes = 36, number of trials = 36, p-value = 1.455e-11 # alternative hypothesis: true probability of success is greater than 0.5 # 95 percent confidence interval: # 0.9201535 1.0000000 # sample estimates: # probability of success # 1 # # # $`S America` # # Exact binomial test # # data: newX[, i] # number of successes = 10, number of trials = 13, p-value = 0.04614 # alternative hypothesis: true probability of success is greater than 0.5 # 95 percent confidence interval: # 0.5053503 1.0000000 # sample estimates: # probability of success # 0.7692308 # ################################ # Grand summaries, tables and plot # Much here is just about proper ordering of results # Large family estimates area.order <- c('Africa','Eurasia','Pacific','N/C America','S America') at <- case.alignment.autotyp.fam$prior[order(factor(case.alignment.autotyp.fam$prior$area, levels=area.order)),] at.ml <- case.alignment.autotyp.fam.ml.p$prior[order(factor(case.alignment.autotyp.fam.ml.p$prior$area, levels=area.order)),] gl <- case.alignment.glottolog.fam$prior[order(factor(case.alignment.glottolog.fam$prior$area, levels=area.order)),] gl.ml <- case.alignment.glottolog.fam.ml.p$prior[order(factor(case.alignment.glottolog.fam.ml.p$prior$area, levels=area.order)),] all.priors <- data.frame(area=at$area, autotyp.bias=gsub('c', ' ', paste(round(at$p.bias,2),at$p.bias.95.cred, sep='')), autotyp.ml.bias=gsub('c', ' ', paste(round(at.ml$p.bias,2),at.ml$p.bias.95.cred, sep='')), glottolog.bias=gsub('c', ' ', paste(round(gl$p.bias,2),gl$p.bias.95.cred, sep='')), glottolog.ml.bias=gsub('c', ' ', paste(round(gl.ml$p.bias,2),gl.ml$p.bias.95.cred, sep=''))) # deviation probabilities: all.dev <- data.frame(area=at$area, autotyp=at$p.deviate, autotyp.ml=at.ml$p.deviate, glottolog=gl$p.deviate, glottolog.ml=gl.ml$p.deviate) # Analysis of the mean results across methods and taxonomic assumptions: all.estimates <- data.frame(case.alignment.autotyp.fam.mean[order(factor(case.alignment.autotyp.fam.mean$area, levels=c('Africa','Eurasia','Pacific','N/C America','S America'))),1:2], autotyp.binom.and.mcmc=case.alignment.autotyp.fam.mean$Freq[order(factor(case.alignment.glottolog.fam.mean$area, levels=c('Africa','Eurasia','Pacific','N/C America','S America')))], autotyp.binom.and.mcmc.SE=case.alignment.autotyp.fam.se$SE[order(factor(case.alignment.autotyp.fam.se$area, levels=c('Africa','Eurasia','Pacific','N/C America','S America')),factor(case.alignment.autotyp.fam.se$majority.response, levels=c('diverse','A','E')))], autotyp.ml=case.alignment.autotyp.fam.ml.p.mean$Freq[order(factor(case.alignment.autotyp.fam.ml.p.mean$area, levels=c('Africa','Eurasia','Pacific','N/C America','S America')))], autotyp.ml.SE=case.alignment.autotyp.fam.ml.p.se$SE[order(factor(case.alignment.autotyp.fam.ml.p.se$area, levels=c('Africa','Eurasia','Pacific','N/C America','S America')),factor(case.alignment.autotyp.fam.ml.p.se$majority.response, levels=c('diverse','A','E')))], glottolog.binom.and.mcmc=case.alignment.glottolog.fam.mean$Freq[order(factor(case.alignment.glottolog.fam.mean$area, levels=c('Africa','Eurasia','Pacific','N/C America','S America')))], glottolog.binom.and.mcmc.SE=case.alignment.glottolog.fam.se$SE[order(factor(case.alignment.glottolog.fam.se$area, levels=c('Africa','Eurasia','Pacific','N/C America','S America')),factor(case.alignment.glottolog.fam.se$majority.response, levels=c('diverse','A','E')))], glottolog.ml=case.alignment.glottolog.fam.ml.p.mean$Freq[order(factor(case.alignment.glottolog.fam.ml.p.mean$area, levels=c('Africa','Eurasia','Pacific','N/C America','S America')))], glottolog.ml.SE=case.alignment.glottolog.fam.ml.p.se$SE[order(factor(case.alignment.glottolog.fam.ml.p.se$area, levels=c('Africa','Eurasia','Pacific','N/C America','S America')),factor(case.alignment.glottolog.fam.ml.p.se$majority.response, levels=c('diverse','A','E')))] ) # stats on grand totals (as reported in the main text): all.estimates$Freq <- rowMeans(all.estimates[,c('autotyp.binom.and.mcmc','autotyp.ml','glottolog.binom.and.mcmc','glottolog.ml')]) addmargins(xtabs(Freq~majority.response,all.estimates)) # majority.response # diverse A E Sum # 47.67018 123.21173 10.61810 181.50000 all.estimates.biases <- droplevels(subset(all.estimates, !majority.response %in% 'diverse', c('area', 'majority.response','Freq'))) all.estimates.biases$Freq <- round(all.estimates.biases$Freq) full <- glm(Freq~majority.response+area+majority.response:area, all.estimates.biases, family='poisson') indep <- glm(Freq~majority.response+area, all.estimates.biases, family='poisson') anova(full, indep, test='Chi') # Analysis of Deviance Table # # Model 1: Freq ~ majority.response + area + majority.response:area # Model 2: Freq ~ majority.response + area # Resid. Df Resid. Dev Df Deviance Pr(>Chi) # 1 0 0.000 # 2 4 11.992 -4 -11.992 0.01741 * # --- # Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 no.bias <- glm(Freq~area, all.estimates.biases, family='poisson') anova(no.bias, full, test='Chi') # Analysis of Deviance Table # # Model 1: Freq ~ area # Model 2: Freq ~ majority.response + area + majority.response:area # Resid. Df Resid. Dev Df Deviance Pr(>Chi) # 1 5 125.39 # 2 0 0.00 5 125.39 < 2.2e-16 *** # --- # Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 apply(xtabs(Freq~.,all.estimates.biases), 1 , binom.test, alternative='gr') # $Africa # # Exact binomial test # # data: newX[, i] # number of successes = 37, number of trials = 37, p-value = 7.276e-12 # alternative hypothesis: true probability of success is greater than 0.5 # 95 percent confidence interval: # 0.9222253 1.0000000 # sample estimates: # probability of success # 1 # # # $Eurasia # # Exact binomial test # # data: newX[, i] # number of successes = 25, number of trials = 27, p-value = 2.824e-06 # alternative hypothesis: true probability of success is greater than 0.5 # 95 percent confidence interval: # 0.7847 1.0000 # sample estimates: # probability of success # 0.9259259 # # # $`N/C America` # # Exact binomial test # # data: newX[, i] # number of successes = 31, number of trials = 32, p-value = 7.683e-09 # alternative hypothesis: true probability of success is greater than 0.5 # 95 percent confidence interval: # 0.8601505 1.0000000 # sample estimates: # probability of success # 0.96875 # # # $Pacific # # Exact binomial test # # data: newX[, i] # number of successes = 20, number of trials = 25, p-value = 0.002039 # alternative hypothesis: true probability of success is greater than 0.5 # 95 percent confidence interval: # 0.6245949 1.0000000 # sample estimates: # probability of success # 0.8 # # # $`S America` # # Exact binomial test # # data: newX[, i] # number of successes = 10, number of trials = 12, p-value = 0.01929 # alternative hypothesis: true probability of success is greater than 0.5 # 95 percent confidence interval: # 0.5618946 1.0000000 # sample estimates: # probability of success # 0.8333333 # tabulate and plot results all.estimates$area.ordered <- factor(all.estimates$area, levels=c('Africa','Eurasia','Pacific','S America', 'N/C America'), ordered=T, labels=c('Africa','Eurasia','Pacific','South\nAmerica', 'Rest of\nthe Americas')) all.estimates$majority.response.ordered <- factor(all.estimates$majority.response, levels=c('diverse','E','A'), ordered=T, labels=c("no detectable bias\nin language change", "bias for\nergatives", "bias against\nergatives")) # all.estimates[order(all.estimates$area.ordered),1:10] # area majority.response autotyp.binom.and.mcmc autotyp.binom.and.mcmc.SE autotyp.ml # 1 Africa diverse 4.1690 0.010991155 1.2644 # 2 Africa A 34.8310 0.010991155 28.7356 # 3 Africa E 0.0000 0.000000000 0.0000 # 4 Eurasia diverse 1.2929 0.022037639 8.4936 # 5 Eurasia A 28.7071 0.020946781 24.1191 # 6 Eurasia E 0.0000 0.011271666 3.3873 # 10 Pacific diverse 8.4984 0.029175254 30.2091 # 11 Pacific A 24.1061 0.025163646 14.0153 # 12 Pacific E 3.3955 0.017229440 3.7756 # 13 S America diverse 12.0036 0.022464877 12.0090 # 14 S America A 9.9870 0.019976024 9.9840 # 15 S America E 2.0094 0.009987043 2.0070 # 7 N/C America diverse 24.0279 0.019122749 4.1184 # 8 N/C America A 18.7633 0.019122749 34.8816 # 9 N/C America E 5.2088 0.000000000 0.0000 # autotyp.ml.SE glottolog.binom.and.mcmc glottolog.binom.and.mcmc.SE glottolog.ml glottolog.ml.SE # 1 0.01081461 3.2727 0.01729936 3.2742 0.01720914 # 2 0.01081461 41.7273 0.01729936 41.7258 0.01720914 # 3 0.00000000 0.0000 0.00000000 0.0000 0.00000000 # 4 0.02188221 7.3028 0.02012292 7.3204 0.02027798 # 5 0.02098460 23.6868 0.01958010 23.6662 0.01977560 # 6 0.01113762 3.0104 0.01079942 3.0134 0.01086051 # 10 0.02855904 14.9984 0.02770855 14.9785 0.02776240 # 11 0.02372212 21.1418 0.02563274 21.2348 0.02527117 # 12 0.01569983 5.8598 0.01762736 5.7867 0.01733249 # 13 0.02217605 12.4704 0.02287692 12.4801 0.02301172 # 14 0.01994199 10.0275 0.01994584 10.0035 0.02003021 # 15 0.01000578 2.5021 0.01132249 2.5164 0.01120495 # 7 0.01906239 4.2655 0.01918124 4.2314 0.01902630 # 8 0.01906239 35.7345 0.01918124 35.7686 0.01902630 # 9 0.00000000 0.0000 0.00000000 0.0000 0.00000000 apply(all.estimates[,c(4,6,8,10)],2, range) # autotyp.binom.and.mcmc.SE autotyp.ml.SE glottolog.binom.and.mcmc.SE glottolog.ml.SE # [1,] 0.00000000 0.00000000 0.00000000 0.0000000 # [2,] 0.02917525 0.02855904 0.02770855 0.0277624 mosaic(Freq~area.ordered+majority.response.ordered, all.estimates, split_vertical=c(T,F), boxes=c(F,F), labeling_args=list(gp_labels=gpar(cex=1, lineheight=.8, fontfamily="Helvetica", fontsize=20), varnames=F), gp=gpar(fill=c('grey','white','black')), rot_labels=c(0,0), just_labels=c("center", "left"), offset_labels=c(0,-.7,.5,0),margins=c(1,13,3,3), tl_labels=F, keep_aspect_ratio=F, spacing=spacing_increase(start = unit(0.1, "lines"),rate=4))