########################################################
# Script generating the statistics reported "The neurophysiology of language processing shapes the evolution of grammar: evidence from case-marking" 
# Written by Balthasar Bickel [2014-07-16]
# Results are pasted in as comments
########################################################

########################################################
# Libraries, helpers etc.
install.packages('devtools') 
library(devtools)
install_github("IVS-UZH/familybias", ref='793313ebe5441c80b1003bb9046d9a2c08c1bb80') # family bias estimates and extrapolations to small families
library(familybias) 
library(parallel) # multicore processing (change mclapply below to lappy if not multicore processing is not required)
library(ape) # used for tree manipulation
library(geiger) # used for ML estimates on character evolution in trees
library(vcd) # for plotting of results
# function for creating pyhlo objects based on flat taxonomies:
source_url('https://github.com/IVS-UZH/phylo-convert/raw/bf572f1e75aeff4b873793ec5d395e8e273d2f4f/phylo-convert.R')
# small helper function for reading output from BayesTraits (last lines in each file)
lastline <- function(filename) {
  out <- system(sprintf("wc -l %s",filename),intern=TRUE)
  n <- as.integer(sub(sprintf("[ ]*([0-9]+)[ ]%s",filename),"\\1",out))
  scan(filename,what="",skip=n-1,nlines=1,sep="\n",quiet=TRUE)
}

########################################################
#  Data  (without BibTeX keys)
case.alignment.data.gg <- read.csv('bickeletal_S1_database.csv', header=T)[,-36]
# some helpers for easy reference
glottolog.taxa <- paste('Level',18:1, sep='') 
autotyp.taxa <- c("stock", "mbranch", "sbranch", "ssbranch", "lsbranch", "language")
# summary counts of data
length(unique(case.alignment.data.gg$LID))
#[1] 617
nrow(case.alignment.data.gg)
#[1] 705
length(unique(case.alignment.data.gg$stock))
#[1] 156
length(unique(case.alignment.data.gg$Level18))
#[1] 144

########################################################
#  Estimates of biases within families
#  NOTE: results of estimations will always be a slightly different from each other because of random sampling
#-------------------------------------------------------
# 1. With Autotyp taxonomy: 
#-------------------------------------------------------
# 1.1. with binomial tests (the default in familybias)
#-------------------------------------------------------
case.alignment.autotyp.fam <- familybias(df = case.alignment.data.gg, family.names = c(autotyp.taxa, 'subsystems'), r.name = 'acc', p.names = 'area', B = 10000, lapplyfunc = mclapply)
# means of extrapolations:
case.alignment.autotyp.fam.mean <- mean(case.alignment.autotyp.fam)
# standard errors across extrapolations, saving them for later tabulation 
case.alignment.autotyp.fam.freqs <- lapply(case.alignment.autotyp.fam$extrapolations,function(x) {as.data.frame(xtabs(~area+majority.response,x))})
case.alignment.autotyp.fam.freqs.df <- do.call(cbind, lapply(case.alignment.autotyp.fam.freqs, "[", 3))
case.alignment.autotyp.fam.se <- data.frame(case.alignment.autotyp.fam.freqs[[1]][,1:2],SE=apply(case.alignment.autotyp.fam.freqs.df[,-c(1:2)],1,function(x) sd(x)/sqrt(length(x))))
case.alignment.autotyp.fam.mean.biases <- droplevels(subset(case.alignment.autotyp.fam.mean, !majority.response %in% 'diverse'))
case.alignment.autotyp.fam.mean.biases$Freq <- round(case.alignment.autotyp.fam.mean.biases$Freq)
# Loglinear Analysis:
full <- glm(Freq~majority.response+area+majority.response:area, case.alignment.autotyp.fam.mean.biases, family='poisson')
indep <- glm(Freq~majority.response+area, case.alignment.autotyp.fam.mean.biases, family='poisson')
anova(full, indep, test='Chi')
# Analysis of Deviance Table
#
# Model 1: Freq ~ majority.response + area + majority.response:area
# Model 2: Freq ~ majority.response + area
#   Resid. Df Resid. Dev Df Deviance Pr(>Chi)
# 1         0      0.000
# 2         4     15.809 -4  -15.809 0.003286 **
# ---
# Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
area.only <- glm(Freq~area, case.alignment.autotyp.fam.mean.biases, family='poisson')
anova(area.only, full, test='Chi')
# Analysis of Deviance Table
#
# Model 1: Freq ~ area
# Model 2: Freq ~ majority.response + area + majority.response:area
#   Resid. Df Resid. Dev Df Deviance  Pr(>Chi)
# 1         5     121.84
# 2         0       0.00  5   121.84 < 2.2e-16 ***
# ---
# Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

# Factorial analysis, resolving the interaction by area:

apply(xtabs(Freq~.,case.alignment.autotyp.fam.mean.biases), 1 , binom.test, alternative='gr') 

# $Africa
#
# 	Exact binomial test
#
# data:  newX[, i]
# number of successes = 29, number of trials = 29, p-value = 1.863e-09
# alternative hypothesis: true probability of success is greater than 0.5
# 95 percent confidence interval:
#  0.9018554 1.0000000
# sample estimates:
# probability of success
#                      1
#
#
# $Eurasia
#
# 	Exact binomial test
#
# data:  newX[, i]
# number of successes = 24, number of trials = 27, p-value = 2.462e-05
# alternative hypothesis: true probability of success is greater than 0.5
# 95 percent confidence interval:
#  0.7372612 1.0000000
# sample estimates:
# probability of success
#              0.8888889
#
#
# $`N/C America`
#
# 	Exact binomial test
#
# data:  newX[, i]
# number of successes = 35, number of trials = 35, p-value = 2.91e-11
# alternative hypothesis: true probability of success is greater than 0.5
# 95 percent confidence interval:
#  0.9179684 1.0000000
# sample estimates:
# probability of success
#                      1
#
#
# $Pacific
#
# 	Exact binomial test
#
# data:  newX[, i]
# number of successes = 19, number of trials = 24, p-value = 0.003305
# alternative hypothesis: true probability of success is greater than 0.5
# 95 percent confidence interval:
#  0.6108607 1.0000000
# sample estimates:
# probability of success
#              0.7916667
#
#
# $`S America`
#
# 	Exact binomial test
#
# data:  newX[, i]
# number of successes = 10, number of trials = 12, p-value = 0.01929
# alternative hypothesis: true probability of success is greater than 0.5
# 95 percent confidence interval:
#  0.5618946 1.0000000
# sample estimates:
# probability of success
#              0.8333333


#-------------------------------------------------------
# 1.2. estimating biases in large non-uniform families using ML as implemented in geiger::FitDiscrete and BayesTraits and using MCMC as implemented in BayesTraits rather than using a binomial test as implemented in familybias
#-------------------------------------------------------
# first, extract relevant families:
case.alignment.autotyp.fam.big <- droplevels(subset(case.alignment.autotyp.fam$large.families.estimate[order(case.alignment.autotyp.fam$large.families.estimate$family.name), ], majority.prop<1 | is.na(majority.prop)))
# NOTE familybias labels Finisterre and Binanderean as mbranches because there no other mbranches in their stocks, but they are the full data that we have for the larger stocks 'Finisterre-Huon' and 'Greater Binanderean' in our dataset.
stocks <- c(as.character(subset(case.alignment.autotyp.fam$large.families.estimate[order(case.alignment.autotyp.fam$large.families.estimate$family.name), ], (majority.prop<1 | is.na(majority.prop)) & taxonomic.level %in% 'stock')$family.name), 'Finisterre-Huon','Greater Binanderean')
# prepare trees, adding Oceanic which is not extracted by calling stocks above because it is an sbranch, not a stock
big.case.alignment.autotyp.phylo <- as.phylo(case.alignment.data.gg[case.alignment.data.gg$stock %in% stocks | case.alignment.data.gg$sbranch %in% 'Oceanic', c(autotyp.taxa, 'subsystems')])
# Note that this conversion labels Oceanic as "Austronesian" because phylo keeps the entire node hierarchy in store. But the data is really just the subtree: plot(collapse.singles(big.case.alignment.autotyp.phylo[["Austronesian"]]))
# 
# now prepare data:
big.case.alignment.autotyp.data <- case.alignment.data.gg[case.alignment.data.gg$stock %in% stocks | case.alignment.data.gg$sbranch %in% 'Oceanic', c('subsystems', 'acc')]
# get ML estimates with fitDiscrete 
big.case.autotyp.ml <- do.call(rbind, mclapply(big.case.alignment.autotyp.phylo, function(x) {
current.tree <- multi2di(collapse.singles(x)) # clean up for fitDiscrete
current.data <- sapply(current.tree$tip.label, function(i) big.case.alignment.autotyp.data$acc[big.case.alignment.autotyp.data$subsystems==i])
current.ard <- fitDiscrete(current.tree, current.data, model='ARD')$opt
current.er <- fitDiscrete(current.tree, current.data, model='ER')$opt
lr <- 2*(current.ard$lnL-current.er$lnL)
p <- 1-pchisq(2*(current.ard$lnL-current.er$lnL),1)
data.frame(ARD.qAE=current.ard$q12, ARD.qEA=current.ard$q21, ARD.lnL=current.ard$lnL, ER.q=current.er$q12, ER.lnL=current.er$lnL, lr.ml=lr, lr.ml.p=p)
}))

# write out the data and the trees for analysis by BayesTraits
x <- lapply(big.case.alignment.autotyp.phylo, function(x) {
fam <- gsub('\\s','',x$node.label[1])
current.tree <- collapse.singles(x)
current.tree$node.label <- NULL # BayesTraits doesn't like node labels
current.tree$tip.label <- gsub('\\s','',current.tree$tip.label)
big.case.alignment.autotyp.data$subsystems2 <- gsub('\\s','',big.case.alignment.autotyp.data$subsystems)
current.data <- sapply(current.tree$tip.label, function(i) big.case.alignment.autotyp.data$acc[big.case.alignment.autotyp.data$subsystems2==i])
current.data.df <-data.frame(names(current.data), current.data)
write.nexus(current.tree, file=paste(fam, ".tree", sep = ""))
write.table(current.data.df, file=paste(fam, ".data", sep = ""), row.names = F, quote = F, col.names = F, sep = "\t")
})
# run as ./BayesTraits Austronesian.tree Austronesian.data < commands.ER.txt Austronesian.ER.txt etc as per the BayesTraits manual specs
#
# Read in results from BayesTraits ML:  
big.case.autotyp.ms.er.ml <- as.data.frame(do.call(rbind, sapply(dir(pattern='^[[:upper:]].*ER.log.txt'),function(l) {
x <- lastline(l)
rows <- strsplit(x, "\t")
}
)))
names(big.case.autotyp.ms.er.ml) <- c("TreeNo", "Lh", "qAE", "qEA", "RootP(A)", "RootP(E)")
 
big.case.autotyp.ms.ard.ml <- as.data.frame(do.call(rbind, sapply(dir(pattern='^[[:upper:]].*ARD.log.txt'),function(l) {
x <- lastline(l)
rows <- strsplit(x, "\t")
}
)))
names(big.case.autotyp.ms.ard.ml) <- c("TreeNo", "Lh", "qAE", "qEA", "RootP(A)", "RootP(E)")
 
# collect results in one dataframe for later tabulation 
big.case.autotyp.ms.ml <- data.frame(fam=gsub('\\.ER\\.txt','',rownames(big.case.autotyp.ms.er.ml)), lr.ml.ms=2*(as.numeric(as.character(big.case.autotyp.ms.ard.ml$Lh))-as.numeric(as.character(big.case.autotyp.ms.er.ml$Lh))))
big.case.autotyp.ms.ml$lr.ml.ms.p <- round(1-pchisq(big.case.autotyp.ms.ml$lr.ml.ms,1),5)

# from BayesTraits MCMC:
big.case.autotyp.er.mcmc <- as.data.frame(do.call(rbind, sapply(dir(pattern='^[[:upper:]].*ER.MCMC.log.txt'),function(l) {
x <- secondlastline(l)
rows <- strsplit(x, "\t")
}
)))
names(big.case.autotyp.er.mcmc) <- c("Iteration", "Lh", "HarmonicMean", "TreeNo", "qAE", "qEA", "RootP(A)", "RootP(E)")
big.case.autotyp.ard.mcmc <- as.data.frame(do.call(rbind, sapply(dir(pattern='^[[:upper:]].*ARD.MCMC.log.txt'),function(l) {
x <- secondlastline(l)
rows <- strsplit(x, "\t")
}
)))
names(big.case.autotyp.ard.mcmc) <- c("Iteration", "Lh", "HarmonicMean", "TreeNo", "qAE", "qEA", "RootP(A)", "RootP(E)")
big.case.autotyp.mcmc <- data.frame(fam=gsub('\\.ARD\\.MCMC\\.txt','',rownames(big.case.autotyp.ard.mcmc)), lr.mcmc=2*(as.numeric(as.character(big.case.autotyp.ard.mcmc$HarmonicMean))-as.numeric(as.character(big.case.autotyp.er.mcmc$HarmonicMean))))

# Summary table, for comparison of results:
big.case.autotyp.summary <- data.frame(case.alignment.autotyp.fam.big, binom.p=round(sapply(case.alignment.autotyp.fam.big.p, function(x) x$p.value),5), apply(big.case.autotyp.ml,2,round,5)[,6:7], big.case.autotyp.ms.ml[,2:3],lr.mcmc=big.case.autotyp.mcmc[,2])

# From mcmc-lex-based trees for Austronesian and Indo-European (from Dunn et al. 2011)
# Note for this we have to treat polymorphisms as uncertainties because trees are based on whole languages because 
# Matching between language names was done by hand as there were no ISO oder Glottolog Codes and sampling was very different.
# Austronesian languages with data in both trees and our database (names from the trees, data from database):
# language		acc
# ArakiSouthwestSanto	A
# Bali			A
# FutunaEast		E
# FutunaWest		A
# Hawaiian		A
# Ilokano		A
# Javanese		A
# Lamaholot		A
# Madurese		A
# Maori			A
# Mwotlap		A
# Nalik			A
# Nias			E
# Seimat		A
# Siar			A
#
# Indo-European languages with data in both trees and our databases (names from the trees, plus ID from the tree):
# language 	acc	ID
# Albanian_C	A	3
# Armenian_Mod	A	17
# Bulgarian	A	42
# Catalan	A	77
# Cornish	A	92
# English_ST	A	59
# French	A	85
# German_ST	A	75
# Greek_D	A	20
# Hindi		AE	31
# Hittite	AE	0
# Italian	A	88
# Marathi	AE	37
# Ossetic	A	15
# Persian_List	A	12
# Provencal	A	86
# Russian	A	44
# Sardinian_C	A	81
# Serbocroatian	A	41
# Spanish	A	80
# Tadzik	A	11
# 

# final table with estimates from all tree-based methods, adding the results from the MCMC analysis on the lexical trees by hand:
big.case.autotyp.summary$lex.mcmc <- NA
big.case.autotyp.summary$lex.mcmc[big.case.autotyp.summary$family.name %in% 'Oceanic'] <- 5.262682
big.case.autotyp.summary$lex.mcmc[big.case.autotyp.summary$family.name %in% 'Indo-European'] <- 6.495412

# Now, we re-run extrapolations, based on the tree-based estimates. We do this by feeeding overrides into the extrapolation algorithm: 
# a template for this is
bias.override.autotyp.template <- apply(case.alignment.autotyp.fam.big, 1,  function(r) {
mj.p <- as.numeric("["(r,'majority.prop'))
names(mj.p) <- NULL
bias <- "["(r,'majority.response')
bias <- ifelse(bias %in% 'diverse', NA, bias)
names(bias) <- NULL
list(mj.prop=mj.p, bias.val=bias)})

# fill the template with the appropriate values. See big.case.autotyp.summary or Table S2. 
names(bias.override.autotyp.template) <- case.alignment.autotyp.fam.big$family.name
bias.override.autotyp.ml.p <- bias.override.autotyp.template
bias.override.autotyp.ml.p[['Oceanic']]$mj.prop <- NA
bias.override.autotyp.ml.p[['Oceanic']]$bias.val <- NA
bias.override.autotyp.ml.p[['Madang']]$mj.prop <- NA
bias.override.autotyp.ml.p[['Madang']]$bias.val <- NA

case.alignment.autotyp.fam.ml.p <- familybias(df = case.alignment.data.gg, family.names = c(autotyp.taxa, 'subsystems'), r.name = 'acc', p.names = 'area', B = 10000, lapplyfunc = mclapply, bias.override=bias.override.autotyp.ml.p)

# case.alignment.autotyp.fam.ml.p$prior
#          area    p.bias p.bias.95.cred  p.deviate
# 1      Africa 0.9285714        0.75, 1 0.00000000
# 2     Eurasia 0.7500000     0.48, 0.94 0.02871621
# 3 N/C America 0.8750000        0.59, 1 0.00000000
# 4     Pacific 0.3750000     0.16, 0.62 0.00000000
# 5   S America 0.5000000     0.15, 0.85 0.00000000

# mean across extrapolations
case.alignment.autotyp.fam.ml.p.mean <- mean(case.alignment.autotyp.fam.ml.p)

# std. error across extrapolations for later tabulation
case.alignment.autotyp.fam.ml.p.freqs <- lapply(case.alignment.autotyp.fam.ml.p$extrapolations,function(x) {as.data.frame(xtabs(~area+majority.response,x))})
case.alignment.autotyp.fam.ml.p.freqs.df <- do.call(cbind, lapply(case.alignment.autotyp.fam.ml.p.freqs, "[", 3))
case.alignment.autotyp.fam.ml.p.se <- data.frame(case.alignment.autotyp.fam.ml.p.freqs[[1]][,1:2],SE=apply(case.alignment.autotyp.fam.ml.p.freqs.df[,-c(1:2)],1,function(x) sd(x)/sqrt(length(x))))

# loglinear analysis
case.alignment.autotyp.fam.ml.p.mean.biases <- droplevels(subset(case.alignment.autotyp.fam.ml.p.mean, !majority.response %in% 'diverse'))
case.alignment.autotyp.fam.ml.p.mean.biases$Freq <- round(case.alignment.autotyp.fam.ml.p.mean.biases$Freq)
full <- glm(Freq~majority.response+area+majority.response:area, case.alignment.autotyp.fam.ml.p.mean.biases, family='poisson')
indep <- glm(Freq~majority.response+area, case.alignment.autotyp.fam.ml.p.mean.biases, family='poisson')
anova(full, indep, test='Chi')
# Analysis of Deviance Table
#
# Model 1: Freq ~ majority.response + area + majority.response:area
# Model 2: Freq ~ majority.response + area
#   Resid. Df Resid. Dev Df Deviance Pr(>Chi)
# 1         0      0.000
# 2         4     15.368 -4  -15.368 0.003996 **
# ---
# Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
area.only <- glm(Freq~area, case.alignment.autotyp.fam.ml.p.mean.biases, family='poisson')
anova(area.only, full, test='Chi')
# Analysis of Deviance Table
#
# Model 1: Freq ~ area
# Model 2: Freq ~ majority.response + area + majority.response:area
#   Resid. Df Resid. Dev Df Deviance  Pr(>Chi)
# 1         5     119.02
# 2         0       0.00  5   119.02 < 2.2e-16 ***
# ---
# Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

# Factorial analysis, resolving the interaction by area:

apply(xtabs(Freq~.,case.alignment.autotyp.fam.ml.p.mean.biases), 1 , binom.test, alternative='gr') 
# $Africa
#
# 	Exact binomial test
#
# data:  newX[, i]
# number of successes = 29, number of trials = 29, p-value = 1.863e-09
# alternative hypothesis: true probability of success is greater than 0.5
# 95 percent confidence interval:
#  0.9018554 1.0000000
# sample estimates:
# probability of success
#                      1
#
#
# $Eurasia
#
# 	Exact binomial test
#
# data:  newX[, i]
# number of successes = 24, number of trials = 27, p-value = 2.462e-05
# alternative hypothesis: true probability of success is greater than 0.5
# 95 percent confidence interval:
#  0.7372612 1.0000000
# sample estimates:
# probability of success
#              0.8888889
#
#
# $`N/C America`
#
# 	Exact binomial test
#
# data:  newX[, i]
# number of successes = 35, number of trials = 35, p-value = 2.91e-11
# alternative hypothesis: true probability of success is greater than 0.5
# 95 percent confidence interval:
#  0.9179684 1.0000000
# sample estimates:
# probability of success
#                      1
#
#
# $Pacific
#
# 	Exact binomial test
#
# data:  newX[, i]
# number of successes = 14, number of trials = 18, p-value = 0.01544
# alternative hypothesis: true probability of success is greater than 0.5
# 95 percent confidence interval:
#  0.5611172 1.0000000
# sample estimates:
# probability of success
#              0.7777778
#
#
# $`S America`
#
# 	Exact binomial test
#
# data:  newX[, i]
# number of successes = 10, number of trials = 12, p-value = 0.01929
# alternative hypothesis: true probability of success is greater than 0.5
# 95 percent confidence interval:
#  0.5618946 1.0000000
# sample estimates:
# probability of success
#              0.8333333


########################################################
#-------------------------------------------------------
# 2. With Glottolog taxonomy: 
#-------------------------------------------------------
# 2.1. with binomial tests (the default in familybias)
#-------------------------------------------------------
case.alignment.glottolog.fam <- familybias(df = case.alignment.data.gg, family.names = c(glottolog.taxa, 'subsystems'), r.name = 'acc', p.names = 'area', B = 10000, lapplyfunc = mclapply)
case.alignment.glottolog.fam.mean <- mean(case.alignment.glottolog.fam)
case.alignment.glottolog.fam.freqs <- lapply(case.alignment.glottolog.fam$extrapolations,function(x) {as.data.frame(xtabs(~area+majority.response,x))})
case.alignment.glottolog.fam.freqs.df <- do.call(cbind, lapply(case.alignment.glottolog.fam.freqs, "[", 3))

# summaries for later
case.alignment.glottolog.fam.se <- data.frame(case.alignment.glottolog.fam.freqs[[1]][,1:2],SE=apply(case.alignment.glottolog.fam.freqs.df[,-c(1:2)],1,function(x) sd(x)/sqrt(length(x))))
case.alignment.glottolog.fam.mean.biases <- droplevels(subset(case.alignment.glottolog.fam.mean, !majority.response %in% 'diverse'))
case.alignment.glottolog.fam.mean.biases$Freq <- round(case.alignment.glottolog.fam.mean.biases$Freq)

# loglinear analysis
full <- glm(Freq~majority.response+area+majority.response:area, case.alignment.glottolog.fam.mean.biases, family='poisson')
indep <- glm(Freq~majority.response+area, case.alignment.glottolog.fam.mean.biases, family='poisson')
anova(full, indep, test='Chi')
# Analysis of Deviance Table
#
# Model 1: Freq ~ majority.response + area + majority.response:area
# Model 2: Freq ~ majority.response + area
#   Resid. Df Resid. Dev Df Deviance  Pr(>Chi)
# 1         0      0.000
# 2         4     21.296 -4  -21.296 0.0002766 ***
# ---
# Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
area.only <- glm(Freq~area, case.alignment.glottolog.fam.mean.biases, family='poisson')
anova(area.only, full, test='Chi')
# Analysis of Deviance Table
#
# Model 1: Freq ~ area
# Model 2: Freq ~ majority.response + area + majority.response:area
#   Resid. Df Resid. Dev Df Deviance  Pr(>Chi)
# 1         5     139.53
# 2         0       0.00  5   139.53 < 2.2e-16 ***
# ---
# Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

# Factorial analysis, resolving the interaction by areas:
apply(xtabs(Freq~.,case.alignment.glottolog.fam.mean.biases), 1 , binom.test, alternative='gr') 
# $Eurasia
#
# 	Exact binomial test
#
# data:  newX[, i]
# number of successes = 24, number of trials = 27, p-value = 2.462e-05
# alternative hypothesis: true probability of success is greater than 0.5
# 95 percent confidence interval:
#  0.7372612 1.0000000
# sample estimates:
# probability of success
#              0.8888889
#
#
# $Pacific
#
# 	Exact binomial test
#
# data:  newX[, i]
# number of successes = 21, number of trials = 27, p-value = 0.002962
# alternative hypothesis: true probability of success is greater than 0.5
# 95 percent confidence interval:
#  0.6079016 1.0000000
# sample estimates:
# probability of success
#              0.7777778
#
#
# $Africa
#
# 	Exact binomial test
#
# data:  newX[, i]
# number of successes = 42, number of trials = 42, p-value = 2.274e-13
# alternative hypothesis: true probability of success is greater than 0.5
# 95 percent confidence interval:
#  0.9311574 1.0000000
# sample estimates:
# probability of success
#                      1
#
#
# $`N/C America`
#
# 	Exact binomial test
#
# data:  newX[, i]
# number of successes = 36, number of trials = 36, p-value = 1.455e-11
# alternative hypothesis: true probability of success is greater than 0.5
# 95 percent confidence interval:
#  0.9201535 1.0000000
# sample estimates:
# probability of success
#                      1
#
#
# $`S America`
#
# 	Exact binomial test
#
# data:  newX[, i]
# number of successes = 10, number of trials = 13, p-value = 0.04614
# alternative hypothesis: true probability of success is greater than 0.5
# 95 percent confidence interval:
#  0.5053503 1.0000000
# sample estimates:
# probability of success
#              0.7692308


#-------------------------------------------------------
# 1.2. estimating biases in large non-uniform families using ML as implemented in geiger::FitDiscrete and BayesTraits and using MCMC as implemented in BayesTraits rather than using a binomial test as implemented in familybias
#-------------------------------------------------------
# same procedure as above, but now with glottolog taxonomy
case.alignment.glottolog.fam.big <- droplevels(subset(case.alignment.glottolog.fam$large.families.estimate[order(case.alignment.glottolog.fam$large.families.estimate$family.name), ], majority.prop<1 | is.na(majority.prop)))

# fix taxonomic levels
level18.glot <- subset(case.alignment.glottolog.fam.big, taxonomic.level %in% 'Level18')$family.name
# Prepare trees, adding Central-Eastern Malayo-Polynesian:
big.case.alignment.glottolog.phylo <- as.phylo(case.alignment.data.gg[case.alignment.data.gg$Level15 %in% "'Central-Eastern Malayo-Polynesian [cent2237]'"| case.alignment.data.gg$Level18 %in% level18.glot, c(glottolog.taxa, 'subsystems')])
# Note that this conversion gets Central-Eastern Malayo-Polynesian with name "Austronesian" because phylo keeps the entire node hierarchy in store. But the data is really just the subtree: plot(collapse.singles(big.case.alignment.glottolog.phylo[[10]]))

# prepare data:
big.case.alignment.glottolog.data <- case.alignment.data.gg[case.alignment.data.gg$Level15 %in% "'Central-Eastern Malayo-Polynesian [cent2237]'"| case.alignment.data.gg$Level18 %in% level18.glot, c('subsystems', 'acc')]

# ML estimates using FitDiscrete:
big.case.glottolog.ml <- do.call(rbind, mclapply(big.case.alignment.glottolog.phylo, function(x) {
current.tree <- multi2di(collapse.singles(x)) # clean up for fitDiscrete
current.data <- sapply(current.tree$tip.label, function(i) big.case.alignment.glottolog.data$acc[big.case.alignment.glottolog.data$subsystems==i])
current.ard <- fitDiscrete(current.tree, current.data, model='ARD')$opt
current.er <- fitDiscrete(current.tree, current.data, model='ER')$opt
lr <- 2*(current.ard$lnL-current.er$lnL)
p <- 1-pchisq(2*(current.ard$lnL-current.er$lnL),1)
data.frame(ARD.qAE=current.ard$q12, ARD.qEA=current.ard$q21, ARD.lnL=current.ard$lnL, ER.q=current.er$q12, ER.lnL=current.er$lnL, lr.ml=lr, lr.ml.p=p)
}))

# export for BayesTraits
x <- mclapply(big.case.alignment.glottolog.phylo, function(x) {
fam <- gsub("\\'|\\s","",x$node.label[1]) # BayesTraits doesn't like quotation marks or spaces in names and the brackets are a nuisance as well.
fam <- gsub("\\[|\\]",".",fam)
current.tree <- collapse.singles(x)
current.tree$node.label <- NULL # BayesTraits doesn't like node labels
current.tree$tip.label <- gsub('\\s','',current.tree$tip.label)
big.case.alignment.glottolog.data$subsystems.clean <- gsub('\\s','',big.case.alignment.glottolog.data$subsystems)
current.data <- sapply(current.tree$tip.label, function(i) big.case.alignment.glottolog.data$acc[big.case.alignment.glottolog.data$subsystems.clean==i])
current.data.df <-data.frame(names(current.data), current.data)
write.nexus(current.tree, file=paste(fam, "tree", sep = ""))
write.table(current.data.df, file=paste(fam, "data", sep = ""), row.names = F, quote = F, col.names = F, sep = "\t")
})
# read results  
big.case.glottolog.ms.er.ml <- as.data.frame(do.call(rbind, sapply(dir(pattern='.*ER.log.txt'),function(l) {
x <- lastline(l)
rows <- strsplit(x, "\t")
}
)))
names(big.case.glottolog.ms.er.ml) <- c("TreeNo", "Lh", "qAE", "qEA", "RootP(A)", "RootP(E)")
big.case.glottolog.ms.ard.ml <- as.data.frame(do.call(rbind, sapply(dir(pattern='.*ARD.log.txt'),function(l) {
x <- lastline(l)
rows <- strsplit(x, "\t")
}
)))
names(big.case.glottolog.ms.ard.ml) <- c("TreeNo", "Lh", "qAE", "qEA", "RootP(A)", "RootP(E)")
big.case.glottolog.ms.ml <- data.frame(fam=gsub('\\.ER\\.out\\.txt','',rownames(big.case.glottolog.ms.er.ml)), lr.ml.ms=2*(as.numeric(as.character(big.case.glottolog.ms.ard.ml$Lh))-as.numeric(as.character(big.case.glottolog.ms.er.ml$Lh))))
big.case.glottolog.ms.ml$lr.ml.ms.p <- round(1-pchisq(big.case.glottolog.ms.ml$lr.ml.ms,1),5)
# read results from from BayesTraits MCMC:
big.case.glottolog.er.mcmc <- as.data.frame(do.call(rbind, sapply(dir(pattern='.*ER.MCMC.log.txt'),function(l) {
	x <- secondlastline(l)
	rows <- strsplit(x, "\t")
	}
	)))
names(big.case.glottolog.er.mcmc) <- c("Iteration", "Lh", "HarmonicMean", "TreeNo", "qAE", "qEA", "RootP(A)", "RootP(E)")
big.case.glottolog.ard.mcmc <- as.data.frame(do.call(rbind, sapply(dir(pattern='.*ARD.MCMC.log.txt'),function(l) {
	x <- secondlastline(l)
	rows <- strsplit(x, "\t")
	}
	)))
names(big.case.glottolog.ard.mcmc) <- c("Iteration", "Lh", "HarmonicMean", "TreeNo", "qAE", "qEA", "RootP(A)", "RootP(E)")
  
# collect in a table:  
big.case.glottolog.mcmc <- data.frame(fam=gsub('\\.ARD\\.MCMC\\.out.txt','',rownames(big.case.glottolog.ard.mcmc)), lr.mcmc=2*(as.numeric(as.character(big.case.glottolog.ard.mcmc$HarmonicMean))-as.numeric(as.character(big.case.glottolog.er.mcmc$HarmonicMean))))
# Summary table, for comparison of results:
big.case.glottolog.summary <- data.frame(case.alignment.glottolog.fam.big, binom.p=round(sapply(case.alignment.glottolog.fam.big.p, function(x) x$p.value),5), apply(big.case.glottolog.ml,2,round,5)[order(rownames(big.case.glottolog.ml)),6:7], big.case.glottolog.ms.ml[,2:3], lr.mcmc=big.case.glottolog.mcmc[,2])
# final table:
big.case.glottolog.summary$lex.mcmc <- NA
big.case.glottolog.summary$lex.mcmc[big.case.glottolog.summary$family.name %in% "'Central-Eastern Malayo-Polynesian [cent2237]'"] <- 5.262682
big.case.glottolog.summary$lex.mcmc[big.case.glottolog.summary$family.name %in% "'Indo-European [indo1319]'"] <- 6.495412

# re-run extrapolations. Again, the MCMC-based estimates agree with the binomial tests (See big.case.glottolog.summary and Table S3), so we perform override analysis only for the ML-based estimates, where Malayo-Polynesian doesn't show an effect at an alpha=.05 level
bias.override.glottolog.template <- apply(case.alignment.glottolog.fam.big, 1,  function(r) {
	mj.p <- as.numeric("["(r,'majority.prop'))
	names(mj.p) <- NULL
	bias <- "["(r,'majority.response')
	bias <- ifelse(bias %in% 'diverse', NA, bias)
	names(bias) <- NULL
	list(mj.prop=mj.p, bias.val=bias)})
names(bias.override.glottolog.template) <- case.alignment.glottolog.fam.big$family.name

bias.override.glottolog.ml.p <- bias.override.glottolog.template
bias.override.glottolog.ml.p[["'Central-Eastern Malayo-Polynesian [cent2237]'"]]$mj.prop <- NA
bias.override.glottolog.ml.p[["'Central-Eastern Malayo-Polynesian [cent2237]'"]]$mj.val <- NA

# re-run familybias with these estimates:
case.alignment.glottolog.fam.ml.p <- familybias(df = case.alignment.data.gg, family.names = c(glottolog.taxa, 'subsystems'), r.name = 'acc', p.names = 'area', B = 10000, lapplyfunc = mclapply, bias.override=bias.override.glottolog.ml.p)

# summaries
case.alignment.glottolog.fam.ml.p.mean <- mean(case.alignment.glottolog.fam.ml.p)
case.alignment.glottolog.fam.ml.p.freqs <- lapply(case.alignment.glottolog.fam.ml.p$extrapolations,function(x) {as.data.frame(xtabs(~area+majority.response,x))})
case.alignment.glottolog.fam.ml.p.freqs.df <- do.call(cbind, lapply(case.alignment.glottolog.fam.ml.p.freqs, "[", 3))
case.alignment.glottolog.fam.ml.p.se <- data.frame(case.alignment.glottolog.fam.ml.p.freqs[[1]][,1:2],SE=apply(case.alignment.glottolog.fam.ml.p.freqs.df[,-c(1:2)],1,function(x) sd(x)/sqrt(length(x))))

# loglinear analysis
case.alignment.glottolog.fam.ml.p.mean.biases <- droplevels(subset(case.alignment.glottolog.fam.ml.p.mean, !majority.response %in% 'diverse'))
case.alignment.glottolog.fam.ml.p.mean.biases$Freq <- round(case.alignment.glottolog.fam.ml.p.mean.biases$Freq)

full <- glm(Freq~majority.response+area+majority.response:area, case.alignment.glottolog.fam.ml.p.mean.biases, family='poisson')
indep <- glm(Freq~majority.response+area, case.alignment.glottolog.fam.ml.p.mean.biases, family='poisson')
anova(full, indep, test='Chi')
# Analysis of Deviance Table
#
# Model 1: Freq ~ majority.response + area + majority.response:area
# Model 2: Freq ~ majority.response + area
#   Resid. Df Resid. Dev Df Deviance  Pr(>Chi)
# 1         0      0.000
# 2         4     21.296 -4  -21.296 0.0002766 ***
# ---
# Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
area.only <- glm(Freq~area, case.alignment.glottolog.fam.ml.p.mean.biases, family='poisson')
anova(area.only, full, test='Chi')
# Analysis of Deviance Table
#
# Model 1: Freq ~ area
# Model 2: Freq ~ majority.response + area + majority.response:area
#   Resid. Df Resid. Dev Df Deviance  Pr(>Chi)
# 1         5     139.53
# 2         0       0.00  5   139.53 < 2.2e-16 ***
# ---
# Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

# Factorial analysis, resolving interaction by areas:
apply(xtabs(Freq~.,case.alignment.glottolog.fam.ml.p.mean.biases), 1 , binom.test, alternative='gr')
# $Eurasia
#
# 	Exact binomial test
#
# data:  newX[, i]
# number of successes = 24, number of trials = 27, p-value = 2.462e-05
# alternative hypothesis: true probability of success is greater than 0.5
# 95 percent confidence interval:
#  0.7372612 1.0000000
# sample estimates:
# probability of success
#              0.8888889
#
#
# $Pacific
#
# 	Exact binomial test
#
# data:  newX[, i]
# number of successes = 21, number of trials = 27, p-value = 0.002962
# alternative hypothesis: true probability of success is greater than 0.5
# 95 percent confidence interval:
#  0.6079016 1.0000000
# sample estimates:
# probability of success
#              0.7777778
#
#
# $Africa
#
# 	Exact binomial test
#
# data:  newX[, i]
# number of successes = 42, number of trials = 42, p-value = 2.274e-13
# alternative hypothesis: true probability of success is greater than 0.5
# 95 percent confidence interval:
#  0.9311574 1.0000000
# sample estimates:
# probability of success
#                      1
#
#
# $`N/C America`
#
# 	Exact binomial test
#
# data:  newX[, i]
# number of successes = 36, number of trials = 36, p-value = 1.455e-11
# alternative hypothesis: true probability of success is greater than 0.5
# 95 percent confidence interval:
#  0.9201535 1.0000000
# sample estimates:
# probability of success
#                      1
#
#
# $`S America`
#
# 	Exact binomial test
#
# data:  newX[, i]
# number of successes = 10, number of trials = 13, p-value = 0.04614
# alternative hypothesis: true probability of success is greater than 0.5
# 95 percent confidence interval:
#  0.5053503 1.0000000
# sample estimates:
# probability of success
#              0.7692308


# 
################################
# Grand summaries, tables and plot
# Much here is just about proper ordering of results

# Large family estimates
area.order <- c('Africa','Eurasia','Pacific','N/C America','S America')
at <- case.alignment.autotyp.fam$prior[order(factor(case.alignment.autotyp.fam$prior$area, levels=area.order)),]
at.ml <- case.alignment.autotyp.fam.ml.p$prior[order(factor(case.alignment.autotyp.fam.ml.p$prior$area, levels=area.order)),]
gl <- case.alignment.glottolog.fam$prior[order(factor(case.alignment.glottolog.fam$prior$area, levels=area.order)),]
gl.ml <- case.alignment.glottolog.fam.ml.p$prior[order(factor(case.alignment.glottolog.fam.ml.p$prior$area, levels=area.order)),]
all.priors <- data.frame(area=at$area, autotyp.bias=gsub('c', ' ', paste(round(at$p.bias,2),at$p.bias.95.cred, sep='')), autotyp.ml.bias=gsub('c', ' ', paste(round(at.ml$p.bias,2),at.ml$p.bias.95.cred, sep='')), glottolog.bias=gsub('c', ' ', paste(round(gl$p.bias,2),gl$p.bias.95.cred, sep='')), glottolog.ml.bias=gsub('c', ' ', paste(round(gl.ml$p.bias,2),gl.ml$p.bias.95.cred, sep='')))

# deviation probabilities:
all.dev <- data.frame(area=at$area, autotyp=at$p.deviate, autotyp.ml=at.ml$p.deviate, glottolog=gl$p.deviate, glottolog.ml=gl.ml$p.deviate)

# Analysis of the mean results across methods and taxonomic assumptions:
all.estimates <- data.frame(case.alignment.autotyp.fam.mean[order(factor(case.alignment.autotyp.fam.mean$area, levels=c('Africa','Eurasia','Pacific','N/C America','S America'))),1:2],
	 autotyp.binom.and.mcmc=case.alignment.autotyp.fam.mean$Freq[order(factor(case.alignment.glottolog.fam.mean$area, levels=c('Africa','Eurasia','Pacific','N/C America','S America')))],
	 autotyp.binom.and.mcmc.SE=case.alignment.autotyp.fam.se$SE[order(factor(case.alignment.autotyp.fam.se$area, levels=c('Africa','Eurasia','Pacific','N/C America','S America')),factor(case.alignment.autotyp.fam.se$majority.response, levels=c('diverse','A','E')))],
	 autotyp.ml=case.alignment.autotyp.fam.ml.p.mean$Freq[order(factor(case.alignment.autotyp.fam.ml.p.mean$area, levels=c('Africa','Eurasia','Pacific','N/C America','S America')))],
	 autotyp.ml.SE=case.alignment.autotyp.fam.ml.p.se$SE[order(factor(case.alignment.autotyp.fam.ml.p.se$area, levels=c('Africa','Eurasia','Pacific','N/C America','S America')),factor(case.alignment.autotyp.fam.ml.p.se$majority.response, levels=c('diverse','A','E')))],
	 glottolog.binom.and.mcmc=case.alignment.glottolog.fam.mean$Freq[order(factor(case.alignment.glottolog.fam.mean$area, levels=c('Africa','Eurasia','Pacific','N/C America','S America')))],
	 glottolog.binom.and.mcmc.SE=case.alignment.glottolog.fam.se$SE[order(factor(case.alignment.glottolog.fam.se$area, levels=c('Africa','Eurasia','Pacific','N/C America','S America')),factor(case.alignment.glottolog.fam.se$majority.response, levels=c('diverse','A','E')))],
	 glottolog.ml=case.alignment.glottolog.fam.ml.p.mean$Freq[order(factor(case.alignment.glottolog.fam.ml.p.mean$area, levels=c('Africa','Eurasia','Pacific','N/C America','S America')))],
	 glottolog.ml.SE=case.alignment.glottolog.fam.ml.p.se$SE[order(factor(case.alignment.glottolog.fam.ml.p.se$area, levels=c('Africa','Eurasia','Pacific','N/C America','S America')),factor(case.alignment.glottolog.fam.ml.p.se$majority.response, levels=c('diverse','A','E')))]
 	)

# stats on grand totals (as reported in the main text):
all.estimates$Freq <- rowMeans(all.estimates[,c('autotyp.binom.and.mcmc','autotyp.ml','glottolog.binom.and.mcmc','glottolog.ml')])
addmargins(xtabs(Freq~majority.response,all.estimates))
# majority.response
#   diverse         A         E       Sum
#  47.67018 123.21173  10.61810 181.50000

all.estimates.biases <- droplevels(subset(all.estimates, !majority.response %in% 'diverse', c('area', 'majority.response','Freq')))
all.estimates.biases$Freq <- round(all.estimates.biases$Freq)
full <- glm(Freq~majority.response+area+majority.response:area, all.estimates.biases, family='poisson')
indep <- glm(Freq~majority.response+area, all.estimates.biases, family='poisson')
anova(full, indep, test='Chi')
# Analysis of Deviance Table
#
# Model 1: Freq ~ majority.response + area + majority.response:area
# Model 2: Freq ~ majority.response + area
#   Resid. Df Resid. Dev Df Deviance Pr(>Chi)
# 1         0      0.000
# 2         4     11.992 -4  -11.992  0.01741 *
# ---
# Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
no.bias <- glm(Freq~area, all.estimates.biases, family='poisson')
anova(no.bias, full, test='Chi')
# Analysis of Deviance Table
#
# Model 1: Freq ~ area
# Model 2: Freq ~ majority.response + area + majority.response:area
#   Resid. Df Resid. Dev Df Deviance  Pr(>Chi)
# 1         5     125.39
# 2         0       0.00  5   125.39 < 2.2e-16 ***
# ---
# Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

apply(xtabs(Freq~.,all.estimates.biases), 1 , binom.test, alternative='gr')
# $Africa
#
# 	Exact binomial test
#
# data:  newX[, i]
# number of successes = 37, number of trials = 37, p-value = 7.276e-12
# alternative hypothesis: true probability of success is greater than 0.5
# 95 percent confidence interval:
#  0.9222253 1.0000000
# sample estimates:
# probability of success
#                      1
#
#
# $Eurasia
#
# 	Exact binomial test
#
# data:  newX[, i]
# number of successes = 25, number of trials = 27, p-value = 2.824e-06
# alternative hypothesis: true probability of success is greater than 0.5
# 95 percent confidence interval:
#  0.7847 1.0000
# sample estimates:
# probability of success
#              0.9259259
#
#
# $`N/C America`
#
# 	Exact binomial test
#
# data:  newX[, i]
# number of successes = 31, number of trials = 32, p-value = 7.683e-09
# alternative hypothesis: true probability of success is greater than 0.5
# 95 percent confidence interval:
#  0.8601505 1.0000000
# sample estimates:
# probability of success
#                0.96875
#
#
# $Pacific
#
# 	Exact binomial test
#
# data:  newX[, i]
# number of successes = 20, number of trials = 25, p-value = 0.002039
# alternative hypothesis: true probability of success is greater than 0.5
# 95 percent confidence interval:
#  0.6245949 1.0000000
# sample estimates:
# probability of success
#                    0.8
#
#
# $`S America`
#
# 	Exact binomial test
#
# data:  newX[, i]
# number of successes = 10, number of trials = 12, p-value = 0.01929
# alternative hypothesis: true probability of success is greater than 0.5
# 95 percent confidence interval:
#  0.5618946 1.0000000
# sample estimates:
# probability of success
#              0.8333333


# tabulate and plot results
all.estimates$area.ordered <- factor(all.estimates$area, levels=c('Africa','Eurasia','Pacific','S America', 'N/C America'), ordered=T, labels=c('Africa','Eurasia','Pacific','South\nAmerica', 'Rest of\nthe Americas'))
all.estimates$majority.response.ordered <- factor(all.estimates$majority.response, levels=c('diverse','E','A'), ordered=T, labels=c("no detectable bias\nin language change", "bias for\nergatives", "bias against\nergatives"))

# all.estimates[order(all.estimates$area.ordered),1:10]
#           area majority.response autotyp.binom.and.mcmc autotyp.binom.and.mcmc.SE autotyp.ml
# 1       Africa           diverse                 4.1690               0.010991155     1.2644
# 2       Africa                 A                34.8310               0.010991155    28.7356
# 3       Africa                 E                 0.0000               0.000000000     0.0000
# 4      Eurasia           diverse                 1.2929               0.022037639     8.4936
# 5      Eurasia                 A                28.7071               0.020946781    24.1191
# 6      Eurasia                 E                 0.0000               0.011271666     3.3873
# 10     Pacific           diverse                 8.4984               0.029175254    30.2091
# 11     Pacific                 A                24.1061               0.025163646    14.0153
# 12     Pacific                 E                 3.3955               0.017229440     3.7756
# 13   S America           diverse                12.0036               0.022464877    12.0090
# 14   S America                 A                 9.9870               0.019976024     9.9840
# 15   S America                 E                 2.0094               0.009987043     2.0070
# 7  N/C America           diverse                24.0279               0.019122749     4.1184
# 8  N/C America                 A                18.7633               0.019122749    34.8816
# 9  N/C America                 E                 5.2088               0.000000000     0.0000
#    autotyp.ml.SE glottolog.binom.and.mcmc glottolog.binom.and.mcmc.SE glottolog.ml glottolog.ml.SE
# 1     0.01081461                   3.2727                  0.01729936       3.2742      0.01720914
# 2     0.01081461                  41.7273                  0.01729936      41.7258      0.01720914
# 3     0.00000000                   0.0000                  0.00000000       0.0000      0.00000000
# 4     0.02188221                   7.3028                  0.02012292       7.3204      0.02027798
# 5     0.02098460                  23.6868                  0.01958010      23.6662      0.01977560
# 6     0.01113762                   3.0104                  0.01079942       3.0134      0.01086051
# 10    0.02855904                  14.9984                  0.02770855      14.9785      0.02776240
# 11    0.02372212                  21.1418                  0.02563274      21.2348      0.02527117
# 12    0.01569983                   5.8598                  0.01762736       5.7867      0.01733249
# 13    0.02217605                  12.4704                  0.02287692      12.4801      0.02301172
# 14    0.01994199                  10.0275                  0.01994584      10.0035      0.02003021
# 15    0.01000578                   2.5021                  0.01132249       2.5164      0.01120495
# 7     0.01906239                   4.2655                  0.01918124       4.2314      0.01902630
# 8     0.01906239                  35.7345                  0.01918124      35.7686      0.01902630
# 9     0.00000000                   0.0000                  0.00000000       0.0000      0.00000000

apply(all.estimates[,c(4,6,8,10)],2, range)
#      autotyp.binom.and.mcmc.SE autotyp.ml.SE glottolog.binom.and.mcmc.SE glottolog.ml.SE
# [1,]                0.00000000    0.00000000                  0.00000000       0.0000000
# [2,]                0.02917525    0.02855904                  0.02770855       0.0277624

mosaic(Freq~area.ordered+majority.response.ordered, all.estimates, split_vertical=c(T,F), boxes=c(F,F), labeling_args=list(gp_labels=gpar(cex=1, lineheight=.8, fontfamily="Helvetica", fontsize=20), varnames=F), gp=gpar(fill=c('grey','white','black')), rot_labels=c(0,0), just_labels=c("center", "left"), offset_labels=c(0,-.7,.5,0),margins=c(1,13,3,3), tl_labels=F, keep_aspect_ratio=F, spacing=spacing_increase(start = unit(0.1, "lines"),rate=4))