############################################################################### # Estimate and generalize Elena Maslova's "transition probabilities" # # Copyright (C) 2008-2012 Michael Cysouw & Dan Dediu # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . # ############################################################################### # The working path: working.path <- "./"; # Load the original WALS data as used in Dediu, D. & Cysouw, M. Some Structural Aspects of Language are More Stable than Others: A Comparison of Seven Methods. (please make sure you have extracted it here): source( paste( working.path, "/WALS_R/wals.r", sep="" ), chdir=TRUE); ## Please note that currently the WALS data is released in a slightly different format (see http://wals.info/export) and in order to import it you need to use instead something of the form: ## but we did not test the script with this new format and it might require some tweaking. #langs <- read.table("languages.tab",sep="\t",header=T) #mat <- read.table("datapoints.tab",sep="\t",header=T,row.names=1) # Select genera that have more than one language coded for a feature # For example, "length(getgenera(83))" gives "197" # Constrain by macroarea (if != NA) and/or family (if != NA): getgenera <- function( feature, macroarea=NA, lgfamily=NA ) { # Select those languages in the given macroarea (if any): langs <- names( na.omit( mat[,feature] ) ); if( !is.na( macroarea ) ) { langs <- langs[ lgs$macroarea[ rownames(lgs) %in% langs ] == macroarea ]; } if( !is.na( lgfamily ) ) { langs <- langs[ lgs$family[ rownames(lgs) %in% langs ] == lgfamily ]; } g <- listGenera( codes=langs ) # list all genera if( !is.na( macroarea ) & !is.na( lgfamily ) ) # both constraints { count <- sapply( g, function(x){ length( na.omit( mat[ rownames( lgs[ lgs$genus==x & lgs$macroarea == macroarea & lgs$family == lgfamily, ] ), feature ] ) ) } ) # get genera that are coded more than once } else if( !is.na( macroarea ) ) # only macroarea constraint { count <- sapply( g, function(x){ length( na.omit( mat[ rownames( lgs[lgs$genus==x & lgs$macroarea == macroarea, ] ), feature ] ) ) } ) # get genera that are coded more than once } else if( !is.na( lgfamily ) ) # only faimly constraint { count <- sapply( g, function(x){ length( na.omit( mat[ rownames( lgs[ lgs$genus==x & lgs$family == lgfamily, ] ), feature ] ) ) } ) # get genera that are coded more than once } else # no constraints at all: { count<-sapply(g,function(x){length(na.omit(mat[rownames(lgs[lgs$genus==x,]),feature]))}) # get genera that are coded more than once } return( g[ count>1 ] ) } # Get estimates for P(D) and P(A) for a set of genera g. # First, select a pair of languages from each genus, then compute P(D) and P(A). # The value can consist either of a single value (A), in which case all the others are considered as (non-A); a list of values (A) and all the others are considered (non-A); or as a list of values (A) and the non.values contains the list of non-A values, with all the others ignored [not yet implemented] getprobs <- function( g, feature, value, macroarea=NA, lgfamily=NA, non.value=NA ) { # Get a random pair of languages from the same genus, for each genus: if( !is.na( macroarea ) & !is.na( lgfamily ) ) # Both constraints: { pairs <- lapply( g, function(x){ sample( na.omit( mat[ rownames( lgs[ lgs$genus==x & lgs$macroarea==macroarea & lgs$family==lgfamily, ] ), feature ] ), 2 ) } ) } else if( !is.na( macroarea ) ) # only macorarea constraint: { pairs <- lapply( g, function(x){ sample( na.omit( mat[ rownames( lgs[ lgs$genus==x & lgs$macroarea==macroarea, ] ), feature ] ), 2 ) } ) } else if( !is.na( lgfamily ) ) # only faimly constraint: { pairs <- lapply( g, function(x){ sample( na.omit( mat[ rownames( lgs[ lgs$genus==x & lgs$family==lgfamily, ] ), feature ] ), 2 ) } ) } else # no contraints: { pairs <- lapply( g, function(x){ sample( na.omit( mat[ rownames( lgs[lgs$genus==x,]), feature ] ), 2 ) } ) } # The pA and pD probabilities: pA <- NA; pD <- NA; # Compute pD & pA for these pairs: if( is.na(non.value) ) { pD <- 1 - ( sum( as.numeric( lapply( pairs, function(x){ ( (x[1] %in% value) & (x[2] %in% value) ) | ( !(x[1] %in% value) & !(x[2] %in% value) ) } ) ) ) / length( pairs ) ); pA <- sum( as.matrix( as.data.frame( lapply( pairs, function(x){ x %in% value } ) ) ) ) / ( 2 * length( pairs ) ); } else { stop( "No yet implemented!\n" ); } return( data.frame( "pA"=pA, "pD"=pD ) ); } # Get estimates for P(D) and P(A) multiple times, and put the results in a table: getcounts <- function( feature, value, cases=50, samplesize=.5, macroarea=NA, lgfamily=NA, non.value=NA ) { result <- data.frame( "pA"=numeric(cases), "pD"=numeric(cases) ); g <- getgenera( feature, macroarea, lgfamily ); for( i in 1:cases ) { s <- sample( length(g), floor( samplesize*length(g) ) ); result[i,] <- getprobs( g[s], feature, value, macroarea, lgfamily, non.value ); } return( result ); } # Compute the transition probabilities from the output of "getcounts", ie. from the table with the values of P(D) and P(A): estimates <- function( test, no.simulations=1000, return.summaries=TRUE ) { # Regress pD an pA: l <- lm( pD ~ pA, data=test ); # The regression goodness was tested before using Pearson's r, so don't do it again here! # Get the estimated regression coefficients: they have a mean and a standard distribution: l.summary <- summary( l ); if( nrow( l.summary$coefficients ) < 2 ) { a.estimate <- NA; a.sd <- NA; b.estimate <- NA; b.sd <- NA; } else { a.estimate <- as.numeric( l.summary$coefficients[2,1] ); a.sd <- as.numeric( l.summary$coefficients[2,2] ); b.estimate <- as.numeric( l.summary$coefficients[1,1] ); b.sd <- as.numeric( l.summary$coefficients[1,2] ); } # Given the complexity of the formaulas for pAB and pBA, I cannot derive a formula for their sd, so simulate it: a.simulated <- rnorm( no.simulations, mean=a.estimate, sd=a.sd) / 2; b.simulated <- rnorm( no.simulations, mean=b.estimate, sd=b.sd) / 2; # And compute the probabilities: pAB.simulated <- (1 + a.simulated - sqrt((1 - a.simulated)^2 - 4*b.simulated))/2; pBA.simulated <- (1 - a.simulated - sqrt((1 - a.simulated)^2 - 4*b.simulated))/2; stableA.simulated <- pBA.simulated / (pAB.simulated + pBA.simulated); stability.simulated <- 1 - ((pAB.simulated + pBA.simulated)/2); # get correlations for estimates correlation <- cor.test( test[,1], test[,2] ) # Return the results: if( return.summaries == TRUE ) { return( list( pApD.cor = as.numeric(correlation$estimate), pApD.sig = correlation$p.value, pAB.mean = mean( pAB.simulated, na.rm=TRUE ), pBA.mean = mean( pBA.simulated, na.rm=TRUE ), stableA.mean = mean( stableA.simulated, na.rm=TRUE ), stability.mean = mean( stability.simulated, na.rm=TRUE ), pAB.sd = ifelse( sum( !is.na( pAB.simulated ) ) <= 1, NA, sd( pAB.simulated, na.rm=TRUE ) ), pBA.sd = ifelse( sum( !is.na( pBA.simulated ) ) <= 1, NA, sd( pBA.simulated, na.rm=TRUE ) ), stableA.sd = ifelse ( sum( !is.na( stableA.simulated ) ) <= 1, NA, sd( stableA.simulated, na.rm=TRUE ) ), stability.sd = ifelse( sum( !is.na( stability.simulated ) ) <= 1, NA, sd( stability.simulated, na.rm=TRUE ) ) ) ); } else { return ( data.frame( pAB=pAB.simulated, pBA=pBA.simulated, stableA=stableA.simulated, stability=stability.simulated ) ); } } # Compute the observed frequency of the feature's value in the given macroarea and language family: observed.freq <- function( feature, value, macroarea=NA, lgfamily=NA, non.value=NA ) { # Select those languages in the given macroarea (if any): langs <- names( na.omit( mat[ , feature ] ) ); if( !is.na( macroarea ) ) { langs <- langs[ lgs$macroarea[ rownames(lgs) %in% langs ] == macroarea ]; } if( !is.na( lgfamily ) ) { langs <- langs[ lgs$family[ rownames(lgs) %in% langs ] == lgfamily ]; } values <- na.omit( mat[ rownames(lgs) %in% langs,feature ] ); obs.freq <- sum( values %in% value ) / length( values ); obs.freq; } ##################################################################################### # # Compute the stabilities of WALS features using the above estimation techniques # ##################################################################################### # Features above 138 don't work: feature.selection <- 1:138 value.selection <- unlist ( sapply( feature.selection, function(x) { which( featValues[,"feat"] == x ) } ) ) # Get the statistics for all features, using 200 samples. The list 'results' is a list for all 653 values of the features 1 to 138: results <- list() for( i in value.selection ) { results[[i]] <- getcounts( featValues[i,1], featValues[i,2], 200 ) } # Get the estimates for all results: all.estimates <- list() for( i in value.selection ) { all.estimates[[i]] <- estimates( results[[i]] ) } # Extract the list of pAB transition probabilities: pAB <- unlist( sapply( all.estimates, function(x){ x[3] } ) ) # Get the fraction of occurence of all values per feature: to be used in computing a weighted average of the stabilities of each value. # This also nicely gets rid of the nasty problem of values that are only found in very few languages: Maslova's approach cannot estimate such groups. # However, because such groups are small, they are getting a very low weight anyway now, so we don't have to worry about their influence: proportion <- c() for( i in feature.selection ) { t <- table(mat[,i]) proportion <- c( proportion, t/sum(t) ) } # Then we define the stability of a *value* as 1-pAB (i.e. the probability that a value to *not* change). # To derive the stability of a *feature* we take the weighted average over these value-stabilities: frequent values count more than those with low frequencies: stability <- xtabs( ( (1-pAB) * proportion ) ~ featValues[ value.selection, 1 ] ) # Save this list of feature stavilities to file for later use: write.table( data.frame( "Feat"=as.character( names(stability) ), "Stability"=as.numeric(stability) ), "Maslova-stabilities.csv", sep="\t", quote=FALSE, row.names=FALSE );