###############################################################################
#    Estimate and generalize Elena Maslova's "transition probabilities"
#        
#    Copyright (C) 2008-2012  Michael Cysouw & Dan Dediu
#
#    This program is free software: you can redistribute it and/or modify
#    it under the terms of the GNU General Public License as published by
#    the Free Software Foundation, either version 3 of the License, or
#    (at your option) any later version.
#
#    This program is distributed in the hope that it will be useful,
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.   See the
#    GNU General Public License for more details.
#
#    You should have received a copy of the GNU General Public License
#    along with this program.   If not, see <http://www.gnu.org/licenses/>.
#
###############################################################################

# The working path:
working.path <-  "./";

# Load the original WALS data as used in Dediu, D. & Cysouw, M. Some Structural Aspects of Language are More Stable than Others: A Comparison of Seven Methods. (please make sure you have extracted it here):
source( paste( working.path,  "/WALS_R/wals.r", sep=""  ), chdir=TRUE);
## Please note that currently the WALS data is released in a slightly different format (see http://wals.info/export) and in order to import it you need to use instead something of the form:
## but we did not test the script with this new format and it might require some tweaking.
#langs <- read.table("languages.tab",sep="\t",header=T)
#mat <- read.table("datapoints.tab",sep="\t",header=T,row.names=1)


# Select genera that have more than one language coded for a feature
# For example,  "length(getgenera(83))" gives "197"
# Constrain by macroarea (if != NA) and/or family (if != NA):
getgenera <- function( feature, macroarea=NA, lgfamily=NA )
{  
  # Select those languages in the given macroarea (if any):
  langs <- names( na.omit( mat[,feature]  )  );
  if(  !is.na( macroarea )  )
  {
    langs <- langs[ lgs$macroarea[ rownames(lgs)  %in% langs ]  == macroarea ];
  }
  if(  !is.na( lgfamily )  )
  {
    langs <- langs[ lgs$family[ rownames(lgs)  %in% langs ]  == lgfamily ];
  }

  g <- listGenera( codes=langs )  # list all genera
  if(  !is.na( macroarea )  &  !is.na( lgfamily )  )  # both constraints
  {
    count <- sapply( g, function(x){ length( na.omit( mat[ rownames( lgs[ lgs$genus==x & lgs$macroarea == macroarea & lgs$family == lgfamily,  ]  ), feature ]  )  )  }  )  # get genera that are coded more than once
  } else if(  !is.na( macroarea )  )  # only macroarea constraint
  {
    count <- sapply( g, function(x){ length( na.omit( mat[ rownames( lgs[lgs$genus==x & lgs$macroarea == macroarea,  ]  ), feature ]  )  )  }  )  # get genera that are coded more than once
  } else if(  !is.na( lgfamily )  )  # only faimly constraint
  {
    count <- sapply( g, function(x){ length( na.omit( mat[ rownames( lgs[ lgs$genus==x & lgs$family == lgfamily,  ]  ), feature ]  )  )  }  )  # get genera that are coded more than once
  } else # no constraints at all:
  {
    count<-sapply(g,function(x){length(na.omit(mat[rownames(lgs[lgs$genus==x,]),feature]))})  # get genera that are coded more than once
  }
  return( g[ count>1 ]  )
}

# Get estimates for P(D) and P(A) for a set of genera g.  
# First, select a pair of languages from each genus, then compute P(D) and P(A).
# The value can consist either of a single value (A), in which case all the others are considered as (non-A); a list of values (A) and all the others are considered (non-A); or as a list of values (A) and the non.values contains the list of non-A values, with all the others ignored [not yet implemented]
getprobs <- function( g, feature, value, macroarea=NA, lgfamily=NA, non.value=NA )
{  
  # Get a random pair of languages from the same genus, for each genus:
  if(  !is.na( macroarea )  &  !is.na( lgfamily )  )  # Both constraints:
  {
    pairs <- lapply( g, function(x){ sample( na.omit( mat[ rownames( lgs[ lgs$genus==x & lgs$macroarea==macroarea & lgs$family==lgfamily,  ]  ), feature ]  ), 2 )  }  )      
  } else if(  !is.na( macroarea )  )  # only macorarea constraint:
  {
    pairs <- lapply( g, function(x){ sample( na.omit( mat[ rownames( lgs[ lgs$genus==x & lgs$macroarea==macroarea,  ]  ), feature ]  ), 2 )  }  )    
  } else if(  !is.na( lgfamily )  )  # only faimly constraint:
  {
    pairs <- lapply( g, function(x){ sample( na.omit( mat[ rownames( lgs[ lgs$genus==x & lgs$family==lgfamily,  ]  ), feature ]  ), 2 )  }  )      
  } else # no contraints:
  {
    pairs <- lapply( g, function(x){ sample( na.omit( mat[ rownames( lgs[lgs$genus==x,]), feature ]  ), 2 )  }  )    
  }

  # The pA and pD probabilities:
  pA <- NA;
  pD <- NA;

  # Compute pD & pA for these pairs:
  if( is.na(non.value)  )
  {
    pD <- 1 -  ( sum( as.numeric( lapply( pairs, function(x){  (  (x[1]  %in% value)  &  (x[2]  %in% value)  )  |  (  !(x[1]  %in% value)  &  !(x[2]  %in% value)  )  }  )  )  )  / length( pairs )  );
    pA <- sum( as.matrix( as.data.frame( lapply( pairs, function(x){ x %in% value }  )  )  )  )  /  ( 2 * length( pairs )  );
  } else
  {
     stop(  "No yet implemented!\n"  );
  }
  return( data.frame(  "pA"=pA,  "pD"=pD )  );
}  

# Get estimates for P(D) and P(A) multiple times, and put the results in a table:
getcounts <- function( feature, value, cases=50, samplesize=.5, macroarea=NA, lgfamily=NA, non.value=NA )
{  
  result <- data.frame(  "pA"=numeric(cases),  "pD"=numeric(cases)  );
  g <- getgenera( feature, macroarea, lgfamily );

  for( i in 1:cases )
  {  
    s <- sample( length(g), floor( samplesize*length(g)  )  );
    result[i,]  <- getprobs( g[s], feature, value, macroarea, lgfamily, non.value );
  }
  return( result );
}

# Compute the transition probabilities from the output of "getcounts", ie. from the table with the values of P(D) and P(A):
estimates <- function( test, no.simulations=1000, return.summaries=TRUE )  
{  
  # Regress pD an pA:
  l <- lm( pD ~ pA, data=test );  # The regression goodness was tested before using Pearson's r, so don't do it again here!

  # Get the estimated regression coefficients: they have a mean and a standard distribution:
  l.summary <- summary( l );
  if( nrow( l.summary$coefficients )  < 2 )
  {
    a.estimate <- NA;
    a.sd <- NA;
    b.estimate <- NA;
    b.sd <- NA;
  } else
  {
    a.estimate <- as.numeric( l.summary$coefficients[2,1]  );
    a.sd <- as.numeric( l.summary$coefficients[2,2]  );
    b.estimate <- as.numeric( l.summary$coefficients[1,1]  );
    b.sd <- as.numeric( l.summary$coefficients[1,2]  );
  }

  # Given the complexity of the formaulas for pAB and pBA, I cannot derive a formula for their sd, so simulate it:
  a.simulated <- rnorm( no.simulations, mean=a.estimate, sd=a.sd)  / 2;
  b.simulated <- rnorm( no.simulations, mean=b.estimate, sd=b.sd)  / 2;

  # And compute the probabilities:
  pAB.simulated <-  (1 + a.simulated - sqrt((1 - a.simulated)^2 - 4*b.simulated))/2;
  pBA.simulated <-  (1 - a.simulated - sqrt((1 - a.simulated)^2 - 4*b.simulated))/2;
  stableA.simulated   <- pBA.simulated /  (pAB.simulated + pBA.simulated);
  stability.simulated <- 1 -  ((pAB.simulated + pBA.simulated)/2);

  # get correlations for estimates
  correlation <- cor.test( test[,1], test[,2]  )

  # Return the results:
  if( return.summaries == TRUE )
  {
    return( list(
                  pApD.cor = as.numeric(correlation$estimate),
                  pApD.sig = correlation$p.value,
                  pAB.mean = mean( pAB.simulated, na.rm=TRUE ),
                  pBA.mean = mean( pBA.simulated, na.rm=TRUE ),
                  stableA.mean = mean( stableA.simulated, na.rm=TRUE ),
                  stability.mean = mean( stability.simulated, na.rm=TRUE ),
                  pAB.sd = ifelse( sum(  !is.na( pAB.simulated )  )  <= 1, NA, sd( pAB.simulated, na.rm=TRUE )  ),
                  pBA.sd = ifelse( sum(  !is.na( pBA.simulated )  )  <= 1, NA, sd( pBA.simulated, na.rm=TRUE )  ),
                  stableA.sd = ifelse ( sum(  !is.na( stableA.simulated )  )  <= 1, NA, sd( stableA.simulated, na.rm=TRUE )  ),
                  stability.sd = ifelse( sum(  !is.na( stability.simulated )  )  <= 1, NA, sd( stability.simulated, na.rm=TRUE )  )
                ) );
  } else
  {
    return ( data.frame( pAB=pAB.simulated,   pBA=pBA.simulated,     stableA=stableA.simulated,     stability=stability.simulated )  );
  }    
}

# Compute the observed frequency of the feature's value in the given macroarea and language family:
observed.freq <- function( feature, value, macroarea=NA, lgfamily=NA, non.value=NA )
{
  # Select those languages in the given macroarea (if any):
  langs <- names( na.omit( mat[  , feature ]  )  );
  if(  !is.na( macroarea )  )
  {
    langs <- langs[ lgs$macroarea[ rownames(lgs)  %in% langs ]  == macroarea ];
  }
  if(  !is.na( lgfamily )  )
  {
    langs <- langs[ lgs$family[ rownames(lgs)  %in% langs ]  == lgfamily ];
  }

  values <- na.omit( mat[ rownames(lgs)  %in% langs,feature ]  );
  obs.freq <- sum( values %in% value )  / length( values );
  obs.freq;
}

#####################################################################################
#
#  Compute the stabilities of WALS features using the above estimation techniques
#
#####################################################################################

# Features above 138 don't work:
feature.selection <- 1:138
value.selection <- unlist ( sapply( feature.selection, function(x)  { which( featValues[,"feat"]  == x )  }  )  )

# Get the statistics for all features, using 200 samples. The list 'results' is a list for all 653 values of the features 1 to 138:
results <- list()
for( i in value.selection )
{
  results[[i]]  <- getcounts( featValues[i,1], featValues[i,2], 200 )
}

# Get the estimates for all results:
all.estimates <- list()
for( i in value.selection )
{
  all.estimates[[i]]  <- estimates( results[[i]]  )
}

# Extract the list of pAB transition probabilities:
pAB <- unlist( sapply( all.estimates, function(x){ x[3]  }  )  )

# Get the fraction of occurence of all values per feature: to be used in computing a weighted average of the stabilities of each value.  
# This also nicely gets rid of the nasty problem of values that are only found in very few languages: Maslova's approach cannot estimate such groups.  
# However, because such groups are small, they are getting a very low weight anyway now, so we don't have to worry about their influence:
proportion <- c()
for( i in feature.selection )  
{
  t <- table(mat[,i])
  proportion <- c( proportion, t/sum(t)  )
}

# Then we define the stability of a *value* as 1-pAB (i.e. the probability that a value to *not* change).
# To derive the stability of a *feature* we take the weighted average over these value-stabilities: frequent values count more than those with low frequencies:
stability <- xtabs(  (  (1-pAB)  * proportion )  ~ featValues[ value.selection, 1 ]  )

# Save this list of feature stavilities to file for later use:
write.table( data.frame(  "Feat"=as.character( names(stability)  ),  "Stability"=as.numeric(stability)  ),  "Maslova-stabilities.csv", sep="\t", quote=FALSE, row.names=FALSE );