# R code taken from MoreTreeTools master branch commit 141ba75
# https://github.com/DomBennett/MoreTreeTools
# Unpublished R package by Dom Bennett, provided here for comparative purposes

pinNames <- function (tree, names, lineages, min.ages, max.ages,
                      fuzzy=TRUE, datasource=4,
                      iterations=1, resolve.list=NULL) {
  # SAFETY CHECK
  if (iterations < 1) {
    stop ('Iterations must be >=1')
  }
  if (!is.vector (names) && length (names) <= 1) {
    stop ('Names must be a vector of more than 1 character string')
  }
  if (!class (tree) %in%  c ('phylo', 'multiPhylo')) {
    stop ('Tree must be phylo or multiPhylo')
  }
  #TODO update addTip to handle node.label, for now drop node.label
  # INIT
  # drop underscores, check for branch lengths and drop node labels
  tree <- .mnClean (tree)
  names <- gsub ('_', ' ', names)
  # find matching and non-matching names
  tip.labels <- .mnGetNames (tree)
  matching.names <- names[names %in% tip.labels]
  nonmatching.names <- names[!names %in% tip.labels]
  # stop now if all names match names in tree or fuzzy is false
  if (!fuzzy || length (matching.names) == length (names)) {
    return (.mnEarlyReturn (tree, names, iterations))
  }
  # VARIABLES AND ENVIRONMENTS
  # place all parameters in an environment to save arguments
  paraenv <- new.env (parent=emptyenv ())
  if (class (tree) == 'multiPhylo') {
    paraenv$trees <- tree
    paraenv$start.tree <- tree[[sample (1:length (tree), 1)]]
  } else {
    paraenv$start.tree <- tree
  }
  paraenv$grow.tree <- paraenv$start.tree
  paraenv$datasource <- datasource
  paraenv$matching.names <- matching.names
  paraenv$names <- names
  paraenv$by.age <- TRUE
  # get query and subject resolved names
  qrylist <- list ()
  qrylist$lineages <- lineages
  qrylist$resolved <- data.frame (search.name=names,
                                  min.age=min.ages,
                                  max.age=max.ages)
  if (!is.null (resolve.list)) {
    # if resolve.list provided, no need to do any searches
    # unpack resolved names into sbjctenv
    sbjctenv <- new.env (parent=emptyenv ())
    sbjctenv$resolved <- resolve.list$resolved
    sbjctenv$lineages <- resolve.list$lineages
    paraenv$deja.vues <- tree$tip.label
  } else {
    # hold subject name resolution results in a single env
    sbjctenv <- new.env (parent=emptyenv ())
    .mnResolveUpdate (paraenv=paraenv, sbjctenv=sbjctenv)
  }
  # hold all resulting trees and stats in a single env
  resenv <- new.env (parent=emptyenv ())
  resenv$trees <- list ()
  # ITERATE
  # loop through for each iteration, write results to resenv
  m_ply (.data=data.frame(iteration=1:iterations), .fun=.mnMap,
         resenv=resenv, qrylist=qrylist, sbjctenv=sbjctenv,
         paraenv=paraenv)
  # RETURN
  if (length (resenv$trees) > 1) {
    trees <- resenv$trees
    class (trees) <- 'multiPhylo'
  } else {
    trees <- resenv$trees[[1]]
  }
  return (trees)
}

mapNames <- function (tree, names, fuzzy=TRUE, datasource=4,
                      iterations=1, resolve.list=NULL) {
  # SAFETY CHECK
  if (iterations < 1) {
    stop ('Iterations must be >=1')
  }
  if (!is.vector (names) && length (names) <= 1) {
    stop ('Names must be a vector of more than 1 character string')
  }
  if (!class (tree) %in%  c ('phylo', 'multiPhylo')) {
    stop ('Tree must be phylo or multiPhylo')
  }
  #TODO update addTip to handle node.label, for now drop node.label
  # INIT
  # drop underscores, check for branch lengths and drop node labels
  tree <- .mnClean (tree)
  names <- gsub ('_', ' ', names)
  # find matching and non-matching names
  tip.labels <- .mnGetNames (tree)
  matching.names <- names[names %in% tip.labels]
  nonmatching.names <- names[!names %in% tip.labels]
  # stop now if all names match names in tree or fuzzy is false
  if (!fuzzy || length (matching.names) == length (names)) {
    return (.mnEarlyReturn (tree, names, iterations))
  }
  # VARIABLES AND ENVIRONMENTS
  # place all parameters in an environment to save arguments
  paraenv <- new.env (parent=emptyenv ())
  if (class (tree) == 'multiPhylo') {
    paraenv$trees <- tree
    paraenv$start.tree <- tree[[sample (1:length (tree), 1)]]
  } else {
    paraenv$start.tree <- tree
  }
  paraenv$grow.tree <- paraenv$start.tree
  paraenv$datasource <- datasource
  paraenv$matching.names <- matching.names
  paraenv$names <- names
  paraenv$by.age <- FALSE
  # get query and subject resolved names
  if (!is.null (resolve.list)) {
    # if resolve.list provided, no need to do any searches
    # unpack resolved names into qrylist and sbjctenv
    pull <- resolve.list$resolved$search.name %in% nonmatching.names
    if (sum (pull) < 1) {
      return (.mnEarlyReturn (tree, names, iterations))
    }
    qrylist <- list ()
    qrylist$resolved <- resolve.list$resolved[pull, ]
    qrylist$lineages <- resolve.list$lineages[pull]
    sbjctenv <- new.env (parent=emptyenv ())
    sbjctenv$resolved <- resolve.list$resolved
    sbjctenv$lineages <- resolve.list$lineages
    paraenv$deja.vues <- tree$tip.label
  } else {
    # hold query name resolution results in a list
    qrylist <- .mnResolve (names=nonmatching.names, paraenv=paraenv)
    if (!nrow (qrylist$resolved) > 0) {  # stop now if none resolved
      return (.mnEarlyReturn (tree, names, iterations))
    }
    # hold subject name resolution results in a single env
    sbjctenv <- new.env (parent=emptyenv ())
    .mnResolveUpdate (paraenv=paraenv, sbjctenv=sbjctenv)
    # add qrylist results to sbjctenv
    pull <- !as.vector (qrylist$resolved$search.name) %in%
      as.vector (sbjctenv$resolved$search.name)
    sbjctenv$resolved <- rbind (sbjctenv$resolved, qrylist$resolved[pull, ])
    sbjctenv$lineages <- c (sbjctenv$lineages, qrylist$lineages[pull])
  }
  # hold all resulting trees and stats in a single env
  resenv <- new.env (parent=emptyenv ())
  resenv$trees <- list ()
  # ITERATE
  # loop through for each iteration, write results to resenv
  m_ply (.data=data.frame(iteration=1:iterations), .fun=.mnMap,
         resenv=resenv, qrylist=qrylist, sbjctenv=sbjctenv,
         paraenv=paraenv)
  # RETURN
  if (length (resenv$trees) > 1) {
    trees <- resenv$trees
    class (trees) <- 'multiPhylo'
  } else {
    trees <- resenv$trees[[1]]
  }
  return (trees)
}
# HIDDEN mapNames functions
.mnMap <- function (iteration, resenv, qrylist, sbjctenv, paraenv) {
  # INIT
  # randomise order to prevent order bias
  randomised <- sample (1:nrow (qrylist$resolved))
  qrylist$resolved <- qrylist$resolved[randomised, ]
  qrylist$lineages <- qrylist$lineages[randomised]
  if (!is.binary.tree (paraenv$start.tree)) {
    paraenv$grow.tree <- multi2di (paraenv$start.tree)
  } else {
    paraenv$grow.tree <- paraenv$start.tree
  }
  sbjctlist <- .mnTemporise(record=sbjctenv, tree=paraenv$grow.tree)  # convert to list
  # LOOP until qrylist in grow.tree or sbjct names exhausted
  while (TRUE) {
    if (nrow (sbjctlist$resolved) > 0) {
      not.in.tree <- which (!as.vector (qrylist$resolved$search.name) %in%
                              paraenv$grow.tree$tip.label)
      # loop through each in qrylist$resolved and match to sbjctlist$resolved
      for (i in not.in.tree) {
        # get lineage of the resovled qry name
        lineage <- qrylist$lineages[[i]]
        # match lineage to sbjct lineages
        lsrs <- .mnGetLSR (qry=lineage, sbjcts=sbjctlist$lineages)
        # of the most closely related lineages get the min dist -- calculated as
        # the smallest lsrs, this makes sure that they do not
        # form an in-group to the exclusion of the qry lineage
        possibles <- as.vector (which (lsrs == max (lsrs)))
        best.sbjcts <- sbjctlist$lineages[possibles]
        if (length (best.sbjcts) < length (sbjctlist$lineages)) {
          # if the length of best is the length of all, then no resolution withing sbjcts
          if (paraenv$by.age) {
            paraenv$grow.tree <- .mnAddTipWAge (tree=paraenv$grow.tree,
                                                tip.is=sbjctlist$resolved$tip.i[possibles],
                                                new.name=as.character (
                                                  qrylist$resolved$search.name[i]),
                                                max.age=qrylist$resolved$max.age[i],
                                                min.age=qrylist$resolved$min.age[i])
          } else {
              paraenv$grow.tree <- .mnAddTip (tree=paraenv$grow.tree,
                                              tip.is=sbjctlist$resolved$tip.i[possibles],
                                              new.name=as.character (
                                                qrylist$resolved$search.name[i]))
          }
          # update tip.i in sbjctlist
          sbjctlist <- .mnTemporise(record=sbjctlist, tree=paraenv$grow.tree)
        }
      }
    }
    # if all qry names in grow.tree or all names in tree sampled
    if (all (as.vector (qrylist$resolved$search.name) %in% paraenv$grow.tree$tip.label) |
          all (paraenv$start.tree$tip.label %in% paraenv$deja.vues)) {
      break
    }
    # if haven't broken out, update sbjctenv and extract new sbjctlist
    .mnResolveUpdate (paraenv=paraenv, sbjctenv=sbjctenv)
    sbjctlist <- .mnTemporise(record=sbjctenv, tree=paraenv$grow.tree)
  }
  # save results to resenv
  if (paraenv$by.age) {
    resenv$trees <- c (resenv$trees, list (paraenv$grow.tree))
  } else {
    tree <- .mnExtract (tree=paraenv$grow.tree, names=paraenv$names)
    resenv$trees <- c (resenv$trees, list (tree))
  }
  # choose new random tree
  if (!is.null (paraenv$trees)) {
    paraenv$start.tree <- paraenv$trees[[sample (1:length (paraenv$trees), 1)]]
  }
}
.mnResolve <- function (names, paraenv) {
  # Resolve names using taxaResolve, return list of taxaResolve
  #  dataframe and a list of lineages for each name
  res <- list ()
  res['resolved'] <- list (taxaResolve (names=names,
                                        datasource=paraenv$datasource))
  # drop NAs
  res$resolved <- res$resolved[!is.na (res$resolved$name.string), ]
  # drop those w/o lineage
  res$resolved <- res$resolved[res$resolved$lineage != '', ]
  # separate lineages
  if (nrow (res$resolved) > 0) {
    res['lineages'] <- list (strsplit (as.vector (res$resolved$lineage),
                                       '\\|'))
  }
  return (res)
}
.mnGetLSR <- function (qry, sbjcts) {
  # get lowest shared ranks between qry lineage and subject lineages
  tds <- rep (NA, length (sbjcts))
  for (i in 1:length (sbjcts)) {
    tds[i] <- max (which (qry %in% sbjcts[[i]]))
  }
  tds
}
.mnSample <- function (paraenv) {
  # sample names from a tree in a way to reduce searching
  tree <- paraenv$grow.tree
  if (length (paraenv$matching.names) > 1) {
    tip <- sample (paraenv$matching.names, 1)
  } else {
    tip <- sample (tree$tip.label, 1)
  }
  node <- tree$edge[tree$edge[ ,2] == which (tip == tree$tip.label), 1]
  while (TRUE) {
    children <- getChildren (tree, node=node)
    if (any (!children %in% paraenv$deja.vues)) {
      # exclude names already searched
      children <- children[!children %in% paraenv$deja.vues]
      return (children)
    }
    if (node == getSize (tree) + 1) {
      break
    }
    node <- getParent (tree, node=node)
  }
  vector ()
}
.mnResolveUpdate <- function (paraenv, sbjctenv) {
  # Update sbjenv -- by only searching when needed, reduce number of searches
  # get sample of names
  names <- .mnSample (paraenv)
  if (length (names) > 0) {
    # add names to deja.vues
    paraenv$deja.vues <- c (paraenv$deja.vues, names)
    # resolve these names
    res <- .mnResolve (names, paraenv)
    # stick to previous results
    if (!is.null (sbjctenv$resolved) & !is.null (sbjctenv$lineages)) {
      sbjctenv$resolved <- rbind (sbjctenv$resolved, res$resolved)
      sbjctenv$lineages <- c (sbjctenv$lineages, res$lineages)
    } else {
      sbjctenv$resolved <- res$resolved
      sbjctenv$lineages <- res$lineages
    }
  }
}
.mnTemporise <- function (record, tree) {
  # return a temporary record for name mapping
  # create a copy
  res <- list ()
  res$resolved <- record$resolved
  res$lineages <- record$lineages
  # add tip.i info
  tip.i <- match (res$resolved$search.name, tree$tip.label)
  not.in.tree <- is.na (tip.i)
  res$resolved$tip.i <- tip.i
  res$resolved <- res$resolved[!not.in.tree, ]
  res$lineages <- res$lineages[!not.in.tree]
  return (res)
}
.mnAddTipWAge <- function (tree, tip.is, new.name,
                           min.age, max.age) {
  print(new.name)
  # add new tip
  if (length (tip.is) == 1) {
    edge <- which (tip.is == tree$edge[ ,2])
    edge.age <- getAge (tree, edge=edge)
    pull <- edge.age$max.age >= max.age
    if (!pull) {
      edge <- NA
    }
  } else {
    # randomly map new tip to any edge in the clade
    # represented by the matching tip.is
    # find the parent node of all the matching tips
    children <- tree$tip.label[tip.is]
    parent.node <- getParent (tree, tips=children)
    # get all descending
    edges <- getEdges(tree, node=parent.node)
    # INCLUDING supporting edge!!!
    edges <- c (edges, which (tree$edge[ ,2] ==parent.node))
    # check ages
    edge.ages <- getAge (tree, edge=edges)
    pull <- edge.ages$max.age >= max.age
    edges <- edges[pull]
    # choose edge at random
    if (length (edges) > 1) {
      edge <- sample (edges, size=1)
    } else {
      edge <- edges[1]
    }
  }
  # exit if no suitable edge found
  if (!is.numeric (edge)) {
    return (tree)
  }
  # random tip and node age
  age.range <- getAge (tree, edge=edge)
  node.age <- runif (n=1, min=age.range[1, 'max.age'],
                     max=age.range[1, 'max.age'])
  tip.age <- runif (n=1, min=min.age, max=node.age)
  tree <- addTip (tree, edge=edge, tip.name=new.name, node.age=node.age,
                  tip.age=tip.age)
  return (tree)
}
.mnAddTip <- function (tree, tip.is, new.name) {
  # add new tip
  if (length (tip.is) == 1) {
    # add to the pendant edge at a random time point
    edge <- which (tip.is == tree$edge[ ,2])
  } else {
    # randomly map new tip to any edge in the clade
    # represented by the matching tip.is
    # find the parent node of all the matching tips
    children <- tree$tip.label[tip.is]
    parent.node <- getParent (tree, tips=children)
    # get descending and supporting edges
    edges <- which (tree$edge[ ,1] == parent.node)
    edges <- c (edges, which (tree$edge[ ,2] == parent.node))
    # choose edge at random, bias sample based on branch length
    edge <- sample (edges, size=1, prob=tree$edge.length[edges])
  }
  # random node age somewhere on edge
  age.range <- getAge (tree, edge=edge)
  node.age <- runif (n=1, min=age.range[1, 'min.age'],
                     max=age.range[1, 'max.age'])
  # random tip age if not ultrametric
  if (is.ultrametric (tree)) {
    tip.age <- 0
  } else {
    # use random tip age in shared clade
    possibles <- getAge (tree, node=tip.is)$age
    possibles <- possibles[possibles < node.age]
    tip.age <- runif (n=1, min=min(possibles), max=max(possibles))
  }
  tree <- addTip (tree, edge=edge, tip.name=new.name, node.age=node.age,
                  tip.age=tip.age)
  return (tree)
}
.mnExtract <- function (tree, names) {
  # return tree representing names
  if (class (tree) == 'multiPhylo') {
    tree <- tree[[sample (1:length (tree), 1)]]
  }
  if (sum (names %in% tree$tip.label) > 1) {
    res.tree <- drop.tip (tree, tip = tree$tip.label[!tree$tip.label %in% names])
    return (res.tree)
  } else {
    warning ('Too few names could be mapped to tree')
    return (NA)
  }
}
.mnEarlyReturn <- function (tree, names, iterations) {
  # if mapNames returns tree early, return an object
  #  that was expect i.e. phylo or multiphylo
  tree <- .mnExtract (tree, names)
  if (iterations == 1) {
    return (tree)
  } else {
    trees <- list ()
    for (i in 1:iterations) {
      trees <- c (trees, list (tree))
    }
    class (trees) <- 'multiPhylo'
    return (trees)
  }
}
.mnClean <- function (trees) {
  # drop _ in names of trees in a multiphylo
  # remove node labels
  # set branch lengths to 1 if no branch lengths
  if (class (trees) == 'multiPhylo') {
    .drop <- function (i) {
      tree <- trees[[i]]
      tree$tip.label <- gsub ('_', ' ', tree$tip.label)
      tree$node.label <- NULL
      if (is.null (tree$edge.length)) {
        tree$edge.length <- rep (1, nrow (tree$edge))
      }
      res <<- c (res, list (tree))
    }
    res <- list ()
    m_ply (.data=data.frame (i=1:length (trees)), .fun=.drop)
    class (res) <- 'multiPhylo'
    return (res)
  } else {
    trees$tip.label <- gsub ('_', ' ', trees$tip.label)
    trees$node.label <- NULL
    if (is.null (trees$edge.length)) {
      trees$edge.length <- rep (1, nrow (trees$edge))
    }
    return (trees)
  }
}
.mnGetNames <- function(trees) {
  # get tip names from a multiphylo
  if (class (trees) == 'multiPhylo') {
    .get <- function (i) {
      res <<- c (res, trees[[i]]$tip.label)
    }
    res <- NULL
    m_ply (.data=data.frame (i=1:length (trees)), .fun=.get)
    res <- unique (res)
  } else {
    res <- trees$tip.label
  }
  return (res)
}

mapNamesPreDownload <- function (names, datasource=4) {
  paraenv <- list ('datasource'=datasource)
  # parse names
  names <- unique (names)
  names <- gsub ('_', ' ', names)
  # search names
  resolved.list <- .mnResolve (names, paraenv)
  return (resolved.list)
}