#!/usr/local/bin/Rscript
#
# This R script analyzes the phylogenetic distribution of 16S GCNs and performs hidden state prediction of 16S GCNs on the SILVA phylogenetic tree
# Note that the script will automatically attempt to install any required packages.
# Tested on Mac OS X 10.12, using R 3.4.0
# Required external files: See the global variables "INPUT_TREE" and "INPUT_TRAIT_TABLE" below.
# All output will be automatically saved to the directory "output". Existing files will be overwritten without warning.
# Please also consult the copyright notice and disclaimer below.
#
# This is supplementary code for the following paper: 
#   Stilianos Louca, Michael Doebeli and Laura Parfrey. Correcting for 16S rRNA gene copy numbers in microbiome surveys remains an unsolved problem (in review as of September 2017)
#
# Execute this script in your terminal as:
#	Rscript reconstruct_16S_gene_counts.R
#
# 
# ###################################
# Copyright (c) 2017, Stilianos Louca
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
#
#    * Redistributions of source code must retain the above copyright
#      notice, this list of conditions and the following disclaimer.
#    * Redistributions in binary form must reproduce the above copyright
#      notice, this list of conditions and the following disclaimer in
#      the documentation and/or other materials provided with the distribution
#    * Neither the name of the original author (Stilianos Louca), nor the names
#      of any other contributors may be used to endorse or promote products derived
#      from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
# ###################################
#
# Stilianos Louca
# September 20, 2017


###################################
# OPTIONS

INPUT_TREE="input/SILVA_NR99_FastTree_16S_free_living.tre" 				# SILVA phylogenetic tree in Newick format, with tips labelled according to SILVA OTU IDs
INPUT_TRAIT_TABLE="input/16S_counts_per_SILVA_core.tsv"	# table listing 16S GCNs for a subset of SILVA OTUs
TRAIT_TABLE_TIP_NAMES_IN_COLUMN="OTU" 	# column name listing tip labels in trait table
TRAIT_TABLE_FOCAL_TRAIT_COLUMN=2 		# column index (1-based) of focal trait in INPUT_TRAIT_TABLE (also counting the row-names column)
TREE_EDGE_LENGTH_EPSILON=NULL			# "tiny" edge length to use as a substitute for zero-length edges. Set to NULL to adjust automatically

REQUIRED_PACKAGES=c("castor", "ape", "rncl")
NUMBER_OF_PARALLEL_THREADS=22

HSP_INCLUDE_CROSS_VALIDATION=TRUE
HSP_CROSS_VALIDATION_COUNT=50
HSP_CROSS_VALIDATION_TEST_FRACTION=0.02 # fraction of known_tips to be used as a test set
HSP_CROSS_VALIDATION_CLUSTER_TEST_SET=TRUE # if TRUE, then the training set is guaranteed to have certain distances from the test set. If FALSE, then the test set is chosen randomly, the training set is the full complement and then test OTUs are binned into NSTD intervals

# NSTD bins for HSP
HSP_NSTD_BIN_COUNT 	= 8
HSP_NSTD_MIN		= 0;
HSP_NSTD_MAX		= (HSP_NSTD_BIN_COUNT-1)*0.1;


TREE_PLOTS_DPI=72
TREE_PLOTS_INCH_PER_CHAR=0.05 * 72/TREE_PLOTS_DPI
TREE_PLOTS_INCH_PER_LEVEL=0.3 * 72/TREE_PLOTS_DPI

DEFAULT_PLOT_WIDTH=4
DEFAULT_PLOT_HEIGHT=4

INCLUDE_HSP=TRUE
INCLUDE_ACF=TRUE
INCLUDE_NSTD=TRUE
INCLUDE_COARSE_ESP=FALSE

###############################								
# AUXILIARY FUNCTIONS & MACROS

options(expressions=100000) # increase recursion depth

# print warnings as they occur
options(warn=1)


check_output_file = function(file_path,force_replace,verbose,verbose_prefix){
	if(file.exists(file_path)){
		if(force_replace){
			cat(sprintf("%sNote: Replacing output file '%s'.\n",verbose_prefix,file_path))
			file.remove(file_path);
		}else{
			stop(sprintf("Output file '%s' already exists. Cowardly refusing to continue.",file_path), call.=FALSE)
		}
	}
	dir.create(dirname(file_path), showWarnings = FALSE, recursive=TRUE);
}


get_trait_colors = function(N){
	if(N<=3){
		trait_colors = c("red","green","blue")[1:Nstates]
	}else if(N<=6){
		trait_colors = c("red","orange","yellow","green","blue","purple")[1:Nstates]
	}else{
		trait_colors = colorRampPalette(c("red","orange","yellow","green","blue","purple"), bias=1, space="rgb", interpolate="linear")(N)
	}
	return(trait_colors);
}


population_variance = function(X, na.rm=FALSE){
	if(na.rm) X = X[!is.na(X)];
	return(sum((X-mean(X))**2)/length(X));
}


get_complement_of_integer_set = function(N, set){
	keep = rep(TRUE,times=N)
	keep[set] = FALSE
	return(which(keep))
}


save_and_plot_stats_curve = function(	X, 
										Y, 
										N,
										xlabel, 
										ylabel, 
										title, 
										output_basename,
										ylim,
										as_barplot,	# if false, stats are plotted as a curve
										Nmin,
										reference){	# either NA, or a numeric value for drawing a horizontal reference line
	valids = which((!is.na(X)) & (!is.nan(X)) & (!is.na(Y)) & (!is.nan(Y)) & (!is.na(N)) & (!is.nan(N)) & (N>=Nmin))
	X = X[valids]
	Y = Y[valids]
	N = N[valids]
	
	if(is.null(ylim)) ylim = c(1.1*min(0,min(Y)),max(Y)*1.2)

	cat(sprintf("Saving %s..\n",title))
	output_path = sprintf("output/%s.tsv",output_basename)
	check_output_file(output_path,TRUE,TRUE,"  ");						
	cat(sprintf("# %s\n# %s\t%s\tN\n",title,xlabel,ylabel), file=output_path, append=FALSE);
	write.table(x=data.frame(X,Y,N), file=output_path, append=TRUE, sep="\t", row.names=FALSE, col.names=FALSE, quote = FALSE);

	cat(sprintf("Plotting %s..\n",title))
	plot_file=sprintf("output/%s.pdf",output_basename)
	check_output_file(plot_file,TRUE,TRUE,"  ")
	pdf(file=plot_file, width=DEFAULT_PLOT_WIDTH, height=DEFAULT_PLOT_HEIGHT);
	if(as_barplot){
		barplot(Y, names.arg=X, main=title, xlab=xlabel, ylab=ylabel, ylim=ylim, border="black", col="#D6D6D6", las=1)
		box()
	}else{
		plot(x=X, y=Y, type="l", col="black", pch=1, cex=0.6, las=1, lwd=1.5, las=1, xlab=xlabel, ylab=ylabel, ylim=ylim, main=title);
	}
	if((!is.na(reference)) && (reference>ylim[1]) && (reference<ylim[2])){
		abline(h=reference, col="#808080", lwd=1.5)
	}
	invisible(dev.off());
}


###################################
# PREPARATIONS

cat(sprintf("Loading %d required packages (%s)..\n",length(REQUIRED_PACKAGES),paste(REQUIRED_PACKAGES,collapse=", ")));
for(p in 1:length(REQUIRED_PACKAGES)){
	#if(REQUIRED_PACKAGES[p]=="castor"){
	#	library("devtools");
	#	cat(sprintf("  Loading %s..\n",REQUIRED_PACKAGES[p]))
	#	suppressMessages(devtools::load_all("castor",quiet=TRUE))
	if(!require(REQUIRED_PACKAGES[p], quietly=TRUE, character.only=TRUE)){
		cat(sprintf("  Note: Installing missing package '%s'..\n",REQUIRED_PACKAGES[p]))
		install.packages(REQUIRED_PACKAGES[p], dependencies=TRUE, repos="http://cran.r-project.org/");
		suppressPackageStartupMessages(require(REQUIRED_PACKAGES[p],quietly=TRUE,character.only=TRUE))
	}
}


# save backup of this script
dir.create("output", showWarnings = FALSE, recursive=TRUE);
initial.options = commandArgs(trailingOnly = FALSE);
script_path <- sub("--file=", "", initial.options[grep("--file=", initial.options)]);
OK = file.copy(script_path, to = "output/workflow_backup.R", overwrite=TRUE);


###############################								
# LOAD FULL TREE


cat(sprintf("Loading phylogenetic tree..\n"));
tree 	= rncl::read_newick_phylo(file = INPUT_TREE);
Ntips 	= length(tree$tip.label)
Nnodes 	= tree$Nnode;
cat(sprintf(" --> Tree has %d nodes, %d tips and %d edges\n",Nnodes,Ntips,nrow(tree$edge)));

# create internal node labels if needed
if(is.null(tree$node.label)){
	cat(sprintf("Adding node labels to full tree..\n"))
	tree$node.label = paste("node.", 1:Nnodes, sep = "") # don't use underscores, because some tree readers (e.g. rncl) interpret them as spaces
}

# avoid zero-length edges
if(any(tree$edge.length==0)){
	if(is.null(TREE_EDGE_LENGTH_EPSILON)) TREE_EDGE_LENGTH_EPSILON = 0.1*min(tree$edge.length[tree$edge.length>0])
	cat(sprintf("Note: Some edges have length zero, which may break some of the HSP routines. Replacing zero-lengths with a tiny positive length (%g)..\n",TREE_EDGE_LENGTH_EPSILON))
	tree$edge.length[tree$edge.length==0] = TREE_EDGE_LENGTH_EPSILON
}


##############################
# LOAD TRAIT TABLE


cat(sprintf("Reading trait table..\n"))
known_trait_table = read.table(	file 				= INPUT_TRAIT_TABLE, 
								header 				= TRUE, 
								sep 				= "\t", 
								quote 				= "\"",
								dec					= ".",
								row.names 			= TRAIT_TABLE_TIP_NAMES_IN_COLUMN,
								strip.white 		= TRUE, 
								blank.lines.skip 	= TRUE,
								na.strings 			= "NA",
								check.names 		= FALSE,
								comment.char 		= "#",
								colClasses 			= "character",
								stringsAsFactors 	= TRUE);

# filter out rows not corresponding to tips in the full tree
row2tip = match(rownames(known_trait_table), tree$tip.label)
known_trait_table = known_trait_table[!is.na(row2tip),,drop=FALSE]

raw_known_tip_states 	= known_trait_table[,TRAIT_TABLE_FOCAL_TRAIT_COLUMN-1,drop=TRUE];
trait_name 				= colnames(known_trait_table)[TRAIT_TABLE_FOCAL_TRAIT_COLUMN-1]
cat(sprintf(" --> Loaded %d tip states (%d unique states) for trait '%s'\n",nrow(known_trait_table),length(unique(raw_known_tip_states)),trait_name));

# get subtree with known tip states
cat(sprintf("Extracting subtree comprising tips with known state..\n"))
known_tree  = castor::get_subtree_with_tips(tree, omit_tips=which(is.na(match(tree$tip.label,rownames(known_trait_table)))), collapse_monofurcations=TRUE, force_keep_root=TRUE)$subtree;
cat(sprintf(" --> known_tree has %d nodes, %d tips and %d edges\n",known_tree$Nnode,length(known_tree$tip.label),nrow(known_tree$edge)))

# synchronize tip_states & known_tree & tree
cat(sprintf("Synchronizing tree & known sub-tree & states..\n"))
Nknown_tips 	= length(known_tree$tip.label)
Nknown_nodes 	= known_tree$Nnode;
known_tip_names	= known_tree$tip.label
known_tip2tip 	= match(known_tip_names, tree$tip.label)
known_tip2row_in_known_trait_table = match(known_tip_names, rownames(known_trait_table))
known_tip_states = as.numeric(raw_known_tip_states[known_tip2row_in_known_trait_table])
remove(known_trait_table)
remove(raw_known_tip_states)

###################################
# HIDDEN STAIT PREDICTION FOR TRAIT

if(INCLUDE_HSP){
	mapping 	= castor::map_to_state_space(known_tip_states, fill_gaps=FALSE, sort_order="natural", include_state_values=TRUE);
	tip_states	= setNames(rep(NA, times=Ntips), tree$tip.label)
	tip_states[known_tip2tip] = mapping$mapped_states;
	if(Nknown_tips<=3) HSP_INCLUDE_CROSS_VALIDATION = FALSE

	# the following arrays must be synchronized
	METHODS				= c("SA",	"EP",	"PIC", "WSCP", 	"MPR", 			"MPR", 			"MPR", 			"Mk",		"Mk")
	RATE_MODELS			= c("",		"",		"", 	"", 	"exponential", 	"proportional", "all_equal", 	"ER",		"SUEDE")
	METHOD_TAGS_SHORT	= c("SA",	"EP",	"PIC", 	"WSCP", "MPRex", 		"MPRpr", 		"MPRae", 		"Mk_ER",	"Mk_SUEDE")
	METHOD_INCLUDE		= c(TRUE,	TRUE,	TRUE,	TRUE,	TRUE,			TRUE,			TRUE,			TRUE,		FALSE)
	
	# filter methods
	METHODS = METHODS[METHOD_INCLUDE]
	RATE_MODELS = RATE_MODELS[METHOD_INCLUDE]
	METHOD_TAGS_SHORT = METHOD_TAGS_SHORT[METHOD_INCLUDE]
		
	# define the workhorse function for HSP
	estimate_all_state_values = function(method, rate_model, focal_tree, focal_tip_states, verbose){
		transition_matrix = NULL
		if(verbose) tip_state_comment = sprintf("%d out of %d tips have known state",sum(!is.na(focal_tip_states)),length(focal_tip_states))
		if(method=="Mk"){
			if(verbose) cat(sprintf("Performing hidden state prediction on full tree via %s (%s) (%s)..\n", method, rate_model, tip_state_comment))
			hsp = castor::hsp_mk_model(	focal_tree, 
										focal_tip_states,
										Nstates 				= mapping$Nstates,
										tip_priors 				= NULL,
										rate_model 				= rate_model,
										transition_matrix		= NULL,
										root_prior 				= "empirical",
										Ntrials 				= max(10,NUMBER_OF_PARALLEL_THREADS),
										optim_algorithm		 	= "nlminb",
										store_exponentials 		= TRUE,
										check_input 			= TRUE,
										Nthreads				= NUMBER_OF_PARALLEL_THREADS);
			method_summary = sprintf("HSP via Mk rerooting, with rate model '%s'. Log-likelihood = %.10g",rate_model,hsp$loglikelihood)
			method_tag = sprintf("%s_%s",method,rate_model)
		}else if(method=="MPR"){
			if(verbose) cat(sprintf("Performing hidden state prediction on full tree via %s %s (%s)..\n",method, rate_model, tip_state_comment))
			hsp = castor::hsp_max_parsimony(	focal_tree, 
												focal_tip_states,
												Nstates 			= mapping$Nstates,
												transition_costs	= rate_model, 
												edge_exponent		= 0.0, 
												weight_by_scenarios = TRUE,
												check_input 		= TRUE);
			method_summary = sprintf("HSP via maximum parsimony, transition costs = '%s'",rate_model)
			method_tag = sprintf("%s_%s",method,rate_model)
		}else if(method=="WSCP"){
			if(verbose) cat(sprintf("Performing hidden state prediction on full tree via %s (%s)..\n",method,tip_state_comment))
			hsp = castor::hsp_squared_change_parsimony(	focal_tree, 
														tip_states	= mapping$state_values[focal_tip_states],
														weighted 	= TRUE,
														check_input = TRUE);
			method_summary = sprintf("HSP via weighted squared-change parsimony. total_sum_of_squared_changes = %.10g",hsp$total_sum_of_squared_changes)
			method_tag = method
		}else if(method=="SA"){
			if(verbose) cat(sprintf("Performing hidden state prediction on full tree via %s (%s)..\n",method,tip_state_comment))
			hsp = castor::hsp_subtree_averaging(focal_tree, 
												tip_states	= mapping$state_values[focal_tip_states],
												check_input = TRUE);
			method_summary = sprintf("HSP via subtree averaging")
			method_tag = method
		}else if(method=="EP"){
			if(verbose) cat(sprintf("Performing hidden state prediction on full tree via %s (%s)..\n",method,tip_state_comment))
			hsp = castor::hsp_empirical_probabilities(	focal_tree, 
														tip_states	= focal_tip_states,
														Nstates 	= mapping$Nstates,
														check_input = TRUE);
			method_summary = sprintf("HSP via empirical probabilities")
			method_tag = method
		}else if(method=="PIC"){
			if(verbose) cat(sprintf("Performing hidden state prediction on full tree via %s (%s)..\n",method,tip_state_comment))
			hsp = castor::hsp_independent_contrasts(focal_tree, 
													tip_states	= mapping$state_values[focal_tip_states],
													weighted 	= TRUE,
													check_input = TRUE);
			method_summary = sprintf("HSP via phylogenetic independent contrasts. total_sum_of_squared_changes = %.10g",hsp$total_sum_of_squared_changes)
			method_tag = method
		}else if(method=="ESP"){
			if(verbose) cat(sprintf("Performing hidden state prediction on full tree via %s (%s)..\n",method,tip_state_comment))
			hsp = castor::hsp_empirical_probabilities(	focal_tree, 
														focal_tip_states,
														Nstates		= mapping$Nstates,
														check_input = TRUE);
			method_summary = sprintf("HSP via empirical state probabilities")
			method_tag = method
		}else{
			stop(sprintf("ERROR: Unknown HSP method '%s'",method))
		}
		if((!is.null(hsp$success)) && (!hsp$success)) stop(sprintf("ERROR: HSP failed for method '%s': %s",method_tag,hsp$error))

		# convert estimated states (indices) to state_values (numeric values)
		if(method %in% c("WSCP","PIC","SA")){
			estimated_state_values = round(hsp$states); # round estimated numeric values to nearest integer (since fractional trait values don't make sense for 16S GCNs)
		}else{
			# find maximum likelihood states based on posterior probabilities
			estimated_state_values = mapping$state_values[max.col(hsp$likelihoods, ties.method="first")] # map max-likelihood states back to numeric value
		}

		return(list(method_summary=method_summary, estimated_state_values=estimated_state_values, transition_matrix=hsp$transition_matrix, method_tag=method_tag));
	}

	# calculate expected FCPcv if using the most-frequent trait values as an estimate for all tips
	most_frequent_state = as.integer(names(sort(table(mapping$mapped_states),decreasing=TRUE))[1])
	FCP_baseline = mean(mapping$mapped_states == most_frequent_state)
	cat(sprintf("Baseline FCP (if using most-frequent trait value as an estimator) = %.5g\n",FCP_baseline))
	
	# prepare data structures for cross-validation
	if(HSP_INCLUDE_CROSS_VALIDATION){
		R2cv_per_method  = rep(NA, length(METHODS))
		FCPcv_per_method = rep(NA, length(METHODS))
		NSTD_step	= (HSP_NSTD_MAX-HSP_NSTD_MIN)/(HSP_NSTD_BIN_COUNT-1);
		NSTD_lbins	= HSP_NSTD_MIN + NSTD_step * (0:(HSP_NSTD_BIN_COUNT-1));
	}
	
	# perform HSP using various methods
	method_tags = rep(NA, length(METHODS))
	for(m in 1:length(METHODS)){
		method 		= METHODS[m];
		rate_model 	= RATE_MODELS[m];
		
		# perform a single HSP using the full information available
		hsp = estimate_all_state_values(method, rate_model, tree, tip_states, verbose=TRUE)
		method_tags[m] = hsp$method_tag
		
		# save fitted transition matrix to file
		if(method=="Mk"){
			cat(sprintf("  Saving ML-fitted transition matrix..\n"))
			output_table=sprintf("output/HSP_%s_transition_matrix.tsv",hsp$method_tag)
			check_output_file(output_table,TRUE,TRUE,"  ")
			rownames(hsp$transition_matrix) = mapping$state_names;
			colnames(hsp$transition_matrix) = mapping$state_names;
			cat(sprintf("# Estimation of Markov transition rate matrix (as part of hidden state prediction) of trait '%s' in phylogenetic tree: %s\n# %d out of %d tips had known state to begin with\n# Rate mode: %s\n# Log-likelihood: %.10g\n# Entry in row i and column j is transition rate i-->j\n# state\t%s\n",trait_name,INPUT_TREE,length(known_tip_names),Ntips,rate_model,hsp$loglikelihood,paste(mapping$state_names,collapse="\t")), file=output_table, append=FALSE)
			write.table(x=hsp$transition_matrix, file=output_table, append=TRUE, sep="\t", row.names=TRUE, col.names=FALSE, quote=FALSE);
		}

		# save estimated tip states to file
		cat(sprintf("  Saving estimated states for all tips to table..\n"))
		output_table=sprintf("output/HSP_%s_for_all_tips.tsv",hsp$method_tag)
		check_output_file(output_table,TRUE,TRUE,"  ")
		estimated_tip_state_values = matrix(hsp$estimated_state_values[1:Ntips], ncol=1);
		rownames(estimated_tip_state_values) = tree$tip.label;
		cat(sprintf("# Hidden state prediction of trait '%s' on tips in phylogenetic tree: %s\n# %d out of %d tips had known state to begin with\n# %s\n# OTU\t%s\n",trait_name,INPUT_TREE,length(known_tip_names),Ntips,hsp$method_summary,trait_name), file=output_table, append=FALSE)
		write.table(x=estimated_tip_state_values, file=output_table, append=TRUE, sep="\t", row.names=TRUE, col.names=FALSE, quote=FALSE);

		# also save estimations together with NSTDs		
		cat(sprintf("  Saving estimated states + NSTDs for all tips to table..\n"))
		NSTDs = castor::find_nearest_tips(tree, only_descending_tips=FALSE, target_tips=known_tip2tip)$nearest_distance_per_tip
		output_table=sprintf("output/HSP_%s_and_NSTDs_for_all_tips.tsv",hsp$method_tag)
		check_output_file(output_table,TRUE,TRUE,"  ")
		estimated_tip_state_values = matrix(c(hsp$estimated_state_values[1:Ntips],NSTDs), ncol=2);
		rownames(estimated_tip_state_values) = tree$tip.label;
		cat(sprintf("# Hidden state prediction of trait '%s' on tips in phylogenetic tree: %s\n# %d out of %d tips had known state to begin with\n# %s\n# Mean NSTD = %g\n# OTU\t%s\tNSTD\n",trait_name,INPUT_TREE,length(known_tip_names),Ntips,hsp$method_summary,mean(NSTDs),trait_name), file=output_table, append=FALSE)
		write.table(x=estimated_tip_state_values, file=output_table, append=TRUE, sep="\t", row.names=TRUE, col.names=FALSE, quote=FALSE);
		
	
		# perform cross-validation if needed
		if(HSP_INCLUDE_CROSS_VALIDATION){
			R2cv  = 0.0;
			FCPcv = 0.0; 
			R2cv_per_NSTD_bin  	= rep(0, times=HSP_NSTD_BIN_COUNT);
			FCPcv_per_NSTD_bin 	= rep(0, times=HSP_NSTD_BIN_COUNT);
			N_per_NSTD_bin 		= rep(0, times=HSP_NSTD_BIN_COUNT);
			trait_variance 		= population_variance(mapping$state_values[mapping$mapped_states]);
			for(r in 1:HSP_CROSS_VALIDATION_COUNT){
				cat(sprintf("  Performing cross-validation # %d..\n",r))
				# pick random tip test set, and hide known states
				if(HSP_CROSS_VALIDATION_CLUSTER_TEST_SET){
					# pick test OTUs non-independently (i.e. group at various radii)
					for(b in 1:HSP_NSTD_BIN_COUNT){
						# evaluate HSP accuracy for this clustering bin: pick training set to be beyond a certain distance from the test set
						cluster_radius		= HSP_NSTD_MIN+NSTD_step*(b-1)
						test_set			= sample.int(n=Nknown_tips, size=min(Nknown_tips-2,max(1,HSP_CROSS_VALIDATION_TEST_FRACTION * Nknown_tips)), replace=FALSE)
						dist_to_test_set	= castor::find_nearest_tips(known_tree, only_descending_tips=FALSE, target_tips=test_set)$nearest_distance_per_tip
						training_set		= which(dist_to_test_set>cluster_radius) # only use tips further than the cluster_radius from the test_set, as training set
						#test_set			= known_tip2tip[known_test_set]
						#dist_to_test_set 	= castor::find_nearest_tips(tree, only_descending_tips=FALSE, target_tips=test_set)$nearest_distance_per_tip
						#training_set		= intersect(known_tip2tip, which(dist_to_test_set>cluster_radius)) # only use tips further than the cluster_radius from the test_set, as training set
				
						# perform HSP using training set
						sub_tip_states					= rep(NA,Nknown_tips);
						sub_tip_states[training_set] 	= mapping$mapped_states[training_set];
						hsp = estimate_all_state_values(method, rate_model, known_tree, sub_tip_states, verbose=FALSE)
						
						# evaluate HSP accuracy w.r.t. test set
						estimated_test_values 	= hsp$estimated_state_values[test_set];
						true_test_values		= mapping$state_values[mapping$mapped_states[test_set]];
						R2cv_per_NSTD_bin[b]  	= R2cv_per_NSTD_bin[b] + 1.0 - mean((estimated_test_values - true_test_values)**2)/trait_variance;
						FCPcv_per_NSTD_bin[b] 	= FCPcv_per_NSTD_bin[b] + mean(estimated_test_values == true_test_values);
						N_per_NSTD_bin[b]	  	= N_per_NSTD_bin[b] + length(test_set);	
					}

				}else{
					# pick test OTUs randomly & independently
					test_set 		= sample.int(n=Nknown_tips, size=min(Nknown_tips-2,max(1,HSP_CROSS_VALIDATION_TEST_FRACTION * Nknown_tips)), replace=FALSE)					
					training_set	= get_complement_of_integer_set(Nknown_tips,known_test_set)

					# perform HSP using training set
					sub_tip_states					= rep(NA,Nknown_tips);
					sub_tip_states[training_set]	= mapping$mapped_states[training_set];
					hsp = estimate_all_state_values(method, rate_model, known_tree, sub_tip_states, verbose=FALSE)
				
					# evaluate HSP accuracy w.r.t. test set
					estimated_test_values 	= hsp$estimated_state_values[test_set];
					true_test_values		= mapping$state_values[mapping$mapped_states[test_set]];
					FCP = mean(estimated_test_values == true_test_values);
					R2	= 1.0 - mean((estimated_test_values - true_test_values)**2)/trait_variance;
					cat(sprintf("   --> FCP = %.4g, R2 = %.4g\n",FCP,R2))
					FCPcv = FCPcv + FCP;
					R2cv  = R2cv + R2;
				
					# split test set into NSTD bins and calculate conditional accuracies (i.e. R2cv & FCPcv for each NSTD-bin)
					test_NSTDs = castor::find_nearest_tips(known_tree, only_descending_tips=FALSE, target_tips=training_set, as_edge_counts=FALSE, check_input=TRUE)$nearest_distance_per_tip[test_set]
					for(b in 1:HSP_NSTD_BIN_COUNT){
						bin_subset = which((test_NSTDs<HSP_NSTD_MIN+b*NSTD_step) & (test_NSTDs>=HSP_NSTD_MIN+(b-1)*NSTD_step))
						R2cv_per_NSTD_bin[b]  = R2cv_per_NSTD_bin[b] + 1.0 - mean((estimated_test_values[bin_subset] - true_test_values[bin_subset])**2)/trait_variance;
						FCPcv_per_NSTD_bin[b] = FCPcv_per_NSTD_bin[b] + mean(estimated_test_values[bin_subset] == true_test_values[bin_subset]);
						N_per_NSTD_bin[b]	  = N_per_NSTD_bin[b] + length(bin_subset);
					}
				}
				
			}
			if(!HSP_CROSS_VALIDATION_CLUSTER_TEST_SET){
				R2cv_per_method[m]  = R2cv/HSP_CROSS_VALIDATION_COUNT;
				FCPcv_per_method[m] = FCPcv/HSP_CROSS_VALIDATION_COUNT;
			}
			R2cv_per_NSTD_bin 	= R2cv_per_NSTD_bin/HSP_CROSS_VALIDATION_COUNT;
			FCPcv_per_NSTD_bin	= FCPcv_per_NSTD_bin/HSP_CROSS_VALIDATION_COUNT;
			N_per_NSTD_bin		= N_per_NSTD_bin/HSP_CROSS_VALIDATION_COUNT;
						
			# save conditional accuracies over NSTD
			save_and_plot_stats_curve(	X				= NSTD_lbins, 
										Y				= R2cv_per_NSTD_bin,
										N				= N_per_NSTD_bin,
										xlabel			= "NSTD",
										ylabel			= "R2cv",
										title			= sprintf("HSP R2cv over NSTD (%s)",method_tags[m]), 
										output_basename	= sprintf("HSP_%s_R2cv_over_NSTD",method_tags[m]), 
										ylim			= NULL,
										as_barplot		= TRUE,
										Nmin			= 10,
										reference		= 0);
			save_and_plot_stats_curve(	X				= NSTD_lbins, 
										Y				= FCPcv_per_NSTD_bin,
										N				= N_per_NSTD_bin,
										xlabel			= "NSTD",
										ylabel			= "FCPcv",
										title			= sprintf("HSP FCPcv over NSTD (%s)",method_tags[m]), 
										output_basename	= sprintf("HSP_%s_FCPcv_over_NSTD",method_tags[m]), 
										ylim			= c(0,1),
										as_barplot		= TRUE,
										Nmin			= 10,
										reference		= FCP_baseline);
		}
	}
	
	# save & plot cross-validation summaries (one R2cv per method, one FCPcv per method)
	if(HSP_INCLUDE_CROSS_VALIDATION && (!HSP_CROSS_VALIDATION_CLUSTER_TEST_SET)){		
		cat(sprintf("Saving HSP cross-validation summaries for all methods..\n"))
		output_table=sprintf("output/HSP_cross_validation.tsv")
		check_output_file(output_table,TRUE,TRUE,"  ")
		cat(sprintf("# Cross-validation of hidden state prediction for trait '%s'\n# Number of cross-validation repeats: %d\n# CV test fraction: %g\n# Baseline FCP (if using most-frequent trait value as an estimator) = %.5g\n# method\tR2cv\tFCPcv\n",trait_name,HSP_CROSS_VALIDATION_COUNT,HSP_CROSS_VALIDATION_TEST_FRACTION,FCP_baseline), file=output_table, append=FALSE)
		write.table(x=data.frame(method_tags,R2cv_per_method,FCPcv_per_method), file=output_table, append=TRUE, sep="\t", row.names=FALSE, col.names=FALSE, quote=FALSE);
	
		cat(sprintf("Plotting HSP cross-validation summaries..\n"))
		plot_file=sprintf("output/HSP_cross_validation.pdf")
		check_output_file(plot_file,TRUE,TRUE,"  ")
		pdf(file=plot_file, width=DEFAULT_PLOT_WIDTH, height=DEFAULT_PLOT_HEIGHT);
		metric_colors 	= c("#D6D6D6", "#919191")
		metric_names  	= c("R2cv", "FCPcv")
		barplot(t(matrix(c(R2cv_per_method,FCPcv_per_method),ncol=2)), main="HSP cross-validation", xlab="method", ylab=paste(metric_names,collapse=" & "), ylim=c(0,1), names.arg=METHOD_TAGS_SHORT, border="black", col=metric_colors, beside=TRUE, las=1, cex.names = 0.75)
		legend('topleft', legend=metric_names, col=metric_colors, box.col="#404040", title.adj=c(0), xjust=0, pch=15, pt.cex=1.5)
		box()
		invisible(dev.off());
	}
}


###################################
# AUTOCORRELATION FUNCTION OF TRAIT

if(INCLUDE_ACF){
	cat(sprintf("Calculating ACF and MRD of trait '%s'..\n",trait_name))
	acf_results = castor::get_trait_acf(known_tree, known_tip_states, Npairs=1e8, Nbins=100)

	# save stats to file
	cat(sprintf("Saving autocorrelation function to TSV file..\n"))
	output_path = sprintf("output/ACF.tsv")
	check_output_file(output_path,TRUE,TRUE,"  ");						
	cat(sprintf("# Phylogenetic autocorrelation function (ACF) and mean absolute difference (MAD), of '%s' across tips\n# ACF(x) = correlation between the states of two random tips at distance x from each other\n# distance\tautocorrelation\tmean_absolute_deviation\tmean_relative_deviation\tNpairs\n",trait_name), file=output_path, append=FALSE);
	write.table(x=data.frame(acf_results$distances, acf_results$autocorrelations, acf_results$mean_abs_differences, acf_results$mean_rel_differences, acf_results$Npairs_per_distance), file=output_path, append=TRUE, sep="\t", row.names=FALSE, col.names=FALSE, quote = FALSE);

	# plot ACF
	cat(sprintf("Plotting ACF..\n"))
	plot_file=sprintf("output/ACF.pdf")
	check_output_file(plot_file,TRUE,TRUE,"  ")
	pdf(file=plot_file, width=DEFAULT_PLOT_WIDTH, height=DEFAULT_PLOT_HEIGHT);
	par(las=1);
	plot(x=100*acf_results$distances, y=acf_results$autocorrelations, type="l", col="black", pch=1, cex=0.6, las=1, lwd=1.5, xlab="phylogenetic distance (% substitutions per site)", ylab="autocorrelation", xlim=c(0,80), ylim=c(-0.1,max(acf_results$autocorrelations[acf_results$distances<=1],na.rm=TRUE)));
	invisible(dev.off());

	# plot MAD
	cat(sprintf("Plotting MAD..\n"))
	plot_file=sprintf("output/mean_abs_differences.pdf")
	check_output_file(plot_file,TRUE,TRUE,"  ")
	pdf(file=plot_file, width=DEFAULT_PLOT_WIDTH, height=DEFAULT_PLOT_HEIGHT);
	par(las=1);
	plot(x=100*acf_results$distances, y=acf_results$mean_abs_differences, type="l", col="black", pch=1, cex=0.6, las=1, lwd=1.5, xlab="phylogenetic distance (% substitutions per site)", ylab="mean abs difference", xlim=c(0,80), ylim=c(-0.1,max(acf_results$mean_abs_differences[acf_results$distances<=1],na.rm=TRUE)));
	invisible(dev.off());
	
	# plot MRD
	cat(sprintf("Plotting MRD..\n"))
	plot_file=sprintf("output/mean_rel_differences.pdf")
	check_output_file(plot_file,TRUE,TRUE,"  ")
	pdf(file=plot_file, width=DEFAULT_PLOT_WIDTH, height=DEFAULT_PLOT_HEIGHT);
	par(las=1);
	plot(x=100*acf_results$distances, y=acf_results$mean_rel_differences, type="l", col="black", pch=1, cex=0.6, las=1, lwd=1.5, xlab="phylogenetic distance (% substitutions per site)", ylab="mean rel difference", xlim=c(0,80), ylim=c(-0.1,max(acf_results$mean_rel_differences[acf_results$distances<=1],na.rm=TRUE)));
	invisible(dev.off());
}


###################################
# NEAREST SEQUENCED TAXON INDEX

if(INCLUDE_NSTD){
	cat(sprintf("Calculating nearest sequenced taxon indices (NSTD) w.r.t. OTUs with known %s..\n",trait_name))
	target_tips 	= match(known_tip_names, tree$tip.label);
	results 		= castor::find_nearest_tips(tree, only_descending_tips=FALSE, target_tips=target_tips, as_edge_counts=FALSE, check_input=TRUE);
	distances 		= results$nearest_distance_per_tip
	mean_distance 	= mean(distances);
	std_distance  	= sd(distances);
	fraction_gt_03	= mean(distances>0.3)
	fraction_gt_04	= mean(distances>0.4)
	
	# save distances to file
	cat(sprintf("Saving NSTDs to TSV file..\n"))
	output_path = sprintf("output/NSTD.tsv")
	check_output_file(output_path,TRUE,TRUE,"  ");						
	cat(sprintf("# Nearest Sequenced Taxon Indices for OTUs in the SILVA tree, w.r.t. tips with known %s\n# Fraction of OTUs with NSTD>0.3: %g\n# Fraction of OTUs with NSTD>0.4: %g\n# OTU\tnearest_OTU_with_known_%s\tnearest_distance\n",trait_name,fraction_gt_03,fraction_gt_04,trait_name), file=output_path, append=FALSE);
	write.table(x=data.frame(tree$tip.label, tree$tip.label[results$nearest_tip_per_tip], distances), file=output_path, append=TRUE, sep="\t", row.names=FALSE, col.names=FALSE, quote = FALSE);
	
	# histogram of distances
	cat(sprintf("Plotting histogram of NSTDs..\n"))
	plot_file=sprintf("output/NSTD_histogram.pdf")
	check_output_file(plot_file,TRUE,TRUE,"  ")
	pdf(file=plot_file, width=DEFAULT_PLOT_WIDTH, height=DEFAULT_PLOT_HEIGHT);
	hist(100*distances[distances<0.8 & distances>0], breaks=50, main=sprintf("NSTDs of OTUs with unknown %s",trait_name), xlab="NSTD (% subst. per site)", ylab="", border="black", col="grey", las=1, prob = FALSE);
	box();
	invisible(dev.off());
}


##########################################################
# EMPIRICAL STATE PROBABILITIES AT COARSE TREE RESOLUTIONS

if(INCLUDE_COARSE_ESP){
	mapping = castor::map_to_state_space(known_tip_states, fill_gaps=FALSE, sort_order="natural", include_state_values=TRUE);

	cat(sprintf("Calculating ancestral empirical state probabilities of trait '%s'..\n",trait_name))
	ancestral_likelihoods = castor::asr_empirical_probabilities(known_tree, mapping$mapped_states, Nstates=mapping$Nstates, probabilities=TRUE, check_input=TRUE)$ancestral_likelihoods
	
	# create likelihoods table for all clades (tips & nodes) in known_tree
	likelihoods = matrix(0, ncol=mapping$Nstates, nrow=(Nknown_tips+Nknown_nodes));
	likelihoods[cbind(1:Nknown_tips,mapping$mapped_states)] = 1
	likelihoods[(Nknown_tips+1):(Nknown_tips+Nknown_nodes),] = ancestral_likelihoods;

	# save ancestral ESPs to file
	cat(sprintf("Saving ancestral empirical state probabilities to TSV file..\n"))
	output_path = sprintf("output/ancestral_empirical_probabilities.tsv")
	check_output_file(output_path,TRUE,TRUE,"  ");						
	cat(sprintf("# Ancestral empirical state probabilities of '%s' across nodes\n# node\t%s\n",trait_name,paste(mapping$state_names,collapse="\t")), file=output_path, append=FALSE);
	write.table(x=data.frame(known_tree$node.label,ancestral_likelihoods), file=output_path, append=TRUE, sep="\t", row.names=FALSE, col.names=FALSE, quote = FALSE);

	# create coarse version of tree
	resolution = 2;
	cat(sprintf("Coarsening known tree at resolution %g..\n",resolution))
	coarsening 				= castor::collapse_tree_at_resolution(known_tree, resolution=resolution, by_edge_count=FALSE);
	coarse_known_tree 		= coarsening$collapsed_tree;
	NCtips 					= length(coarse_known_tree$tip.label)
	NCnodes 				= coarse_known_tree$Nnode;
	coarse_likelihoods 		= likelihoods[coarsening$new2old_clade,]
	cat(sprintf(" --> Coarsened tree has %d nodes and %d tips\n",NCnodes, NCtips))
	
	# plot coarse tree
	cat(sprintf("Plotting coarsened known_tree with empirical %s probabilities..\n",trait_name));
	output_path = sprintf("output/coarse_tree_empirical_probabilities.pdf")
	check_output_file(output_path,TRUE,TRUE,"  ");						
	pdf(file=output_path, width=TREE_PLOTS_INCH_PER_CHAR*max(sapply(coarse_known_tree$tip.label,nchar))+TREE_PLOTS_INCH_PER_LEVEL*max(ape::node.depth(phy=coarse_known_tree, method=2)), height=0.22*NCtips); 
	phytools::plotTree(coarse_known_tree,type = "phylogram", cex=0.4, fsize=0.8, lwd=0.5, edge.color="grey", offset=0.6)
	title(main=sprintf("Coarsed tree (%d tips) with empirical state probabilities for '%s'",NCtips,trait_name), line=-3)
	ape::add.scale.bar(x=0,y=-2)
	trait_colors = get_trait_colors(mapping$Nstates)
	ape::nodelabels(pie=coarse_likelihoods[(NCtips+1):(NCtips+NCnodes),],piecol=trait_colors,cex=0.2)
	ape::tiplabels(pie=coarse_likelihoods[1:NCtips,],piecol=trait_colors,cex=0.12,frame="none")
	legend("topright",legend=mapping$state_names,col=trait_colors,pch=16,box.col="#404040",pt.cex=1.5,y.intersp=0.8,title="Trait color codes",title.adj=c(0),xjust=0)
	invisible(dev.off());
}