#!/usr/local/bin/Rscript # # This R script analyzes the phylogenetic distribution of 16S GCNs and performs hidden state prediction of 16S GCNs on the SILVA phylogenetic tree # Note that the script will automatically attempt to install any required packages. # Tested on Mac OS X 10.12, using R 3.4.0 # Required external files: See the global variables "INPUT_TREE" and "INPUT_TRAIT_TABLE" below. # All output will be automatically saved to the directory "output". Existing files will be overwritten without warning. # Please also consult the copyright notice and disclaimer below. # # This is supplementary code for the following paper: # Stilianos Louca, Michael Doebeli and Laura Parfrey. Correcting for 16S rRNA gene copy numbers in microbiome surveys remains an unsolved problem (in review as of September 2017) # # Execute this script in your terminal as: # Rscript reconstruct_16S_gene_counts.R # # # ################################### # Copyright (c) 2017, Stilianos Louca # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are # met: # # * Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in # the documentation and/or other materials provided with the distribution # * Neither the name of the original author (Stilianos Louca), nor the names # of any other contributors may be used to endorse or promote products derived # from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. # ################################### # # Stilianos Louca # September 20, 2017 ################################### # OPTIONS INPUT_TREE="input/SILVA_NR99_FastTree_16S_free_living.tre" # SILVA phylogenetic tree in Newick format, with tips labelled according to SILVA OTU IDs INPUT_TRAIT_TABLE="input/16S_counts_per_SILVA_core.tsv" # table listing 16S GCNs for a subset of SILVA OTUs TRAIT_TABLE_TIP_NAMES_IN_COLUMN="OTU" # column name listing tip labels in trait table TRAIT_TABLE_FOCAL_TRAIT_COLUMN=2 # column index (1-based) of focal trait in INPUT_TRAIT_TABLE (also counting the row-names column) TREE_EDGE_LENGTH_EPSILON=NULL # "tiny" edge length to use as a substitute for zero-length edges. Set to NULL to adjust automatically REQUIRED_PACKAGES=c("castor", "ape", "rncl") NUMBER_OF_PARALLEL_THREADS=22 HSP_INCLUDE_CROSS_VALIDATION=TRUE HSP_CROSS_VALIDATION_COUNT=50 HSP_CROSS_VALIDATION_TEST_FRACTION=0.02 # fraction of known_tips to be used as a test set HSP_CROSS_VALIDATION_CLUSTER_TEST_SET=TRUE # if TRUE, then the training set is guaranteed to have certain distances from the test set. If FALSE, then the test set is chosen randomly, the training set is the full complement and then test OTUs are binned into NSTD intervals # NSTD bins for HSP HSP_NSTD_BIN_COUNT = 8 HSP_NSTD_MIN = 0; HSP_NSTD_MAX = (HSP_NSTD_BIN_COUNT-1)*0.1; TREE_PLOTS_DPI=72 TREE_PLOTS_INCH_PER_CHAR=0.05 * 72/TREE_PLOTS_DPI TREE_PLOTS_INCH_PER_LEVEL=0.3 * 72/TREE_PLOTS_DPI DEFAULT_PLOT_WIDTH=4 DEFAULT_PLOT_HEIGHT=4 INCLUDE_HSP=TRUE INCLUDE_ACF=TRUE INCLUDE_NSTD=TRUE INCLUDE_COARSE_ESP=FALSE ############################### # AUXILIARY FUNCTIONS & MACROS options(expressions=100000) # increase recursion depth # print warnings as they occur options(warn=1) check_output_file = function(file_path,force_replace,verbose,verbose_prefix){ if(file.exists(file_path)){ if(force_replace){ cat(sprintf("%sNote: Replacing output file '%s'.\n",verbose_prefix,file_path)) file.remove(file_path); }else{ stop(sprintf("Output file '%s' already exists. Cowardly refusing to continue.",file_path), call.=FALSE) } } dir.create(dirname(file_path), showWarnings = FALSE, recursive=TRUE); } get_trait_colors = function(N){ if(N<=3){ trait_colors = c("red","green","blue")[1:Nstates] }else if(N<=6){ trait_colors = c("red","orange","yellow","green","blue","purple")[1:Nstates] }else{ trait_colors = colorRampPalette(c("red","orange","yellow","green","blue","purple"), bias=1, space="rgb", interpolate="linear")(N) } return(trait_colors); } population_variance = function(X, na.rm=FALSE){ if(na.rm) X = X[!is.na(X)]; return(sum((X-mean(X))**2)/length(X)); } get_complement_of_integer_set = function(N, set){ keep = rep(TRUE,times=N) keep[set] = FALSE return(which(keep)) } save_and_plot_stats_curve = function( X, Y, N, xlabel, ylabel, title, output_basename, ylim, as_barplot, # if false, stats are plotted as a curve Nmin, reference){ # either NA, or a numeric value for drawing a horizontal reference line valids = which((!is.na(X)) & (!is.nan(X)) & (!is.na(Y)) & (!is.nan(Y)) & (!is.na(N)) & (!is.nan(N)) & (N>=Nmin)) X = X[valids] Y = Y[valids] N = N[valids] if(is.null(ylim)) ylim = c(1.1*min(0,min(Y)),max(Y)*1.2) cat(sprintf("Saving %s..\n",title)) output_path = sprintf("output/%s.tsv",output_basename) check_output_file(output_path,TRUE,TRUE," "); cat(sprintf("# %s\n# %s\t%s\tN\n",title,xlabel,ylabel), file=output_path, append=FALSE); write.table(x=data.frame(X,Y,N), file=output_path, append=TRUE, sep="\t", row.names=FALSE, col.names=FALSE, quote = FALSE); cat(sprintf("Plotting %s..\n",title)) plot_file=sprintf("output/%s.pdf",output_basename) check_output_file(plot_file,TRUE,TRUE," ") pdf(file=plot_file, width=DEFAULT_PLOT_WIDTH, height=DEFAULT_PLOT_HEIGHT); if(as_barplot){ barplot(Y, names.arg=X, main=title, xlab=xlabel, ylab=ylabel, ylim=ylim, border="black", col="#D6D6D6", las=1) box() }else{ plot(x=X, y=Y, type="l", col="black", pch=1, cex=0.6, las=1, lwd=1.5, las=1, xlab=xlabel, ylab=ylabel, ylim=ylim, main=title); } if((!is.na(reference)) && (reference>ylim[1]) && (reference Tree has %d nodes, %d tips and %d edges\n",Nnodes,Ntips,nrow(tree$edge))); # create internal node labels if needed if(is.null(tree$node.label)){ cat(sprintf("Adding node labels to full tree..\n")) tree$node.label = paste("node.", 1:Nnodes, sep = "") # don't use underscores, because some tree readers (e.g. rncl) interpret them as spaces } # avoid zero-length edges if(any(tree$edge.length==0)){ if(is.null(TREE_EDGE_LENGTH_EPSILON)) TREE_EDGE_LENGTH_EPSILON = 0.1*min(tree$edge.length[tree$edge.length>0]) cat(sprintf("Note: Some edges have length zero, which may break some of the HSP routines. Replacing zero-lengths with a tiny positive length (%g)..\n",TREE_EDGE_LENGTH_EPSILON)) tree$edge.length[tree$edge.length==0] = TREE_EDGE_LENGTH_EPSILON } ############################## # LOAD TRAIT TABLE cat(sprintf("Reading trait table..\n")) known_trait_table = read.table( file = INPUT_TRAIT_TABLE, header = TRUE, sep = "\t", quote = "\"", dec = ".", row.names = TRAIT_TABLE_TIP_NAMES_IN_COLUMN, strip.white = TRUE, blank.lines.skip = TRUE, na.strings = "NA", check.names = FALSE, comment.char = "#", colClasses = "character", stringsAsFactors = TRUE); # filter out rows not corresponding to tips in the full tree row2tip = match(rownames(known_trait_table), tree$tip.label) known_trait_table = known_trait_table[!is.na(row2tip),,drop=FALSE] raw_known_tip_states = known_trait_table[,TRAIT_TABLE_FOCAL_TRAIT_COLUMN-1,drop=TRUE]; trait_name = colnames(known_trait_table)[TRAIT_TABLE_FOCAL_TRAIT_COLUMN-1] cat(sprintf(" --> Loaded %d tip states (%d unique states) for trait '%s'\n",nrow(known_trait_table),length(unique(raw_known_tip_states)),trait_name)); # get subtree with known tip states cat(sprintf("Extracting subtree comprising tips with known state..\n")) known_tree = castor::get_subtree_with_tips(tree, omit_tips=which(is.na(match(tree$tip.label,rownames(known_trait_table)))), collapse_monofurcations=TRUE, force_keep_root=TRUE)$subtree; cat(sprintf(" --> known_tree has %d nodes, %d tips and %d edges\n",known_tree$Nnode,length(known_tree$tip.label),nrow(known_tree$edge))) # synchronize tip_states & known_tree & tree cat(sprintf("Synchronizing tree & known sub-tree & states..\n")) Nknown_tips = length(known_tree$tip.label) Nknown_nodes = known_tree$Nnode; known_tip_names = known_tree$tip.label known_tip2tip = match(known_tip_names, tree$tip.label) known_tip2row_in_known_trait_table = match(known_tip_names, rownames(known_trait_table)) known_tip_states = as.numeric(raw_known_tip_states[known_tip2row_in_known_trait_table]) remove(known_trait_table) remove(raw_known_tip_states) ################################### # HIDDEN STAIT PREDICTION FOR TRAIT if(INCLUDE_HSP){ mapping = castor::map_to_state_space(known_tip_states, fill_gaps=FALSE, sort_order="natural", include_state_values=TRUE); tip_states = setNames(rep(NA, times=Ntips), tree$tip.label) tip_states[known_tip2tip] = mapping$mapped_states; if(Nknown_tips<=3) HSP_INCLUDE_CROSS_VALIDATION = FALSE # the following arrays must be synchronized METHODS = c("SA", "EP", "PIC", "WSCP", "MPR", "MPR", "MPR", "Mk", "Mk") RATE_MODELS = c("", "", "", "", "exponential", "proportional", "all_equal", "ER", "SUEDE") METHOD_TAGS_SHORT = c("SA", "EP", "PIC", "WSCP", "MPRex", "MPRpr", "MPRae", "Mk_ER", "Mk_SUEDE") METHOD_INCLUDE = c(TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, FALSE) # filter methods METHODS = METHODS[METHOD_INCLUDE] RATE_MODELS = RATE_MODELS[METHOD_INCLUDE] METHOD_TAGS_SHORT = METHOD_TAGS_SHORT[METHOD_INCLUDE] # define the workhorse function for HSP estimate_all_state_values = function(method, rate_model, focal_tree, focal_tip_states, verbose){ transition_matrix = NULL if(verbose) tip_state_comment = sprintf("%d out of %d tips have known state",sum(!is.na(focal_tip_states)),length(focal_tip_states)) if(method=="Mk"){ if(verbose) cat(sprintf("Performing hidden state prediction on full tree via %s (%s) (%s)..\n", method, rate_model, tip_state_comment)) hsp = castor::hsp_mk_model( focal_tree, focal_tip_states, Nstates = mapping$Nstates, tip_priors = NULL, rate_model = rate_model, transition_matrix = NULL, root_prior = "empirical", Ntrials = max(10,NUMBER_OF_PARALLEL_THREADS), optim_algorithm = "nlminb", store_exponentials = TRUE, check_input = TRUE, Nthreads = NUMBER_OF_PARALLEL_THREADS); method_summary = sprintf("HSP via Mk rerooting, with rate model '%s'. Log-likelihood = %.10g",rate_model,hsp$loglikelihood) method_tag = sprintf("%s_%s",method,rate_model) }else if(method=="MPR"){ if(verbose) cat(sprintf("Performing hidden state prediction on full tree via %s %s (%s)..\n",method, rate_model, tip_state_comment)) hsp = castor::hsp_max_parsimony( focal_tree, focal_tip_states, Nstates = mapping$Nstates, transition_costs = rate_model, edge_exponent = 0.0, weight_by_scenarios = TRUE, check_input = TRUE); method_summary = sprintf("HSP via maximum parsimony, transition costs = '%s'",rate_model) method_tag = sprintf("%s_%s",method,rate_model) }else if(method=="WSCP"){ if(verbose) cat(sprintf("Performing hidden state prediction on full tree via %s (%s)..\n",method,tip_state_comment)) hsp = castor::hsp_squared_change_parsimony( focal_tree, tip_states = mapping$state_values[focal_tip_states], weighted = TRUE, check_input = TRUE); method_summary = sprintf("HSP via weighted squared-change parsimony. total_sum_of_squared_changes = %.10g",hsp$total_sum_of_squared_changes) method_tag = method }else if(method=="SA"){ if(verbose) cat(sprintf("Performing hidden state prediction on full tree via %s (%s)..\n",method,tip_state_comment)) hsp = castor::hsp_subtree_averaging(focal_tree, tip_states = mapping$state_values[focal_tip_states], check_input = TRUE); method_summary = sprintf("HSP via subtree averaging") method_tag = method }else if(method=="EP"){ if(verbose) cat(sprintf("Performing hidden state prediction on full tree via %s (%s)..\n",method,tip_state_comment)) hsp = castor::hsp_empirical_probabilities( focal_tree, tip_states = focal_tip_states, Nstates = mapping$Nstates, check_input = TRUE); method_summary = sprintf("HSP via empirical probabilities") method_tag = method }else if(method=="PIC"){ if(verbose) cat(sprintf("Performing hidden state prediction on full tree via %s (%s)..\n",method,tip_state_comment)) hsp = castor::hsp_independent_contrasts(focal_tree, tip_states = mapping$state_values[focal_tip_states], weighted = TRUE, check_input = TRUE); method_summary = sprintf("HSP via phylogenetic independent contrasts. total_sum_of_squared_changes = %.10g",hsp$total_sum_of_squared_changes) method_tag = method }else if(method=="ESP"){ if(verbose) cat(sprintf("Performing hidden state prediction on full tree via %s (%s)..\n",method,tip_state_comment)) hsp = castor::hsp_empirical_probabilities( focal_tree, focal_tip_states, Nstates = mapping$Nstates, check_input = TRUE); method_summary = sprintf("HSP via empirical state probabilities") method_tag = method }else{ stop(sprintf("ERROR: Unknown HSP method '%s'",method)) } if((!is.null(hsp$success)) && (!hsp$success)) stop(sprintf("ERROR: HSP failed for method '%s': %s",method_tag,hsp$error)) # convert estimated states (indices) to state_values (numeric values) if(method %in% c("WSCP","PIC","SA")){ estimated_state_values = round(hsp$states); # round estimated numeric values to nearest integer (since fractional trait values don't make sense for 16S GCNs) }else{ # find maximum likelihood states based on posterior probabilities estimated_state_values = mapping$state_values[max.col(hsp$likelihoods, ties.method="first")] # map max-likelihood states back to numeric value } return(list(method_summary=method_summary, estimated_state_values=estimated_state_values, transition_matrix=hsp$transition_matrix, method_tag=method_tag)); } # calculate expected FCPcv if using the most-frequent trait values as an estimate for all tips most_frequent_state = as.integer(names(sort(table(mapping$mapped_states),decreasing=TRUE))[1]) FCP_baseline = mean(mapping$mapped_states == most_frequent_state) cat(sprintf("Baseline FCP (if using most-frequent trait value as an estimator) = %.5g\n",FCP_baseline)) # prepare data structures for cross-validation if(HSP_INCLUDE_CROSS_VALIDATION){ R2cv_per_method = rep(NA, length(METHODS)) FCPcv_per_method = rep(NA, length(METHODS)) NSTD_step = (HSP_NSTD_MAX-HSP_NSTD_MIN)/(HSP_NSTD_BIN_COUNT-1); NSTD_lbins = HSP_NSTD_MIN + NSTD_step * (0:(HSP_NSTD_BIN_COUNT-1)); } # perform HSP using various methods method_tags = rep(NA, length(METHODS)) for(m in 1:length(METHODS)){ method = METHODS[m]; rate_model = RATE_MODELS[m]; # perform a single HSP using the full information available hsp = estimate_all_state_values(method, rate_model, tree, tip_states, verbose=TRUE) method_tags[m] = hsp$method_tag # save fitted transition matrix to file if(method=="Mk"){ cat(sprintf(" Saving ML-fitted transition matrix..\n")) output_table=sprintf("output/HSP_%s_transition_matrix.tsv",hsp$method_tag) check_output_file(output_table,TRUE,TRUE," ") rownames(hsp$transition_matrix) = mapping$state_names; colnames(hsp$transition_matrix) = mapping$state_names; cat(sprintf("# Estimation of Markov transition rate matrix (as part of hidden state prediction) of trait '%s' in phylogenetic tree: %s\n# %d out of %d tips had known state to begin with\n# Rate mode: %s\n# Log-likelihood: %.10g\n# Entry in row i and column j is transition rate i-->j\n# state\t%s\n",trait_name,INPUT_TREE,length(known_tip_names),Ntips,rate_model,hsp$loglikelihood,paste(mapping$state_names,collapse="\t")), file=output_table, append=FALSE) write.table(x=hsp$transition_matrix, file=output_table, append=TRUE, sep="\t", row.names=TRUE, col.names=FALSE, quote=FALSE); } # save estimated tip states to file cat(sprintf(" Saving estimated states for all tips to table..\n")) output_table=sprintf("output/HSP_%s_for_all_tips.tsv",hsp$method_tag) check_output_file(output_table,TRUE,TRUE," ") estimated_tip_state_values = matrix(hsp$estimated_state_values[1:Ntips], ncol=1); rownames(estimated_tip_state_values) = tree$tip.label; cat(sprintf("# Hidden state prediction of trait '%s' on tips in phylogenetic tree: %s\n# %d out of %d tips had known state to begin with\n# %s\n# OTU\t%s\n",trait_name,INPUT_TREE,length(known_tip_names),Ntips,hsp$method_summary,trait_name), file=output_table, append=FALSE) write.table(x=estimated_tip_state_values, file=output_table, append=TRUE, sep="\t", row.names=TRUE, col.names=FALSE, quote=FALSE); # also save estimations together with NSTDs cat(sprintf(" Saving estimated states + NSTDs for all tips to table..\n")) NSTDs = castor::find_nearest_tips(tree, only_descending_tips=FALSE, target_tips=known_tip2tip)$nearest_distance_per_tip output_table=sprintf("output/HSP_%s_and_NSTDs_for_all_tips.tsv",hsp$method_tag) check_output_file(output_table,TRUE,TRUE," ") estimated_tip_state_values = matrix(c(hsp$estimated_state_values[1:Ntips],NSTDs), ncol=2); rownames(estimated_tip_state_values) = tree$tip.label; cat(sprintf("# Hidden state prediction of trait '%s' on tips in phylogenetic tree: %s\n# %d out of %d tips had known state to begin with\n# %s\n# Mean NSTD = %g\n# OTU\t%s\tNSTD\n",trait_name,INPUT_TREE,length(known_tip_names),Ntips,hsp$method_summary,mean(NSTDs),trait_name), file=output_table, append=FALSE) write.table(x=estimated_tip_state_values, file=output_table, append=TRUE, sep="\t", row.names=TRUE, col.names=FALSE, quote=FALSE); # perform cross-validation if needed if(HSP_INCLUDE_CROSS_VALIDATION){ R2cv = 0.0; FCPcv = 0.0; R2cv_per_NSTD_bin = rep(0, times=HSP_NSTD_BIN_COUNT); FCPcv_per_NSTD_bin = rep(0, times=HSP_NSTD_BIN_COUNT); N_per_NSTD_bin = rep(0, times=HSP_NSTD_BIN_COUNT); trait_variance = population_variance(mapping$state_values[mapping$mapped_states]); for(r in 1:HSP_CROSS_VALIDATION_COUNT){ cat(sprintf(" Performing cross-validation # %d..\n",r)) # pick random tip test set, and hide known states if(HSP_CROSS_VALIDATION_CLUSTER_TEST_SET){ # pick test OTUs non-independently (i.e. group at various radii) for(b in 1:HSP_NSTD_BIN_COUNT){ # evaluate HSP accuracy for this clustering bin: pick training set to be beyond a certain distance from the test set cluster_radius = HSP_NSTD_MIN+NSTD_step*(b-1) test_set = sample.int(n=Nknown_tips, size=min(Nknown_tips-2,max(1,HSP_CROSS_VALIDATION_TEST_FRACTION * Nknown_tips)), replace=FALSE) dist_to_test_set = castor::find_nearest_tips(known_tree, only_descending_tips=FALSE, target_tips=test_set)$nearest_distance_per_tip training_set = which(dist_to_test_set>cluster_radius) # only use tips further than the cluster_radius from the test_set, as training set #test_set = known_tip2tip[known_test_set] #dist_to_test_set = castor::find_nearest_tips(tree, only_descending_tips=FALSE, target_tips=test_set)$nearest_distance_per_tip #training_set = intersect(known_tip2tip, which(dist_to_test_set>cluster_radius)) # only use tips further than the cluster_radius from the test_set, as training set # perform HSP using training set sub_tip_states = rep(NA,Nknown_tips); sub_tip_states[training_set] = mapping$mapped_states[training_set]; hsp = estimate_all_state_values(method, rate_model, known_tree, sub_tip_states, verbose=FALSE) # evaluate HSP accuracy w.r.t. test set estimated_test_values = hsp$estimated_state_values[test_set]; true_test_values = mapping$state_values[mapping$mapped_states[test_set]]; R2cv_per_NSTD_bin[b] = R2cv_per_NSTD_bin[b] + 1.0 - mean((estimated_test_values - true_test_values)**2)/trait_variance; FCPcv_per_NSTD_bin[b] = FCPcv_per_NSTD_bin[b] + mean(estimated_test_values == true_test_values); N_per_NSTD_bin[b] = N_per_NSTD_bin[b] + length(test_set); } }else{ # pick test OTUs randomly & independently test_set = sample.int(n=Nknown_tips, size=min(Nknown_tips-2,max(1,HSP_CROSS_VALIDATION_TEST_FRACTION * Nknown_tips)), replace=FALSE) training_set = get_complement_of_integer_set(Nknown_tips,known_test_set) # perform HSP using training set sub_tip_states = rep(NA,Nknown_tips); sub_tip_states[training_set] = mapping$mapped_states[training_set]; hsp = estimate_all_state_values(method, rate_model, known_tree, sub_tip_states, verbose=FALSE) # evaluate HSP accuracy w.r.t. test set estimated_test_values = hsp$estimated_state_values[test_set]; true_test_values = mapping$state_values[mapping$mapped_states[test_set]]; FCP = mean(estimated_test_values == true_test_values); R2 = 1.0 - mean((estimated_test_values - true_test_values)**2)/trait_variance; cat(sprintf(" --> FCP = %.4g, R2 = %.4g\n",FCP,R2)) FCPcv = FCPcv + FCP; R2cv = R2cv + R2; # split test set into NSTD bins and calculate conditional accuracies (i.e. R2cv & FCPcv for each NSTD-bin) test_NSTDs = castor::find_nearest_tips(known_tree, only_descending_tips=FALSE, target_tips=training_set, as_edge_counts=FALSE, check_input=TRUE)$nearest_distance_per_tip[test_set] for(b in 1:HSP_NSTD_BIN_COUNT){ bin_subset = which((test_NSTDs=HSP_NSTD_MIN+(b-1)*NSTD_step)) R2cv_per_NSTD_bin[b] = R2cv_per_NSTD_bin[b] + 1.0 - mean((estimated_test_values[bin_subset] - true_test_values[bin_subset])**2)/trait_variance; FCPcv_per_NSTD_bin[b] = FCPcv_per_NSTD_bin[b] + mean(estimated_test_values[bin_subset] == true_test_values[bin_subset]); N_per_NSTD_bin[b] = N_per_NSTD_bin[b] + length(bin_subset); } } } if(!HSP_CROSS_VALIDATION_CLUSTER_TEST_SET){ R2cv_per_method[m] = R2cv/HSP_CROSS_VALIDATION_COUNT; FCPcv_per_method[m] = FCPcv/HSP_CROSS_VALIDATION_COUNT; } R2cv_per_NSTD_bin = R2cv_per_NSTD_bin/HSP_CROSS_VALIDATION_COUNT; FCPcv_per_NSTD_bin = FCPcv_per_NSTD_bin/HSP_CROSS_VALIDATION_COUNT; N_per_NSTD_bin = N_per_NSTD_bin/HSP_CROSS_VALIDATION_COUNT; # save conditional accuracies over NSTD save_and_plot_stats_curve( X = NSTD_lbins, Y = R2cv_per_NSTD_bin, N = N_per_NSTD_bin, xlabel = "NSTD", ylabel = "R2cv", title = sprintf("HSP R2cv over NSTD (%s)",method_tags[m]), output_basename = sprintf("HSP_%s_R2cv_over_NSTD",method_tags[m]), ylim = NULL, as_barplot = TRUE, Nmin = 10, reference = 0); save_and_plot_stats_curve( X = NSTD_lbins, Y = FCPcv_per_NSTD_bin, N = N_per_NSTD_bin, xlabel = "NSTD", ylabel = "FCPcv", title = sprintf("HSP FCPcv over NSTD (%s)",method_tags[m]), output_basename = sprintf("HSP_%s_FCPcv_over_NSTD",method_tags[m]), ylim = c(0,1), as_barplot = TRUE, Nmin = 10, reference = FCP_baseline); } } # save & plot cross-validation summaries (one R2cv per method, one FCPcv per method) if(HSP_INCLUDE_CROSS_VALIDATION && (!HSP_CROSS_VALIDATION_CLUSTER_TEST_SET)){ cat(sprintf("Saving HSP cross-validation summaries for all methods..\n")) output_table=sprintf("output/HSP_cross_validation.tsv") check_output_file(output_table,TRUE,TRUE," ") cat(sprintf("# Cross-validation of hidden state prediction for trait '%s'\n# Number of cross-validation repeats: %d\n# CV test fraction: %g\n# Baseline FCP (if using most-frequent trait value as an estimator) = %.5g\n# method\tR2cv\tFCPcv\n",trait_name,HSP_CROSS_VALIDATION_COUNT,HSP_CROSS_VALIDATION_TEST_FRACTION,FCP_baseline), file=output_table, append=FALSE) write.table(x=data.frame(method_tags,R2cv_per_method,FCPcv_per_method), file=output_table, append=TRUE, sep="\t", row.names=FALSE, col.names=FALSE, quote=FALSE); cat(sprintf("Plotting HSP cross-validation summaries..\n")) plot_file=sprintf("output/HSP_cross_validation.pdf") check_output_file(plot_file,TRUE,TRUE," ") pdf(file=plot_file, width=DEFAULT_PLOT_WIDTH, height=DEFAULT_PLOT_HEIGHT); metric_colors = c("#D6D6D6", "#919191") metric_names = c("R2cv", "FCPcv") barplot(t(matrix(c(R2cv_per_method,FCPcv_per_method),ncol=2)), main="HSP cross-validation", xlab="method", ylab=paste(metric_names,collapse=" & "), ylim=c(0,1), names.arg=METHOD_TAGS_SHORT, border="black", col=metric_colors, beside=TRUE, las=1, cex.names = 0.75) legend('topleft', legend=metric_names, col=metric_colors, box.col="#404040", title.adj=c(0), xjust=0, pch=15, pt.cex=1.5) box() invisible(dev.off()); } } ################################### # AUTOCORRELATION FUNCTION OF TRAIT if(INCLUDE_ACF){ cat(sprintf("Calculating ACF and MRD of trait '%s'..\n",trait_name)) acf_results = castor::get_trait_acf(known_tree, known_tip_states, Npairs=1e8, Nbins=100) # save stats to file cat(sprintf("Saving autocorrelation function to TSV file..\n")) output_path = sprintf("output/ACF.tsv") check_output_file(output_path,TRUE,TRUE," "); cat(sprintf("# Phylogenetic autocorrelation function (ACF) and mean absolute difference (MAD), of '%s' across tips\n# ACF(x) = correlation between the states of two random tips at distance x from each other\n# distance\tautocorrelation\tmean_absolute_deviation\tmean_relative_deviation\tNpairs\n",trait_name), file=output_path, append=FALSE); write.table(x=data.frame(acf_results$distances, acf_results$autocorrelations, acf_results$mean_abs_differences, acf_results$mean_rel_differences, acf_results$Npairs_per_distance), file=output_path, append=TRUE, sep="\t", row.names=FALSE, col.names=FALSE, quote = FALSE); # plot ACF cat(sprintf("Plotting ACF..\n")) plot_file=sprintf("output/ACF.pdf") check_output_file(plot_file,TRUE,TRUE," ") pdf(file=plot_file, width=DEFAULT_PLOT_WIDTH, height=DEFAULT_PLOT_HEIGHT); par(las=1); plot(x=100*acf_results$distances, y=acf_results$autocorrelations, type="l", col="black", pch=1, cex=0.6, las=1, lwd=1.5, xlab="phylogenetic distance (% substitutions per site)", ylab="autocorrelation", xlim=c(0,80), ylim=c(-0.1,max(acf_results$autocorrelations[acf_results$distances<=1],na.rm=TRUE))); invisible(dev.off()); # plot MAD cat(sprintf("Plotting MAD..\n")) plot_file=sprintf("output/mean_abs_differences.pdf") check_output_file(plot_file,TRUE,TRUE," ") pdf(file=plot_file, width=DEFAULT_PLOT_WIDTH, height=DEFAULT_PLOT_HEIGHT); par(las=1); plot(x=100*acf_results$distances, y=acf_results$mean_abs_differences, type="l", col="black", pch=1, cex=0.6, las=1, lwd=1.5, xlab="phylogenetic distance (% substitutions per site)", ylab="mean abs difference", xlim=c(0,80), ylim=c(-0.1,max(acf_results$mean_abs_differences[acf_results$distances<=1],na.rm=TRUE))); invisible(dev.off()); # plot MRD cat(sprintf("Plotting MRD..\n")) plot_file=sprintf("output/mean_rel_differences.pdf") check_output_file(plot_file,TRUE,TRUE," ") pdf(file=plot_file, width=DEFAULT_PLOT_WIDTH, height=DEFAULT_PLOT_HEIGHT); par(las=1); plot(x=100*acf_results$distances, y=acf_results$mean_rel_differences, type="l", col="black", pch=1, cex=0.6, las=1, lwd=1.5, xlab="phylogenetic distance (% substitutions per site)", ylab="mean rel difference", xlim=c(0,80), ylim=c(-0.1,max(acf_results$mean_rel_differences[acf_results$distances<=1],na.rm=TRUE))); invisible(dev.off()); } ################################### # NEAREST SEQUENCED TAXON INDEX if(INCLUDE_NSTD){ cat(sprintf("Calculating nearest sequenced taxon indices (NSTD) w.r.t. OTUs with known %s..\n",trait_name)) target_tips = match(known_tip_names, tree$tip.label); results = castor::find_nearest_tips(tree, only_descending_tips=FALSE, target_tips=target_tips, as_edge_counts=FALSE, check_input=TRUE); distances = results$nearest_distance_per_tip mean_distance = mean(distances); std_distance = sd(distances); fraction_gt_03 = mean(distances>0.3) fraction_gt_04 = mean(distances>0.4) # save distances to file cat(sprintf("Saving NSTDs to TSV file..\n")) output_path = sprintf("output/NSTD.tsv") check_output_file(output_path,TRUE,TRUE," "); cat(sprintf("# Nearest Sequenced Taxon Indices for OTUs in the SILVA tree, w.r.t. tips with known %s\n# Fraction of OTUs with NSTD>0.3: %g\n# Fraction of OTUs with NSTD>0.4: %g\n# OTU\tnearest_OTU_with_known_%s\tnearest_distance\n",trait_name,fraction_gt_03,fraction_gt_04,trait_name), file=output_path, append=FALSE); write.table(x=data.frame(tree$tip.label, tree$tip.label[results$nearest_tip_per_tip], distances), file=output_path, append=TRUE, sep="\t", row.names=FALSE, col.names=FALSE, quote = FALSE); # histogram of distances cat(sprintf("Plotting histogram of NSTDs..\n")) plot_file=sprintf("output/NSTD_histogram.pdf") check_output_file(plot_file,TRUE,TRUE," ") pdf(file=plot_file, width=DEFAULT_PLOT_WIDTH, height=DEFAULT_PLOT_HEIGHT); hist(100*distances[distances<0.8 & distances>0], breaks=50, main=sprintf("NSTDs of OTUs with unknown %s",trait_name), xlab="NSTD (% subst. per site)", ylab="", border="black", col="grey", las=1, prob = FALSE); box(); invisible(dev.off()); } ########################################################## # EMPIRICAL STATE PROBABILITIES AT COARSE TREE RESOLUTIONS if(INCLUDE_COARSE_ESP){ mapping = castor::map_to_state_space(known_tip_states, fill_gaps=FALSE, sort_order="natural", include_state_values=TRUE); cat(sprintf("Calculating ancestral empirical state probabilities of trait '%s'..\n",trait_name)) ancestral_likelihoods = castor::asr_empirical_probabilities(known_tree, mapping$mapped_states, Nstates=mapping$Nstates, probabilities=TRUE, check_input=TRUE)$ancestral_likelihoods # create likelihoods table for all clades (tips & nodes) in known_tree likelihoods = matrix(0, ncol=mapping$Nstates, nrow=(Nknown_tips+Nknown_nodes)); likelihoods[cbind(1:Nknown_tips,mapping$mapped_states)] = 1 likelihoods[(Nknown_tips+1):(Nknown_tips+Nknown_nodes),] = ancestral_likelihoods; # save ancestral ESPs to file cat(sprintf("Saving ancestral empirical state probabilities to TSV file..\n")) output_path = sprintf("output/ancestral_empirical_probabilities.tsv") check_output_file(output_path,TRUE,TRUE," "); cat(sprintf("# Ancestral empirical state probabilities of '%s' across nodes\n# node\t%s\n",trait_name,paste(mapping$state_names,collapse="\t")), file=output_path, append=FALSE); write.table(x=data.frame(known_tree$node.label,ancestral_likelihoods), file=output_path, append=TRUE, sep="\t", row.names=FALSE, col.names=FALSE, quote = FALSE); # create coarse version of tree resolution = 2; cat(sprintf("Coarsening known tree at resolution %g..\n",resolution)) coarsening = castor::collapse_tree_at_resolution(known_tree, resolution=resolution, by_edge_count=FALSE); coarse_known_tree = coarsening$collapsed_tree; NCtips = length(coarse_known_tree$tip.label) NCnodes = coarse_known_tree$Nnode; coarse_likelihoods = likelihoods[coarsening$new2old_clade,] cat(sprintf(" --> Coarsened tree has %d nodes and %d tips\n",NCnodes, NCtips)) # plot coarse tree cat(sprintf("Plotting coarsened known_tree with empirical %s probabilities..\n",trait_name)); output_path = sprintf("output/coarse_tree_empirical_probabilities.pdf") check_output_file(output_path,TRUE,TRUE," "); pdf(file=output_path, width=TREE_PLOTS_INCH_PER_CHAR*max(sapply(coarse_known_tree$tip.label,nchar))+TREE_PLOTS_INCH_PER_LEVEL*max(ape::node.depth(phy=coarse_known_tree, method=2)), height=0.22*NCtips); phytools::plotTree(coarse_known_tree,type = "phylogram", cex=0.4, fsize=0.8, lwd=0.5, edge.color="grey", offset=0.6) title(main=sprintf("Coarsed tree (%d tips) with empirical state probabilities for '%s'",NCtips,trait_name), line=-3) ape::add.scale.bar(x=0,y=-2) trait_colors = get_trait_colors(mapping$Nstates) ape::nodelabels(pie=coarse_likelihoods[(NCtips+1):(NCtips+NCnodes),],piecol=trait_colors,cex=0.2) ape::tiplabels(pie=coarse_likelihoods[1:NCtips,],piecol=trait_colors,cex=0.12,frame="none") legend("topright",legend=mapping$state_names,col=trait_colors,pch=16,box.col="#404040",pt.cex=1.5,y.intersp=0.8,title="Trait color codes",title.adj=c(0),xjust=0) invisible(dev.off()); }