# Supplemental file: documentation of bioinformatic analysis

# create working directory
$ mkdir working_directory
$ cd working-directory
	
# Download and unzip fastq files
$ wget *.fastq.gz
$ gunzip *.fastq.gz

# Download eautils
$ wget -O fastq-join https://protect-us.mimecast.com/s/9X7eBDUOwVoDcR?domain=dropbox.com	
$ wget -O fastq-barcode.pl https://protect-us.mimecast.com/s/lN5JBRU2q1Qxu2?domain=dropbox.com
$ chmod a+x fastq-join fastq-barcode.pl

# Join forward and reverse reads
$ ./fastq-join Undetermined_S0_L001_R1_001.fastq Undetermined_S0_L001_R2_001.fastq -o joined.fastq

# Produce barcode subset corresponding to joined reads
$ https://protect-us.mimecast.com/s/zNXlBdU3M8YXCv Undetermined_S0_L001_I1_001.fastq joined.fastqjoin > barcodes.fastq

# start QIIME:
$ source /macqiime/configs/bash_profile.txt

# Demultiplex
$ check_id_map.py -m map.txt -o map_output
$ split_libraries_fastq.py -i joined.fastqjoin -o join_out_regmap/ -b barcodes.fastq -m map.txt --rev_comp_barcode --barcode_type 12

# filter crypto samples from original sequence files
$ filter_fasta.py -f E_seqs.fna -o E_Crypto_seqs.fna --sample_id_fp E_sample_ID_file.txt
$ filter_fasta.py -f L_seqs.fna -o L_Crypto_seqs.fna --sample_id_fp L_sample_ID_file.txt

# concatenate filtered sequences from both data sets.
$ cat E_Crypto_seqs.fna L_Crypto_seqs.fna >EL_Crypto.seqs.fna

# pick de novo OTUs
$ pick_de_novo_otus.py -i EL_Crypto_seqs.fna -o UClust_denovo_OTUs/

# check that all samples were retained
$ biom summarize-table -i UClust_denovo_OTUs/otu_table.biom -o UClust_denovo_OTUs/otu_table_summary.txt
	## filter to remove PcFe4, PcBe5Cp01, PcAr8Cp00, and PcAr8Cp08 in downstream analysis
	## (they had 1, 2, 21, and 10 sequences per sample, respectively)
$ filter_samples_from_otu_table.py -i UClust_denovo_OTUs/otu_table.biom -o UClust_denovo_OTUs/filtered_otu_table.biom --sample_id_fp EL_map_filtered.txt
	## double-check number of samples
$ biom summarize-table -i UClust_denovo_OTUs/filtered_otu_table.biom -o UClust_denovo_OTUs/filtered_otu_table_summary.txt
	## use filtered_otu_table.biom as full data set for all downstream analyses

# filter biom table to produce subsets for comparison
	## Age vs. Illness
$ filter_samples_from_otu_table.py -i filtered_otu_table.biom -o AvI_otu_table.biom --sample_id_fp AvI_map.txt
	## Infant Infection per Days Elapsed
$ filter_samples_from_otu_table.py -i filtered_otu_table.biom -o IIDE_otu_table.biom --sample_id_fp IIDE_map.txt
	## Recovery vs. Baseline
$ filter_samples_from_otu_table.py -i filtered_otu_table.biom -o RvB_otu_table.biom --sample_id_fp RvB_map.txt
	## Recovery vs. Baseline, including all Healthy Adults
$ filter_samples_from_otu_table.py -i filtered_otu_table.biom -o RvBHA_otu_table.biom --sample_id_fp RvBHA_map.txt

# perform all following downstream analyses on the 5 data sets above (full set and 4 subsets)
	## identify taxa
$ summarize_taxa_through_plots.py -i filtered_otu_table.biom -o taxa_summary -m EL_map_filtered.txt
	## alpha diversity
$ alpha_diversity.py -i filtered_otu_table.biom -o alpha_diversity.txt -m chao1,goods_coverage,simpson,shannon,PD_whole_tree -t rep_set.tre
	## beta diversity
$ beta_diversity.py -i filtered_otu_table.biom -m weighted_unifrac,unweighted_unifrac -o beta_div -t rep_set.tre
	## statistically compare weighted UniFrac distance between Status groups
$ make_distance_boxplots.py -m RvBHA_map.txt -d beta_div/weighted_unifrac_RvBHA_otu_table.txt -f Status -o beta_div/boxplot_RvBHA
	## PCoA
$ principal_coordinates.py -i beta_div/weighted_unifrac.txt -o weighted_unifrac_PC.txt
$ make_2d_plots.py -i weighted_unifrac_PC.txt -m map.txt -o weighted_unifrac_plots
$ principal_coordinates.py -i beta_div/unweighted_unifrac.txt -o unweighted_unifrac_PC.txt
$ make_2d_plots.py -i unweighted_unifrac_PC.txt -m map.txt -o unweighted_unifrac_plots