# Supplemental file: documentation of bioinformatic analysis # create working directory $ mkdir working_directory $ cd working-directory # Download and unzip fastq files $ wget *.fastq.gz $ gunzip *.fastq.gz # Download eautils $ wget -O fastq-join https://protect-us.mimecast.com/s/9X7eBDUOwVoDcR?domain=dropbox.com $ wget -O fastq-barcode.pl https://protect-us.mimecast.com/s/lN5JBRU2q1Qxu2?domain=dropbox.com $ chmod a+x fastq-join fastq-barcode.pl # Join forward and reverse reads $ ./fastq-join Undetermined_S0_L001_R1_001.fastq Undetermined_S0_L001_R2_001.fastq -o joined.fastq # Produce barcode subset corresponding to joined reads $ https://protect-us.mimecast.com/s/zNXlBdU3M8YXCv Undetermined_S0_L001_I1_001.fastq joined.fastqjoin > barcodes.fastq # start QIIME: $ source /macqiime/configs/bash_profile.txt # Demultiplex $ check_id_map.py -m map.txt -o map_output $ split_libraries_fastq.py -i joined.fastqjoin -o join_out_regmap/ -b barcodes.fastq -m map.txt --rev_comp_barcode --barcode_type 12 # filter crypto samples from original sequence files $ filter_fasta.py -f E_seqs.fna -o E_Crypto_seqs.fna --sample_id_fp E_sample_ID_file.txt $ filter_fasta.py -f L_seqs.fna -o L_Crypto_seqs.fna --sample_id_fp L_sample_ID_file.txt # concatenate filtered sequences from both data sets. $ cat E_Crypto_seqs.fna L_Crypto_seqs.fna >EL_Crypto.seqs.fna # pick de novo OTUs $ pick_de_novo_otus.py -i EL_Crypto_seqs.fna -o UClust_denovo_OTUs/ # check that all samples were retained $ biom summarize-table -i UClust_denovo_OTUs/otu_table.biom -o UClust_denovo_OTUs/otu_table_summary.txt ## filter to remove PcFe4, PcBe5Cp01, PcAr8Cp00, and PcAr8Cp08 in downstream analysis ## (they had 1, 2, 21, and 10 sequences per sample, respectively) $ filter_samples_from_otu_table.py -i UClust_denovo_OTUs/otu_table.biom -o UClust_denovo_OTUs/filtered_otu_table.biom --sample_id_fp EL_map_filtered.txt ## double-check number of samples $ biom summarize-table -i UClust_denovo_OTUs/filtered_otu_table.biom -o UClust_denovo_OTUs/filtered_otu_table_summary.txt ## use filtered_otu_table.biom as full data set for all downstream analyses # filter biom table to produce subsets for comparison ## Age vs. Illness $ filter_samples_from_otu_table.py -i filtered_otu_table.biom -o AvI_otu_table.biom --sample_id_fp AvI_map.txt ## Infant Infection per Days Elapsed $ filter_samples_from_otu_table.py -i filtered_otu_table.biom -o IIDE_otu_table.biom --sample_id_fp IIDE_map.txt ## Recovery vs. Baseline $ filter_samples_from_otu_table.py -i filtered_otu_table.biom -o RvB_otu_table.biom --sample_id_fp RvB_map.txt ## Recovery vs. Baseline, including all Healthy Adults $ filter_samples_from_otu_table.py -i filtered_otu_table.biom -o RvBHA_otu_table.biom --sample_id_fp RvBHA_map.txt # perform all following downstream analyses on the 5 data sets above (full set and 4 subsets) ## identify taxa $ summarize_taxa_through_plots.py -i filtered_otu_table.biom -o taxa_summary -m EL_map_filtered.txt ## alpha diversity $ alpha_diversity.py -i filtered_otu_table.biom -o alpha_diversity.txt -m chao1,goods_coverage,simpson,shannon,PD_whole_tree -t rep_set.tre ## beta diversity $ beta_diversity.py -i filtered_otu_table.biom -m weighted_unifrac,unweighted_unifrac -o beta_div -t rep_set.tre ## statistically compare weighted UniFrac distance between Status groups $ make_distance_boxplots.py -m RvBHA_map.txt -d beta_div/weighted_unifrac_RvBHA_otu_table.txt -f Status -o beta_div/boxplot_RvBHA ## PCoA $ principal_coordinates.py -i beta_div/weighted_unifrac.txt -o weighted_unifrac_PC.txt $ make_2d_plots.py -i weighted_unifrac_PC.txt -m map.txt -o weighted_unifrac_plots $ principal_coordinates.py -i beta_div/unweighted_unifrac.txt -o unweighted_unifrac_PC.txt $ make_2d_plots.py -i unweighted_unifrac_PC.txt -m map.txt -o unweighted_unifrac_plots