"#QIIME 1.3.0 was used to split sff files by barcode, trim barcode and primer sequences, and quality filter reads" #batch process the 3 input sffs process_sff.py --input_dir project_sff/ -o project_fasta_qual "#Split per-region fasta files from above into quality-filtered, per-sample .fna files; repeat per 52 files" split_libraries.py --fasta filename.fasta --qual filename.qual --map mappingfile.txt -o split_files --min-seq-length 200 --max-seq-length 1000 --min-qual-score 20 --reverse_primers truncate_only --max-barcode-errors 0 --max-ambig 0 --max-primer-mismatch 0 --max-homopolymer 6 --barcode-type variable_length #combine the 52 individual .fna files into one for OTU picking cat split_files/*.fna >UCLA_brain_16Scombined.fasta #Cluster into OTUs pick_otus.py -I UCLA_brain_16Scombined.fasta --out_picking_method cdhit --output_dir clustered_seqs/ --similiarity 0.97 #generate a representative sequences file pick_rep_set.py -i clustered_seqs/UCLA_brain_16Scombined_otu.txt -f UCLA_brain_16Scombined.fasta -o UCLA_brain_16S_rep_seqs.fasta -m first #align representative sequences align_seqs.py-i UCLA_brain_16S_rep_seqs.fasta -m pynast -t core_set_aligend.imputed -o aligned_rep_set/ --min_length 150 #identify potential chimeras identify_chimeric_seqs.py -i aligned_rep_set/UCLA_brain_16S_rep_seqs_aligned.fasta -a core_set_aligned.imputed -m ChimeraSlayer-o chimericseqs.txt #remove chimeras from the representative sequences files filter_fasta.py -i aligned_rep_set/UCLA_brain_16S_rep_seqs_aligned.fasta -s chimericseqs.txt -n -o chimera_filtered_rep_set_aligned.fasta filter_fasta.py -i UCLA_brain_16S_rep_seqs.fasta -o chimera_filtered_UCLA_brain_16S_rep_seqs.fasta #filter excess spaces from the alignment filter_alignment.py -i chimera_filtered_rep_set_aligned.fasta -o filtered_chimera_checked_alignment/ #assign taxonomy to remaining sequences assign_taxonomy.py -i chimera_filtered_UCLA_brain_16S_rep_seqs.fasta -m rdp -c 0.80 -o RDP_taxonomy/ #make phylogenentic tree make_phylogeny.py -i filtered_chimera_checked_alignment/chimera_filtered_rep_set_aligned.filtered.fasta -t fasttree -o repset.tree #make OTU table make_otu_table.py -i clustered_seqs/otu97.txt -t RDP_taxonomy/chimera_filtered_UCLA_brain_16s_rep_seqs_taxonomy.txt -o chimera_checked_otu_table.txt #randomly subsample to 1355 single_rarefaction.py -i chimera_checked_otu_table.txt -d 1355 -o subsampled_otu_table_1355seqs.txt #calculate alpha diversity metrics "alpha_diversity.py -i subsampled_otu_table_1355seqs.txt -m observed_otus,shannon,simpson,PD_whole_tree,chao1 -o alpha_div.txt" #compare alpha Diversity compare_alpha_diversity.py -i arare2/alpha_div_collated/PD_whole_tree.txt -m LocalAnalyis_16June2014/LITTLEmapping.txt -c Clust -o Jen/arare2/Clust_PD #generate beta diversity metrics and plots beta_diversity_through_plots.py -I subsampled_otu_table_1355seqs.txt -m UCLA_brain_mapping.txt -t repset.tree -o beta_diversity_output #generate taxa summary files summarize_taxa_through_plots.py -i biom/otu_table_updated_taxonomy1335.biom -o taxa_summary/Clust -m LocalAnalyis_16June2014/Mapping.txt -c Clust #Adonis to compare distance matrices by clusters compare_categories.py --method adonis -i bdiv_even1335/dm/unweighted_unifrac_dm.txt -m LocalAnalyis_16June2014/Mapping.txt -c IBS_IBSNL -o Adonis/unWTD/CON_IBSvIBSNL -n #Kruskall wallis test for taxa differences group_significance.py -i Jen/taxa_summary/otu_table_updated_taxonomy1335_L2.biom -m LocalAnalyis_16June2014/BIGmapping.txt -c Clust -o Jen/ClustTest_TaxLevels/L2 #Compute the log of the abundance of p__Firmicutes plus p__Fusobacteria divided by the abundance of p__Bacteroidetes compute_taxonomy_ratios.py -i biom/otu_table_updated_taxonomy1335.biom --increased p__Firmicutes --decreased p__Bacteroidetes -o F2Bratio.txt ****************************************** "#RandomForest Analysis (RStudio, running R version 3.3.0)" #PACKAGES USED: #randomForest version 4.6-12 #caret version 6.0-73 #pROC version 1.8 #e1701 version 1.6-7 #The chimera-filtered OTU table was manually filtered to remove OTUs which occurred in fewer than 10% of samples and modified to fit the following format: Subject_ID Grouping OTU_533 OTU_1822 OTU_296 OTU_2709 OTU_2148 OTU_1538 … A4954 HC 0 0 0 0 0 0 A5626 HC 0 0 0 0 0 0 A5972 HC 0 0 0 0 0 0 A6038 HC 0 0 0 0 0 0 … A5085 IBS 3 2 3 1 3 0 A5975 IBS 0 0 0 1 0 1 A6022 IBS 1 1 2 0 3 1 A6023 IBS 0 0 0 0 0 0 "UCLA_IBS_HC=read.table(""C:/Users/holli_000/Documents/R/UCLA_Brain_subsampled_otu_table_1355seqs_updated_taxonomy_HC_IBS_10pct.txt"", head=TRUE, row.names=1, sep=""\t"")" library(caret) library(randomForest) library(e1071) library(pROC) "cvCtrl=trainControl(method=""repeatedcv"", number=10, repeats=5, savePred=T, classProbs=TRUE, summaryFunction=twoClassSummary)" "ucla_rf=train(Grouping~., UCLA_IBS_HC, method=""rf"", metric=""ROC"", trControl=cvCtrl, ntree=1000, importance=TRUE)" ucla_rf #best mtry =132; Generate ROC with mtry=132 "predMat132=predMat[predMat$mtry==132, ]" pred132=predMat132$IBS "resp132=factor(predMat132$obs, ordered=T, levels=c(""IBS"", ""HC""))" "testROC132=roc(response=resp132, predictor = pred132)" testROC132 plot(testROC132) "legend(""bottomright"", legend=c(""IBS versus Control AUROC=0.966""), bty=""n"", cex=1, pt.cex=0.66)" **************************************************** Predicted Metagenome Analysis ## The following was performed in QIIME 1.9.1 using PICRUSt 1.0 (http://picrust.github.io/picrust) and the fasta file created earlier pick_closed_reference_otus.py -i UCLA_brain_16Scombined.fasta -r 97_otus.fasta -o closed_reference_otus -t 97_otu_taxonomy.txt # 97_otu files from Greengenes 2013 downloaded from QIIME #afterwards, rename otu_table.biom (output) as otu_table_closed_reference_otus.biom normalize_by_copy_number.py -i otu_table_closed_reference_otus.biom -o otu_table_closed_reference_otus_PICRUSt_normalized.biom predict_metagenomes.py -i otu_table_closed_reference_otus_PICRUSt_normalized.biom -o PICRUSt_imputed_metagenome.biom -a PICRUSt_nsti_per_sample.tab filter_otus_from_otu_table.py -i PICRUSt_imputed_metagenome.biom -o PICRUSt_imputed_metagenome_s5.biom -s 5 filter_otus_from_otu_table.py -i PICRUSt_imputed_metagenome_s5.biom -o PICRUSt_imputed_metagenome_s5_000001.biom --min_count_fraction 0.000001 ## The following was performed in R 3.1.2 using Phyloseq (http://www.bioconductor.org/packages/release/bioc/html/phyloseq.html) library("phyloseq") library("DESeq2") map=import_qiime_sample_data(choose.files()) #import mapping file (.txt) biom=import_biom(choose.files(),parseFunction=parse_taxonomy_greengenes,parallel=TRUE) #import PICRUSt_imputed_metagenome_s5_000001.biom data=merge_phyloseq(biom,map) # To assess metagenes associated with IBS1 vs. HC (or other two way comparisons by microbial cluster) diagdds=phyloseq_to_deseq2(data, ~ Clust) diagdds=DESeq(diagdds) CDvsNorm = results(diagdds, contrast=c("Clust","IBS1","HC")) CDvsNorm = CDvsNorm[order(CDvsNorm$padj, na.last = NA), ] CDvsNormMatrix = cbind(as(CDvsNorm, "data.frame")) write.csv(CDvsNormMatrix, "PICRUSt_metagenes_IBS1_vs_HC.csv") # To assess metagenes associated with brain morphology parameters diagdds=phyloseq_to_deseq2(data, ~ R_InfCirIns_SA) diagdds=DESeq(diagdds) CDvsNorm = results(diagdds, name="R_InfCirIns_SA") CDvsNorm = CDvsNorm[order(CDvsNorm$padj, na.last = NA), ] CDvsNormMatrix = cbind(as(CDvsNorm, "data.frame")) write.csv(CDvsNormMatrix, "PICRUSt_metagenes_R_InfCirIns_SA.csv") ## The following was performed in QIIME 1.9.1, R 3.1.2, and Python 2.7.3 using FishTaco (http://borenstein-lab.github.io/fishtaco) and these files: Sample_list_FISHTACO.txt (column 1:sample name; column 2: "1" to indicate IBS1 or "0" to indicate HC), Metagenes.txt (tab-delimited file with first column containing list of metagenes to be used in FishTaco analysis by KEGG ID, e.g. K00043), PICRUSt precalculated file for genomic content (ko_13_5_precalculated.tab) # QIIME filter_otus_from_otu_table.py -i otu_table_closed_reference_otus.biom -o otu_table_closed_reference_otus_0.0005.biom --min_count_fraction 0.0005 biom convert -i otu_table_closed_reference_otus_0.0005.biom -o Taxa_abundance_mean_0.0005_FISHTACO.txt -b biom convert -i PICRUSt_imputed_metagenome_s5_000001.biom -o Metagene_abundance_FISHTACO.txt -b # R Metagenes<-read.table(choose.files(),header=F) #import Metagenes.txt GC<-read.table(choose.files(),header=T,row.names=1) # import ko_13_5_precalculated.tab # note that this takes a long time GCsubset<-GC[grep(paste(Metagenes$V1,collapse="|"),colnames(GC))] dim(GCsubset) #to verify correct number of columns write.table(GCsubset, "Genomic_content_FISHTACO.csv") # Python run_fishtaco.py -ta Taxa_abundance_mean_0.0005_FISHTACO.txt -fu Metagene_abundance_FISHTACO.txt -l Sample_list_FISHTACO.txt -gc Genomic_content_FISHTACO.txt -functional_profile_already_corrected_with_musicc -mult_hyp none -map_function_level none -multi_function_filter_list K00043,K00260,K01035,K02688