"#QIIME 1.3.0 was used to split sff files by barcode, trim barcode and primer sequences, and quality filter reads"								
#batch process the 3 input sffs								
process_sff.py --input_dir project_sff/ -o project_fasta_qual								
								
"#Split per-region fasta files from above into quality-filtered, per-sample .fna files; repeat per 52 files"								
split_libraries.py --fasta filename.fasta --qual filename.qual --map mappingfile.txt -o split_files --min-seq-length 200 --max-seq-length 1000 --min-qual-score 20 --reverse_primers truncate_only --max-barcode-errors 0 --max-ambig 0 --max-primer-mismatch 0 --max-homopolymer 6 --barcode-type variable_length								
								
#combine the 52 individual .fna files into one for OTU picking								
cat split_files/*.fna >UCLA_brain_16Scombined.fasta								
								
#Cluster into OTUs								
pick_otus.py -I UCLA_brain_16Scombined.fasta --out_picking_method cdhit --output_dir clustered_seqs/ --similiarity 0.97 								
								
#generate a representative sequences file								
pick_rep_set.py -i clustered_seqs/UCLA_brain_16Scombined_otu.txt -f UCLA_brain_16Scombined.fasta -o UCLA_brain_16S_rep_seqs.fasta -m first								
								
#align representative sequences								
align_seqs.py-i UCLA_brain_16S_rep_seqs.fasta -m pynast -t core_set_aligend.imputed -o aligned_rep_set/ --min_length 150 								
								
#identify potential chimeras								
identify_chimeric_seqs.py  -i aligned_rep_set/UCLA_brain_16S_rep_seqs_aligned.fasta -a core_set_aligned.imputed -m ChimeraSlayer-o chimericseqs.txt								
								
#remove chimeras from the representative sequences files								
filter_fasta.py -i aligned_rep_set/UCLA_brain_16S_rep_seqs_aligned.fasta -s chimericseqs.txt -n -o chimera_filtered_rep_set_aligned.fasta								
filter_fasta.py -i UCLA_brain_16S_rep_seqs.fasta -o chimera_filtered_UCLA_brain_16S_rep_seqs.fasta								
								
#filter excess spaces from the alignment								
filter_alignment.py -i chimera_filtered_rep_set_aligned.fasta -o filtered_chimera_checked_alignment/								
								
#assign taxonomy to remaining sequences								
assign_taxonomy.py -i chimera_filtered_UCLA_brain_16S_rep_seqs.fasta -m rdp -c 0.80 -o RDP_taxonomy/								
								
#make phylogenentic tree								
make_phylogeny.py -i filtered_chimera_checked_alignment/chimera_filtered_rep_set_aligned.filtered.fasta -t fasttree -o repset.tree								
								
#make OTU table								
make_otu_table.py -i clustered_seqs/otu97.txt -t RDP_taxonomy/chimera_filtered_UCLA_brain_16s_rep_seqs_taxonomy.txt -o chimera_checked_otu_table.txt								
								
#randomly subsample to 1355								
single_rarefaction.py -i chimera_checked_otu_table.txt -d 1355 -o subsampled_otu_table_1355seqs.txt								
								
#calculate alpha diversity metrics								
"alpha_diversity.py -i subsampled_otu_table_1355seqs.txt -m observed_otus,shannon,simpson,PD_whole_tree,chao1 -o alpha_div.txt"								
								
#compare alpha Diversity								
compare_alpha_diversity.py -i arare2/alpha_div_collated/PD_whole_tree.txt -m LocalAnalyis_16June2014/LITTLEmapping.txt -c Clust -o Jen/arare2/Clust_PD								
								
#generate beta diversity metrics and plots								
beta_diversity_through_plots.py -I subsampled_otu_table_1355seqs.txt -m UCLA_brain_mapping.txt -t repset.tree -o beta_diversity_output								
								
#generate taxa summary files								
summarize_taxa_through_plots.py -i biom/otu_table_updated_taxonomy1335.biom -o taxa_summary/Clust -m LocalAnalyis_16June2014/Mapping.txt -c Clust								
								
#Adonis to compare distance matrices by clusters								
compare_categories.py --method adonis -i bdiv_even1335/dm/unweighted_unifrac_dm.txt -m LocalAnalyis_16June2014/Mapping.txt -c IBS_IBSNL -o Adonis/unWTD/CON_IBSvIBSNL -n								
								
#Kruskall wallis test for taxa differences								
group_significance.py -i Jen/taxa_summary/otu_table_updated_taxonomy1335_L2.biom -m LocalAnalyis_16June2014/BIGmapping.txt -c Clust -o Jen/ClustTest_TaxLevels/L2								
								
#Compute the log of the abundance of p__Firmicutes plus p__Fusobacteria divided by the abundance of p__Bacteroidetes 								
compute_taxonomy_ratios.py -i biom/otu_table_updated_taxonomy1335.biom --increased p__Firmicutes --decreased p__Bacteroidetes -o F2Bratio.txt								
								
								
								
								
								
								
******************************************								
								
"#RandomForest Analysis (RStudio, running R version 3.3.0)"								
#PACKAGES USED:								
#randomForest version 4.6-12								
#caret version 6.0-73								
#pROC version 1.8								
#e1701 version 1.6-7								
								
#The chimera-filtered OTU table was manually filtered to remove OTUs which occurred in fewer than 10% of samples and modified to fit the following format:								
								
Subject_ID	Grouping	OTU_533	OTU_1822	OTU_296	OTU_2709	OTU_2148	OTU_1538	…
A4954	HC	0	0	0	0	0	0	
A5626	HC	0	0	0	0	0	0	
A5972	HC	0	0	0	0	0	0	
A6038	HC	0	0	0	0	0	0	
…								
A5085	IBS	3	2	3	1	3	0	
A5975	IBS	0	0	0	1	0	1	
A6022	IBS	1	1	2	0	3	1	
A6023	IBS	0	0	0	0	0	0	
								
								
"UCLA_IBS_HC=read.table(""C:/Users/holli_000/Documents/R/UCLA_Brain_subsampled_otu_table_1355seqs_updated_taxonomy_HC_IBS_10pct.txt"", head=TRUE, row.names=1, sep=""\t"")"								
								
library(caret)								
library(randomForest)								
library(e1071)								
library(pROC)								
								
"cvCtrl=trainControl(method=""repeatedcv"", number=10, repeats=5, savePred=T, classProbs=TRUE, summaryFunction=twoClassSummary)"								
"ucla_rf=train(Grouping~., UCLA_IBS_HC, method=""rf"", metric=""ROC"", trControl=cvCtrl, ntree=1000, importance=TRUE)"								
ucla_rf								
								
#best mtry =132; Generate ROC with mtry=132								
"predMat132=predMat[predMat$mtry==132, ]"								
pred132=predMat132$IBS								
"resp132=factor(predMat132$obs, ordered=T, levels=c(""IBS"", ""HC""))"								
"testROC132=roc(response=resp132, predictor = pred132)"								
testROC132								
plot(testROC132)								
"legend(""bottomright"", legend=c(""IBS versus Control AUROC=0.966""), bty=""n"", cex=1, pt.cex=0.66)"		




****************************************************
Predicted Metagenome Analysis

## The following was performed in QIIME 1.9.1 using PICRUSt 1.0 (http://picrust.github.io/picrust) and the fasta file created earlier

pick_closed_reference_otus.py -i UCLA_brain_16Scombined.fasta -r 97_otus.fasta -o closed_reference_otus -t 97_otu_taxonomy.txt # 97_otu files from Greengenes 2013 downloaded from QIIME  #afterwards, rename otu_table.biom (output) as otu_table_closed_reference_otus.biom

normalize_by_copy_number.py -i otu_table_closed_reference_otus.biom -o otu_table_closed_reference_otus_PICRUSt_normalized.biom



predict_metagenomes.py -i otu_table_closed_reference_otus_PICRUSt_normalized.biom -o PICRUSt_imputed_metagenome.biom -a PICRUSt_nsti_per_sample.tab



filter_otus_from_otu_table.py -i PICRUSt_imputed_metagenome.biom -o PICRUSt_imputed_metagenome_s5.biom -s 5

filter_otus_from_otu_table.py -i PICRUSt_imputed_metagenome_s5.biom -o PICRUSt_imputed_metagenome_s5_000001.biom --min_count_fraction 0.000001


## The following was performed in R 3.1.2 using Phyloseq (http://www.bioconductor.org/packages/release/bioc/html/phyloseq.html)

library("phyloseq")

library("DESeq2")

map=import_qiime_sample_data(choose.files())   #import mapping file (.txt)

biom=import_biom(choose.files(),parseFunction=parse_taxonomy_greengenes,parallel=TRUE)  #import PICRUSt_imputed_metagenome_s5_000001.biom

data=merge_phyloseq(biom,map)

# To assess metagenes associated with IBS1 vs. HC (or other two way comparisons by microbial cluster)

diagdds=phyloseq_to_deseq2(data, ~ Clust)    	 
                 
diagdds=DESeq(diagdds)

CDvsNorm = results(diagdds, contrast=c("Clust","IBS1","HC"))

CDvsNorm = CDvsNorm[order(CDvsNorm$padj, na.last = NA), ]

CDvsNormMatrix = cbind(as(CDvsNorm, "data.frame"))

write.csv(CDvsNormMatrix, "PICRUSt_metagenes_IBS1_vs_HC.csv")  

# To assess metagenes associated with brain morphology parameters

diagdds=phyloseq_to_deseq2(data, ~ R_InfCirIns_SA) 
   	                  
diagdds=DESeq(diagdds) 

CDvsNorm = results(diagdds, name="R_InfCirIns_SA") 

CDvsNorm = CDvsNorm[order(CDvsNorm$padj, na.last = NA), ]

CDvsNormMatrix = cbind(as(CDvsNorm, "data.frame"))

write.csv(CDvsNormMatrix, "PICRUSt_metagenes_R_InfCirIns_SA.csv")  

## The following was performed in QIIME 1.9.1, R 3.1.2, and Python 2.7.3 using FishTaco (http://borenstein-lab.github.io/fishtaco) and these files: Sample_list_FISHTACO.txt (column 1:sample name; column 2: "1" to indicate IBS1 or "0" to indicate HC), Metagenes.txt (tab-delimited file with first column containing list of metagenes to be used in FishTaco analysis by KEGG ID, e.g. K00043), PICRUSt precalculated file for genomic content (ko_13_5_precalculated.tab)

# QIIME

filter_otus_from_otu_table.py -i otu_table_closed_reference_otus.biom -o otu_table_closed_reference_otus_0.0005.biom --min_count_fraction 0.0005  

biom convert -i otu_table_closed_reference_otus_0.0005.biom -o Taxa_abundance_mean_0.0005_FISHTACO.txt -b

biom convert -i PICRUSt_imputed_metagenome_s5_000001.biom -o Metagene_abundance_FISHTACO.txt -b

# R

Metagenes<-read.table(choose.files(),header=F)    #import Metagenes.txt

GC<-read.table(choose.files(),header=T,row.names=1)   # import ko_13_5_precalculated.tab  # note that this takes a long time

GCsubset<-GC[grep(paste(Metagenes$V1,collapse="|"),colnames(GC))]

dim(GCsubset) #to verify correct number of columns

write.table(GCsubset, "Genomic_content_FISHTACO.csv")

# Python

run_fishtaco.py -ta Taxa_abundance_mean_0.0005_FISHTACO.txt -fu Metagene_abundance_FISHTACO.txt -l Sample_list_FISHTACO.txt -gc Genomic_content_FISHTACO.txt -functional_profile_already_corrected_with_musicc -mult_hyp none -map_function_level none -multi_function_filter_list K00043,K00260,K01035,K02688