###Mothur pipeline for processing and analysis of V5V6 data. This file is designed to be run in scratch and will copy completed files to your permanent directory. Find and replace "study" for your study name and "folder" for a directory in your permanent folder where you want data moved. Updated 30-Jan-2018 CS## #Unzip fastq.gz files from data_release directory" #system(gzip -dc < /path/to/data_release/sample_S00_R1_001.fastq.gz > ./sample.r1.fastq) #system(gzip -dc < /path/to/data_release/sample_S00_R2_001.fastq.gz > ./sample.r2.fastq) ##Cut reads to remove low quality regions.## #system(cut -c1-150 sample.r1.fastq > sample.r1.cut.fastq) #system(cut -c1-150 sample.r2.fastq > sample.r2.cut.fastq) #system(cut -c1-150 sample.r1.fastq > sample.r1.cut.fastq) #system(cut -c1-150 sample.r2.fastq > sample.r2.cut.fastq) #system(cut -c1-150 sample.r1.fastq > sample.r1.cut.fastq) #system(cut -c1-150 sample.r2.fastq > sample.r2.cut.fastq) #system(cut -c1-150 sample.r1.fastq > sample.r1.cut.fastq) #system(cut -c1-150 sample.r2.fastq > sample.r2.cut.fastq) #system(cut -c1-150 sample.r1.fastq > sample.r1.cut.fastq) #system(cut -c1-150 sample.r2.fastq > sample.r2.cut.fastq) #system(cut -c1-150 sample.r1.fastq > sample.r1.cut.fastq) #system(cut -c1-150 sample.r2.fastq > sample.r2.cut.fastq) ##Paired-end join reads using fastq-join script.## #system(/panfs/roc/groups/8/cmstaley/cmstaley/fastq-join sample.r1.cut.fastq sample.r2.cut.fastq -o sample.fastq) #system(/panfs/roc/groups/8/cmstaley/cmstaley/fastq-join sample.r1.cut.fastq sample.r2.cut.fastq -o sample.fastq) #system(/panfs/roc/groups/8/cmstaley/cmstaley/fastq-join sample.r1.cut.fastq sample.r2.cut.fastq -o sample.fastq) #system(/panfs/roc/groups/8/cmstaley/cmstaley/fastq-join sample.r1.cut.fastq sample.r2.cut.fastq -o sample.fastq) #system(/panfs/roc/groups/8/cmstaley/cmstaley/fastq-join sample.r1.cut.fastq sample.r2.cut.fastq -o sample.fastq) #system(/panfs/roc/groups/8/cmstaley/cmstaley/fastq-join sample.r1.cut.fastq sample.r2.cut.fastq -o sample.fastq) #system(/panfs/roc/groups/8/cmstaley/cmstaley/fastq-join sample.r1.cut.fastq sample.r2.cut.fastq -o sample.fastq) #system(/panfs/roc/groups/8/cmstaley/cmstaley/fastq-join sample.r1.cut.fastq sample.r2.cut.fastq -o sample.fastq) #system(/panfs/roc/groups/8/cmstaley/cmstaley/fastq-join sample.r1.cut.fastq sample.r2.cut.fastq -o sample.fastq) #system(/panfs/roc/groups/8/cmstaley/cmstaley/fastq-join sample.r1.cut.fastq sample.r2.cut.fastq -o sample.fastq) #system(/panfs/roc/groups/8/cmstaley/cmstaley/fastq-join sample.r1.cut.fastq sample.r2.cut.fastq -o sample.fastq) #system(/panfs/roc/groups/8/cmstaley/cmstaley/fastq-join sample.r1.cut.fastq sample.r2.cut.fastq -o sample.fastq) #system(/panfs/roc/groups/8/cmstaley/cmstaley/fastq-join sample.r1.cut.fastq sample.r2.cut.fastq -o sample.fastq) #system(/panfs/roc/groups/8/cmstaley/cmstaley/fastq-join sample.r1.cut.fastq sample.r2.cut.fastq -o sample.fastq) #system(/panfs/roc/groups/8/cmstaley/cmstaley/fastq-join sample.r1.cut.fastq sample.r2.cut.fastq -o sample.fastq) #system(/panfs/roc/groups/8/cmstaley/cmstaley/fastq-join sample.r1.cut.fastq sample.r2.cut.fastq -o sample.fastq) #system(/panfs/roc/groups/8/cmstaley/cmstaley/fastq-join sample.r1.cut.fastq sample.r2.cut.fastq -o sample.fastq) #system(/panfs/roc/groups/8/cmstaley/cmstaley/fastq-join sample.r1.cut.fastq sample.r2.cut.fastq -o sample.fastq) ##Rename files for use with mothur.## #system(mv sample.fastqjoin sample.fastq) #system(mv sample.fastqjoin sample.fastq) #system(mv sample.fastqjoin sample.fastq) #system(mv sample.fastqjoin sample.fastq) #system(mv sample.fastqjoin sample.fastq) #system(mv sample.fastqjoin sample.fastq) #system(mv sample.fastqjoin sample.fastq) #system(mv sample.fastqjoin sample.fastq) #system(mv sample.fastqjoin sample.fastq) #system(mv sample.fastqjoin sample.fastq) #system(mv sample.fastqjoin sample.fastq) #system(mv sample.fastqjoin sample.fastq) #system(mv sample.fastqjoin sample.fastq) #system(mv sample.fastqjoin sample.fastq) #system(mv sample.fastqjoin sample.fastq) #system(mv sample.fastqjoin sample.fastq) #system(mv sample.fastqjoin sample.fastq) #system(mv sample.fastqjoin sample.fastq) ##Split fasta and qual files from fastq files.## #fastq.info(fastq=sample.fastq) #fastq.info(fastq=sample.fastq) #fastq.info(fastq=sample.fastq) #fastq.info(fastq=sample.fastq) #fastq.info(fastq=sample.fastq) #fastq.info(fastq=sample.fastq) #fastq.info(fastq=sample.fastq) #fastq.info(fastq=sample.fastq) #fastq.info(fastq=sample.fastq) #fastq.info(fastq=sample.fastq) #fastq.info(fastq=sample.fastq) #fastq.info(fastq=sample.fastq) #fastq.info(fastq=sample.fastq) #fastq.info(fastq=sample.fastq) #fastq.info(fastq=sample.fastq) #fastq.info(fastq=sample.fastq) #fastq.info(fastq=sample.fastq) #fastq.info(fastq=sample.fastq) ##Initial quality trimming for low quality regions and primer mismatches. Note UMGC uses a proofreading taq polymerases, so primer sequences may change slightly.## #trim.seqs(fasta=sample.fasta,qfile=sample.qual,maxambig=0,maxhomop=8,qwindowsize=50,qwindowaverage=35,oligos=/panfs/roc/groups/8/cmstaley/cmstaley/V5V6.oligos,pdiffs=2,processors=8) #trim.seqs(fasta=sample.fasta,qfile=sample.qual,maxambig=0,maxhomop=8,qwindowsize=50,qwindowaverage=35,oligos=/panfs/roc/groups/8/cmstaley/cmstaley/V5V6.oligos,pdiffs=2,processors=8) #trim.seqs(fasta=sample.fasta,qfile=sample.qual,maxambig=0,maxhomop=8,qwindowsize=50,qwindowaverage=35,oligos=/panfs/roc/groups/8/cmstaley/cmstaley/V5V6.oligos,pdiffs=2,processors=8) #trim.seqs(fasta=sample.fasta,qfile=sample.qual,maxambig=0,maxhomop=8,qwindowsize=50,qwindowaverage=35,oligos=/panfs/roc/groups/8/cmstaley/cmstaley/V5V6.oligos,pdiffs=2,processors=8) #trim.seqs(fasta=sample.fasta,qfile=sample.qual,maxambig=0,maxhomop=8,qwindowsize=50,qwindowaverage=35,oligos=/panfs/roc/groups/8/cmstaley/cmstaley/V5V6.oligos,pdiffs=2,processors=8) #trim.seqs(fasta=sample.fasta,qfile=sample.qual,maxambig=0,maxhomop=8,qwindowsize=50,qwindowaverage=35,oligos=/panfs/roc/groups/8/cmstaley/cmstaley/V5V6.oligos,pdiffs=2,processors=8) #trim.seqs(fasta=sample.fasta,qfile=sample.qual,maxambig=0,maxhomop=8,qwindowsize=50,qwindowaverage=35,oligos=/panfs/roc/groups/8/cmstaley/cmstaley/V5V6.oligos,pdiffs=2,processors=8) #trim.seqs(fasta=sample.fasta,qfile=sample.qual,maxambig=0,maxhomop=8,qwindowsize=50,qwindowaverage=35,oligos=/panfs/roc/groups/8/cmstaley/cmstaley/V5V6.oligos,pdiffs=2,processors=8) #trim.seqs(fasta=sample.fasta,qfile=sample.qual,maxambig=0,maxhomop=8,qwindowsize=50,qwindowaverage=35,oligos=/panfs/roc/groups/8/cmstaley/cmstaley/V5V6.oligos,pdiffs=2,processors=8) #trim.seqs(fasta=sample.fasta,qfile=sample.qual,maxambig=0,maxhomop=8,qwindowsize=50,qwindowaverage=35,oligos=/panfs/roc/groups/8/cmstaley/cmstaley/V5V6.oligos,pdiffs=2,processors=8) #trim.seqs(fasta=sample.fasta,qfile=sample.qual,maxambig=0,maxhomop=8,qwindowsize=50,qwindowaverage=35,oligos=/panfs/roc/groups/8/cmstaley/cmstaley/V5V6.oligos,pdiffs=2,processors=8) #trim.seqs(fasta=sample.fasta,qfile=sample.qual,maxambig=0,maxhomop=8,qwindowsize=50,qwindowaverage=35,oligos=/panfs/roc/groups/8/cmstaley/cmstaley/V5V6.oligos,pdiffs=2,processors=8) #trim.seqs(fasta=sample.fasta,qfile=sample.qual,maxambig=0,maxhomop=8,qwindowsize=50,qwindowaverage=35,oligos=/panfs/roc/groups/8/cmstaley/cmstaley/V5V6.oligos,pdiffs=2,processors=8) #trim.seqs(fasta=sample.fasta,qfile=sample.qual,maxambig=0,maxhomop=8,qwindowsize=50,qwindowaverage=35,oligos=/panfs/roc/groups/8/cmstaley/cmstaley/V5V6.oligos,pdiffs=2,processors=8) #trim.seqs(fasta=sample.fasta,qfile=sample.qual,maxambig=0,maxhomop=8,qwindowsize=50,qwindowaverage=35,oligos=/panfs/roc/groups/8/cmstaley/cmstaley/V5V6.oligos,pdiffs=2,processors=8) #trim.seqs(fasta=sample.fasta,qfile=sample.qual,maxambig=0,maxhomop=8,qwindowsize=50,qwindowaverage=35,oligos=/panfs/roc/groups/8/cmstaley/cmstaley/V5V6.oligos,pdiffs=2,processors=8) #trim.seqs(fasta=sample.fasta,qfile=sample.qual,maxambig=0,maxhomop=8,qwindowsize=50,qwindowaverage=35,oligos=/panfs/roc/groups/8/cmstaley/cmstaley/V5V6.oligos,pdiffs=2,processors=8) #trim.seqs(fasta=sample.fasta,qfile=sample.qual,maxambig=0,maxhomop=8,qwindowsize=50,qwindowaverage=35,oligos=/panfs/roc/groups/8/cmstaley/cmstaley/V5V6.oligos,pdiffs=2,processors=8) #trim.seqs(fasta=sample.fasta,qfile=sample.qual,maxambig=0,maxhomop=8,qwindowsize=50,qwindowaverage=35,oligos=/panfs/roc/groups/8/cmstaley/cmstaley/V5V6.oligos,pdiffs=2,processors=8) ##Combine high-quality reads to make .fasta, .groups, and .names files.## #merge.files(input=sample.trim.fasta-sample.trim.fasta,output=study.fasta) #make.group(fasta=sample.trim.fasta-sample.trim.fasta,groups=sample1-sample2,output=study.groups) #unique.seqs(fasta=study.fasta) #system(cp study.fasta /panfs/roc/groups/8/cmstaley/cmstaley/folder/) #system(cp study.groups /panfs/roc/groups/8/cmstaley/cmstaley/folder/) #count.groups(group=study.groups) ###Alignment database is based on SILVA ver. 132## #align.seqs(fasta=study.unique.fasta,reference=/panfs/roc/groups/8/cmstaley/cmstaley/silva.132.v5v6.fasta,flip=T,processors=8) #summary.seqs(fasta=study.unique.align,name=study.names,processors=8) ##Remove positions outside the alignment.## #screen.seqs(fasta=study.unique.align,name=study.names,group=study.groups,start=25287,end=33183,processors=8) #filter.seqs(fasta=study.unique.good.align,vertical=T,trump=.,processors=8) #count.groups(group=study.good.groups) #unique.seqs(fasta=study.unique.good.filter.fasta,name=study.good.names) ##Remove sequences that differ by 2% as sequence errors.## #pre.cluster(fasta=study.unique.good.filter.unique.fasta,name=study.unique.good.filter.names,group=study.good.groups,diffs=2,processors=6) #system(mv study.unique.good.filter.unique.precluster.fasta study.work.fasta) #system(mv study.unique.good.filter.unique.precluster.names study.work.names) #system(mv study.good.groups study.work.groups) ##Identify and remove chimeras.## #chimera.uchime(fasta=study.work.fasta,name=study.work.names,group=study.work.groups,dereplicate=T,processors=8) #remove.seqs(accnos=study.work.uchime.accnos,fasta=study.work.fasta,name=study.work.names,group=study.work.groups) #count.groups(group=study.work.pick.groups) #system(cp study.work.pick.fasta /panfs/roc/groups/8/cmstaley/cmstaley/folder/) #system(cp study.work.pick.groups /panfs/roc/groups/8/cmstaley/cmstaley/folder/) #system(cp study.work.pick.names /panfs/roc/groups/8/cmstaley/cmstaley/folder/) ##Subsampling## #sub.sample(fasta=study.work.pick.fasta,name=study.work.pick.names,group=study.work.pick.groups,size=11000,persample=T) #count.groups(group=study.work.pick.subsample.groups) #system(mv study.work.pick.subsample.fasta study.final.fasta) #system(mv study.work.pick.subsample.names study.final.names) #system(mv study.work.pick.subsample.groups study.final.groups) #system(cp study.final.fasta /panfs/roc/groups/8/cmstaley/cmstaley/folder/) #system(cp study.final.groups /panfs/roc/groups/8/cmstaley/cmstaley/folder/) #system(cp study.final.names /panfs/roc/groups/8/cmstaley/cmstaley/folder/) ##OTU clustering## #dist.seqs(fasta=study.final.fasta,cutoff=0.03,processors=8) #cluster(column=study.final.dist,name=study.final.names,cutoff=0.03,method=furthest) #system(cp study.final.fn.list /panfs/roc/groups/8/cmstaley/cmstaley/folder/) #make.shared(list=study.final.fn.list,group=study.final.groups,label=0.03) #system(cp study.final.fn.shared /panfs/roc/groups/8/cmstaley/cmstaley/folder/) ##RDP database is ver. 16## #classify.seqs(fasta=study.final.fasta,name=study.final.names,template=/panfs/roc/groups/8/cmstaley/cmstaley/trainset16_022016.rdp.fasta,taxonomy=/panfs/roc/groups/8/cmstaley/cmstaley/trainset16_022016.rdp.tax,processors=8) #classify.otu(list=study.final.fn.list,name=study.final.names,taxonomy=study.final.rdp.wang.taxonomy,reftaxonomy=/panfs/roc/groups/8/cmstaley/cmstaley/trainset16_022016.rdp.tax,cutoff=60,label=0.03) #system(cp study.final.rdp.wang.taxonomy /panfs/roc/groups/8/cmstaley/cmstaley/folder/) #system(mv study.final.fn.0.03.cons.taxonomy study.final.taxonomy) #system(cp study.final.taxonomy /panfs/roc/groups/8/cmstaley/cmstaley/folder/) ##Calculate phylip-formatted distance matrix and phylogenetic tree.## #dist.shared(shared=study.final.fn.shared,calc=braycurtis) #tree.shared(shared=study.final.fn.shared,calc=braycurtis) #system(mv study.final.fn.braycurtis.0.03.lt.dist study.final.lt.dist) #system(mv study.final.fn.braycurtis.0.03.tre study.final.tre) #pcoa(phylip=study.final.lt.dist) #system(cp study.final.lt.pcoa.axes /panfs/roc/groups/8/cmstaley/cmstaley/folder/) #system(cp study.final.lt.pcoa.loadings /panfs/roc/groups/8/cmstaley/cmstaley/folder/) #system(cp study.final.lt.dist /panfs/roc/groups/8/cmstaley/cmstaley/folder/) #system(cp study.final.tre /panfs/roc/groups/8/cmstaley/cmstaley/folder/) #summary.single(shared=study.final.fn.shared,calc=coverage-sobs-shannon-chao) #system(cp study.final.fn.groups.summary /panfs/roc/groups/8/cmstaley/cmstaley/folder/)