#王振宇 #2021-5-27 #木聚糖缓解无纤维诱导紊乱 1.3.1 对所有.gz格式测序数据进行原文件夹解压 #! /bin/bash #SBATCH --job-name=gzip #SBATCH --partition=cast #SBATCH --nodes=1 #SBATCH --ntasks=1 #SBATCH --cpus-per-task=8 #SBATCH --mem=16gb #SBATCH --output=gzip_%j.log #SBATCH --qos normal cleandata=~/WORKSPACE/rawdata/ff_xylan_pig/cleandata #先从宏基因组样品命名信息中提取所有样品名 filename=$(cat ~/WORKSPACE/rawdata/ff_xylan_pig/cleandata/metadata.txt | tail -n+2 | awk '{print $1}') cd ~/WORKSPACE/rawdata/ff_xylan_pig/cleandata for i in $filename do gzip -d ${cleandata}/${i}/*.gz done #使用kneaddata对测序数据去宿主(猪) #! /bin/bash #SBATCH --job-name=kneaddata_filter #SBATCH --partition=cast #SBATCH --nodes=1 #SBATCH --ntasks=1 #SBATCH --cpus-per-task=72 #SBATCH --mem=144gb #SBATCH --output=kneaddata_filter_%j.log #SBATCH --qos normal module load jdk module load bowtie2 filename=$(cat ~/WORKSPACE/rawdata/ff_xylan_pig/cleandata/metadata.txt | tail -n+2 | awk '{print $1}') cleandata=~/WORKSPACE/rawdata/ff_xylan_pig/cleandata pig_genome=~/WORKSPACE/database/pig_genome/bowtie_pig_reference_genome ff_xylan_pig=~/WORKSPACE/result/ff_xylan_pig for i in $filename do kneaddata --input ${cleandata}/${i}/${i}_1.clean.fq --input ${cleandata}/${i}/${i}_2.clean.fq \ -db ${pig_genome}/pig_genome_11_1 \ -p 72 --output ${ff_xylan_pig}/kneaddata_filter/ --bypass-trim --bypass-trf done ##metaphlan3物种注释 #! /bin/bash #SBATCH --job-name=metaphlan3 #SBATCH --partition=cast #SBATCH --nodes=1 #SBATCH --ntasks=1 #SBATCH --cpus-per-task=48 #SBATCH --mem=48gb #SBATCH --output=metaphlan3_%j.log #SBATCH --qos normal module load MetaPhlAn3 source $CONDA_SH conda activate MetaPhlAn metaphlan3=~/WORKSPACE/result/ff_xylan_pig/metaphlan3 filename=$(cat ~/WORKSPACE/rawdata/ff_xylan_pig/cleandata/metadata.txt | tail -n+2 | awk '{print $1}') metaphlan3_database=~/WORKSPACE/database/metaphlan3_database filterdata=~/WORKSPACE/rawdata/ff_xylan_pig/filterdata cd ${metaphlan3} for i in $filename do metaphlan ${rawdata}/${i}_1.clean_kneaddata_paired_1.fastq,${rawdata}/${i}_1.clean_kneaddata_paired_2.fastq --bowtie2out ${i}.bowtie2.bz2 --nproc 48 --input_type fastq -t rel_ab_w_read_stats -o ${metaphlan3}/absolutecount/${i}_ab_counts.txt --bowtie2db ${metaphlan3_database} done #! /bin/bash #SBATCH --job-name=megahit #SBATCH --partition=cast #SBATCH --nodes=1 #SBATCH --ntasks=1 #SBATCH --cpus-per-task=72 #SBATCH --mem=216gb #SBATCH --output=megahit_%j.log #SBATCH --qos normal samplename=~/WORKSPACE/rawdata/ff_xylan_pig/filterdata megahit=~/WORKSPACE/result/ff_xylan_pig/megahit time megahit -t 72 \ -1 `cat ${samplename}/samplename_XY_modify.txt | grep -E '4' | sed 's/^/WORKSPACE\/rawdata\/ff_xylan_pig\/filterdata\//;s/$/_1.clean_kneaddata_paired_1.fastq/' | tr '\n' ',' | sed 's/,$//'` \ -2 `cat ${samplename}/samplename_XY_modify.txt | grep -E '4' | sed 's/^/WORKSPACE\/rawdata\/ff_xylan_pig\/filterdata\//;s/$/_1.clean_kneaddata_paired_2.fastq/' | tr '\n' ',' | sed 's/,$//'` \ -o ${megahit} --min-contig-len 300 --min-count 2 time megahit -t 72 \ -1 `cat ${samplename}/samplename_FF_modify.txt | grep -E '1' | sed 's/^/WORKSPACE\/rawdata\/ff_xylan_pig\/filterdata\//;s/$/_1.clean_kneaddata_paired_1.fastq/' | tr '\n' ',' | sed 's/,$//'` \ -2 `cat ${samplename}/samplename_FF_modify.txt | grep -E '1' | sed 's/^/WORKSPACE\/rawdata\/ff_xylan_pig\/filterdata\//;s/$/_1.clean_kneaddata_paired_2.fastq/' | tr '\n' ',' | sed 's/,$//'` \ -o ${megahit}/FF --min-contig-len 300 --min-count 2 ##合并组装结果 cd ~/WORKSPACE/result/ff_xylan_pig/megahit cat *.fa > merge_contig.fa seqkit rename merge_contig.fa > unique_contig.fa #1.5基因预测 #! /bin/bash #SBATCH --job-name=prodigal #SBATCH --partition=cast #SBATCH --nodes=1 #SBATCH --ntasks=1 #SBATCH --cpus-per-task=72 #SBATCH --mem=72gb #SBATCH --output=prodigal_%j.log #SBATCH --qos normal megahit=~/WORKSPACE/result/ff_xylan_pig/megahit prodigal=~/WORKSPACE/result/ff_xylan_pig/prodigal time prodigal -i ${megahit}/merge_contig_unique.fa \ -d ${prodigal}/gene.fa \ -o ${prodigal}/gene.gff \ -a ${prodigal}/protein.faa -p meta -f gff # 统计基因数量,6709811 grep -c '>' ~/WORKSPACE/result/ff_xylan_pig/prodigal/gene.fa #完整基因,2886465 grep -c 'partial=00' ~/WORKSPACE/result/ff_xylan_pig/prodigal/gene.fa #提取完整基因ID grep 'partial=00' ~/WORKSPACE/result/ff_xylan_pig/prodigal/gene.fa | cut -f1 -d ' '| sed 's/>//' > ~/WORKSPACE/result/ff_xylan_pig/prodigal/full_length.id ##根据完整基因id过滤基因集 seqkit grep -f ~/WORKSPACE/result/ff_xylan_pig/prodigal/full_length.id ~/WORKSPACE/result/ff_xylan_pig/prodigal/gene.fa > ~/WORKSPACE/result/ff_xylan_pig/prodigal/full_length.fa ##完整基因集基本统计信息 seqkit stat ~/WORKSPACE/result/ff_xylan_pig/prodigal/full_length.fa transeq -sequence ./full_length.fa -outseq ./full_length_protein.fa -trim Y -clean # 基因去冗余 # aS覆盖度,c相似度,G局部比对,g最优解,T多线程,M内存0不限制 # 2万基因2m,2千万需要2000h,可多线程加速 #! /bin/bash #SBATCH --job-name=cd-hit-est #SBATCH --partition=cast #SBATCH --nodes=1 #SBATCH --ntasks=1 #SBATCH --cpus-per-task=72 #SBATCH --mem=72gb #SBATCH --output=cd-hit-est_%j.log #SBATCH --qos normal prodigal=~/WORKSPACE/result/ff_xylan_pig/prodigal cd_hit=~/WORKSPACE/result/ff_xylan_pig/cd_hit time cd-hit -i ${prodigal}/full_length_protein.fa \ -o ${cd_hit}/cluster_protein_90.fa \ -aS 0.9 -c 0.90 -G 0 -g 0 -T 36 -M 0 ##去掉后缀_1 sed -i 's/_1 / /' ~/WORKSPACE/result/ff_xylan_pig/cd_hit/cluster_protein_90.fa ##提取非冗余基因集(蛋白序列)id grep "^>" ~/WORKSPACE/result/ff_xylan_pig/cd_hit/cluster_protein_90.fa | cut -d '#' -f1 | sed 's/>//' > id.txt ##根据提取的id过滤核酸序列 seqtk subseq ~/WORKSPACE/result/ff_xylan_pig/prodigal/full_length.fa id.txt > ~/WORKSPACE/result/ff_xylan_pig/cd_hit/cluster_nucleotide_90.fa #1.7基因丰度计算 # 建索引, -t序列, -i 索引,10s #! /bin/bash #SBATCH --job-name=salmon #SBATCH --partition=cast #SBATCH --nodes=1 #SBATCH --ntasks=1 #SBATCH --cpus-per-task=36 #SBATCH --mem=72gb #SBATCH --output=salmon_%j.log #SBATCH --qos normal mkdir ~/WORKSPACE/result/ff_xylan_pig/salmon cd_hit=~/WORKSPACE/result/ff_xylan_pig/cd_hit salmon=~/WORKSPACE/result/ff_xylan_pig/salmon time salmon index -t ${cd_hit}/cluster_nucleotide_90.fa -p 36 \ -i ${salmon}/index ##基因定量 #! /bin/bash #SBATCH --job-name=salmon #SBATCH --partition=cast #SBATCH --nodes=1 #SBATCH --ntasks=1 #SBATCH --cpus-per-task=36 #SBATCH --mem=72gb #SBATCH --output=salmon_%j.log #SBATCH --qos normal filename=$(cat ~/WORKSPACE/rawdata/ff_xylan_pig/cleandata/metadata.txt | tail -n+2 | awk '{print $1}') index=~/WORKSPACE/result/ff_xylan_result/salmon/index filterdata=~/WORKSPACE/rawdata/ff_xylan_pig/filterdata for i in $filename do salmon quant \ -i ${index} -l A -p 48 --meta \ -1 ${filterdata}/${i}_1.clean_kneaddata_paired_1.fastq \ -2 ${filterdata}/${i}_1.clean_kneaddata_paired_2.fastq \ -o ~/WORKSPACE/result/ff_xylan_pig/salmon/${i}.quant done ##基因定量结果合并 mkdir ~/WORKSPACE/result/ff_xylan_pig/salmon/result ##结果为相对丰度 salmon quantmerge --quants ~/WORKSPACE/result/ff_xylan_pig/salmon/*.quant -o ~/WORKSPACE/result/ff_xylan_pig/salmon/result/gene.TPM ##结果为基因count数 salmon quantmerge --quants ~/WORKSPACE/result/ff_xylan_pig/salmon/*.quant --column NumReads -o ~/WORKSPACE/result/ff_xylan_pig/salmon/result/gene.count ##dbCAN2碳水化合物功能注释 ##构建索引 time diamond makedb \ --in /home/wangjunjun/WORKSPACE/database/dbCAN_database/CAZyDB.07312020.fa \ --db CAZyDB.07312020 ##功能注释 #! /bin/bash #SBATCH --job-name=dbCAN_diamond #SBATCH --partition=cast #SBATCH --nodes=1 #SBATCH --ntasks=1 #SBATCH --cpus-per-task=36 #SBATCH --mem=72gb #SBATCH --output=dbCAN_diamond_%j.log #SBATCH --qos normal dbCAN=~/WORKSPACE/database/dbCAN_database cd_hit=~/WORKSPACE/result/ff_xylan_pig/cd_hit mkdir ~/WORKSPACE/result/ff_xylan_pig/dbcan2 time diamond blastp --db ${dbCAN}/CAZyDB.07312018 --query ${cd_hit}/cluster_protein_90.fa \ --outfmt 6 --threads 36 --max-target-seqs 1 --quiet -e 1e-5 --sensitive \ --out ~/WORKSPACE/result/ff_xylan_pig/dbcan2/dbCAN_diamond ##提取基因list cut -f 1,2 ~/WORKSPACE/result/ff_xylan_pig/dbcan2/dbCAN_diamond | uniq | sed 's/|/\t/g' | cut -f 1,3 | cut -f 1,2 -d ' ' | sed '1 i Name\tCAZ' > ~/WORKSPACE/result/ff_xylan_pig/dbcan2/gene_list ##合并基因list和基因丰度表 awk 'BEGIN{FS=OFS="\t"} NR==FNR{a[$1]=$2} NR>FNR{print $0,a[$1]}' ~/WORKSPACE/result/ff_xylan_pig/dbcan2/gene_list ~/WORKSPACE/result/ff_xylan_pig/salmon/result/gene.TPM \ | sed '/\t$/d' > ~/WORKSPACE/result/ff_xylan_pig/dbcan2/gene_list_TPM ##合并基因list和基因counts表 awk 'BEGIN{FS=OFS="\t"} NR==FNR{a[$1]=$2} NR>FNR{print $0,a[$1]}' ~/WORKSPACE/result/ff_xylan_pig/dbcan2/gene_list ~/WORKSPACE/result/ff_xylan_pig/salmon/result/gene.count \ | sed '/\t$/d' > ~/WORKSPACE/result/ff_xylan_pig/dbcan2/gene_list_count