#1_fastqc conda activate wes cd ~/wes_cancer/project cat config | while read id do fastqc --outdir ./3.qc/raw_qc/ --threads 10 ./1.raw_fq/${id}*.fastq.gz >> ./3.qc/raw_qc/${id}_fastqc.log 2>&1 done multiqc ./3.qc/raw_qc/*zip -o ./3.qc/raw_qc/multiqc ## trim_galore.sh cat config | while read id do fq1=./1.raw_fq/${id}_1.fastq.gz fq2=./1.raw_fq/${id}_2.fastq.gz trim_galore --paired -q 28 --phred33 --length 30 --stringency 3 --gzip --cores 8 -o ./2.clean_fq $fq1 $fq2 >> ./2.clean_fq/${id}_trim.log 2>&1 done nohup bash trim_galore.sh & cat config | while read id do fastqc --outdir ./3.qc/clean_qc/ --threads 16 ./2.clean_fq/${id}*.fq.gz >> ./3.qc/clean_qc/${id}_fastqc.log 2>&1 done multiqc ./3.qc/clean_qc/*zip -o ./3.qc/clean_qc/multiqc #2_bwa conda activate wes cd ~/wes_cancer/data gunzip Homo_sapiens_assembly19.fasta.gz time bwa index -a bwtsw -p gatk_hg19 ~/wes_cancer/data/Homo_sapiens_assembly19.fasta cd ~/wes_cancer/project ## bwa.sh INDEX=~/wes_cancer/data/gatk_hg19 cat config | while read id do echo "start bwa for ${id}" `date` fq1=./2.clean_fq/${id}_1_val_1.fq.gz fq2=./2.clean_fq/${id}_2_val_2.fq.gz bwa mem -M -t 16 -R "@RG\tID:${id}\tSM:${id}\tLB:WXS\tPL:Illumina" ${INDEX} ${fq1} ${fq2} | samtools sort -@ 10 -m 1G -o ./4.align/${id}.bam - echo "end bwa for ${id}" `date` done samtools index case1_biorep_A_techrep.bam samtools view -h case1_biorep_A_techrep.bam chr17 | samtools view -Sb - > small.bam samtools index small.bam ## stats.sh cat config | while read id do bam=./4.align/${id}.bam samtools stats -@ 16 --reference ~/wes_cancer/data/Homo_sapiens_assembly19.fasta ${bam} > ./4.align/stats/${id}.stat plot-bamstats -p ./4.align/stats/${id} ./4.align/stats/${id}.stat done cat config | while read id do qualimap bamqc --java-mem-size=10G -gff ~/wes_cancer/data/hg19.exon.bed -nr 100000 -nw 500 -nt 16 -bam ./4.align/${id}.bam -outdir ./4.align/qualimap/${id} done #3_facets #! bin/bash InputFileDir=./4.align/ Sample=` ls ./4.align/ | awk -F '.' '{print $1}' | uniq | sed -s 's/.$//g' | uniq ` OutputDir=./5.pileup_output for i in $Sample;do InputFile1=$InputFileDir/$i\N.sort.markdup.bam InputFile2=$InputFileDir/$i\O.sort.markdup.bam InputFile3=$InputFileDir/$i\T.sort.markdup.bam echo $InputFile1 echo $InputFile2 echo $InputFile3 perl ~/biosoft/PileUp/snp-pileup.pl $InputFile1 $InputFile2 $OutputDir/$i\_NO_output.facets perl ~/biosoft/PileUp/snp-pileup.pl $InputFile1 $InputFile3 $OutputDir/$i\_NT_output.facets done ####---------------------R library(facets) set.seed(114) AllFilePath <- list.files('~/wes_cancer/project/5.pileup_output') AllFileDir <- list.dirs('~/wes_cancer/project/5.pileup_output') AllFilePath <- paste(AllFileDir, AllFilePath, sep='/') AllFilePath final_df <- matrix(nrow=length(AllFilePath), ncol=3) final_df <- as.data.frame(final_df) for (i in 1:length(AllFilePath)) { xx=preProcSample(AllFilePath[i]) oo=procSample(xx,cval=200) fit=emcncf(oo) df <- c(fit$dipLogR,fit$ploidy, fit$purity) final_df[i,] <- df } colnames(final_df) <- c('dipLogR','ploidy','purity') AllFilePath <- list.files('~/wes_cancer/project/5.pileup_output') AllFilePath <- gsub('(G.._..)_.*','\\1',AllFilePath) rownames(final_df) <- AllFilePath write.csv(final_df,file = 'facet_result.csv', row.names = T, col.names = T, quote = F) #4_GATK GATK=~/wes_cancer/biosoft/gatk-4.1.4.1/gatk cat config | while read id do BAM=./4.align/${id}.bam if [ ! -f ./5.gatk/ok.${id}_marked.status ] then echo "start MarkDuplicates for ${id}" `date` $GATK --java-options "-Xmx20G -Djava.io.tmpdir=./" MarkDuplicates \ -I ${BAM} \ --REMOVE_DUPLICATES=true \ -O ./5.gatk/${id}_marked.bam \ -M ./5.gatk/${id}.metrics \ 1>./5.gatk/${id}_log.mark 2>&1 if [ $? -eq 0 ] then touch ./5.gatk/ok.${id}_marked.status fi echo "end MarkDuplicates for ${id}" `date` samtools index -@ 16 -m 4G -b ./5.gatk/${id}_marked.bam ./5.gatk/${id}_marked.bai fi done GATK=~/wes_cancer/biosoft/gatk-4.1.4.1/gatk snp=~/wes_cancer/data/dbsnp_146.hg19.vcf.gz indel=~/wes_cancer/data/Mills_and_1000G_gold_standard.indels.hg19.vcf.gz ref=~/wes_cancer/data/Homo_sapiens_assembly19.fasta cat config | while read id do if [ ! -f ./5.gatk/${id}_bqsr.bam ] then echo "start BQSR for ${id}" `date` $GATK --java-options "-Xmx20G -Djava.io.tmpdir=./" BaseRecalibrator \ -R $ref \ -I ./5.gatk/${id}_marked.bam \ --known-sites ${snp} \ --known-sites ${indel} \ -O ./5.gatk/${id}_recal.table \ 1>./5.gatk/${id}_log.recal 2>&1 $GATK --java-options "-Xmx20G -Djava.io.tmpdir=./" ApplyBQSR \ -R $ref \ -I ./5.gatk/${id}_marked.bam \ -bqsr ./5.gatk/${id}_recal.table \ -O ./5.gatk/${id}_bqsr.bam \ 1>./5.gatk/${id}_log.ApplyBQSR 2>&1 echo "end BQSR for ${id}" `date` fi done GATK=~/wes_cancer/biosoft/gatk-4.1.4.1/gatk snp=~/wes_cancer/data/dbsnp_146.hg19.vcf.gz indel=~/wes_cancer/data/Mills_and_1000G_gold_standard.indels.hg19.vcf.gz ref=~/wes_cancer/data/Homo_sapiens_assembly19.fasta bed=~/wes_cancer/data/hg19.exon.bed cat config | while read id do echo "start HC for ${id}" `date` $GATK --java-options "-Xmx20G -Djava.io.tmpdir=./" HaplotypeCaller -ERC GVCF \ -R ${ref} \ -I ./5.gatk/${id}_bqsr.bam \ --dbsnp ${snp} \ -L ${bed} \ -O ./5.gatk/${id}_raw.vcf \ 1>./5.gatk/${id}_log.HC 2>&1 echo "end HC for ${id}" `date` done cd ./5.gatk/gvcf for chr in chr{1..22} chrX chrY chrM do time $GATK --java-options "-Xmx20G -Djava.io.tmpdir=./" GenomicsDBImport \ -R ${ref} \ $(ls ./*raw.vcf | awk '{print "-V "$0" "}') \ -L ${chr} \ --genomicsdb-workspace-path gvcfs_${chr}.db time $GATK --java-options "-Xmx20G -Djava.io.tmpdir=./" GenotypeGVCFs \ -R ${ref} \ -V gendb://gvcfs_${chr}.db \ -O gvcfs_${chr}.vcf done $GATK --java-options "-Xmx20G -Djava.io.tmpdir=./" GatherVcfs \ $(for i in {1..22} X Y M;do echo "-I gvcfs_chr${i}.vcf" ;done) \ -O merge.vcf ## mutect.sh GATK=~/wes_cancer/biosoft/gatk-4.1.4.1/gatk ref=~/wes_cancer/data/Homo_sapiens_assembly19.fasta bed=~/wes_cancer/data/hg19.exon.bed cat config2 | while read id do arr=(${id}) sample=${arr[1]} T=./5.gatk/${arr[1]}_bqsr.bam N=./5.gatk/${arr[0]}_bqsr.bam echo "start Mutect2 for ${id}" `date` $GATK --java-options "-Xmx20G -Djava.io.tmpdir=./" Mutect2 -R ${ref} \ -I ${T} -tumor $(basename "$T" _bqsr.bam) \ -I ${N} -normal $(basename "$N" _bqsr.bam) \ -L ${bed} \ -O ./6.mutect/${sample}_mutect2.vcf $GATK FilterMutectCalls \ -R ${ref} \ -V ./6.mutect/${sample}_mutect2.vcf \ -O ./6.mutect/${sample}_somatic.vcf echo "end Mutect2 for ${id}" `date` cat ./6.mutect/${sample}_somatic.vcf | perl -alne '{if(/^#/){print}else{next unless $F[6] eq "PASS";next if $F[0] =~/_/;print } }' > ./6.mutect/${sample}_filter.vcf done #5_ANNOVAR cd ~/wes_cancer/biosoft cd annovar nohup ./annotate_variation.pl -downdb -webfrom annovar gnomad_genome --buildver hg19 humandb/ >down.log 2>&1 & $ ./annotate_variation.pl -downdb -buildver hg19 -webfrom annovar avdblist hg19_list/ $ cat hg19_list/hg19_avdblist.txt cat config | while read id do echo "start ANNOVAR for ${id} " `date` ~/biosoft/annovar/table_annovar.pl ./6.mutect/${id}_filter.vcf ~/biosoft/annovar/humandb/ \ -buildver hg19 \ -out ./7.annotation/annovar/${id} \ -remove \ -protocol refGene,knownGene,clinvar_20170905 \ -operation g,g,f \ -nastring . \ -vcfinput echo "end ANNOVAR for ${id} " `date` done #6_maf cat config | while read id do grep -v '^Chr' ./7.annotation/annovar/${id}.hg19_multianno.txt | cut -f 1-20 | awk -v T=${id} -v N=${id:0:5}_germline '{print $0"\t"T"\t"N}' >./7.annotation/annovar/${id}.annovar.vcf done head -1 ./7.annotation/annovar/case1_biorep_A_techrep.hg19_multianno.txt| sed 's/Otherinfo/Tumor_Sample_Barcode\tMatched_Norm_Sample_Barcode/' >./7.annotation/annovar/header cat ./7.annotation/annovar/header ./7.annotation/annovar/*annovar.vcf >./7.annotation/annovar/annovar_merge.vcf ###R rm(list = ls()) require(maftools) options(stringsAsFactors = F) ## annovar annovar.laml <- annovarToMaf(annovar = "./7.annotation/annovar/annovar_merge.vcf", refBuild = 'hg19', tsbCol = 'Tumor_Sample_Barcode', table = 'refGene', MAFobj = T) #7_maftools----------R rm(list = ls()) library(maftools) library(data.table) library(ggplot2) mergemafData <- read.maf(maf = 'snv.maf') #dir.create("maf") plotmafSummary(maf = mergemafData, rmOutlier = TRUE, addStat = 'median', dashboard = TRUE, titvRaw = FALSE) oncoplot(maf = mergemafData, top = 50, fontSize = 0.4, showTumorSampleBarcodes = T, sampleOrder = mergemafData@clinical.data$Tumor_Sample_Barcode, removeNonMutated = FALSE)