### Comparison of sequencing data processing pipelines and application to underrepresented human populations: Additional File 3. ### Gwenna Breton ### Scripts by pipeline ### ### Steps common to all pipelines ### ## Step 0.A: Mapping # Step 0.A.1: Mapping: Auton et al 2015 and Mallick et al 2016 samples #Program version: picard/1.126, bwakit/0.7.12 /sw/apps/bioinfo/bwakit/0.7.12/bwa.kit/seqtk mergepe input_1.fastq.gz input_2.fastq.gz \ | /sw/apps/bioinfo/bwakit/0.7.12/bwa.kit/bwa mem -p -t4 -R'@RG\tID:sampleID_Llane\tSM:sampleID\tPL:ILLUMINA\tLB:lib1\tPU:Lane' ${reference_hg38} - 2> logfile \ | /sw/apps/bioinfo/bwakit/0.7.12/bwa.kit/k8 /sw/apps/bioinfo/bwakit/0.7.12/bwa.kit/bwa-postalt.js -p sampleID_Llane.hla /proj/b2012165/nobackup/private/Seq_project_cont/reference_hg38/GRCh38_full_analysis_set_plus_decoy_hla.fa.alt \ | java -Xmx7g -jar /pica/sw/apps/bioinfo/picard/1.126/milou/picard.jar SortSam INPUT=/dev/stdin OUTPUT=sampleID_Llane_sorted.bam SORT_ORDER=coordinate java -Xmx7g -jar /pica/sw/apps/bioinfo/picard/1.126/milou/picard.jar BuildBamIndex INPUT=sampleID_Llane_sorted.bam # Step 0.A.2: Meyer 2012 samples #Revert mapped BAM to unmapped BAM #Program version: picard/1.126 java -Xmx28g -jar /pica/sw/apps/bioinfo/picard/1.126/milou/picard.jar RevertSam VALIDATION_STRINGENCY=LENIENT I=${infolder}/${infile}.bam O=${infile}.revertsam.bam SANITIZE=true MAX_DISCARD_FRACTION=0.005 ATTRIBUTE_TO_CLEAR=BC ATTRIBUTE_TO_CLEAR=XD ATTRIBUTE_TO_CLEAR=AM ATTRIBUTE_TO_CLEAR=SM ATTRIBUTE_TO_CLEAR=H0 ATTRIBUTE_TO_CLEAR=H1 ATTRIBUTE_TO_CLEAR=H2 ATTRIBUTE_TO_CLEAR=XC SORT_ORDER=queryname RESTORE_ORIGINAL_QUALITIES=true REMOVE_DUPLICATE_INFORMATION=true REMOVE_ALIGNMENT_INFORMATION=true #Shuffle BAM, revert to FASTQ, and map #Program version: samtools/1.1, picard/1.126, bwakit/0.7.12 samtools bamshuf -Ou "${IN}" $SNIC_TMP/shuf.bam | \ samtools bam2fq /dev/stdin | /sw/apps/bioinfo/bwakit/0.7.12/bwa.kit/bwa mem -t4 -p -R'@RG\tID:HGDPID\tSM:HGDP_INFILE\tPL:ILLUMINA\tLB:lib1\tPU:Lane' /proj/b2012165/nobackup/private/Seq_project_cont/reference_hg38/GRCh38_full_analysis_set_plus_decoy_hla.fa - 2> HGDPid.log.bwamem \ | /sw/apps/bioinfo/bwakit/0.7.12/bwa.kit/k8 /sw/apps/bioinfo/bwakit/0.7.12/bwa.kit/bwa-postalt.js -p HGDPID.hla /proj/b2012165/nobackup/private/Seq_project_cont/reference_hg38/GRCh38_full_analysis_set_plus_decoy_hla.fa.alt \ | java -Xmx28g -jar /pica/sw/apps/bioinfo/picard/1.126/milou/picard.jar SortSam INPUT=/dev/stdin OUTPUT=HGDPID_sorted.bam SORT_ORDER=coordinate java -Xmx28g -jar /pica/sw/apps/bioinfo/picard/1.126/milou/picard.jar BuildBamIndex INPUT=HGDPID_sorted.bam ## Step 0.B: Create a BAM file with mapped reads and a BAM file with unmapped reads. #Program version: samtools/1.1, picard/1.126 samtools view -b -f 4 input_sorted.bam > output_sorted_unmapped.bam java -Xmx7g -jar /pica/sw/apps/bioinfo/picard/1.126/milou/picard.jar BuildBamIndex INPUT=output_sorted_unmapped.bam samtools view -b -F 4 input_sorted.bam > output_sorted_mapped.bam java -Xmx7g -jar /pica/sw/apps/bioinfo/picard/1.126/milou/picard.jar BuildBamIndex INPUT=output_sorted_mapped.bam ## Step 0.C: Mark duplicates reads #Program version: picard/1.126 java -Xmx7g -jar /pica/sw/apps/bioinfo/picard/1.126/milou/picard.jar MarkDuplicates \ INPUT=soutput_sorted_mapped.bam \ OUTPUT=marked_duplicates_bam/dedup_perlane/sampleID_Llane.dedup.bam \ METRICS_FILE=marked_duplicates_bam/dedup_perlane/sampleID_Llane_metrics.txt \ CREATE_INDEX=true ### ### Pipeline 1 ### ## Step 1.A: Base Quality Score Recalibration (BQSR) step with recommended reference dataset # Performed on the main chromsomal contigs (e.g. chr1), on the "random" contigs (e.g. chr1_KI270706v1_random) and on the contigs not attributed to a given chromosome (e.g. chrUn_KI270302v1). "alt" contigs are excluded. Y chromosome and mitochondrial contigs are excluded. #Program version: GATK/3.5.0 # Step 1.A.1: recalibration table java -Xmx28g -jar $GATK_HOME/GenomeAnalysisTK.jar -T BaseRecalibrator \ -R ${reference_hg38} \ -I ${input}.bam \ -knownSites ${dbsnp144}.vcf.gz \ -o ${output.bqsred}.table \ -L ${interval} \ -nct 4 # Step 1.A.2: apply the recalibration java -Xmx28g -jar $GATK_HOME/GenomeAnalysisTK.jar -T PrintReads \ -R ${reference_hg38} \ -I ${input}.bam \ -BQSR ${output.bqsred}.table \ -o ${output.bqsred}.bam \ -L ${interval} \ -nct 4 \ --disable_indel_quals ## Step 1.B: generate GVCF file for each individual (HaplotypeCaller) # Performed on autosomes main contigs only (e.g. for chromosome 1 on contig "chr1") # For this and following steps we had to change to GATK/3.7 because of a bug in joint genotyping all sites GVCF with GATK/3.5. #Program version: GATK/3.7 for CHR in {1..22}; do java -Xmx32g -jar /sw/apps/bioinfo/GATK/3.7/GenomeAnalysisTK.jar -T HaplotypeCaller \ -nct 4 \ -R ${reference_hg38} \ -I ${output.bqsred}.bam \ --genotyping_mode DISCOVERY \ -stand_call_conf 30 \ -ploidy 2 \ --emitRefConfidence BP_RESOLUTION \ --dbsnp ${dbsnp150} \ -L chr${CHR} \ -G Standard -G AS_Standard -G StandardHC \ -o ${output.bqsred}_${CHR}.g.vcf.gz done ## Step 1.C: joint genotyping of the 28 individuals (GenotypeGVCFs) #Program version: GATK/3.7 for CHR in {1..22}; do java -Xmx14g -jar $GATK_HOME/GenomeAnalysisTK.jar -T GenotypeGVCFs \ -R ${reference_hg38} \ --dbsnp ${dbsnp150} \ -L chr${CHR} \ -allSites \ -V ${ind1.bqsred}_${CHR}.g.vcf.gz \ -V ${ind2.bqsred}_${CHR}.g.vcf.gz \ [...] -V ${ind28.bqsred}_${CHR}.g.vcf.gz \ -G Standard -G AS_Standard \ -o 28ind.HCBPresolution.GenotypeGVCFsallsites.${CHR}.vcf.gz done ## Step 1.D: Callset refinment with Variant Quality Score Recalibration (VQSR) of SNP. We used the recommended tranche threshold for human data. # Program version: GATK/3.7 # Step 1.D.1: Variant Recalibrator hapmap=/hg38bundle/hapmap_3.3.hg38.vcf.gz omni=/hg38bundle/1000G_omni2.5.hg38.vcf.gz tusenG=/hg38bundle/1000G_phase1.snps.high_confidence.hg38.vcf.gz inroot=28ind.HCBPresolution.GenotypeGVCFsallsites java -Xmx24g -jar $GATK_HOME/GenomeAnalysisTK.jar \ -T VariantRecalibrator \ -R $ref \ -input ${inroot}.1.vcf.gz \ -input ${inroot}.2.vcf.gz \ [...] -input ${inroot}.22.vcf.gz \ -resource:hapmap,known=false,training=true,truth=true,prior=15.0 ${hapmap} \ -resource:omni,known=false,training=true,truth=true,prior=12.0 ${omni} \ -resource:1000G,known=false,training=true,truth=false,prior=10.0 ${tusenG} \ -resource:dbsnp,known=true,training=false,truth=false,prior=2.0 ${dbsnp151} \ -an DP \ -an QD \ -an FS \ -an SOR \ -an MQRankSum \ -an ReadPosRankSum \ -an MQ \ -mode SNP \ -tranche 100.0 -tranche 99.95 -tranche 99.94 -tranche 99.93 -tranche 99.92 -tranche 99.91 -tranche 99.9 -tranche 99.0 -tranche 95.0 -tranche 90.0 \ -recalFile ${out}.recal \ -tranchesFile ${out}.tranches \ -nt 4 # Step 1.D.2: Apply recalibration for CHR in {1..22}; do java -Xmx18g -jar $GATK_HOME/GenomeAnalysisTK.jar \ -T ApplyRecalibration \ -R ${reference_hg38} \ -input $inroot.${CHR}.vcf.gz \ -mode SNP \ --ts_filter_level 99.9 \ -recalFile ${out}.recal \ -tranchesFile ${out}.recal \ -o ${inroot.vqsredSNP99.9}.${CHR}.vcf.gz \ -nt 3 \ -L chr${CHR} done ### ### Pipeline 2 ### ## Step 2.A: Realignment around indels # Performed on the main chromsomal contigs (e.g. chr1), on the "random" contigs (e.g. chr1_KI270706v1_random) and on the contigs not attributed to a given chromosome (e.g. chrUn_KI270302v1). "alt" contigs are excluded. #Program version: GATK/3.5.0 # Step 2.A.1: identify regions which need to be realigned java -Xmx7g -jar $GATK_HOME/GenomeAnalysisTK.jar -T RealignerTargetCreator \ -R ${reference_hg38} \ -I ${input}.bam \ -o ${input}_realignment_targets.list \ -L ${interval} # Step 2.A.2: realign java -Xmx7g -jar $GATK_HOME/GenomeAnalysisTK.jar -T IndelRealigner \ -R ${reference_hg38} \ -I ${input}.bam \ -targetIntervals ${input}_realignment_targets.list \ -o ${output.realigned}.bam \ -L ${interval} ## Step 2.B: same as Step 1.A but with input ${output.realigned}.bam (output of Step 2.A) and output ${output.realigned.bqsred}.bam ## Step 2.C (generating GVCFs): same as Step 1.B but with input ${output.realigned.bqsred}.bam ## Step 2.D (joint genotyping): same as Step 1.C ## Step 2.F (callset refinement): same as Step 1.D ### ### Pipeline 3 ### ## Step 3.A: same as Step 2.A (Realignment around indels) ## Step 3.B: Variant calling on the output of Step 3.A # Performed on the main chromosomal contigs (e.g. chr1), on the "random" contigs (e.g. chr1_KI270706v1_random) and on the contigs not attributed to a given chromosome (e.g. chrUn_KI270302v1). "alt" contigs are excluded. The autosomes, chromosome X, chromosome Y and the mitochondria are called separately. The ploidy parameter is set to 2 for the autosomes and 1 for the mitochondria. It is not set for the sex chromosomes. Command for the autosomes: # Program version: GATK/3.5, vcftools/0.1.13, tabix/0.2.6 java -Xmx35g -jar $GATK_HOME/GenomeAnalysisTK.jar -T HaplotypeCaller \ -nct 5 \ -R ${reference_hg38} \ -I ${output.realigned}.bam \ --genotyping_mode DISCOVERY \ -stand_emit_conf 30 \ -stand_call_conf 30 \ -ploidy 2 \ -L ${interval} \ -o sampleID_directcall_rawcalls_1-22.vcf # The resulting VCF are compressed with bgzip, the VCF for 1-22 and for chromosome X are concatenated and the resulting VCF is indexed. bgzip sampleID_directcall_rawcalls_1-22.vcf vcf-concat sampleID_directcall_rawcalls_1-22.vcf.gz sampleID_directcall_rawcalls_chrX.vcf.gz | bgzip > sampleID_directcall_rawcalls_1-22X.vcf.gz tabix sampleID_directcall_rawcalls_1-22X.vcf.gz ## Step 3.C: Variant calling on the output of BQSR with dbsnp as reference dataset # Program version: GATK/3.5, vcftools/0.1.13, tabix/0.2.6 # Step 3.C.1: same as Step 1.A but with input: ${output.realigned}.bam (output of Step 2.A) and output: ${output.realigned.bqsred}.bam # Step 3.C.2: Variant calling: same as Step 3.B but with input: ${output.realigned.bqsred}.bam (output of Step 3.C.1) and output: sampleID_callafterBQSRdb_rawcalls_1-22X.vcf.gz ## Step 3.D: Triple mask BQSR # Performed on the main chromsomal contigs (e.g. chr1), on the "random" contigs (e.g. chr1_KI270706v1_random) and on the contigs not attributed to a given chromosome (e.g. chrUn_KI270302v1). "alt" contigs are excluded. Y chromosome and mitochondrial contigs are excluded. #Program version: GATK/3.5.0 # Step 3.D.1: recalibration table java -Xmx42g -jar $GATK_HOME/GenomeAnalysisTK.jar -T BaseRecalibrator \ -nct 5 \ -R ${reference_hg38} \ -I ${output.realigned}.bam \ -L ${interval} \ -knownSites ${dbsnp144} \ -knownSites sampleID_directcall_rawcalls_1-22X.vcf.gz \ -knownSites sampleID_callafterBQSRdb_rawcalls_1-22X.vcf.gz \ -o ${output.realigned.3maskbqsred}.table # Step 3.D.2: apply the recalibration table java -Xmx42g -jar $GATK_HOME/GenomeAnalysisTK.jar -T PrintReads \ -nct 5 \ -R ${reference_hg38} \ -I ${output.realigned}.bam \ -L ${interval} \ -BQSR ${output.realigned.3maskbqsred}.table \ -o ${output.realigned.3maskbqsred}.bam \ --disable_indel_quals ## Step 3.E (generating GVCFs): same as Step 1.B but with input: ${output.realigned.3maskbqsred}.bam ## Step 3.F (joint genotyping): same as Step 1.C ## Step 3.G (callset refinement): same as Step 1.D ### ### Pipeline 4 ### ## Steps 4.A to 4.E identical to Steps 3.A to 3.E (except for individuals for which data is processed by lanes then by individual). ## Step 4.F: same as Step 4.F but more individuals are included. ## Step 4.G: Extract the individuals present in the Pipelines 1, 2 and 3 to enable fair comparisons. # Program version: GATK/3.7 for CHR in {1..22}; do java -Xmx6g -jar $GATK_HOME/GenomeAnalysisTK.jar -T SelectVariants \ --variant ${output_Step_4.F}.${CHR}.vcf.gz \ -R ${reference_hg38} \ -sn ${ind1_ID} \ -sn ${ind2_ID} \ [...] -sn ${ind28_ID} \ -trimAlternates \ --out ${Pipeline4_subset28ind}.${CHR}.vcf.gz done ## Step 4.H (callset refinement): same as Step 1.D