# Online Resource 4: Hepatitis E virus alignment mapping bioinformatics pipeline #Having performed high accuracy basecalling using the MK1C MinION #Create an alignment fasta file containing reference sequences - download from NCBI HEV_reference.fasta #Go to the folder containing high accuracy basecalling data and create a set of summary graphs about the data cd basecalling conda create --name=nanoplot nanoplot=1.32.1 conda activate nanoplot $ NanoPlot -t 20 --summary sequencing_summary.txt --loglength -o summary-plots-log-transformed conda deactivate #Create the list of barcodes to analyse nano HEV2mapping.lst #Then concatenate (combine) the high accuracy basecalled fastq files into one merged file folders=$(find ./ -maxdepth 1 -type d -name barcode\*) for folder in $folders do barcode=$(echo $folder | sed 's:./::g') echo $folder echo $barcode cat $folder/*fastq.gz > ${barcode}_merged.fastq.gz done #This gives a merged fastq file for each barcode # create the trim_seqs directory mkdir trim_seqs #Use cutadapt to cut the primers off of the sequences while read folderName; do barcode=$(echo $folderName | sed 's:./::g') echo $barcode eval "$(conda shell.bash hook)" conda activate cutadapt cutadapt -j20 \ -e 0.20 --revcomp --minimum-length 200 \ -g TGTTGCGCAGGTYTGTGT...TCGTGYTTYTGCCTATGYTGC \ -g TGTTGCGCAGGTYTGTGT...TTCYTCGTGYTTYTGCCTATG \ -o trim_seqs/${barcode}_trimmed.fastq.gz --untrimmed-output trim_seqs/${barcode}_untrimmed.fastq.gz ${barcode}_merged.fastq.gz > trim_seqs/${barcode}_trim.log 2>&1 done Minimap_Vs_All_Ref_Seqs/alignment_${barcode}_flagstat.report samtools coverage Minimap_Vs_All_Ref_Seqs/alignment_${barcode}.bam | awk '$4 >0' | sort -rnk4 > Minimap_Vs_All_Ref_Seqs/alignment_${barcode}_coverage.report done #This gives us a bam, bam.bai, coverage report, and flagstat report for each barcode #Create two more directories for creating our initial consensus sequences mkdir assemble mkdir assemble/intermediate_files/ #Create the consensus sequences for sequences with >1000 reads. Possible to change this "baseline" parameter depending on what is deemed acceptable. while read folderName; do barcode=$(echo $folderName | sed 's:./::g') echo $barcode references=$(awk '$4>1000' Minimap_Vs_All_Ref_Seqs/alignment_${barcode}_coverage.report | cut -f1 | grep -v "\#") for reference in $references do eval "$(conda shell.bash hook)" conda activate minimap2 samtools view Minimap_Vs_All_Ref_Seqs/alignment_${barcode}.bam $reference | awk '{ print "@" $1 "\n" $10 "\n+\n" $11 }' > assemble/intermediate_files/${barcode}_${reference}.fastq eval "$(conda shell.bash hook)" conda activate canu canu -p ${barcode}_${reference}_assembly -d assemble/intermediate_files/${barcode}_${reference}_assembly -correct \ -nanopore assemble/intermediate_files/${barcode}_${reference}.fastq \ genomeSize=1k \ corOutCoverage=1000000 \ useGrid=false \ maxThreads=20 maxMemory=100 \ minReadLength=200 minOverlapLength=100 corMinCoverage=30 \ corMhapOptions="--threshold 0.8 --ordered-sketch-size 1000 --ordered-kmer-size 14" correctedErrorRate=0.105 > assemble/intermediate_files/${barcode}_${reference}_assembly.log 2>&1 done done1000' ../Minimap_Vs_All_Ref_Seqs/alignment_${barcode}_coverage.report | cut -f1 | grep -v "\#") rm contigs/${barcode}_seqs.fasta for reference in $references do eval "$(conda shell.bash hook)" conda activate seqtk seqtk seq -L 200 intermediate_files/${barcode}_${reference}_assembly/${barcode}_${reference}_assembly.correctedReads.fasta.gz | head -n2 > contigs/${barcode}_${reference}_contigs.fasta contigName=$(head -n1 contigs/${barcode}_${reference}_contigs.fasta | sed 's/>//g') sed -i "s/${contigName}/${barcode}_${reference}/g" contigs/${barcode}_${reference}_contigs.fasta cat contigs/${barcode}_${reference}_contigs.fasta >> contigs/${barcode}_seqs.fasta done eval "$(conda shell.bash hook)" conda activate mafft mafft --adjustdirection --reorder contigs/${barcode}_seqs.fasta > contigs/${barcode}_aligned.fasta done<../HEV2mapping.lst #Download the aligned.fasta files from the contigs folder to local pc #Open Ugene (Windows version 40) #Click open file and select the alignment files #Where necessary, when opening the alignment, select the 'Join sequences into alignment and open in multiple alignment viewer' option #Trim the beginning and ends of the sequences if there are lots of gaps or the sequences are too long #Go to the Actions tab > Statistics > Generate similarity matrix #It will compare the sequences together in a hamming dissimilarity matrix #If sequence counts are 10 or below, then they are very similar and you should only keep one sequence #If sequences have been removed due to similarities, remove the relevant sequences from the seqs.fasta files in the contigs directory #Rename the sequences in the seqs.fasta file to seq1, seq2, seq3 etc. #Create a new directory in the assemble folder mkdir align_against_consensus_seqs #To realign the chosen sequences against the reference again, use: while read folder; do barcode=$(echo $folder | sed 's:./::g') echo $folder echo $barcode eval "$(conda shell.bash hook)" conda activate minimap2 minimap2 --secondary=no -t18 -ax map-ont ./contigs/${barcode}_seqs.fasta ../${barcode}_merged.fastq.gz |\ samtools sort -@18 -o align_against_consensus_seqs/alignment_${barcode}.bam samtools index -@ 18 align_against_consensus_seqs/alignment_${barcode}.bam samtools flagstat align_against_consensus_seqs/alignment_${barcode}.bam > align_against_consensus_seqs/alignment_${barcode}_flagstat.report samtools coverage align_against_consensus_seqs/alignment_${barcode}.bam | awk '$4 >0' | sort -rnk4 > align_against_consensus_seqs/alignment_${barcode}_coverage.report done<../HEV2mapping.lst #Make the following directories: mkdir assemble/medaka_consensus/ mkdir assemble/medaka_contigs/ #Create consensus sequences using medaka: while read folder; do barcode=$(echo $folder | sed 's:./::g') echo $folder echo $barcode eval "$(conda shell.bash hook)" conda activate medaka_cpu medaka_consensus -i ../trim_seqs/${barcode}_trimmed.fastq.gz -d contigs/${barcode}_seqs.fasta -o medaka_consensus/${barcode}_consensus -t 20 -m r941_min_high_g360 cp medaka_consensus/${barcode}_consensus/consensus.fasta medaka_contigs/${barcode}_polished_seqs.fasta done<../HEV2mapping.lst #Create a new folder in the assemble directory mkdir medaka_variant_calling/ #Use medaka again to do variant calling for snps while read folder; do barcode=$(echo $folder | sed 's:./::g') echo $folder echo $barcode eval "$(conda shell.bash hook)" conda activate minimap2 minimap2 --secondary=no -t18 -ax map-ont ./medaka_contigs/${barcode}_polished_seqs.fasta ../trim_seqs/${barcode}_trimmed.fastq.gz |\ samtools sort -@18 -o medaka_variant_calling/alignment_${barcode}.bam samtools index -@ 18 medaka_variant_calling/alignment_${barcode}.bam samtools flagstat medaka_variant_calling/alignment_${barcode}.bam > medaka_variant_calling/alignment_${barcode}_flagstat.report samtools coverage medaka_variant_calling/alignment_${barcode}.bam | awk '$4 >0' | sort -rnk4 > medaka_variant_calling/alignment_${barcode}_coverage.report eval "$(conda shell.bash hook)" conda activate medaka_cpu medaka_variant -i medaka_variant_calling/alignment_${barcode}.bam -m r941_min_high_g360 -s r941_min_high_g360 -f medaka_contigs/${barcode}_polished_seqs.fasta -o medaka_variant_calling/${barcode}_variants done<../HEV2mapping.lst #We would then view the vcf files and see which ones have passed the quality score to see which snps are reliable and which are likely to be artifacts