# Online Resource 4: Hepatitis E virus alignment mapping bioinformatics pipeline

#Having performed high accuracy basecalling using the MK1C MinION
#Create an alignment fasta file containing reference sequences - download from NCBI
HEV_reference.fasta

#Go to the folder containing high accuracy basecalling data and create a set of summary graphs about the data
cd basecalling
conda create --name=nanoplot nanoplot=1.32.1
conda activate nanoplot
$ NanoPlot -t 20 --summary sequencing_summary.txt --loglength -o summary-plots-log-transformed
conda deactivate

#Create the list of barcodes to analyse
nano HEV2mapping.lst

#Then concatenate (combine) the high accuracy basecalled fastq files into one merged file 
folders=$(find ./ -maxdepth 1 -type d -name barcode\*)

for folder in $folders
do
        barcode=$(echo $folder | sed 's:./::g')
        echo $folder
        echo $barcode
        cat $folder/*fastq.gz > ${barcode}_merged.fastq.gz
done

#This gives a merged fastq file for each barcode
# create the trim_seqs directory
mkdir trim_seqs
#Use cutadapt to cut the primers off of the sequences
while read folderName;
do
	barcode=$(echo $folderName | sed 's:./::g')
	echo $barcode

	eval "$(conda shell.bash hook)"
	conda activate cutadapt
	cutadapt -j20 \
		-e 0.20 --revcomp --minimum-length 200 \
		-g TGTTGCGCAGGTYTGTGT...TCGTGYTTYTGCCTATGYTGC \
		-g TGTTGCGCAGGTYTGTGT...TTCYTCGTGYTTYTGCCTATG \
		-o trim_seqs/${barcode}_trimmed.fastq.gz --untrimmed-output trim_seqs/${barcode}_untrimmed.fastq.gz ${barcode}_merged.fastq.gz > trim_seqs/${barcode}_trim.log 2>&1
done<HEV2mapping.lst

#This gives a "too short" "trim log" "trimmed" and "untrimmed" file for each barcode

#View some interesting read results for each barcode, using this command (e.g.):
head -n 20 trim_seqs/barcode04_trim.log
head -n 20 trim_seqs/barcode07_trim.log

#Upload the HEV_reference.fasta file that we need to do the alignment
#Make a directory to store alignments
mkdir Minimap_Vs_All_Ref_Seqs/

#Align the sequences against the reference sequences for HEV (from Smith et al 2021)
folders=$(find ./ -maxdepth 1 -type d -name barcode\*)

for folder in $folders
do
        barcode=$(echo $folder | sed 's:./::g')
        echo $folder
        echo $barcode

        eval "$(conda shell.bash hook)"
        conda activate minimap2

        minimap2 --secondary=no -t18 -ax map-ont HEV_reference.fasta ${barcode}_merged.fastq.gz |\
                samtools sort -@18 -o Minimap_Vs_All_Ref_Seqs/alignment_${barcode}.bam
	
	
        samtools index -@ 18 Minimap_Vs_All_Ref_Seqs/alignment_${barcode}.bam
        samtools flagstat Minimap_Vs_All_Ref_Seqs/alignment_${barcode}.bam > Minimap_Vs_All_Ref_Seqs/alignment_${barcode}_flagstat.report
        samtools coverage Minimap_Vs_All_Ref_Seqs/alignment_${barcode}.bam | awk '$4 >0' | sort -rnk4 > Minimap_Vs_All_Ref_Seqs/alignment_${barcode}_coverage.report

done

#This gives us a bam, bam.bai, coverage report, and flagstat report for each barcode

#Create two more directories for creating our initial consensus sequences
mkdir assemble
mkdir assemble/intermediate_files/
#Create the consensus sequences for sequences with >1000 reads. Possible to change this "baseline" parameter depending on what is deemed acceptable.
while read folderName;
do
        barcode=$(echo $folderName | sed 's:./::g')
        echo $barcode
        references=$(awk '$4>1000' Minimap_Vs_All_Ref_Seqs/alignment_${barcode}_coverage.report | cut -f1 | grep -v "\#")

        for reference in $references
        do
                eval "$(conda shell.bash hook)"
                conda activate minimap2
                samtools view Minimap_Vs_All_Ref_Seqs/alignment_${barcode}.bam $reference | awk '{ print "@" $1 "\n" $10 "\n+\n" $11 }' > assemble/intermediate_files/${barcode}_${reference}.fastq

                eval "$(conda shell.bash hook)"
                conda activate canu
                canu -p ${barcode}_${reference}_assembly -d assemble/intermediate_files/${barcode}_${reference}_assembly -correct \
                -nanopore assemble/intermediate_files/${barcode}_${reference}.fastq \
                genomeSize=1k \
                corOutCoverage=1000000 \
                useGrid=false \
                maxThreads=20 maxMemory=100 \
                minReadLength=200 minOverlapLength=100 corMinCoverage=30 \
                corMhapOptions="--threshold 0.8 --ordered-sketch-size 1000 --ordered-kmer-size 14" correctedErrorRate=0.105 > assemble/intermediate_files/${barcode}_${reference}_assembly.log 2>&1


        done
done<HEV2mapping.lst

#This has given us a folder for each consensus generated for each barcode
#It has also given us a list of the consensus fasta files for each barcode and a log of the assembly for each consensus

#Make a contigs folder within assemble folder
mkdir assemble/contigs
#cd into assemble directory
cd assemble
#Pick out consensus sequences from error-corrected reads using canu and map them against the reference database
while read folderName;
    do
    barcode=$(echo $folderName | sed 's:./::g')
    echo $barcode
    references=$(awk '$4>1000' ../Minimap_Vs_All_Ref_Seqs/alignment_${barcode}_coverage.report | cut -f1 | grep -v "\#")

    rm contigs/${barcode}_seqs.fasta
    for reference in $references
    do
            eval "$(conda shell.bash hook)"
            conda activate seqtk
            seqtk seq -L 200 intermediate_files/${barcode}_${reference}_assembly/${barcode}_${reference}_assembly.correctedReads.fasta.gz | head -n2 > contigs/${barcode}_${reference}_contigs.fasta
            contigName=$(head -n1 contigs/${barcode}_${reference}_contigs.fasta | sed 's/>//g')
            sed -i "s/${contigName}/${barcode}_${reference}/g" contigs/${barcode}_${reference}_contigs.fasta
            cat contigs/${barcode}_${reference}_contigs.fasta >> contigs/${barcode}_seqs.fasta
    done

    eval "$(conda shell.bash hook)"
    conda activate mafft

    mafft --adjustdirection --reorder contigs/${barcode}_seqs.fasta > contigs/${barcode}_aligned.fasta
done<../HEV2mapping.lst


#Download the aligned.fasta files from the contigs folder to local pc
#Open Ugene (Windows version 40)
#Click open file and select the alignment files
#Where necessary, when opening the alignment, select the 'Join sequences into alignment and open in multiple alignment viewer' option
#Trim the beginning and ends of the sequences if there are lots of gaps or the sequences are too long
#Go to the Actions tab > Statistics > Generate similarity matrix
#It will compare the sequences together in a hamming dissimilarity matrix
#If sequence counts are 10 or below, then they are very similar and you should only keep one sequence
#If sequences have been removed due to similarities, remove the relevant sequences from the seqs.fasta files in the contigs directory
#Rename the sequences in the seqs.fasta file to seq1, seq2, seq3 etc.

#Create a new directory in the assemble folder
mkdir align_against_consensus_seqs

#To realign the chosen sequences against the reference again, use:
while read folder;
do
    barcode=$(echo $folder | sed 's:./::g')
    echo $folder
    echo $barcode
    eval "$(conda shell.bash hook)"
    conda activate minimap2

    minimap2 --secondary=no -t18 -ax map-ont ./contigs/${barcode}_seqs.fasta ../${barcode}_merged.fastq.gz |\
    samtools sort -@18 -o align_against_consensus_seqs/alignment_${barcode}.bam

    samtools index -@ 18 align_against_consensus_seqs/alignment_${barcode}.bam
    samtools flagstat align_against_consensus_seqs/alignment_${barcode}.bam > align_against_consensus_seqs/alignment_${barcode}_flagstat.report
    samtools coverage align_against_consensus_seqs/alignment_${barcode}.bam | awk '$4 >0' | sort -rnk4 > align_against_consensus_seqs/alignment_${barcode}_coverage.report

done<../HEV2mapping.lst

#Make the following directories:
mkdir assemble/medaka_consensus/
mkdir assemble/medaka_contigs/

#Create consensus sequences using medaka:
while read folder;
do
        barcode=$(echo $folder | sed 's:./::g')
        echo $folder
        echo $barcode
        eval "$(conda shell.bash hook)"
        conda activate medaka_cpu
        medaka_consensus -i ../trim_seqs/${barcode}_trimmed.fastq.gz -d contigs/${barcode}_seqs.fasta -o medaka_consensus/${barcode}_consensus -t 20 -m r941_min_high_g360
        cp medaka_consensus/${barcode}_consensus/consensus.fasta medaka_contigs/${barcode}_polished_seqs.fasta

done<../HEV2mapping.lst

#Create a new folder in the assemble directory
mkdir medaka_variant_calling/

#Use medaka again to do variant calling for snps
while read folder;
do
        barcode=$(echo $folder | sed 's:./::g')
        echo $folder
        echo $barcode

        eval "$(conda shell.bash hook)"
        conda activate minimap2
        minimap2 --secondary=no -t18 -ax map-ont ./medaka_contigs/${barcode}_polished_seqs.fasta ../trim_seqs/${barcode}_trimmed.fastq.gz |\
                samtools sort -@18 -o medaka_variant_calling/alignment_${barcode}.bam

        samtools index -@ 18 medaka_variant_calling/alignment_${barcode}.bam
        samtools flagstat medaka_variant_calling/alignment_${barcode}.bam > medaka_variant_calling/alignment_${barcode}_flagstat.report
        samtools coverage medaka_variant_calling/alignment_${barcode}.bam | awk '$4 >0' | sort -rnk4 > medaka_variant_calling/alignment_${barcode}_coverage.report

        eval "$(conda shell.bash hook)"
        conda activate medaka_cpu
        medaka_variant -i medaka_variant_calling/alignment_${barcode}.bam -m r941_min_high_g360 -s r941_min_high_g360 -f medaka_contigs/${barcode}_polished_seqs.fasta -o medaka_variant_calling/${barcode}_variants
done<../HEV2mapping.lst

#We would then view the vcf files and see which ones have passed the quality score to see which snps are reliable and which are likely to be artifacts