######### 2019-04-10 ######### Contact: early@broadinstitute.org ######### ######### ######### ## Script for calculating pi (pairwise nucleotide diversity) per genomic region (eg, gene, transcript) from an allele frequency file. ## Takes output of pop_gen_stats_per_nt.pl as input ## use warnings; use strict; use POSIX; use Fcntl qw(:flock); my $gene = ""; ## Gene/transcript name or ID my $chrom= ""; ## Chromosome name (not used in calculations) my $trans_site_number = ; ## Number sites of class $snp_eff_code in $gene my $snp_eff_code = ""; ## missense_variant OR synonymous_variant; Originally from column 5 of INPUT_example.txt my @populations = (qw//); # List of population names. MUST BE IN SAME ORDER AS IN INPUT FILE my $min_coverage = ; # Minimum proportion of sites with at least 0.25 samples having a call. Default suggestion: 0.9 my @coverage = (); #For each population in @populations, the estimated proportion of sites covered by sequencing data. For low coverage genomes, this is used to correct estimates across regions for the absence of variant calls do to low coverage. MUST BE IN SAME ORDER AS POPS IN @populations ARRAY my $input_file = ""; ## Input file in format produced by script pop_gen_stats_per_nt.pl. ORIGINAL HEADERS MUST BE INTACT my $out_file = ""; ## Output file name ### Calculate "corrected length" for gene based on estimates of total coverage my %WL; foreach my $index (0..$#populations) { my $pop = $populations[$index]; my $cover = $coverage[$index]; my $corrected_site_number = $trans_site_number*$cover; $WL{$pop} = $corrected_site_number; } #### Make output file header open OUT, ">$out_file" or die; print OUT "GeneID\tChrom\tVarClass\tTotalSites"; foreach my $pop (@populations) { print OUT "\t$pop"."_S\t$pop"."_pi"; } print OUT "\n"; print OUT "$gene\t$chrom\t$snp_eff_code\t$trans_site_number"; my @fst_parts_columns; my @pi_columns; my @ThetaW_columns; my @Sample_Coverage_columns; my @Alt_Count_columns; open SNPS, "$input_file" or (print "No variant file $input_file for gene $gene\n" and die); my %fst_a_sum = my %fst_b_sum = my %fst_c_sum = my %snp_count = my %pi_sum = my %ThetaW_sum = my %a1_sum = my %a2_sum = my %b1_sum = my %b2_sum = my %c2_sum = my %ThetaH_sum = my %ThetaL_sum = my %FayWuN_sum = my %bn1_sum = my %vd_sum = my %ud_sum = my %ne_sum = my %KST_sum = my %Snn_sum = my %Sample_Coverage = (); my $effect_col; my $filter_col; my $total_snps = 0; while () { chomp; my @dl2 = split("\t", $_); if ($.==1) { foreach my $col (0..$#dl2) { if ($dl2[$col] =~ /_PARTS/) { @fst_parts_columns = (@fst_parts_columns, $col); } if ($dl2[$col] =~ /_pi/) { @pi_columns = (@pi_columns, $col); } if ($dl2[$col] =~ /CALLS_/) { @Sample_Coverage_columns = (@Sample_Coverage_columns, $col); } if ($dl2[$col] =~ /ALT_COUNT_/) { @Alt_Count_columns = (@Alt_Count_columns, $col); } if ($dl2[$col] =~ /FILTER/) { $filter_col = $col; } if ($dl2[$col] =~ /EFFECT/) { $effect_col = $col; } } next; } if (($dl2[$filter_col] ne "PASS") || ($dl2[$effect_col] ne $snp_eff_code)) { next; } ## Count number of segregating sites ++$total_snps; ## Make Hash of pi parts foreach my $pi (@pi_columns) { if (!exists $pi_sum{$pi}){ $pi_sum{$pi} = 0; $snp_count{$pi} = 0; } if ($dl2[$pi] ne "NA") { $pi_sum{$pi} = ($pi_sum{$pi} + $dl2[$pi]); if ($dl2[$pi] > 0) { ++$snp_count{$pi}; } } } ## Count cumulative sample coverage across the sites foreach my $index (0..$#pi_columns) { my $pop = $Sample_Coverage_columns[$index]; if (!exists $Sample_Coverage{$pop}) { $Sample_Coverage{$pop} = 0; } if ($dl2[$pi_columns[$index]] ne "NA") { if ($dl2[$pi_columns[$index]] > 0) { $Sample_Coverage{$pop} = $Sample_Coverage{$pop} + $dl2[$pop]; } } } } close SNPS; foreach my $index (0..$#pi_columns) { ## cycle through individual population statistics if ($WL{$populations[$index]} == 0) { print OUT "\tNA\tNA\tNA\tNA"; next; } my $avg_pi = 0; my $ThetaW = 0; my $TD = "NA"; ## Calculate Average pi my $pi_col = $pi_columns[$index]; my $Watterson_col = $ThetaW_columns[$index]; my $coverage_col = $Sample_Coverage_columns[$index]; if (!exists $pi_sum{$pi_col}) { $pi_sum{$pi_col} = 0; } $avg_pi = ($pi_sum{$pi_col}/$WL{$populations[$index]}); ## Print individual population stats if ($coverage[$index] > $min_coverage) { print OUT "\t$snp_count{$pi_col}\t$avg_pi"; } else { print OUT "\tNA\tNA"; } } print OUT "\n"; close OUT;