#!perl -w use strict; use Math::Matrix; $" = ","; ############################################################################################################### # variable declarations ############################################################################################################### my ( $data_set, # name of file containing the original data set $map_order, # name of file containing markers in map order $uni_file, # name of file containing the redundancy-filtered data set $n_samples, # number of samples in data set $rec_threshold, # recombination fraction threshold @breakpoint, # array of marker positions immediately before breack points $n_u_samples, # number of unique samples in data set @sample_list, # list of unique samples %cons_red_samples, # hash: replicated samples => score consensus $marker_name, # name of column heading containing the marker names %marker_features, # hash: markers => marker features $n_markers, # number of markers $n_u_markers, # number of unique markers @marker_list, # list of unique markers %cons_red_markers, # hash: replicated markers => score consensus $score_type, # type of scores (0/1 or A/B/C/D/H) %score, # hash of hashes: samples => markers => scores @map_order, # array of markers arranged according to map order @group, # array of linkage group names assoicated with markers in the @map_order array @group_order, # array of markers belonging to a single linkage group arranged according to map order @group_mapdist, # cumulative Kosambi map distances for markers of currently processed linkage groups @cumul_mapdist, # cumulative Kosambi map distances of all markers, computed by simply adding distances of adjacent markers (simulates MapManager output) @opt_group_mapdist, # optimised cumulative Kosambi map distances for markers of currently processed linkage groups, computed using the least-square fitting procedure used by JoinMap 3.0 (simulates JoinMap output) @opt_cumul_mapdist, # optimised cumulative Kosambi map distances, computed using the least-square fitting procedure used by JoinMap 3.0 (simulates JoinMap output) @rec_fract, # array of arrays of rows of cells of a matrix containing paiwaise recombination fractions @dist_matrix, # array of arrays of rows of cells of a matrix containing paiwaise Kosambi map distances @lod_matrix, # array of arrays of rows of cells of a matrix containing paiwaise LOD scores (computed as in JoinMap 3.0) @opt_mapdist, # array of optimised map distances (ordered according to map order) $total_length, # sum of map lengths of all linkage groups $sum_neg_mapdist, # sum of negative map distances between adjecent markers $row, # currently processed row of a matrix $cell, # currently processed cell of a row of a matrix $i, # looping index $start_group, # number of marker in map order at which the currently processed linkage group starts $end_group, # number of marker in map order at which the currently processed linkage group ends $tmp, # temporary variable $pop_type, # population type (at this stage only DH or selfed RILs) $stdev_cM, @n_scores, $h, $comp, %LOD_matrix, @data_file, $n_lines, %pop_type, $maporder_file, $population, @comp_matrix, @kosambi_list, @kosambi_row, $number, $sum_factor, $sum_wgt, @new_locus, @new_distance, $loop, @lod_list, @comp_list, @lod_row, @comp_row, $line, $j, %distance, @new_lod, @new_comp, %lod, %comp, $number_lines, $sum_comparisons, $lod, $k, $dist, %factor, %wgt, $rec, @missing, $round, @rowC, @orig_opt_dist, $iteration, %first_ori_dist, $exp, $ref, $min_exp, $neg_thresh, $locus, $incr, $max_exp, $quality, @adj_mapdist, %first_opt_dist, $cumul_factor, $sum, $gini, $previous, %opt_mapdist, %sum_neg_mapdist, %stdev_cM, %rel_stdev_cM, $min_lod, $max_rec, %stdev_zero, $most_negative, @opt_dist, $avg_zero_dist, $stdev_zero_dist, $original_avg_resid, $avg_resid, $from, $to, @dist_matrix_new, %stretch, @lod_matrix_new, @comp_matrix_new, $matrix_size, @original_dist, $most_neg, $rank, $avg_res, @subset_order, %found, @to_print, $cosegr, $negative, $found, $sum_squares, $removed, $sample, $original_length, %lines, $adjusted_length, %cM, $locus_i, $locus_j, $found_i, $found_j, $all_lines, $mean_rec, $var, @fitted, $pop, %cumul_stretch, %fitted_cM, @opt_dist_new, %cM_matrix, %cM_list, $sum_avg_dist, $avg, $first, $ratio, $avg_dev, @factor, @avg_dist, @wgt, $count, $min_stretch_rec, $gini_tolerance, @ratio, ); $" = ','; ############################################################################################################### # program settings ############################################################################################################### $rec_threshold = 35; # for initial splitting of within-population linkage groups into subgroups $min_lod = 1; # parameters used to compute map distances $max_rec = 35; $exp = 2; ############################################################################################################### # user input ############################################################################################################### print "\n\n\n\n\n"; print " \\\\\\\\\\\\ \\\\\\\\\\\\\n"; print " >>>>> DArTscript3 >>>>---- >>>>> DArTscript3 >>>>----\n"; print " ////// //////\n\n\n"; print " Copyright: Peter Wenzl 2005-06, DArT P/L (www.DiversityArrays.com)\n"; print "\n !!! WARNING !!!\n"; print " This is a purpose-built script written by someone\n"; print " without previous coding experience. There may now be a\n"; print " better version available. Please contact Peter Wenzl\n"; print " at peter\@DiversityArrays.com\n\n"; print " ------------------------------------------------------------------------\n"; print " [Map distances are calculated for a given order of loci from the\n"; print " segregation data of multiple populations. Only a single linkage group\n"; print " can be processed per run. Two types of input files are required. Files\n"; print " containing the segregation data of each of the populations should be\n"; print " tab or comma-separated text files. They should have a single row with\n"; print " the column headings. They can have multiple columns on the left side,\n"; print " at least one of which has to contain the names of the loci. The other\n"; print " type of file contains a single column (without column heading) with all\n"; print " loci of all populations arranged in consensus map order. User has the\n"; print " option to let the program flip the order of pairs of adjacent loci with\n"; print " negative distance estimates.]\n"; print " ------------------------------------------------------------------------\n"; # reads data sets of for a single linkage group acros several populations print "Files with data sets for a SINGLE linkage group across multiple populations:\n"; do { print " > "; chomp ( $_ = ); push @data_file, $_; if ( $data_file[ -1 ] eq "" ) { print " -> Hey! You've typed 'Enter' instead of a file name. Try again...\n"; pop @data_file; } if ( @data_file && @data_file == 1 && $data_file[ -1 ] =~ /\bdone\b/i ) { print " -> Hey! You haven't'yet provided a single file name! Try again...\n"; pop @data_file; } for ( @data_file[ 0 .. $#data_file - 1 ] ) { if ( $data_file[ -1 ] eq $_ ) { print " -> Hey! You've already typed this file name. Type another one...\n"; pop @data_file; } } } until ( @data_file && $data_file[ -1 ] =~ /\bdone\b/i ); pop @data_file; # asks user to provide information on population types print "\n > DH (\"d\") or selfed RIL (\"r\") populations?\n"; for ( @data_file ) { do { print " $_: "; chomp ( $pop_type{ $_ } = ); } until ( $pop_type{ $_ } =~ /^[dr]$/i ); if ( $pop_type{ $_ } =~ /d/i ) { $pop_type{ $_ } = 'DH'; } elsif ( $pop_type{ $_ } =~ /r/i ) { $pop_type{ $_ } = 'RIL'; } } # reads and opens map order file print "\n > File with map order: "; do { chomp ( $maporder_file = ); } until ( $maporder_file =~ /^.+\.?.*$/ ); open ( ORDER, "$maporder_file" ) or die "\n-> Program terminated: can't open $maporder_file: $!\n\n"; chomp ( @map_order = ); close ORDER or warn "\nCan't close $maporder_file: $!\n\n"; # reads program mode print "\n > Flip locus pairs with negative distance (y) or not (n): "; do { chomp ( $neg_thresh = ); } until ( $neg_thresh =~ /^[yn]$/i ); if ( $neg_thresh eq 'y' ) { print "\n > negative distance threshold in cM (0 if all neg. distances to be removed): "; do { chomp ( $neg_thresh = ); } until ( $neg_thresh =~ /^-\d+\.*\d*$/ or $neg_thresh == 0 ); } else { $neg_thresh = -1000; } ############################################################################################################### # processes one population after the after and produces matrices containing two-marker distances and LOD scores ############################################################################################################### for $population ( @data_file ) { # opens and redundancy-filters score file from each population @dist_matrix = @lod_matrix = @comp_matrix = @subset_order = ( ); print "\n**************\nProcessing $population...\n"; @_ = &filter_redundant ( $population ); $uni_file = shift @_; $n_samples = shift @_; $n_u_samples = shift @_; @sample_list = sort @{ shift @_ }; %cons_red_samples = %{ shift @_ }; $marker_name = shift @_; %marker_features = %{ shift @_ }; $n_markers = shift @_; $n_u_markers = shift @_; @marker_list = sort @{ shift @_ }; %cons_red_markers = %{ shift @_ }; $score_type = shift @_; %score = %{ shift @_ }; unless ( $score_type eq 'letters' ) { # confirms that data set contains A/B/C/D/H type of scores print "\n-----------------------------\n", "\n=> Program terminated. The data sets does not contain A/B/C/D/H scores. Press\n", " 'Enter' to exit..."; < STDIN >; exit; } $lines{ $population } = $n_u_samples; # registers the number of lines for each popuation @subset_order = @map_order; for ( $i = $#map_order; $i >= 0; $i -- ) { # removes those marker entries from map order list which have not been scored for the currently processed population unless ( defined ${ $score{ $sample_list[ 0 ] } }{ $map_order[ $i ] } ) { splice ( @subset_order, $i, 1 ); $removed ++; } else { $found{ $map_order[ $i ] } = 1; } } for ( $i = $#marker_list; $i >= 0; $i -- ) { # removes entries from marker_list and score hash which are for markers that are not in the map_order list $found = "no"; for ( @subset_order ) { if ( $marker_list[ $i ] eq $_ ) { $found = "yes"; last; } } if ( $found eq "no" ) { $removed = splice ( @marker_list, $i, 1 ); for $sample ( @sample_list ) { delete ${ $score{ $sample } }{ $removed }; } } } if ( $#subset_order != $#marker_list ) { # checks that map_order list has an equal number of markers as the scoring table print "Program terminated: unqual number of markers in the scoring table and the map_order file!\n"; ; exit; } @_ = &find_breakpoints ( $rec_threshold, $score_type, \@subset_order, \@sample_list, \%score ); # finds breakpoints (two-marker intervals with a greater-than rec_threshold recombination frequency @breakpoint = @{ shift @_ }; if ( @breakpoint > 1 ) { print "\n\"$population\" split into " . ( ( scalar @breakpoint ) ) . " groups (REC<$rec_threshold):\n"; } else { print "\n\"$population\" contains a single linkage group (REC<$rec_threshold):\n"; } $to = -1; for $i ( 1 .. @breakpoint ) { # loops through all subgroups of a linkage group and computes within-subgroup map distances $from = $to + 1; unless ( $i == @breakpoint ) { $to = $breakpoint[ $i ]; } else { $to = $#marker_list; } @_ = &compare_markers ( $population, $pop_type{ $population }, $score_type, [ @subset_order[ $from .. $to ] ], \@sample_list, \%score, $max_rec, $min_lod ); @dist_matrix_new = @{ shift @_ }; @lod_matrix_new = @{ shift @_ }; @adj_mapdist = @{ shift @_ }; print "\nLoci " . ( $from + 1 ) . " to " . ( $to + 1 ) . ": "; @_ = &optimise_map_distances ( \@dist_matrix_new, \@lod_matrix_new, $exp ); # least-square minimisation procedure @original_dist = @{ shift @_ }; # simulates JoinMap output (crude) @opt_dist_new = @{ shift @_ }; # simulates JoinMap output (negative distances removed) $original_length = shift @_; $adjusted_length = shift @_; $sum_neg_mapdist = shift @_; $avg_zero_dist = shift @_; $stdev_zero_dist = shift @_; $original_avg_resid = shift @_; $avg_resid = shift @_; $cosegr = shift @_; $negative = shift @_; if ( $adjusted_length > 500 ) { print "\n\nProgram terminated: unrealist length of linkage group (%.0f cM!).\nChange settings and re-run.\nPress \"Enter\" to exit...", $adjusted_length; ; exit; } printf "%.1f cM (w/o neg. dist; originally %.1f cM; sum of adjacent dist. %.1f cM)", $adjusted_length, $original_length, $adj_mapdist[ -1 ]; printf "\n - sum of negative distances = %.2f cM", $sum_neg_mapdist if ( $sum_neg_mapdist ); printf "\n - distance between cosegregating markers: %.2f +/- %.2f cM (mean +/- SD)", $avg_zero_dist, $stdev_zero_dist unless ( $avg_zero_dist eq "-" || $stdev_zero_dist eq "-" ); #printf "\n - change in average residual due to removal of negative distances: from %.2f to %.2f cM\n", $original_avg_resid, $avg_resid; #for $j ( 2 .. $#dist_matrix_new ) { # substitutes measured with fitted map distances # for $k ( 1 .. $j - 1 ) { # ${ $dist_matrix_new[ $j ] }[ $k ] = $opt_dist_new[ $j - 1 ] - $opt_dist_new[ $k - 1 ]; # } #} if ( @dist_matrix ) { # joins distance and LOD matrices from different subgroups of the linkage group $matrix_size = $#dist_matrix; for $j ( 1 .. $#dist_matrix_new ) { push ( @{ $dist_matrix[ 0 ] }, ${ $dist_matrix_new[ 0 ] }[ $j ] ); # adds new marker names into first row push ( @{ $lod_matrix[ 0 ] }, ${ $lod_matrix_new[ 0 ] }[ $j ] ); # adds new marker names into first row push ( @{ $dist_matrix[ $matrix_size + $j ] }, ${ $dist_matrix_new[ 0 ] }[ $j ] ); # adds new marker names into first column push ( @{ $lod_matrix[ $matrix_size + $j ] }, ${ $lod_matrix_new[ 0 ] }[ $j ] ); # adds new marker names into first column } for ( 1 .. $matrix_size - 1 ) { for $j ( 1 .. $#dist_matrix_new ) { push @{ $dist_matrix[ $matrix_size + $j ] }, "-"; push @{ $lod_matrix[ $matrix_size + $j ] }, "-"; } } for $j ( 1 .. $#dist_matrix_new ) { push @{ $dist_matrix[ $matrix_size + $j ] }, ( "-", @{ $dist_matrix_new[ $j ] }[ 1 .. $#{ $dist_matrix_new[ $j ] } ] ); push @{ $lod_matrix[ $matrix_size + $j ] }, ( "-", @{ $lod_matrix_new[ $j ] }[ 1 .. $#{ $lod_matrix_new[ $j ] } ] ); } push @opt_dist, ( @opt_dist_new ); } else { @dist_matrix = @dist_matrix_new; @lod_matrix = @lod_matrix_new; @opt_dist = @opt_dist_new; } } $population =~ /^(.+)\..*$/; open ( DIST, ">$1.kos" ) or die "=> Program terminated: couldn't open \"$1.kos\": $!\n"; for ( @dist_matrix ) { print DIST "@{ $_ }\n"; } close DIST or die "=> Program terminated: couldn't close \"$1.lod\": $!\n"; open ( LOD, ">$1.lod" ) or die "=> Program terminated: couldn't open \"$1.lod\": $!\n"; for ( @lod_matrix ) { print LOD "@{ $_ }\n"; } close LOD or die "=> Program terminated: couldn't close \"$1.lod\": $!\n"; $cM_matrix{ $population } = [ @dist_matrix ]; $LOD_matrix{ $population } = [ @lod_matrix ]; $cM_list{ $population } = [ @opt_dist ]; } for ( values %lines ) { $number_lines += $_; } ############################################################################################################### # equalises diferences between recombination frequencies of pairs of populations ############################################################################################################### #print "\nEqualising recombination rates of populations...\n"; #@_ = &equalise_rec ( \%cM_matrix, \@data_file, $min_stretch_rec, $gini_tolerance ); # %cM_matrix = %{ shift @_ }; # %stretch = %{ shift @_ }; # $gini = shift @_; #print " Factors to equalise recombination rates (Gini coefficient = $gini):\n"; # reports adjustment factors on the screen #for ( @data_file ) { # print " - $_: $stretch{ $_ }\n"; #} ############################################################################################################### # removes markers from the map order list that have not been scored in any of the popuations ############################################################################################################### for ( $i = $#map_order; $i >= 0; $i -- ) { unless ( defined $found{ $map_order[ $i ] } ) { splice ( @map_order, $i, 1 ); } } ############################################################################################################### # prints distances of individual maps ############################################################################################################### open ( IND_DIST, ">DistIndPop.csv" ) or die "=> Program terminated: couldn't open \"DistIndPop.csv\": $!\n"; print IND_DIST "Locus"; for $population ( sort keys %cM_list ) { print IND_DIST ",$population"; } print IND_DIST "\n"; for $locus ( @map_order ) { print IND_DIST "$locus"; for $population ( sort keys %cM_list ) { $found = "no"; for $i ( 1 .. $#{ ${ $LOD_matrix{ $population } }[ 0 ] } ) { if ( $locus eq ${ ${ $LOD_matrix{ $population } }[ 0 ] }[ $i ] ) { $found = $i; last; } } unless ( $found eq "no" ) { print IND_DIST ",${ $cM_list{ $population } }[ $found - 1 ]"; } else { print IND_DIST ",-"; } } print IND_DIST "\n"; } close IND_DIST or die "=> Program terminated: couldn't close \"DistIndPop.csv\": $!\n"; ############################################################################################################### # merges distance matrices by computing weighted average distances ############################################################################################################### print "\nMerging population-based map distance estimates...\n"; for $population ( keys %cM_matrix ) { # registers all fitted distances between adjacent markers in the different populations @new_locus = @{ ${ $cM_matrix{ $population } }[ 0 ] }; for $i ( 2 .. $#new_locus ) { for $j ( 1 .. $i - 1 ) { unless ( ${ ${ $cM_matrix{ $population } }[ $i ] }[ $j ] eq "-" ) { push @{ ${ $distance{ $new_locus[ $i ] } }{ $new_locus[ $j ] } }, $lines{ $population } * ${ ${ $cM_matrix{ $population } }[ $i ] }[ $j ]; push @{ ${ $lod{ $new_locus[ $i ] } }{ $new_locus[ $j ] } }, $lines{ $population } * ${ ${ $LOD_matrix{ $population } }[ $i ] }[ $j ]; push @{ ${ $comp{ $new_locus[ $i ] } }{ $new_locus[ $j ] } }, $lines{ $population }; # sloppy! call rate not taken into account push @{ ${ $distance{ $new_locus[ $j ] } }{ $new_locus[ $i ] } }, $lines{ $population } * ${ ${ $cM_matrix{ $population } }[ $i ] }[ $j ]; push @{ ${ $lod{ $new_locus[ $j ] } }{ $new_locus[ $i ] } }, $lines{ $population } * ${ ${ $LOD_matrix{ $population } }[ $i ] }[ $j ]; push @{ ${ $comp{ $new_locus[ $j ] } }{ $new_locus[ $i ] } }, $lines{ $population }; } } } } @kosambi_row = @new_locus = @new_distance = @lod_row = @new_lod = @comp_row = @new_comp = ( ); for $i ( 1 .. $#map_order ) { # computes weighted average map distances for $j ( 0 .. $i - 1 ) { if ( exists $distance{ $map_order[ $i ] }{ $map_order[ $j ] } ) { if ( @{ ${ $distance{ $map_order[ $i ] } }{ $map_order[ $j ] } } > 1 ) { $dist = $lod = $comp = $sum_squares = $count = 0; for $k ( 0 .. $#{ ${ $distance{ $map_order[ $i ] } }{ $map_order[ $j ] } } ) { $dist += ${ ${ $distance{ $map_order[ $i ] } }{ $map_order[ $j ] } }[ $k ]; $lod += ${ ${ $lod{ $map_order[ $i ] } }{ $map_order[ $j ] } }[ $k ]; $comp += ${ ${ $comp{ $map_order[ $i ] } }{ $map_order[ $j ] } }[ $k ]; $count ++; } for $k ( 0 .. $#{ ${ $distance{ $map_order[ $i ] } }{ $map_order[ $j ] } } ) { # computes stdev of distance estimates $sum_squares += ( ( $dist / $count ) - ${ ${ $distance{ $map_order[ $i ] } }{ $map_order[ $j ] } }[ $k ] ) ** 2; } unless ( $sum_squares == 0 ) { $var = $sum_squares / ( $count - 1 ); ${ $lod{ $map_order[ $i ] } }{ $map_order[ $j ] } = ( 1 / $var ) * $lod / $comp; # LOD weight = weighted average LOD x root of the sum of squares of the distance estimate } else { ${ $lod{ $map_order[ $i ] } }{ $map_order[ $j ] } = ( 1 / 2 ) * $lod / $comp; } ${ $distance{ $map_order[ $i ] } }{ $map_order[ $j ] } = $dist / $comp; # = weighted average distance } else { ${ $distance{ $map_order[ $i ] } }{ $map_order[ $j ] } = ${ ${ $distance{ $map_order[ $i ] } }{ $map_order[ $j ] } }[ 0 ] / ${ ${ $comp{ $map_order[ $i ] } }{ $map_order[ $j ] } }[ 0 ]; ${ $lod{ $map_order[ $i ] } }{ $map_order[ $j ] } = ( 1 / 20 ) * ${ ${ $lod{ $map_order[ $i ] } }{ $map_order[ $j ] } }[ 0 ] / ${ ${ $comp{ $map_order[ $i ] } }{ $map_order[ $j ] } }[ 0 ]; } } else { ${ $distance{ $map_order[ $i ] } }{ $map_order[ $j ] } = "-"; ${ $lod{ $map_order[ $i ] } }{ $map_order[ $j ] } = "-"; } } } @dist_matrix = @lod_matrix = ( ); # creates merged matrices with average distances and LOD weights push @{ $dist_matrix[ 0 ] }, "Average Kosambi distances (cM)"; push @{ $lod_matrix[ 0 ] }, "LOD weights"; for $i ( 0 .. $#map_order ) { push ( @{ $dist_matrix[ 0 ] }, $map_order[ $i ] ); push ( @{ $lod_matrix[ 0 ] }, $map_order[ $i ] ); push ( @{ $dist_matrix[ $i + 1 ] }, $map_order[ $i ] ); push ( @{ $lod_matrix[ $i + 1 ] }, $map_order[ $i ] ); for $j ( 0 .. $i - 1 ) { push ( @{ $dist_matrix[ $i + 1 ] }, ${ $distance{ $map_order[ $i ] } }{ $map_order[ $j ] } ); push ( @{ $lod_matrix[ $i + 1 ] }, ${ $lod{ $map_order[ $i ] } }{ $map_order[ $j ] } ); } } %distance = %lod = ( ); for $i ( 1 .. $#dist_matrix ) { # mirrors the lower half of the matrices across the diagonal to create the upper half of the matrix ${ $dist_matrix[ $i ] }[ $i ] = 0; ${ $lod_matrix[ $i ] }[ $i ] = "-"; for $j ( 0 .. $i - 1 ) { ${ $dist_matrix[ $j ] }[ $i ] = ${ $dist_matrix[ $i ] }[ $j ]; ${ $lod_matrix[ $j ] }[ $i ] = ${ $lod_matrix[ $i ] }[ $j ]; } } ############################################################################################################### # computes map distances for consensus map ############################################################################################################### $loop = "yes"; print "\nFlipping marker pairs with negative distances...\n"; $round = 1; print "\n*Round $round:\n"; print " - start: "; $i = 1; $iteration = 1; until ( $loop eq "no" ) { @_ = &optimise_map_distances ( \@dist_matrix, \@lod_matrix, $exp ); # least-square minimisation procedure @original_dist = @{ shift @_ }; # simulates JoinMap output (crude) @opt_dist = @{ shift @_ }; # simulates JoinMap output (negative distances removed) $original_length = shift @_; $adjusted_length = shift @_; $sum_neg_mapdist = shift @_; $avg_zero_dist = shift @_; $stdev_zero_dist = shift @_; $original_avg_resid = shift @_; $avg_resid = shift @_; $cosegr = shift @_; $negative = shift @_; if ( $iteration == 1 ) { for ( 1.. $#dist_matrix ) { $first_ori_dist{ ${ $dist_matrix[ $_ ] }[ 0 ] } = $original_dist[ $_ - 1 ]; $first_opt_dist{ ${ $dist_matrix[ $_ ] }[ 0 ] } = $opt_dist[ $_ - 1 ]; } } printf "sum neg. dist. = %.2f cM [length = %.1f cM - originally %.1f cM]\n", $sum_neg_mapdist, $adjusted_length, $original_length if ( $sum_neg_mapdist ); unless ( $i == $#original_dist ) { $i += 1; } else { $i = 1; $round ++; print "\n*Round $round:\n"; } until ( $original_dist[ $i ] - $original_dist[ $i - 1 ] < $neg_thresh || $i == $#original_dist ) { $i ++; } unless ( $original_dist[ $i ] - $original_dist[ $i - 1 ] >= $neg_thresh ) { printf " - flip pos. $i (%.2f cM): ", ($original_dist[ $i ] - $original_dist[ $i - 1 ]); splice ( @dist_matrix, $i, 0, splice ( @dist_matrix, $i + 1, 1 ) ); splice ( @lod_matrix, $i, 0, splice ( @lod_matrix, $i + 1, 1 ) ); for $row ( @dist_matrix ) { splice ( @{ $row }, $i, 0, splice ( @{ $row }, $i + 1, 1 ) ); } for $row ( @lod_matrix ) { splice ( @{ $row }, $i, 0, splice ( @{ $row }, $i + 1, 1 ) ); } } else { $found = "no"; for $j ( 1 .. $#original_dist ) { if ( $original_dist[ $j ] - $original_dist[ $j - 1 ] < $neg_thresh ) { $found = "yes"; last; } } if ( $found eq "no" ) { $loop = "no"; } } $iteration ++; } ############################################################################################################### # saves distance and lod matrices with modified marker order (after getting rid of negative distances) and reports result ############################################################################################################### open ( DIST, ">cons.dis" ) or die "=> Program terminated: couldn't open \"cons.dis\": $!\n"; # saves the matrices with Kosambi map distances and LOD scores for ( @dist_matrix ) { print DIST "@{ $_ }\n"; } close DIST or die "=> Program terminated: couldn't close \"cons.dis\": $!\n"; open LOD, ">cons.lod" or die "=> Program terminated: couldn't open \"cons.lod\": $!\n"; for ( @lod_matrix ) { print LOD "@{ $_ }\n"; } close LOD or die "=> Program terminated: couldn't close \"cons.lod\": $!\n"; open OPT_DIST, ">consdist.csv" or die "=> Program terminated: couldn't open \"consdist.csv\": $!\n"; print OPT_DIST "Locus,cM_orig_crude,cM_orig_w/o_neg,cM_reordered\n"; for $i ( 1 .. $#dist_matrix ) { print OPT_DIST "${ $dist_matrix[ $i ] }[ 0 ],$first_ori_dist{ ${ $dist_matrix[ $i ] }[ 0 ] },$first_opt_dist{ ${ $dist_matrix[ $i ] }[ 0 ] },$opt_dist[ $i - 1 ]\n"; } close OPT_DIST or die "=> Program terminated: couldn't close \"consdist.csv\": $!\n"; print "\n\nFINAL RESULT:\n"; printf "%.1f cM (w/o neg. dist; originally %.1f cM)", $adjusted_length, $original_length; printf "\n - sum of negative distances = %.2f cM", $sum_neg_mapdist if ( $sum_neg_mapdist ); printf "\n - distance between cosegregating markers: %.2f +/- %.2f cM (mean +/- SD)", $avg_zero_dist, $stdev_zero_dist unless ( $avg_zero_dist eq "-" || $stdev_zero_dist eq "-" ); #printf "\n - change in average residual due to removal of negative distances: from %.2f to %.2f cM\n", $original_avg_resid, $avg_resid; exit; # subroutines ############################################################################################################### ############################################################################################################### ############################################################################################################### ############################################################################################################### sub find_breakpoints { # identifies potential breack points in linkage group (= marker pairs with greater-than-threshold recombination frequencies) my $rec_threshold = shift @_; my $score_type = shift @_; my @map_order = @{ shift @_ }; my @sample_list = @{ shift @_ }; my %score = %{ shift @_ }; my ( $AA, $AB, $BA, $BB, @breakpoint, $i, $j, $sample, @score_sign, @temp_list, @locus, $recomb, $non_recomb, $merge, ); push @breakpoint, "0"; for $i ( 0 .. $#map_order - 1 ) { # compares adjacent marker pairs @score_sign = ( ); for $j ( $i, $i + 1 ) { @temp_list = ( ); for $sample ( @sample_list ) { push @temp_list, ${ $score{ $sample } }{ $map_order[ $j ] }; } push @score_sign, [ @temp_list ]; } unless ( scalar @{ $score_sign[ 0 ] } == scalar @{ $score_sign[ 1 ] } ) { # checks that the two score signatures of the currently compared markers have the same length print "\n-----------------------------\n\n", "-> Program terminated: there was an unequal number of scores in the signatures\n", " of markers '$locus[ 0 ]' and '$locus[ 1 ]'. Press 'Enter' to exit...\n"; < STDIN >; exit; } $AA = $BB = $AB = $BA = 0; for $j ( 0 .. $#{ $score_sign[ 0 ] } ) { if ( ${ $score_sign[ 0 ] }[ $j ] =~ /[CDH]/i || ${ $score_sign[ 1 ] }[ $j ] =~ /[CDH]/i ) { # script can currently only handle A/B scores! print "\n-----------------------------\n\n", "-> Program terminated: sorry, C/D/H scores can't yet be handled by this program!\n", " Press 'Enter' to exit...\n"; < STDIN >; exit; } if ( ${ $score_sign[ 0 ] }[ $j ] =~ /[AB]/i && ${ $score_sign[ 1 ] }[ $j ] =~ /[AB]/i ) { # creates a consensus of the two scores if both have defined values $merge = &merge_scores ( $score_type, ${ $score_sign[ 0 ] }[ $j ], ${ $score_sign[ 1 ] }[ $j ] ); if ( $merge =~ /X/i ) { $AB ++ if ( ${ $score_sign[ 0 ] }[ $j ] =~/A/i ); $BA ++ if ( ${ $score_sign[ 1 ] }[ $j ] =~/A/i ); } elsif ( $merge =~ /A/i ) { $AA ++; } elsif ( $merge =~ /B/i ) { $BB ++; } } } unless ( $AA + $BB + $AB + $BA == 0 ) { push ( @breakpoint, $i ) if ( 100 * ( $AB + $BA ) / ( $AA + $AB + $BA + $BB ) > $rec_threshold ); # registers breack point if recombination frequency > threshold } } return \@breakpoint; } ################################################################################################################### sub compare_markers { # computes two matrices: (1) a matrix with pairwise Kosambi map distances (in cM) between markers and (2) a matrix with pairwise LOD scores (defined as in Joinmap 3.0) my $data_set = shift @_; # name of file containing data set my $pop_type = shift @_; my $score_type = shift @_; # score type (0/1 or A/B/C/D) my @marker_list = @{ shift @_ }; # array of marker names my @sample_list = @{ shift @_ }; # array of sample names my %score = %{ shift @_ }; # hash of hashes: samples => markers => scores my $max_rec = shift @_; my $min_lod = shift @_; my ( $i, # looping index $j, # looping index $k, # looping index @rowK, # currently processed row of Kosambi map distance matrix @rowL, # currently processed row of LOD score matrix @dist_matrix, # matrix of Kosambi map distances: array of rows of cells @comp_matrix, # matrix of Kosambi map distances: array of rows of cells @cumul_mapdist, # cumulative Kosambi map distances for the marker alligned according to map order (non-optimised values computed by adding nearest-neighbour estimates) @LOD_matrix, # matrix of LOD scores: array of rows of cells @locus, # array of the names of the two loci that are currently being compared @score_sign, # array of two arrays with the score signatures of the two currently compared markers @temp_list, # temporary list of scores $merge, # consensus of two currently compared scores $sameAA, # number of A vs A comparisons for the two currently processed marker signatures $sameBB, # number of B vs B comparisons for the two currently processed marker signatures $recombAB, # number A vs B comparisons (eg, a recombination event) between the two currently processed marker signatures $recombBA, # number B vs A comparisons (eg, a recombination event) between the two currently processed marker signatures $row1, # total of first row of a (virtual) genotype contingency table for a pair of markers $row2, # total of second row of a (virtual) genotype contingency table for a pair of markers $col1, # total of first column of a (virtual) genotype contingency table for a pair of markers $col2, # total of second column of a (virtual) genotype contingency table for a pair of markers $total, # grand total of a (virtual) genotype contingency table for a pair of markers $G1, # AA component of G statistics for a (virtual) genotype contingency table for a pair of markers $G2, # AB component of G statistics for a (virtual) genotype contingency table for a pair of markers $G3, # BA component of G statistics for a (virtual) genotype contingency table for a pair of markers $G4, # BB component of G statistics for a (virtual) genotype contingency table for a pair of markers $LOD, # log10-based LOD score for the currently compared pair of markers (as defined in JoinMap 3.0) $dist, # Kosambi map distance between the two currently compared lists of scores $marker, # currently processed marker $sample, # currently processed sample $rec_fract, # recombination fraction between two markers ); push @dist_matrix, [ ( 'Kosambi map distances (cM)', @marker_list ) ]; push @LOD_matrix, [ ( 'LOD scores', @marker_list ) ]; # computes Kosambi map distance and LOD score for each pair of markers, filling up both halfs (triangles) of two matrices #print "creating matrices for $data_set\n"; #print "|@marker_list|\n"; push @cumul_mapdist, 0; for $i ( 0 .. $#marker_list ) { @rowK = @rowL = @rowC = ( ); push @rowK, $marker_list[ $i ]; # adds marker names to the first columns of the Kosambi distance matrix push @rowL, $marker_list[ $i ]; # adds marker names to the first columns of the LOD score matrix for $j ( 0 .. $i - 1 ) { # computes Kosambi distance and LOD score of one marker ($i) vs. all other markers ($j) which, in the @marker_list array, come after the current marker @locus = ( @marker_list[ $i, $j ] ); @score_sign = ( ); for $marker ( @locus ) { @temp_list = ( ); for $sample ( @sample_list ) { push @temp_list, ${ $score{ $sample } }{ $marker }; } push @score_sign, [ @temp_list ]; } unless ( scalar @{ $score_sign[ 0 ] } == scalar @{ $score_sign[ 1 ] } ) { # checks that the two score signatures of the currently compared markers have the same length print "\n-----------------------------\n\n", "-> Program terminated: there was an unequal number of scores in the signatures\n", " of markers '$locus[ 0 ]' and '$locus[ 1 ]'. Press 'Enter' to exit...\n"; < STDIN >; exit; } $sameAA = $sameBB = $recombAB = $recombBA = 0; # compares individual scores of the two signatures of a marker pair and counts number of recombinants and non-recombinants for $k ( 0 .. $#{ $score_sign[ 0 ] } ) { if ( ${ $score_sign[ 0 ] }[ $k ] =~ /[CDH]/i || ${ $score_sign[ 1 ] }[ $k ] =~ /[CDH]/i ) { # script can currently only handle A/B scores! print "\n-----------------------------\n\n", "-> Program terminated: sorry, C/D/H scores can't yet be handled by this program!\n", " Press 'Enter' to exit...\n"; < STDIN >; exit; } #print "${ $score_sign[ 0 ] }[ $k ]${ $score_sign[ 1 ] }[ $k ] "; $merge = &merge_scores ( $score_type, ${ $score_sign[ 0 ] }[ $k ], ${ $score_sign[ 1 ] }[ $k ] ); # creates a consensus of the two scores if ( $merge =~ /X/i ) { if ( ${ $score_sign[ 0 ] }[ $k ] =~ /A/i ) { $recombAB ++; } else { $recombBA ++; } } elsif ( $merge =~ /[AB]/i ) { unless ( ${ $score_sign[ 0 ] }[ $k ] eq '-' || ${ $score_sign[ 1 ] }[ $k ] eq '-' ) { # excludes comparisons where one of the two scores is missing if ( ${ $score_sign[ 0 ] }[ $k ] =~ /A/i ) { $sameAA ++; } else { $sameBB ++; } } } } $dist = $LOD = "-"; unless ( $sameAA * $sameBB + $recombAB * $recombBA == 0 ) { # computes Kosambi map distance and LOD score from recombination data if recombination rate is less than 50 % $rec_fract = ( $recombAB + $recombBA ) / ( $sameAA + $sameBB + $recombAB + $recombBA ); #$rec_fract = sqrt( $recombAB * $recombBA ) / ( sqrt( $sameAA * $sameBB ) + sqrt( $recombAB * $recombBA ) ); # computes recombination frequency (using the formula from MapManager manual p. 153 which allows for segregation distortion) $rec_fract = $rec_fract / ( 2 - 2 * $rec_fract ) if ( $pop_type eq "RIL" ); # convers, for the RIL lines, their cumulative recombination frequency during RIL generation into recombination frequency per meisis according to Haldene and Waddington (1931) Genetics 16: 357-374 unless ( $rec_fract > $max_rec / 100 ) { $dist = 25 * log ( ( 1 + 2 * $rec_fract ) / ( 1 - 2 * $rec_fract ) ); # computes map distance according to Kosambi formula unless ( $i == 0 ) { # adds the distance between two adjacent markers to the list of cumulative map distances of all markers alligned according to map positions if ( $j == $i - 1 ) { push @cumul_mapdist, $dist + $cumul_mapdist[ -1 ]; } } $G1 = $G2 = $G3 = $G4 = 0; # re-sets contributions to G statistics (see below) to zero $total = $sameAA + $sameBB + $recombAB + $recombBA; # grand total of contingency table $row1 = $sameAA + $recombAB; # total of first row of contingency table $row2 = $recombBA + $sameBB; # total of second row of contingency table $col1 = $sameAA + $recombBA; # total of first col of contingency table $col2 = $recombAB + $sameBB; # total of second col of contingency table unless ( $sameAA == 0 ) { # builds up a G statistics for a (virtual) genotype contingency table for a pair of markers $G1 = 2 * ( $sameAA * log ( $sameAA / ( $row1 * $col1 / $total ) ) ); } unless ( $recombAB == 0 ) { $G2 = 2 * ( $recombAB * log ( $recombAB / ( $row1 * $col2 / $total ) ) ); } unless ( $recombBA == 0 ) { $G3 = 2 * ( $recombBA * log ( $recombBA / ( $row2 * $col1 / $total ) ) ); } unless ( $sameBB == 0 ) { $G4 = 2 * ( $sameBB * log ( $sameBB / ( $row2 * $col2 / $total ) ) ); } $LOD = ( $G1 + $G2 + $G3 + $G4) / ( 2 * log ( 10 ) ); } } if ( $LOD eq "-" || $LOD < $min_lod ) { $dist = $LOD = "-"; } push @rowK, $dist; push @rowL, $LOD; } push @dist_matrix, [ @rowK ]; # adds a row to the matrix with Kosambi map distances push @LOD_matrix, [ @rowL ]; # adds a row to the matrix with LOD scores } return ( \@dist_matrix, \@LOD_matrix, \@cumul_mapdist ); } ################################################################################################################### sub optimise_map_distances { # optimises map distances based on the least-square fitting procedure used by JoinMap 3.0 my @mapdist_matrix = @{ shift @_ }; # matrix of Kosambi map distances: array of rows of cells my @lod_matrix = @{ shift @_ }; # matrix of LOD scores: array of rows of cells my $exp = shift @_; my ( @coefficient, # array of arrays containing the coefficients of a matrix describing a system of linear equations used to optise map distances (as in JoinMap 3.0) $equation_set, # matrix representation of the @coeficient matrix $opt_mapdist, # solution vector representation of equation system containing the optimised map distances between markers (ordered according to the map) @opt_mapdist, # array containing the cumulative Kosambi distances derived from $opt_mapdist @opt_cumul_mapdist, # array containing the cumulative Kosambi distances derived from $opt_mapdist $i, # start of map interval $j, # end of map interval $x, # left-hand marker position within map interval $y, # right-hand marker position within map interval $sum_squares, $original_sum_squares, $rel_sum_squares, $count, $avg_resid, $original_avg_resid, $rel_stdev, $above, $below, $sum_neg_mapdist, @original_opt_mapdist, $sum, $avg_zero_dist, $stdev_zero_dist, $n, $cosegr, $negative, @temp1, @temp2, @fitted, ); $" = ','; for $i ( 0 .. ( $#mapdist_matrix - 2 ) ) { # pre-sets all coefficients of the equation system matrix to zero for $j ( 0 .. ( $#mapdist_matrix - 1 ) ) { ${ $coefficient[ $i ] }[ $j ] = 0; } } # creates an array of array (marix) of the coefficients of the set of equations that need to be solved to minimise the overall sum of squared differences between pairs of "measured" and true map distances for $i ( 1 .. $#lod_matrix ) { # defines start of map interval for $j ( $i + 1 .. $#lod_matrix ) { # defines end of map interval unless ( ${ $mapdist_matrix[ $j ] }[ $i ] eq "-" || ${ $lod_matrix[ $j ] }[ $i ] eq "-" ) { for $x ( $i .. $j - 1 ) { # runs through all two-marker sub-intervals within the boundaries of the selected interval ${ $coefficient[ $x - 1 ] }[ $x - 1 ] += ${ $lod_matrix[ $j ] }[ $i ] ** $exp; # builds up the coefficients in the diagonal of the matrix representing the equation system to be solved (= the two-marker intervals for which the first derivation of the overall sum of squares had been calculated) ${ $coefficient[ $x - 1 ] }[ $#lod_matrix - 1 ] += ( ${ $lod_matrix[ $j ] }[ $i ] ** $exp ) * ( ${ $mapdist_matrix[ $j ] }[ $i ] ) # builds up the last column of the matrix (containing the coefficients on the right-hand side of the equations) } for $x ( $i .. $j - 2 ) { # runs through all possible pairs of two-marker sub-intervals within the boundaries of the selected interval for $y ( $x + 1 .. $j - 1 ) { ${ $coefficient[ $y - 1 ] }[ $x - 1 ] += ${ $lod_matrix[ $j ] }[ $i ] ** $exp; # builds up the off-diagonal coefficients of the matrix representing the equation system to be solved } } } } } for $i ( 1 .. $#lod_matrix - 2 ) { # mirrors the lower half of the matrix across the diagonal to create the upper half of the matrix for $j ( 0 .. $i - 1 ) { ${ $coefficient[ $j ] }[ $i ] = ${ $coefficient[ $i ] }[ $j ]; } } # solves the equation system given by the @coefficient matrix $equation_set = new Math::Matrix ( @coefficient ); $opt_mapdist = $equation_set -> solve; @opt_mapdist = split ( '\n', $opt_mapdist ); $sum_neg_mapdist = 0; for ( @opt_mapdist ) { # removes leading and trailing whitespace characters from map distance values s/^\s+//g; s/\s+$//g; if ( $_ < 0 ) { $sum_neg_mapdist += $_; } } push @original_opt_mapdist, @opt_mapdist; # creates a copy of the crude map distances before starting to modify them # corrects the consecutive two-point distances (computed with the multipoint algorithm), based on the consecutive two-point estimates $cosegr = $negative = 0; $avg_zero_dist = $stdev_zero_dist = $sum = $sum_squares = $n = 0; for $i ( 0 .. $#opt_mapdist ) { #if ( ${ $mapdist_matrix[ $i + 2 ] }[ $i + 1 ] ne "-" && ${ $mapdist_matrix[ $i + 2 ] }[ $i + 1 ] == 0 ) { # only a minority of marker pairs that co-segregated will result in 0 cM distance multipoint estimates! # $sum += $opt_mapdist[ $i ]; # $sum_squares += ( $opt_mapdist[ $i ] ) ** 2; # $n ++; # if ( $opt_mapdist[ $i ] != 0 ) { # $opt_mapdist[ $i ] = 0; # $cosegr ++; # } #} if ( $opt_mapdist[ $i ] < 0 ) { # > 95% of negative multipoint estimates for adjacent markers are in fact 0 cM distances (co-segregating markers) #unless ( ${ $mapdist_matrix[ $i + 2 ] }[ $i + 1 ] eq "-" ) { # $opt_mapdist[ $i ] = ${ $mapdist_matrix[ $i + 2 ] }[ $i + 1 ]; # $negative ++; #} else { $opt_mapdist[ $i ] = 0.001; $negative ++; #} } } if ( $n > 0 ) { # computes average and stdev of the multipoint estimates for 0-cM two-point point distances between pairs of adjacent markers $avg_zero_dist = $sum / $n; } else { $avg_zero_dist = "-"; } if ( $n > 1 ) { $stdev_zero_dist = sqrt( ( $sum_squares - ( 1/ $n ) * ( $sum ) ** 2 ) / ( $n - 1 ) ); } else { $stdev_zero_dist = "-"; } # computes the average residuals (estimated vs. measured map distances) $sum_squares = $original_sum_squares = $count = 0; for $i ( 2 .. $#mapdist_matrix - 1 ) { for $j ( 1 .. $i - 1 ) { unless ( ${ $mapdist_matrix[ $i ] }[ $j ] eq "-" ) { $sum_squares += ( ${ $mapdist_matrix[ $i ] }[ $j ] - ( $opt_mapdist[ $i - 1 ] - $opt_mapdist[ $j - 1 ] ) ) ** 2; $original_sum_squares += ( ${ $mapdist_matrix[ $i ] }[ $j ] - ( $original_opt_mapdist[ $i - 1 ] - $original_opt_mapdist[ $j - 1 ] ) ) ** 2; $count ++; } } } if ( $count > 0 ) { $avg_resid = sqrt ( $sum_squares / $count ); $original_avg_resid = sqrt ( $original_sum_squares / $count ); } else { $avg_resid = "-"; $original_avg_resid = "-"; } # converts two-point distances into cumulative distances push @temp1, 0; push @temp2, 0; for $i ( 1 .. $#opt_mapdist + 1 ) { $temp1[ $i ] = $temp1[ $i - 1 ] + $original_opt_mapdist[ $i - 1]; $temp2[ $i ] = $temp2[ $i - 1 ] + $opt_mapdist[ $i - 1]; } return ( \@temp1, \@temp2, $temp1[ -1 ], $temp2[ -1 ], $sum_neg_mapdist, $avg_zero_dist, $stdev_zero_dist, $original_avg_resid, $avg_resid, $cosegr, $negative ); } ################################################################################################################### sub filter_redundant { # reads, quality-controls and finds scores in a file containing marker features and scores my $file = shift @_; # = name of file to read my ( $new_file, # = name of new files into which non-redundant samples and marker entries are written $format, # = format of the file (comma or tab) $score_type, # = type of scores (0/1) vs. (A/B/C/D/H) @titles, # = array of column titles $row, # = currently processed row @row_cells, # = array of arrays containing cells of columns $sample, # = name of currently processed sample $marker, # = name of currently processed marker $n_markers, # = number of markers (rows) $nu_markers, # = number of unique markers (rows) $n_cols, # = number of columns $markercol, # = number of column containing marker names $marker_name, # = title of column containing marker names $scol1, # = number of first column containing scores $n_samples, # = number of samples (score columns) $nu_samples, # = number of unique samples (score columns) $i, # = looping index $j, # = looping index %marker_scores, # = hash of scores of all markers of the currently processed sample %scores, # = hash of hashs of scores of all markers of all samples %feature_values, # = hash of data for all marker features of the currently processed marker %marker_features, # = hash of hashes of data of marker features of all markers %cons_red_markers, # = hash of scroe consensus values (in %) of redundant markers %cons_red_samples, # = hash of scroe consensus values (in %) of redundant samples @markers, # = array of marker names in alphabetical order @samples ); # = array of sample names in alphabetical order @_ = &read_file ( $file ); # reads a file and outputs: file format / number of columns / number of rows / array of column titles / array of rows $format = shift @_; $n_cols = shift @_; $n_markers = shift @_; @titles = @{ shift @_ }; @row_cells = @{ shift @_ }; for ( @row_cells ) { unless ( $n_cols == @{ $_ } ) { # checks whether all rows have the same number of cells print "\n-----------------------------\n\n", "-> Program terminated: the table in '$file' has at least\n", " one row that has a different number of cells than the first row with the\n", " column titles. Go to Excel and revise the file as follows:\n\n", " 1. Check whether the row with the column titles is complete.\n\n", " 2. Check whether there's no empty row separating the data into two blocks.\n\n", " 3. Check whether all columns with scoring data are filled with scores.\n\n", " 4. Select all empty columns to the right of the table -> right-click\n", " and select 'Delete'.\n\n", " 5. Select all empty rows below the table -> right-click and select 'Delete'.\n\n", " 6. Save file in the csv or txt format and run DArTAmanager again.\n\n\n", " Press 'Enter' to exit...\n"; ; exit; } } $scol1 = &find_scores ( $file, \@titles, \@row_cells ); # finds first column (sample) with scores and counts number of samples $n_samples = $n_cols - $scol1 + 1; for $i ( 0 .. $#row_cells ) { # identifies type of scores for $j ( $scol1 - 1 .. $#titles ) { $_ = &score_type ( $file, ${$row_cells[ $i ] }[ $j ] ); ${$row_cells[ $i ] }[ $j ] = uc ( ${$row_cells[ $i ] }[ $j ] ); # converts scores into upper case ${$row_cells[ $i ] }[ $j ] =~ s/[Xx]/\-/g; # subsitutes unnkown scores denoted by "X" with "-" if ( $_ eq 'numbers' || $_ eq 'letters' ) { if ( $score_type ) { unless ( $_ eq $score_type ) { print "\n-----------------------------\n\n", "-> Program terminated: '$file' contains both '0/1' and\n", " 'A/B/C/D/H' scores. Please check this file and try again!\n", " Press 'Enter' to exit..."; ; exit; } } else { $score_type = $_; } } } } if ( $score_type ne 'numbers' && $score_type ne 'letters' ) { print "\n > Couldn't determine the type of scores in '$file' Type '1' if the\n", " data set contains 0/1 scores or 'A' if it contains A/B/C/D/H scores: "; do { chomp ( $score_type = ); $score_type = 'numbers' if ( $score_type =~ /\b1\b/ ); $score_type = 'letters' if ( $score_type =~ /\bA\b/i ); } until ( $score_type && ( $score_type eq 'numbers' || $score_type eq 'letters') ) } $markercol = &find_marker_names ( $file, $scol1, \@titles, \@row_cells ); # identifies the column that contains marker names $marker_name = $titles[ $markercol - 1 ]; @_ = &remove_red_markers ( $file, $score_type, $markercol, $scol1, \@titles, \@row_cells ); # removes redundant marker entries (retaining consensus scores for each group of redundant markers) @row_cells = @{ shift @_ }; %cons_red_markers = %{ shift @_ }; $nu_markers = @row_cells; @_ = &remove_red_samples ( $file, $score_type, $scol1, \@titles, \@row_cells ); # removes replicate sample entries (retaining consensus scores for each group of replicate samples) $nu_samples = shift @_; @titles = @{ shift @_ }; @row_cells = @{ shift @_ }; %cons_red_samples = %{ shift @_ }; for $j ( $scol1 - 1 .. $#titles ) { # creates a hash of hashes containing the scores of all markers of all samples for $i ( 0 .. $#row_cells ) { $marker_scores{ ${ $row_cells[ $i ] }[ $markercol - 1 ] } = ${ $row_cells[ $i ] }[ $j ]; } $scores{ $titles[ $j ] } = { %marker_scores }; } @samples = sort keys %scores; for $i ( 0 .. $#row_cells ) { # creates a hash of hashes containing the features of each feature type of all markers %feature_values = ( ); for $j ( 0 .. $scol1 - 2 ) { unless ( $j == $markercol - 1 ) { $feature_values{ $titles[ $j ] } = ${ $row_cells[ $i ] }[ $j ]; } } $marker_features{ @{ $row_cells[ $i ] }[ $markercol - 1 ] } = { %feature_values }; } @markers = sort keys %marker_features; unless ( ( $file =~ /.+\.uni$/i ) && ( $n_markers == $nu_markers ) && ( $n_samples == $nu_samples ) ) { $file =~ /^(.+)\..*$/; # creates a file to ouput non-redundant score entries $new_file = "$1.uni"; open ( UNIQUE, ">$new_file" ) or die "\n-> Program terminated: can't create '$new_file' to output\n the scoring table with non-redundant entries: $!\n"; print UNIQUE "Scoring table from file '$file' with redundant sample and marker entries collapsed into consensus entries\n\n"; unless ( $n_markers == $nu_markers ) { # outputs information on redundant entries found print UNIQUE " -> There where $nu_markers unique marker"; print UNIQUE "s" if ( $nu_markers > 1 ); print UNIQUE " among a total of $n_markers markers\n"; } else { print UNIQUE " -> There where no redundant marker entries\n"; } unless ( $n_samples == $nu_samples ) { print UNIQUE " -> There where $nu_samples unique sample"; print UNIQUE "s" if ( $nu_samples > 1 ); print UNIQUE " among a total of $n_samples samples\n\n"; } else { print UNIQUE " -> There where no redundant sample entries\n\n"; } unless ( ( $n_markers == $nu_markers ) && ( $n_samples == $nu_samples ) ) { print UNIQUE "Here's the redundancy-filtered scoring table (redundant markers and/or samples collapsed into consensus entries):\n\n"; } else { print UNIQUE "Here's the original scoring table which did not contain any redundant entries:\n\n" ; } if ( $n_samples != $nu_samples ) { # prints a row containing % score-consensus values for redundant sample entries if ( $n_markers != $nu_markers ) { print UNIQUE ","; } for ( keys %{ $marker_features{ $markers[ 0 ] } } ) { print UNIQUE ","; } print UNIQUE "Consensus(%) ->"; for ( sort keys %scores ) { if ( $cons_red_samples{ $_ } ) { print UNIQUE ",$cons_red_samples{ $_ }"; } else { print UNIQUE ",-"; } } print UNIQUE "\n"; } if ( $n_markers != $nu_markers ) { # prints row containing column titles print UNIQUE "Consensus(%),"; } print UNIQUE "$marker_name"; for ( sort keys %{ $marker_features{ $markers[ 0 ] } } ) { print UNIQUE ",$_"; } print UNIQUE ","; print UNIQUE "@samples\n"; for $marker ( sort keys %marker_features ) { # prints rows containing marker features and scores if ( $n_markers != $nu_markers ) { if ( $cons_red_markers{ $marker } ) { print UNIQUE "$cons_red_markers{ $marker },"; } else { print UNIQUE "-,"; } } print UNIQUE "$marker"; for ( sort keys %{ $marker_features{ $marker } } ) { print UNIQUE ",${ $marker_features{ $marker } }{ $_ }"; } for $sample ( sort keys %scores ) { print UNIQUE ",${ $scores{ $sample } }{ $marker }"; } print UNIQUE "\n"; } if ( $n_markers != $nu_markers ) { # prints information on marker and sample redundancy on screen print "\n ==> ", $n_markers - $nu_markers," redundant marker"; print "s" if ( $n_markers - $nu_markers > 1 ); print " transferred to '$1_red$marker_name"."s".".csv'.\n"; } else { print "\n ==> There were no redundant markers.\n"; } if ( $n_samples != $nu_samples ) { print "\n ==> ", $n_samples - $nu_samples, " redundant sample"; print "s" if ( $n_samples - $nu_samples > 1 ); print " transferred to '$1_redSamples.csv'.\n"; } else { print "\n ==> There were no redundant samples.\n"; } print "\n ==> The non-redundant data set with ", $nu_samples, " unique sample"; print "s" if ( $nu_samples > 1 ); print " and ", $nu_markers, " unique marker"; print "s" if ( $nu_markers > 1 ); print "\n was saved as '$1.uni'.\n"; # outputs information on redundant samples and markers found in file close UNIQUE or die "\n-> Program terminated: can't close '$new_file'.\n\n"; } else { print "\n ==> There were no redundant marker or samples.\n"; } return ( $new_file, $n_samples, $nu_samples, \@samples, \%cons_red_samples, $marker_name, \%marker_features, $n_markers, $nu_markers, \@markers, \%cons_red_markers, $score_type, \%scores ); } ################################################################################################################### sub read_file { # opens a text file, checks its format and reads column titles and rows my $file = shift @_; # = name of file to read my ( @header, # = array containing header (= leading rows before the row with the column titles) $title_row, # = row containing column titles @titles, # = array of column titles @rows, # = array of rows excluding title row and rows above title row @row_cells, # = array of arrays containing cells of columns $comma, # = number of commas in title row $tab, # = number of tabs in title row $format, # = delimiter used in file (comma or tab) $i, # = looping index $j ); # = looping index open ( FILE, "$file" ) or die "\n-> Program terminated: can't open $_[0]: $!\n\n"; # reads file content push @rows, ; chomp @rows; close FILE or warn "\nCan't close $file: $!\n"; for ( @rows ) { s/\s+$//g; s/^",+(.+)",/$1,/; s/^"(.+),+",/$1,/; s/^"(.+),+(.+)",/$1;$2,/; s/,",+(.*)",/,$1,/g; s/,"(.*),+",/,$1,/g; s/,"(.*),+(.*)",/,$1;$2,/g; } $i = $#rows; # identifies and excises header rows above the row that contains column titles do { unless ( $rows[ $i ] =~ /[,\t][01ABCDHX\-]$/i ) { $rows[ $i ] =~ /^([^,^\t]+)[,\t]/; print "\n > It looks like as if the column titles are in row no. ", $i + 1; if ( $1 ) { print " (first cell = \n '$1'). "; } else { print ".\n "; } print "Type 'n' if unsure: "; chomp ( $_ = ); if ( /\bn\b/i ) { print "\n > Here's a list of the leftmost cells from the first rows of the file: \n\n"; for $j ( 1 .. $i + 4 ) { $rows[ $j - 1 ] =~ /^([^,^\t]+)[,\t]/; if ( $rows[ $j - 1 ] ne "" ) { if ( $1 ) { printf "%2d: %s", $j, $1; print "\t<- row with column titles?" if ( $j == $i + 1 ); print "\n"; } else { printf "%2d: %s\n", $j, "empty"; } } else { printf "%2d: %s\n", $j, "empty"; } } print "\n > Please select the number of the row that has the column titles: "; chomp ( $_ = ); until ( ( $_ =~ /\d+/ ) && ( $_ >= 1 ) && ( $_ <= $#rows ) ) { print "\n > This is outside of the range of possible row numbers. Try again: "; chomp ( $_ = ); }; $i = $_ - 1; } @header = splice ( @rows, 0, $i ) unless ( $i == 0 ); } $i--; } until ( @header || $i <= 1 ); if ( $i == $#rows ) { print "\n-----------------------------\n", "\n-> Program terminated: the bottom/right corner of '$file'\n", " doesn't contain a score. Go to Excel and revise the file as follows:\n\n", " 1. Make sure that the scores are in the bottom/right part of the table.\n\n", " 2. Make sure that there are no additional data below the scoring table.\n\n", " 3. Select all empty columns to the right of the table -> right-click\n", " and 'Delete'.\n\n", " 4. Select all empty rows below the table -> right-click and 'Delete'.\n\n", " 5. Save file in the csv or txt format and run DArTAmanager again.\n\n\n", " Press 'Enter' to exit...\n"; ; exit; } $title_row = splice ( @rows, 0, 1 ); # excises row containing column titles $comma = ( $title_row =~ tr/,// ); # checks whether comma-separated or tab-delimited file $tab = ( $title_row =~ tr/\t// ); if ( ( $tab == 0 ) && ( $comma > 0 ) ) { $format = ","; } elsif ( ( $tab > 0 ) && ( $comma < $tab ) ) { $format = "\t"; } else { print "\n-----------------------------\n\n", "-> Program terminated: '$file' doesn't appear to be a\n", " comma or tab-delimited text file! (Or perhaps you've selected the wrong\n", " row with column titles?)\n", " Press 'Enter' to exit..."; ; exit; } push @titles, split ( /$format/, $title_row ); # creates array with column titles and removes trailing spaces at the end of individual titles for ( @titles ) { # removes leading and trailing whitspace from column titles s/^\s+//g; s/\s+$//g; } for ( @rows ) { # converts each row into an array of cells, and creates an array of arrays containing cells of rows push @row_cells, [ split ( /$format/, $_ ) ]; for ( @{ $row_cells[ -1 ] } ) { s/^\s+//g; # removes leading and trailing whitspace from all cells s/\s+$//g; } } return ( $format, scalar @titles, scalar @rows, \@titles, \@row_cells, \@header ); # returns the format of the file, number of columns, number of rows, array of column titles and array of rows } ################################################################################################################### sub find_scores { # identifies first column with scoring data in each table my $file = shift @_; # = name of file my @titles = @{ shift @_ }; # = array of column titles my @row_cells = @{ shift @_ }; # = array of arrays of cells of rows my ( $scol1, # = number of first column with scores $i, # = looping index for columns $j, # = looping index for rows $found, # = "non-score" once the first cell from the right end of the table with a non-score has been found $where, # = number of row in last column where a non-valid score entry was found $unsure, # if = "n" then user can define first score column $to_print ); # = number modifier for printing for ( $i = @titles - 1; $i >= 0 ; $i-- ) { # searches cell values of columns, starting from the right end for $j ( 0 .. @row_cells - 1 ) { unless ( ${ $row_cells[ $j ] }[ $i ] =~ /^[01ABCDHX\-]$/i ) { # checks whether a particular cell contains a score $found = "yes"; # indicates that a non-score value has been found $where = $j + 1; } } if ( $found ) { # decides what to do if once the first non-score from the right has been found if ( $i == @titles - 1 ) { if ( $where == 1 ) { $to_print = "st"; } elsif ( $where == 2 ) { $to_print = "nd"; } elsif ( $where == 3 ) { $to_print = "rd"; } else { $to_print = "th"; } print "\n-----------------------------\n\n", "-> Program terminated: The ". $where ."$to_print data row of the scoring table in\n", " '$file' contains at least one non-valid score entry (valid\n", " entries are: 0,1,A,a,B,b,C,c,D,d,H,h,-,X or x).\n\n", " Please revise this file and try again. Press 'Enter' to exit..."; ; exit; } else { $scol1 = $i + 2; print "\n > It looks like as if '$titles[ $scol1 - 1 ]' (column no. $scol1) is the first \n", # asks user whether to accept first column with scores " column with scores. Type 'n' if unsure: "; chomp ( $unsure = ); if ( $unsure =~ /\bn\b/i ) { # allows user to modify first-score column selection $scol1 = 1; print "\n > Here's a list of column titles...\n\n"; for ( @titles ) { # prints column titles on the screen printf "%3d: %s", $scol1, $_; if ( $scol1 == $i + 1 ) { print "\t\t<- last column with marker features..?\n"; } elsif ( $scol1 == $i + 2 ) { print "\t\t<- first column with scores..?\n"; } else { print "\n"; } $scol1++; } $scol1 = 1; print "\n > Please select the number of the first column that contains scores: "; chomp ( $scol1 = ); while ( ( $scol1 > @titles ) || ( $scol1 < 2 ) ) { print "\n > This is outside the valid range of columns. Try again: "; # rejects too large and too small column numbers chomp ( $scol1 = ); } if ( $scol1 <= $i + 1 ) { print "\n-----------------------------\n\n", # rejects lower column numbers than that which has been recognized as the first column with scores "-> Program terminated: there is at least one non-valid score entry in the\n", " '$titles[ $i ]' column"; print " (and possibly more in previous columns)" unless ( $scol1 == $i + 1 ); print ".\n Please check your file and try again. Press 'Enter' to exit..."; ; exit; } } return $scol1; } } } print "\n-> Program terminated: '$file' appears to contain only scores \n", # terminates if all cells contain scores " but no marker information. Press 'Enter' to exit..."; ; exit; } ################################################################################################################### sub find_marker_names { # finds column containing marker names my $file = shift @_; # = name of file my $scol1 = shift @_; # = number of first column with scores my @titles = @{ shift @_ }; # = array of column titles my @row_cells = @{ shift @_ }; # = array of arrays containing cells of rows my $i = 0; # = looping index my $j = 0; # = looping index my $accept = "?"; # = 'n' if user doesn't accept the marker name column offered by the program my $markercol; # = number of column with marker names do { for $j ( 0 .. $scol1 - 2 ) { if ( ${ $row_cells[ $i ] }[ $j ] =~ /^[a-z]{1,2}[A-Z][A-Za-z]{1,3}-\d{4}[AB]?$/ ) { # searches for DArT marker names $markercol = $j + 1; print "\n > Does the '$titles[ $j ]' column (no. ", $j + 1, ") contain the marker names? (y/n): "; chomp ( $accept = ); unless ( $accept =~ /\bn\b/i ) { return $markercol; } } } $i++; } until ( $i == $#row_cells || $accept =~ /\bn\b/i ); for ( @titles[ 0 .. $scol1 - 2 ] ) { # searches for a titles of a column with marker features that contains the string "marker" if ( /markers*/i ) { unless ( ( $markercol ) && ( $_ eq $titles[ $markercol - 1 ] ) ) { print "\n > Does the '$_' column contain the marker names? (y/n): "; chomp ( $accept = ); unless ( $accept =~ /\bn\b/i ) { return ( $markercol = $j + 1 ); } } } } $i = 0; do { for $j ( 0 .. $scol1 - 2 ) { if ( ${ $row_cells[ $i ] }[ $j ] =~ /^\d{12}_[A-P]_\d{1,2}$/i ) { # searches for DArT clone names as an alternative to marker names print "\n > Do you want to use DArT clone names in the '$titles[ $j ]' column as\n", " unique identifiers? (y/n): "; chomp ( $accept = ); unless ( $accept =~ /\bn\b/i ) { return ( $markercol = $j + 1 ); } } } $i++; } until ( $i == $#row_cells || $accept =~ /\bn\b/i ); if ( $scol1 == 2 ) { print "\n ==> Assumed that marker names are in column no. 1.\n"; return ( $markercol = 1 ); } print "\n > Here's a list of the titles of columns that contain marker features...\n\n"; # asks user to identify column with marker names or an alternative unique identifier for $i ( 1 .. $scol1 - 1 ) { # prints column titles on the screen if ( $titles[ $i - 1 ] ) { printf "%3d: %s\n", $i, $titles[ $i - 1 ]; } else { printf "%3d: %s\n", $i, 'empty'; } } print "\n > Please select the number of the column that contains marker names or any\n", " other unique identifier you wish to use: "; chomp ( $markercol = ); while ( ( $markercol >= $scol1 ) || ( $markercol < 1 ) ) { # rejects too big and too small column numbers print "\n > This is outside the valid range of columns. Try again: "; chomp ( $markercol = ); } return $markercol; } ################################################################################################################### sub remove_red_markers { # removes groups of redundant marker entries into a separate file, retaining one entry per group with the consensus scores my $file = shift @_; # = file name my $score_type = shift @_; # = type of scores (0/1) vs. (A/B/C/D/H) my $markercol = shift @_; # = number of column with marker names my $scol1 = shift @_; # = number of first column containing scores my @titles = @{ shift @_ }; # = array of column titles my @row_cells = @{ shift @_ }; # = array of arrays of cells of rows my $i = 0; # = looping index for processing of individual rows my $j = 0; # = nested looping index for second row to be compared to first row ($i) my $discordant = 0; # = counts number of discordant calls in a comparison of two or more score lists my $missing = 0; # = counts number of missing calls in a comparison of two or more score lists my $entry = "new"; # = indicates whether or not a particular marker has already been compared against my $n_markers = @row_cells; # = number of markers (= rows) my $nu_markers = $n_markers; # = number of unique markers (= rows) (starting value = total number) my ( $new_file, # = name of new file containing redundant marker entries @red_row, # = array of cells of a row belonging to a redundant marker entry @consensus, # = running consensus scores for the currently processed group of redundant marker entries %score_consensus ); # = hash of scoring consensus of all markers with multiple entries in % ( = 100 - % of scores that are discordant in at least one pair of score lists in a group of score lists for a redundant entry) for ( @row_cells ) { # loops through all rows, starting at row 0 @consensus = @{ $_ }[ $scol1 - 1 .. $#titles ]; for ( $j = $#row_cells; $j >= $i + 1; $j-- ) { # loops, for each row, through all other rows with higher indices if ( ${ $row_cells[ $i ] }[ $markercol - 1 ] eq ${ $row_cells[ $j ] }[ $markercol - 1 ] ) { # checks whether two rows contain data for the same entry (marker) if ( $nu_markers == $n_markers ) { $file =~ /^(.+)\..*$/; $new_file = "$1_red$titles[ $markercol - 1 ]s.csv"; open ( RED_ROWS, ">$new_file" ) or die "\n-> Program terminated: can't create '$new_file' to output\n groups of redundant markers: $!\n"; print RED_ROWS ",Consensus(%),@titles\n"; } print RED_ROWS ">>>>>>>>,>>>>>>>>\n" unless ( ( $entry eq "rep" ) || ( $nu_markers == $n_markers ) ); print RED_ROWS ",,@{ $row_cells[ $i ] }\n" unless ( $entry eq "rep" ); # writes first copy of redundant entry into new file @red_row = @{ splice ( @row_cells, $j, 1 ) }; # removes row with redundant entry print RED_ROWS ",,@red_row\n"; # writes next copy of redundant entry into new file @_ = &merge_score_lists ( $score_type, ${ $row_cells[ $i ] }[ $markercol - 1 ], $red_row[ $markercol - 1 ], \@consensus, [ @red_row[ $scol1 - 1 .. $#titles ] ] ); # builds the consensus scores for a marker with redundant entries @consensus = @{ $_[ 4 ] }; for ( @consensus ) { s/X/discordant/; # assigns "discordant" label to a pair of discordant scores so that it is remembered over several loops of merging pairs of redundant lists of scores } $nu_markers--; $entry = "rep"; } } if ( $entry eq "rep" ) { for ( @consensus ) { $discordant ++ if ( $_ eq "discordant" ); $missing ++ if ( $_ eq "-" ); s/discordant/X/; } $score_consensus{ ${ $row_cells[ $i ] }[ $markercol - 1 ] } = 100 * ( @consensus - $discordant - $missing ) / ( @consensus - $missing ); $discordant = $missing = 0; @{ $row_cells[ $i ] }[ $scol1 - 1 .. $#titles ] = @consensus; # overwrites the single remaining entry of a marker with (originally) multiple entries with the consensus scores print RED_ROWS "Consensus ->,$score_consensus{ ${ $row_cells[ $i ] }[ $markercol - 1 ] },@{ $row_cells[ $i ] }\n"; # writes the consensus scores for a marker into the separate file containing reduandant markers } $entry = "new"; $i++; } unless ( $nu_markers == $n_markers ) { close RED_ROWS or die "\n-> Program terminated: can't close '$new_file'\n\n"; } return ( \@row_cells, \%score_consensus ); } ################################################################################################################### sub remove_red_samples { # removes groups of redundant samples (columns) into a separate file, retaining one column per group with the consensus scores my $file = shift @_; # = file name my $score_type = shift @_; # = type of scores (0/1) vs. (A/B/C/D/H) my $scol1 = shift @_; # = number of first column containing scores my @titles = @{ shift @_ }; # = array of column titles my @row_cells = @{ shift @_ }; # = array of arrays of cells of rows (excluding first row with column titles) my $n_samples = @titles - $scol1 + 1; # = number of samples (columns with scores) my $nu_samples = $n_samples; # = number of unique samples my @red_titles = @titles[0 .. $scol1 - 2 ]; # = array of titles of redundant columns (and the leading columns containing marker features) my $status = "new"; # = indicates whether a particular sample (column) has already been found to be redundant my $n = 1; # = array index indicating the number of columns of a particluar sample with redundant entries my $discordant = 0; # = counts number of discordant calls in a comparison of two or more score lists my $missing = 0; # = counts number of missing calls in a comparison of two or more score lists my ( $new_file, # = name of new file containing redundant sample entries $i, # = looping index $j, # = looping index $sample, # = name of currently processed sample %red_data, # = hash of an array of arrays containing scores of redundant entries of individual samples @consensus, # = array of consensus scores for a group of replicates of a particular sample @separator, # = array of empty entries (number of entries = number of rows) @red_samples, # = array of names of redundant samples @new_col, # = new column with redundant scores @to_print, # = array of empty of spacer cells to print into file %score_consensus ); # = hash of scoring consensus of all markers with multiple entries in % ( = 100 - % of scores that are discordant in at least one pair of score lists in a group of score lists for a redundant entry) for $i ( $scol1 - 1 .. $#titles ) { # loops through all score columns, starting at the first column containing scores for ( $j = $#titles; $j > $i; $j-- ) { # loops, for each score column, through all other score columns with higher indices if ( $titles[ $i ] eq $titles[ $j ] ) { # checks whether two columns contain scores for the same sample if ( $status eq "new" ) { @{ $red_data{ $titles[ $i ] } }[ 0 ] = [ ( ) ]; for ( @row_cells ) { push @{ @{ $red_data{ $titles[ $i ] } }[ 0 ] }, ${ $_ }[ $i ]; # saves the scores of the first entry of a particular sample push @consensus, ${ $_ }[ $i ]; # creates an array containing the running-consensus scores } } @{ $red_data{ $titles[ $i ] } }[ $n ] = [ ( ) ]; for ( @row_cells ) { # removes scores from redundant column and transfers them into separate array push @{ @{ $red_data{ $titles[ $i ] } }[ $n ] }, splice ( @{ $_ }, $j, 1 ); } $nu_samples--; @_ = &merge_score_lists ( $score_type, $titles[ $i ], $titles[ $j ], \@{ @{ $red_data{ $titles[ $i ] } }[ $n ] }, \@consensus ); # builds the consensus scores for a sample represented by redundant columns @consensus = @{ $_[ 4 ] }; for ( @consensus ) { s/X/discordant/; # assigns "discordant" label to a pair of discordant scores so that it is remembered over several loops of merging pairs of redundant lists of scores } splice ( @titles, $j, 1 ); $status = "rep"; $n++; } } if ( $status eq "rep" ) { for ( @consensus ) { # converts "discordant" label into "X" for output into file $discordant ++ if ( $_ eq "discordant" ); $missing ++ if ( $_ eq "-" ); s/discordant/X/; } $score_consensus{ $titles[ $i ] } = 100 * ( @consensus - $discordant - $missing ) / ( @consensus - $missing ); $discordant = $missing = 0; @{ $red_data{ $titles[ $i ] } }[ $n ] = [ ( ) ]; for ( @consensus ) { push @{ @{ $red_data{ $titles[ $i ] } }[ $n ] }, $_; # writes the consensus scores (and spacer) into the array containing groups of redundant samples } for ( @row_cells ) { push @separator, ""; } ${ $red_data{ $titles[ $i ] } }[ $n + 1 ] = \@separator; push @red_samples, $titles[ $i ]; for ( @row_cells ) { ${ $_ }[ $i ] = shift @consensus; # overwrites the single remaining column representing a sample with originally multiple entries with the consensus scores } } $status = "new"; $n = 1; } unless ( $n_samples == $nu_samples ) { # outputs redudant sample entries plus the consensus scores into a separate file @red_samples = sort ( @red_samples ); # creates column titles for redundant sample entries for ( @red_samples ) { push @red_titles, ( $_, $_, "Consensus", "||" ); } pop @red_titles; $file =~ /^(.+)\..*$/; $new_file = "$1_redSamples.csv"; open ( RED_COLS, ">$new_file" ) or die "\n-> Program terminated: can't create '$new_file' to output\n groups of redundant samples: $!\n"; for ( 1 .. $scol1 - 2 ) { print RED_COLS ","; } print RED_COLS "Consensus(%) ->"; $i = "first sample"; for $sample ( sort keys %red_data ) { for ( 1 .. $#{ $red_data{ $sample } } - 1 ) { push @to_print, "-" ; } print RED_COLS ",||" if ( $i eq "subsequent sample" ); print RED_COLS ",@to_print,$score_consensus{ $sample }"; @to_print = ( ); $i = "subsequent sample"; } print RED_COLS "\n"; print RED_COLS "@red_titles\n"; for $i ( 0 .. $#row_cells ) { print RED_COLS "@{ $row_cells[ $i ] }[ 0 .. $scol1 - 2 ],"; for $sample ( sort keys ( %red_data ) ) { for ( @{ $red_data{ $sample } } ) { print RED_COLS "${ $_ }[ $i ],"; } } print RED_COLS "\n"; } close RED_COLS or die "\n-> Program terminated: can't close '$new_file': $!\n\n"; } return ( $nu_samples, \@titles, \@row_cells, \%score_consensus ); } ################################################################################################################### sub merge_tables { # merges two tables (data sets) into a single strict-consensus data set my $score_type = shift @_; # = type of scores (0/1) or (A/B/C/D/H) my @previous_files = @{ shift @_ }; # = list of previous files contained in current consensus table my $new_file = shift @_; # = name of new file to be merged with current consensus data set my %cons_scores = %{ shift @_ }; # = hash of hashes: samples => markers => scores (in the current consensus table) my %scores = %{ shift @_ }; # = hash of hashes: samples => markers => scores (in the next table to be merged with the current consensus table) my ( $marker, # = currently processed marker $sample, # = currently processed sample $overlap ); # = number of genotype calls (scores) overlapping between the two data sets my $discordant = 0; # = number of discordant genotype call overlaps for $sample ( keys %scores ) { # adds scores for sample x marker pairs from the new table to the consensus table in case there're no values for these sample x marker pairs in the consensus table for $marker ( keys %{ $scores{ $sample } } ) { if ( exists ${ $cons_scores{ $sample } }{ $marker } ) { $overlap ++; @_ = &merge_scores ( $score_type, ${ $cons_scores{ $sample } }{ $marker }, ${ $scores{ $sample } }{ $marker } ); if ( $_[ 0 ] =~ /^X$/i ) { ${ $cons_scores{ $sample } }{ $marker } = 'discordant'; $discordant ++; } else { ${ $cons_scores{ $sample } }{ $marker } = $_[ 0 ]; } } else { ${ $cons_scores{ $sample } }{ $marker } = ${ $scores{ $sample } }{ $marker }; } } } return \%cons_scores, $overlap, $discordant; } ################################################################################################################### sub merge_score_lists { # merges scores from two 1-dimensional lists (eg, one marker scored for many samples) my $score_type = shift @_; # type of scores my @lists = @_[ 0 .. 1 ]; # array of names of the two lists of scores my @scores = @_[ 2 .. 3 ]; # array of the two arrays of scores my $concord = 0; # number of concordant genotype calls my $discord = 0; # number of discordant genotype calls my $single = 0; # number of genotype calls that are unknown in one table but defined in the other my $missing = 0; # number of genotype calls that are unknown in both tables my $j = 0; # looping index my ( $i, # looping index @con_scores # list of consensus scores (merged scores) ); # checks whether the lengths of the two score lists are identical unless ( scalar @{ $scores[ 0 ] } == scalar @{ $scores[ 1 ] } ) { print "\n-----------------------------\n\n", "-> Program terminated: there was an unequal number of scores in two lists\n", " of scores to be merged. Press 'Enter' to exit...\n"; < STDIN >; exit; } # merges the two lists of scores, counts number of concordant, discordant, single and missing scores and # converts "discordant" label of score pairs highlighted to be discordant in a previous round of merging into "X" for $i ( 0 .. $#{ $scores[ 0 ] } ) { push @con_scores, &merge_scores ( $score_type, ${ $scores[ 0 ] }[ $i ], ${ $scores[ 1 ] }[ $i ] ); if ( $con_scores[ $i ] =~ /X/i ) { $discord ++; s/discordant/X/; } elsif ( $con_scores[ $i ] eq '-' ) { $missing ++; } elsif ( $con_scores[ $i ] =~ /[01ABCDH]/i ) { unless ( ${ $scores[ 0 ] }[ $i ] eq '-' || ${ $scores[ 1 ] }[ $i ] eq '-' ) { $concord ++; } else { $single++; } } } return ( $concord, $discord, $single, $missing, \@con_scores ); } ################################################################################################################### sub merge_scores { # merges a pair of scores into a consensus score my $score_type = shift @_; # = type of scores (0/1) or (A/B/C/D/H) my $scoreA = shift @_; # = replicate score no. 1 my $scoreB = shift @_; # = replicate score no. 2 my $consensus; # = consensus score if ( !$score_type || ( $score_type ne 'numbers' && $score_type ne 'letters' ) ) { # tries to identify the type of scores in cases it is not known yet $score_type = &score_type ( 'A pair of scores', $scoreA, $scoreB ); } if ( $score_type eq 'numbers' ) { # checks scoring consistency for a pair of lists with 0/1 scores if ( !( $scoreA =~ /[01X\-]/i ) && !( $scoreB =~ /[01X\-]/i ) ) { print "\n-----------------------------\n\n", "-> Program terminated: one of your data sets appears to contain non-valid\n", " scores (instead of the 0/1 scores that were expected). Press 'Enter' to exit..."; ; exit; } elsif ( ( $scoreA eq "discordant" ) || ( $scoreB eq "discordant" ) ) { $consensus = "X"; } else { if ( $scoreA =~ /^[X\-]$/i && $scoreB =~ /^[X\-]$/i ) { $consensus = "-"; } else { if ( $scoreA eq $scoreB ) { $consensus = uc ( $scoreA ); } elsif ( $scoreA =~ /^[X\-]$/i ) { $consensus = uc ( $scoreB ); } elsif ( $scoreB =~ /^[X\-]$/i ) { $consensus = uc ( $scoreA ); } else { $consensus = "X"; } } } } elsif ( $score_type eq 'letters' ) { # checks scoring consistency for table pairs with A/B/C/D/H type of scores if ( !( $scoreA =~ /[ABCDHX\-]/i ) && !( $scoreB =~ /[ABCDHX\-]/i ) ) { print "\n-----------------------------\n\n", "-> Program terminated: one of your data sets appears to contain non-valid\n", " scores (instead of the A/B/C/D/H scores that were expected).\n", " Press 'Enter' to exit..."; ; exit; } elsif ( ( $scoreA eq "discordant" ) || ( $scoreA eq "discordant" ) ) { $consensus = "X"; } else { if ( $scoreA =~ /^[X\-]$/i && $scoreB =~ /^[X\-]$/i ) { $consensus = "-"; } else { if ( uc ( $scoreA ) eq uc ( $scoreB ) ) { $consensus = uc ( $scoreA ); } elsif ( $scoreA =~ /^[\-X]$/i ) { $consensus = uc ( $scoreB ); } elsif ( $scoreB =~ /^[X\-]$/i ) { $consensus = uc( $scoreA ); } elsif ( ( $scoreA =~ /^H$/i ) || ( $scoreB =~ /^H$/i ) ) { if ( ( $scoreA =~ /^[AB]$/i) || ( $scoreB =~ /^[AB]$/i ) ) { $consensus = "X"; } else { $consensus = "H"; } } elsif ( $scoreA =~ /^C$/i ) { if ( $scoreB =~ /^B$/i ) { $consensus = "B"; } else { $consensus = "X"; } } elsif ( $scoreA =~ /^D$/i ) { if ( $scoreB =~ /^A$/i ) { $consensus = "A"; } else { $consensus = "X"; } } elsif ( $scoreB =~ /^C$/i ) { if ( $scoreA =~ /^B$/i ) { $consensus = "B"; } else { $consensus = "X"; } } elsif ( $scoreB =~ /^D$/i ) { if ( $scoreA =~ /^A$/i ) { $consensus = "A"; } else { $consensus = "X"; } } else { $consensus = "X"; } } } } return $consensus; } ################################################################################################################### sub score_type { # identifies type of scores my $source = shift @_; # = name of source (eg, file) from which the list of scores are coming from my @scores = @_; # = list of scores of supposedly the same type my $score_type; # = type of scores for ( @scores ) { if ( /[01]/ ) { $score_type = 'numbers'; last; } elsif ( /[ABCDH]/i ) { $score_type = 'letters'; last; } elsif ( /[X\-]/i ) { $score_type = 'unknown'; } else { print "\n-----------------------------\n\n", "-> Program terminated: 'source' appears to contain non-valid scores\n", " (scores other then 0/1 or A/B/C/D/H). Press 'Enter' to exit..."; ; exit; } } return $score_type; } ################################################################################################################### sub print_con_file { # prints csv file with consensus table derived from merging individual data sets of unique and quality-filtered markers and samples my $consensus_file = shift @_; # = name of file into which consensus table shall be written my @file_list = @{ shift @_ }; # = list of individual files (data sets) from which the consensus data set was produced my $min_marker_callrate = shift @_; # = call rate threshold for markers my $min_sample_callrate = shift @_; # = call rate threshold for samples my %n_g_markers = %{ shift @_ }; # = array of numbers of good (= quality-filtered) markers in each of the data sets my %n_g_samples = %{ shift @_ }; # = array of numbers of good (= quality-filtered) samples in each of the data sets my %typed_in_table = %{ shift @_ }; # = hash of arrays: sample => list of files in which sample was typed my %scored_in_table = %{ shift @_ }; # = hash of arrays: marker => list of files in which marker was scored my %cons_sample_callrate = %{ shift @_ }; # = hash: sample => call rate (denominated by total number of samples in consensus table) my %cons_marker_callrate = %{ shift @_ }; # = hash: marker => call rate (denominated by total number of markers in consensus table) my %sample_consensus = %{ shift @_ }; # = hash: sample => consensus ("consistency") value: defined scores (0,1,A,B,C,D,H) as % of existing scores (= defined or discordant) (denominator excludes missing scores!) my %marker_consensus = %{ shift @_ }; # = hash: marker => consensus ("consistency") value: defined scores (0,1,A,B,C,D,H) as % of existing scores (= defined or discordant) (denominator excludes missing scores!) my %global_marker_features = %{ shift @_ }; # = hash of hashes of hashes: files (data sets) => markers => marker features => values of marker features in file my %cons_scores = %{ shift @_ }; # = hash of hashes: samples => markers => scores of consensus table my ( $i, # = looping variable $marker, # = currently processed marker $sample, # = currently processed sample $file, # = currently looped-through file name $found, # = + or - depending on wether a particular sample or marker was present in a particular data set (file) %n_features, # = hash file => number of columns with marker features in each of the files $tot_n_features, # = total number of marker features in all files $feature, # = name of currently processed marker feature @table ); # = array of arrays of rows of cells my $global_callrate = 0; # = overall call rate of consensus table for $sample ( keys %cons_sample_callrate ) { # computes the global call rate for the enire consensus score table $global_callrate = $global_callrate + $cons_sample_callrate{ $sample }; } $global_callrate = $global_callrate / scalar ( keys %cons_sample_callrate ); for $file ( @file_list ) { # computes the total number of columns derived from all individual files that contain marker features $n_features{ $file } = 0; for ( keys %{ $global_marker_features{ $file } } ) { for ( sort keys %{ ${ $global_marker_features{ $file } }{ $_ } } ) { $n_features{ $file } ++; $tot_n_features ++; } last; } } open ( CON, ">$consensus_file" ) or die "\n-> Program terminated: can't create '$consensus_file' to output\n the consensus score table: $!\n"; print CON "Consensus data set ("; print CON scalar keys %cons_scores, " sample"; print CON "s" if ( scalar keys %cons_scores > 1 ); print CON " x ", scalar keys %scored_in_table, " marker"; print CON "s" if ( scalar keys %scored_in_table > 1 ); print CON ") produced by merging the data sets from the following files:\n"; for ( 0 .. $#file_list ) { print CON " - '$file_list[ $_ ]' ($n_g_samples{ $file_list[ $_ ] } sample"; print CON "s" if ( $n_g_samples{ $file_list[ $_ ] } > 1 ); print CON " x $n_g_markers{ $file_list[ $_ ] } marker"; print CON "s" if ( $n_g_markers{ $file_list[ $_ ] } > 1 ); print CON ")\n"; } print CON "\n (The 'qua' files contain the individual data sets pre-filtered for redundancy and call rate: marker threshold = $min_marker_callrate % & sample threshold = $min_sample_callrate %.)\n\n"; printf CON " => The global call rate in the consensus table is %2.2f ", $global_callrate; print CON "%.\n\n"; print CON "The sample and marker consensus values are the % of successfully scored calls divided by the number of successful plus discordant calls.\n\n"; for $file ( @file_list ) { # prints information on which data sets contained each of the samples for ( 0 .. @file_list + $tot_n_features + 1 ) { print CON ","; } print CON "$file:"; $found = 'n'; for $sample ( sort keys %typed_in_table ) { for ( @{ $typed_in_table{ $sample } } ) { if ( $_ eq $file ) { $found = "y"; last; } } print CON ",$found"; $found = 'n'; } print CON "\n"; } for ( 0 .. @file_list + $tot_n_features + 1 ) { # prints row with sample consensus values print CON ","; } print CON "SampleConsensus(%) =>"; for $sample ( sort keys %typed_in_table ) { print CON ",$sample_consensus{ $sample }"; } print CON "\n"; for ( 0 .. @file_list + $tot_n_features + 1 ) { # prints row with sample call rate values print CON ","; } print CON "SampleCallRate(%) =>"; for $sample ( sort keys %typed_in_table ) { print CON ",$cons_sample_callrate{ $sample }"; } print CON "\n"; for ( 0 .. @file_list + 2 ) { # prints row with info from which files the individual marker features are derived print CON ","; } for $file ( @file_list ) { $found = ''; until ( $found ) { # searches for a marker scored in the currently processed file for $marker ( sort keys %scored_in_table ) { for ( @{ $scored_in_table{ $marker } } ) { if ( $_ eq $file ) { $found = $marker; } } } } print CON "|| '$file'=>," if ( scalar ( keys %{ ${ $global_marker_features{ $file } }{ $found } } ) > 0 ); # checks whether marker features exist in a particular file for ( keys %{ $global_marker_features{ $file } } ) { for ( 1 .. ( scalar keys %{ ${ $global_marker_features{ $file } }{ $_ } } ) - 1 ) { print CON "*,"; } last; } } print CON "|"; for ( 1 .. ( keys %typed_in_table ) - 1 ) { print CON ",|"; } print CON "\n"; for $file ( @file_list ) { # prints row with names of marker features and sample print CON "$file,"; } print CON "MarkerConsensus(%),MarkerCallRate(%),Marker"; for $file ( @file_list ) { for ( keys %{ $global_marker_features{ $file } } ) { for ( sort keys %{ ${ $global_marker_features{ $file } }{ $_ } } ) { print CON ",$_"; } last; } } for $sample ( sort keys %typed_in_table ) { print CON ",$sample"; } print CON "\n"; for $marker ( sort keys %scored_in_table ) { # prints rows with marker features and scores $found = 'n'; for $file ( @file_list ) { for ( @{ $scored_in_table{ $marker } } ) { if ( $_ eq $file ) { $found = 'y'; last; } } print CON "$found,"; $found = 'n'; } print CON "$marker_consensus{ $marker },$cons_marker_callrate{ $marker },$marker"; for $file ( @file_list ) { # prints values of marker features from individual files $found = 'n'; for ( @{ $scored_in_table{ $marker } } ) { if ( $_ eq $file ) { $found = 'y'; last; } } if ( $found eq 'y' ) { for $feature ( sort keys %{ ${ $global_marker_features{ $file } }{ $marker } } ) { print CON ",${ ${ $global_marker_features{ $file } }{ $marker } }{ $feature }"; } } else { for ( 1 .. $n_features{ $file } ) { print CON ",-"; } } } for $sample ( sort keys %typed_in_table ) { if ( exists ${ $cons_scores{ $sample } }{ $marker } ) { print CON ",${ $cons_scores{ $sample } }{ $marker }"; } else { print CON ",-"; } } print CON "\n"; } close "CON"; print "\n =====> The consensus data set with ", scalar keys %cons_scores, " sample"; print "s" if ( scalar keys %cons_scores > 1 ); print " and ", scalar keys %scored_in_table, " marker"; print "s" if ( scalar keys %scored_in_table > 1 ); print " was saved\n"; print " under '$consensus_file'.\n"; return; }