use strict; $" = ","; my ( $data_set, # name of file containing the original data set $map_order, # name of file containing markers in map order $uni_file, # name of file containing the redundancy-filtered data set $n_samples, # number of samples in data set $rec_threshold, # recombination fraction threshold @breakpoint, # array of marker positions immediately before breack points $n_u_samples, # number of unique samples in data set @sample_list, # list of unique samples %cons_red_samples, # hash: replicated samples => score consensus $marker_name, # name of column heading containing the marker names %marker_features, # hash: markers => marker features $n_markers, # number of markers $n_u_markers, # number of unique markers @marker_list, # list of unique markers %cons_red_markers, # hash: replicated markers => score consensus $score_type, # type of scores (0/1 or A/B/C/D/H) %score, # hash of hashes: samples => markers => scores @map_order, # array of markers arranged according to map order @group, # array of linkage group names assoicated with markers in the @map_order array @group_order, # array of markers belonging to a single linkage group arranged according to map order @group_mapdist, # cumulative Kosambi map distances for markers of currently processed linkage groups @cumul_mapdist, # cumulative Kosambi map distances of all markers, computed by simply adding distances of adjacent markers (simulates MapManager output) @opt_group_mapdist, # optimised cumulative Kosambi map distances for markers of currently processed linkage groups, computed using the least-square fitting procedure used by JoinMap 3.0 (simulates JoinMap output) @opt_cumul_mapdist, # optimised cumulative Kosambi map distances, computed using the least-square fitting procedure used by JoinMap 3.0 (simulates JoinMap output) @rec_fract, # array of arrays of rows of cells of a matrix containing paiwaise recombination fractions @mapdist_matrix, # array of arrays of rows of cells of a matrix containing paiwaise Kosambi map distances @lod_matrix, # array of arrays of rows of cells of a matrix containing paiwaise LOD scores (computed as in JoinMap 3.0) @opt_mapdist, # array of optimised map distances (ordered according to map order) $total_length, # sum of map lengths of all linkage groups $sum_neg_mapdist, # sum of negative map distances between adjecent markers $row, # currently processed row of a matrix $cell, # currently processed cell of a row of a matrix $i, # looping index $start_group, # number of marker in map order at which the currently processed linkage group starts $end_group, # number of marker in map order at which the currently processed linkage group ends $tmp, # temporary variable $pop_type, # population type (at this stage only DH or selfed RILs) $marker, $uni_file_new, @new_map, @to_distribute, $search, $previous_rank, $rank, $n_samples_new, $file, $n_u_samples_new, @sample_list_new, %cons_red_samples_new, $marker_name_new, %marker_features_new, $n_markers_new, $n_u_markers_new, @marker_list_new, @map_order_copy, @surrounding_map, %cons_red_markers_new, $score_type_new, %score_new, $new_markers, $single_co, $double_co, $no_co, $missing, $co_top, $co_bottom, $no_co_top, $no_co_bottom, $missing_top, $missing_bottom, %fit, $best, %best_locus, $number, $missmatch, $sample, $old, $new, $merged_upper, $merged_lower, $fw_score, $best_position, $double_co_top, $double_co_bottom, %signature, @best, %distributed, $best_fit, %locus_count, $n_locus, $n_marker, $increment, %rank, $n_intervals, $loop, %locus, @remaining_marker, @remaining_marker_copy, $from, $to,$fw, $index, $marker_type, ); print "\n\n\n\n\n"; print " \\\\\\\\\\\\ \\\\\\\\\\\\\n"; print " >>>>> DArTscript2 >>>>---- >>>>> DArTscript2 >>>>----\n"; print " ////// //////\n\n\n"; print " Copyright: Peter Wenzl 2005-06, DArT P/L (www.DiversityArrays.com)\n"; print "\n !!! WARNING !!!\n"; print " This is a purpose-built script written by someone\n"; print " without previous coding experience. There may now be a\n"; print " better version available. Please contact Peter Wenzl\n"; print " at peter\@DiversityArrays.com\n\n"; print " ------------------------------------------------------------------------\n"; print " [New loci are incorporated into a fixed framework map by minimizing the\n"; print " number of singletons. Three tab or comma-separated files are required:\n"; print " files 1 and 3 contain genotype data (A/B/C/D/H/-) of the framework map\n"; print " and the new loci, respectively. The files should have a single row with\n"; print " the column headings. They can have multiple columns on the left side, at\n"; print " least one of which has to contain the names of loci. File 2 should\n"; print " contain a single column (without column heading) with the full list\n"; print " of framework loci arranged in map order.]\n"; print " ------------------------------------------------------------------------\n"; print "\nFile with segregation data of framework map:\n"; # reads data set with scores of all markers across all samples do { print " >"; chomp ( $data_set = ); } until ( $data_set =~ /^.+\.?.*$/ ); print "File with framework map order:\n"; # reads map order do { print " >"; chomp ( $map_order = ); } until ( $map_order =~ /^.+\.?.*$/ ); print "File with new markers:\n"; # reads new markers do { print " >"; chomp ( $new_markers = ); } until ( $new_markers =~ /^.+\.?.*$/ ); @_ = &filter_redundant ( $data_set ); # opens and redundancy-filters the file with the FW map markers $uni_file = shift @_; $n_samples = shift @_; $n_u_samples = shift @_; @sample_list = sort @{ shift @_ }; %cons_red_samples = %{ shift @_ }; $marker_name = shift @_; %marker_features = %{ shift @_ }; $n_markers = shift @_; $n_u_markers = shift @_; @marker_list = sort @{ shift @_ }; %cons_red_markers = %{ shift @_ }; $score_type = shift @_; %score = %{ shift @_ }; @_ = &filter_redundant ( $new_markers ); # opens and redundancy-filters the file with the new markers $uni_file_new = shift @_; $n_samples_new = shift @_; $n_u_samples_new = shift @_; @sample_list_new = sort @{ shift @_ }; %cons_red_samples_new = %{ shift @_ }; $marker_name_new = shift @_; %marker_features_new = %{ shift @_ }; $n_markers_new = shift @_; $n_u_markers_new = shift @_; @marker_list_new = sort @{ shift @_ }; %cons_red_markers_new = %{ shift @_ }; $score_type_new = shift @_; %score_new = %{ shift @_ }; unless ( $score_type eq 'letters' ) { # confirms that data set contains A/B/C/D/H type of scores print "\n-----------------------------\n", "\n=> Program terminated. The data sets does not contain A/B/C/D/H scores. Press\n", " 'Enter' to exit..."; < STDIN >; exit; } $missmatch = 0; # checks if framework map and new markers were scored for the same set of lines for $number ( 0 .. $#sample_list ) { $missmatch ++ if ( $sample_list[ $number ] ne $sample_list_new[ $number ] ); } unless ( $score_type eq $score_type_new && $#sample_list == $#sample_list_new && $missmatch == 0 ) { print "\n-----------------------------\n", "\n=> Program terminated. The framework map and the new set of markers are incompatible! Press\n", " 'Enter' to exit..."; < STDIN >; exit; } open ( ORDER, "$map_order" ) or die "\n-> Program terminated: can't open $map_order: $!\n\n"; # opens and reads file containing the map order of markers chomp ( @map_order = ); close ORDER or warn "\nCan't close $map_order: $!\n\n"; unshift ( @map_order, "top telomer" ); # inserts telomers into array containing the framework markers in the correct order push ( @map_order, "bottom telomer" ); for $sample ( @sample_list ) { $score{ $sample }{ $map_order[ 0 ] } = "-"; $score{ $sample }{ $map_order[ -1 ] } = "-"; } @new_map = @map_order; # creates a copy of the framwork map into which new markers are going to be inserted for $old ( 0 .. $#map_order ) { # creates (i) a hash with rank numbers of markers of the framework map and (ii) a hash with the framework markers for each of the rank numbers $rank{ $map_order[ $old ] } = $old; $locus{ $old } = $map_order[ $old ]; } for $new ( 0 .. $#marker_list_new ) { # creates a hash with the score signature of the currently processed new marker for $sample ( @sample_list ) { ${ $signature{ $marker_list_new[ $new ] } }{ $sample } = ${ $score_new{ $sample } }{ $marker_list_new[ $new ] }; } } # integrates new markers into fixed framework map by minimising the number of double CO events print "\n### Integrating $n_markers_new markers into linkage group with " . scalar @map_order . " markers...\n"; @remaining_marker = @remaining_marker_copy = @marker_list_new; $loop = 0; until ( @remaining_marker == 0 ) { # finds the best positions in framework map intervals for each of the new markers $loop ++; %locus_count = ( ); print "\nRound No. $loop (" . scalar @remaining_marker . " marker"; print "s" if ( @remaining_marker > 1 ); print " left):\n"; for $new ( 0 .. $#remaining_marker ) { # evaluates goodness-of-fit for currenlty processed new marker at all framework map position if ( $loop == 1 ) { for ( @map_order ) { if ( $_ eq $remaining_marker[ $new ] ) { print "At least one of the markers to be distributed into the framework map (\"$_\") is already present in the framework map! Program terminated. Press 'Enter' to exit..."; ; exit; } } @{ $distributed{ $remaining_marker[ $new ] } } = &distribute_marker ( $remaining_marker[ $new ], \@sample_list, \%{ $signature{ $remaining_marker[ $new ] } }, \@map_order, 0, $#map_order, \%score ); # distributes a new marker into a rigid (dense) framework map by minimising the number of double CO (and if equal, the number of single CO events) unless ( ${ $distributed{ $remaining_marker[ $new ] } }[ 0 ] eq "not incorporated" ) { push @{ $locus_count{ ${ $distributed{ $remaining_marker[ $new ] } }[ 0 ] } }, $remaining_marker[ $new ]; # registers markers that have been placed at the same position } else { $rank{ $remaining_marker[ $new ] } = "not incorporated"; splice ( @remaining_marker_copy, $new, 1 ); } } else { # evaluates, for the currently processed new marker, the best goodness of fit at all positions within the previously identified best framework map interval (into which other new markers have already been inserted) $from = 0; $from ++ until ( $new_map[ $from ] eq ${ $distributed{ $remaining_marker[ $new ] } }[ 0 ] ); $to = $from + 1; until ( $new_map[ $to ] eq $locus{ int ( $rank{ $new_map[ $from ] } ) + 1 } ) { $to ++; } @{ $distributed{ $remaining_marker[ $new ] } } = &distribute_marker ( $remaining_marker[ $new ], \@sample_list, \%{ $signature{ $remaining_marker[ $new ] } }, \@new_map, $from, $to, \%score ); push @{ $locus_count{ ${ $distributed{ $remaining_marker[ $new ] } }[ 0 ] } }, $remaining_marker[ $new ]; } } if ( $loop == 1 ) { @remaining_marker = @remaining_marker_copy if ( @remaining_marker_copy < @remaining_marker ); @remaining_marker_copy = ( ); } #for ( @new_map ) { # if ( $locus_count{ $_ } && @{ $locus_count{ $_ } } > 1 ) { # print " Markers distributed to the same intervals:\n"; # last; # } #} @to_distribute = ( ); for $fw ( 0 .. $#new_map - 1 ) { # looks at all framework loci and the two telomers if ( $locus_count{ $new_map[ $fw ] } ) { if ( @{ $locus_count{ $new_map[ $fw ] } } > 1 ) { #print " -" . scalar @{ $locus_count{ $new_map[ $fw ] } } . " markers between $new_map[ $fw ] and $new_map[ $fw + 1 ]\n"; %fit = ( ); for ( @{ $locus_count{ $new_map[ $fw ] } } ) { @{ $fit{ $_ } } = @{ $distributed{ $_ } }[ 1 .. 4 ]; } $best_fit = &best_fitting_marker ( \@{ $locus_count{ $new_map[ $fw ] } }, $#sample_list, \%fit ); } else { $best_fit = ${ $locus_count{ $new_map[ $fw ] } }[ 0 ]; } $rank{ $best_fit } = ( $rank{ $new_map[ $fw ] } + $rank{ $new_map[ $fw + 1 ] } ) / 2; # creates a rank number of a newly integrated marker (= aretmethic mean between the rank numbers of the two adjacent markers) $locus{ $rank{ $best_fit } } = $best_fit; for ( 0 .. $#remaining_marker ) { if ( $best_fit eq $remaining_marker[ $_ ] ) { for $sample ( @sample_list ) { ${ $score{ $sample } }{ $remaining_marker[ $_] } = ${ $score_new{ $sample } }{ $remaining_marker[ $_] }; } push @to_distribute, splice ( @remaining_marker, $_, 1 ); # transfers new marker from list of new markers to the best position within the framework map last; } } } } print "\n Distributed:\n"; for $rank ( sort { $b <=> $a } @rank{ @to_distribute } ) { $search = $#new_map; $search -- until ( $rank{ $new_map[ $search ] } < $rank ); splice ( @new_map, $search + 1, 0, $locus{ $rank } ); print " $new_map[ $search + 1 ]: between $new_map[ $search ] and $new_map[ $search + 2 ]\n"; } } # re-calculates number of DCO after having distributed all new markers into the framework map print "\n### Recalculating number of double CO...\n"; for $fw ( 1 .. $#new_map - 1 ) { $marker = $new_map[ $fw ]; @surrounding_map = @new_map; splice ( @surrounding_map, $fw, 1 ); unless ( $signature{ $marker } ) { for $sample ( @sample_list ) { ${ $signature{ $marker } }{ $sample } = ${ $score{ $sample } }{ $marker }; } } @_ = &distribute_marker ( $marker, \@sample_list, \%{ $signature{ $marker } }, \@surrounding_map, $fw - 1, $fw, \%score ); if ( $distributed{ $marker } ) { splice ( @{ $distributed{ $marker } }, 4, 0, $_[ 3 ] ); } else { @{ $distributed{ $marker } } = @_; splice ( @{ $distributed{ $marker } }, 3, 0, "-" ); } } shift @new_map; # removes telomer entries from list of markers pop @new_map; # prints results into a csv file $new_markers =~ /^([^\.]+)\..*$/; $file = $1 . "_distributed.csv"; open ( DISTRIBUTED, ">$file" ) or die "\n-> Program terminated: can't open \"$file\": $!\n\n"; print DISTRIBUTED "Type,Locus,Rank,noCO,singleCO,doubleCOintegr,doubleCOfinal,Missing\n"; for $marker ( @new_map ) { $marker_type = "fw"; for $new ( @marker_list_new ) { if ( $new eq $marker ) { $marker_type = "new"; last; } } print DISTRIBUTED"$marker_type,$marker,$rank{ $marker},@{ $distributed{ $marker } }[ 1 .. 5 ]\n"; } close ( DISTRIBUTED ) or die "\n-> Program terminated: can't close \"$file\": $!\n\n"; print "\nSaved positions of distributed markers in \"$file\"\n"; print "\nFinished! Press \"Enter\" to exit...\n"; ; exit; sub distribute_marker { my $marker = shift; my @sample_list = @{ shift @_ }; my %signature = %{ shift @_ }; my @framework = @{ shift @_ }; my $from = shift; my $to = shift; my %score = %{ shift @_ }; my ( $fw, $single_co, $double_co, $no_co, $missing, $sample, $position, $fw_score, $merged_above, $merged_below, %fit, $double_co_top, $double_co_bottom, $co_top, $co_bottom, $no_co_top, $no_co_bottom, $missing_top, $missing_bottom, $best_position, $best_locus, @distributed, ); # tests all position in the framework map for $fw ( $from .. $to - 1 ) { # places each new marker into all intervals between two adjacent markers of the framework map #print "$marker between $framework[ $fw ] and $framework[ $fw + 1 ]:\n"; $single_co = $double_co = $no_co = $missing = 0; for $sample ( @sample_list ) { if ( $signature{ $sample } =~ /^[ABCDH]$/i ) { # only intends to compare score against scores of adjacent markers if genotype call is defined $position = $fw; # compares with score of next score above $merged_above = ""; $fw_score = ${ $score{ $sample } }{ $framework[ $position ] }; unless ( $fw_score =~ /^[ABCDH]$/i ) { until ( $fw_score =~ /^[ABCDH]$/i ) { $position --; if ( $position < 0 ) { $merged_above = "distal"; last; } $fw_score = ${ $score{ $sample } }{ $framework[ $position ] }; } } #print "$sample: $fw_score$signature{ $sample }"; $merged_above = &merge_scores ( "letters", $fw_score, $signature{ $sample } ) unless ( $merged_above && $merged_above eq "distal" ); # compares against upper marker $position = $fw + 1; # compares with score of next score below $merged_below = ""; $fw_score = ${ $score{ $sample } }{ $framework[ $position ] }; unless ( $fw_score =~ /^[ABCDH]$/i ) { until ( $fw_score =~ /^[ABCDH]$/i ) { $position ++; if ( $position > $#framework ) { $merged_below = "distal"; last; } $fw_score = ${ $score{ $sample } }{ $framework[ $position ] }; } } #print "$fw_score: "; $merged_below = &merge_scores ( "letters", $fw_score, $signature{ $sample } ) unless ( $merged_below && $merged_below eq "distal" ); # compares against upper marker unless ( $merged_above eq "distal" || $merged_below eq "distal" ) { # detects DCO events for internal markers if ( $merged_above =~ /^X$/i && $merged_below =~ /^X$/i ) { # double CO event $double_co ++; #print "doubleCO ***\n"; } elsif ( ( $merged_above =~ /^X$/i && $merged_below =~ /^[ABCDH]$/i ) || ( $merged_above =~ /^[ABCDH]$/i && $merged_below =~ /^X$/i ) ) { # single CO event $single_co ++; #print "singleCO\n"; } elsif ( $merged_above =~ /^[ABCDH]$/i && $merged_below =~ /^[ABCDH]$/i ) { # no CO event $no_co ++; #print "noCO\n"; } else { print "Inconsistency in score comparison. Program terminated. Press 'Enter' to exit..."; ; exit; } } else { # detects DCO events for marker close to the ends of the linkage group if ( $merged_above eq "distal" && $merged_below eq "distal" ) { $missing ++; #print "missing\n"; } elsif ( $merged_above =~ /^X$/i || $merged_below =~ /^X$/i ) { $single_co ++; $double_co = $double_co + 0.999; #print "CO at telomer +++\n"; } elsif ( $merged_above =~ /^[ABCDH]$/i || $merged_below =~ /^[ABCDH]$/i ) { $no_co ++; #print "noCO\n"; } else { print "Inconsistency in score comparison. Program terminated. Press 'Enter' to exit..."; ; exit; } } } else { $missing ++; #print "marker not scored for this sample\n"; } } $fit{ $framework[ $fw ] } = [ $no_co, $single_co, $double_co, $missing ]; # registers the number of no-crossover, single-crossover, double-crossover events for each position in the FW map #print "fitting $marker between $framework[ $fw ] and $framework[ $fw + 1 ]: $no_co fitting, $single_co singleCO, $double_co doubleCO, $missing missing\n"; } # finds the best position $best_locus = &best_fitting_marker ( [ @framework[ $from .. $to - 1 ] ], $#sample_list, \%fit ); # tries to find a position at which the new marker matches better than in the intially assigned position unless ( $best_locus eq "end" ) { @distributed = ( $best_locus, @{ $fit{ $best_locus } } ); # registers the array reference of the best locus for integration plus the number of no-CO, single-CO, double-CO and missing data events at the best locus print "- $marker below $best_locus: ${ $fit{ $best_locus } }[ 2 ] doubleCO, ${ $fit{ $best_locus } }[ 1 ] singleCO, ${ $fit{ $best_locus } }[ 0 ] fitting (${ $fit{ $best_locus } }[ 3 ] missing)\n"; } else { @distributed = ( "not incorporated", "-", "-", "-", "-" ); print "- $marker: not incorporated: no overlap with framework markers!\n"; } return ( @distributed ); } sub best_fitting_marker { my @marker = @{ shift @_ }; my $n_samples = shift; my %fit = %{ shift @_ }; my ( $best, $position, ); $position = 0; $position ++ until ( $position > $#marker || ${ $fit{ $marker[ $position ] } }[ 3 ] < $n_samples ); $best = $marker[ $position ]; if ( $position < $#marker ) { for ( @marker[ ( $position + 1 ) .. $#marker ] ) { if ( ${ $fit{ $_ } }[ 2 ] < ${ $fit{ $best } }[ 2 ] ) { $best = $_; } elsif ( ${ $fit{ $_ } }[ 2 ] == ${ $fit{ $best } }[ 2 ] ) { if ( ${ $fit{ $_ } }[ 1 ] < ${ $fit{ $best } }[ 1 ] ) { $best = $_; } elsif ( ${ $fit{ $_ } }[ 1 ] == ${ $fit{ $best } }[ 1 ] ) { if ( ${ $fit{ $_ } }[ 0 ] > ${ $fit{ $best } }[ 0 ] ) { $best = $_; } } } } return ( $best ); } elsif ( $position == $#marker ) { return ( $best ); } else { return ( "end" ); } } ################################################################################################################### sub merge_tables { # merges two tables (data sets) into a single strict-consensus data set my $score_type = shift @_; # = type of scores (0/1) or (A/B/C/D/H) my @previous_files = @{ shift @_ }; # = list of previous files contained in current consensus table my $new_file = shift @_; # = name of new file to be merged with current consensus data set my %cons_scores = %{ shift @_ }; # = hash of hashes: samples => markers => scores (in the current consensus table) my %scores = %{ shift @_ }; # = hash of hashes: samples => markers => scores (in the next table to be merged with the current consensus table) my ( $marker, # = currently processed marker $sample, # = currently processed sample $overlap ); # = number of genotype calls (scores) overlapping between the two data sets my $discordant = 0; # = number of discordant genotype call overlaps for $sample ( keys %scores ) { # adds scores for sample x marker pairs from the new table to the consensus table in case there're no values for these sample x marker pairs in the consensus table for $marker ( keys %{ $scores{ $sample } } ) { if ( exists ${ $cons_scores{ $sample } }{ $marker } ) { $overlap ++; @_ = &merge_scores ( $score_type, ${ $cons_scores{ $sample } }{ $marker }, ${ $scores{ $sample } }{ $marker } ); if ( $_[ 0 ] =~ /^X$/i ) { ${ $cons_scores{ $sample } }{ $marker } = 'discordant'; $discordant ++; } else { ${ $cons_scores{ $sample } }{ $marker } = $_[ 0 ]; } } else { ${ $cons_scores{ $sample } }{ $marker } = ${ $scores{ $sample } }{ $marker }; } } } return \%cons_scores, $overlap, $discordant; } ################################################################################################################### sub merge_score_lists { # merges scores from two 1-dimensional lists (eg, one marker scored for many samples) my $score_type = shift @_; # type of scores my @lists = @_[ 0 .. 1 ]; # array of names of the two lists of scores my @scores = @_[ 2 .. 3 ]; # array of the two arrays of scores my $concord = 0; # number of concordant genotype calls my $discord = 0; # number of discordant genotype calls my $single = 0; # number of genotype calls that are unknown in one table but defined in the other my $missing = 0; # number of genotype calls that are unknown in both tables my $j = 0; # looping index my ( $i, # looping index @con_scores # list of consensus scores (merged scores) ); # checks whether the lengths of the two score lists are identical unless ( scalar @{ $scores[ 0 ] } == scalar @{ $scores[ 1 ] } ) { print "\n-----------------------------\n\n", "-> Program terminated: there was an unequal number of scores in two lists\n", " of scores to be merged. Press 'Enter' to exit...\n"; < STDIN >; exit; } # merges the two lists of scores, counts number of concordant, discordant, single and missing scores and # converts "discordant" label of score pairs highlighted to be discordant in a previous round of merging into "X" for $i ( 0 .. $#{ $scores[ 0 ] } ) { push @con_scores, &merge_scores ( $score_type, ${ $scores[ 0 ] }[ $i ], ${ $scores[ 1 ] }[ $i ] ); if ( $con_scores[ $i ] =~ /X/i ) { $discord ++; s/discordant/X/; } elsif ( $con_scores[ $i ] eq '-' ) { $missing ++; } elsif ( $con_scores[ $i ] =~ /[01ABCDH]/i ) { unless ( ${ $scores[ 0 ] }[ $i ] eq '-' || ${ $scores[ 1 ] }[ $i ] eq '-' ) { $concord ++; } else { $single++; } } } return ( $concord, $discord, $single, $missing, \@con_scores ); } ################################################################################################################### sub merge_scores { # merges a pair of scores into a consensus score my $score_type = shift @_; # = type of scores (0/1) or (A/B/C/D/H) my $scoreA = shift @_; # = replicate score no. 1 my $scoreB = shift @_; # = replicate score no. 2 my $consensus; # = consensus score if ( !$score_type || ( $score_type ne 'numbers' && $score_type ne 'letters' ) ) { # tries to identify the type of scores in cases it is not known yet $score_type = &score_type ( 'A pair of scores', $scoreA, $scoreB ); } if ( $score_type eq 'numbers' ) { # checks scoring consistency for a pair of lists with 0/1 scores if ( !( $scoreA =~ /[01X\-]/i ) && !( $scoreB =~ /[01X\-]/i ) ) { print "\n-----------------------------\n\n", "-> Program terminated: one of your data sets appears to contain non-valid\n", " scores (instead of the 0/1 scores that were expected). Press 'Enter' to exit..."; ; exit; } elsif ( ( $scoreA eq "discordant" ) || ( $scoreB eq "discordant" ) ) { $consensus = "X"; } else { if ( $scoreA =~ /^[X\-]$/i && $scoreB =~ /^[X\-]$/i ) { $consensus = "-"; } else { if ( $scoreA eq $scoreB ) { $consensus = uc ( $scoreA ); } elsif ( $scoreA =~ /^[X\-]$/i ) { $consensus = uc ( $scoreB ); } elsif ( $scoreB =~ /^[X\-]$/i ) { $consensus = uc ( $scoreA ); } else { $consensus = "X"; } } } } elsif ( $score_type eq 'letters' ) { # checks scoring consistency for table pairs with A/B/C/D/H type of scores if ( !( $scoreA =~ /[ABCDHX\-]/i ) && !( $scoreB =~ /[ABCDHX\-]/i ) ) { print "\n-----------------------------\n\n", "-> Program terminated: one of your data sets appears to contain non-valid\n", " scores (instead of the A/B/C/D/H scores that were expected).\n", " Press 'Enter' to exit..."; ; exit; } elsif ( ( $scoreA eq "discordant" ) || ( $scoreA eq "discordant" ) ) { $consensus = "X"; } else { if ( $scoreA =~ /^[X\-]$/i && $scoreB =~ /^[X\-]$/i ) { $consensus = "-"; } else { if ( uc ( $scoreA ) eq uc ( $scoreB ) ) { $consensus = uc ( $scoreA ); } elsif ( $scoreA =~ /^[\-X]$/i ) { $consensus = uc ( $scoreB ); } elsif ( $scoreB =~ /^[X\-]$/i ) { $consensus = uc( $scoreA ); } elsif ( ( $scoreA =~ /^H$/i ) || ( $scoreB =~ /^H$/i ) ) { if ( ( $scoreA =~ /^[AB]$/i) || ( $scoreB =~ /^[AB]$/i ) ) { $consensus = "X"; } else { $consensus = "H"; } } elsif ( $scoreA =~ /^C$/i ) { if ( $scoreB =~ /^B$/i ) { $consensus = "B"; } else { $consensus = "X"; } } elsif ( $scoreA =~ /^D$/i ) { if ( $scoreB =~ /^A$/i ) { $consensus = "A"; } else { $consensus = "X"; } } elsif ( $scoreB =~ /^C$/i ) { if ( $scoreA =~ /^B$/i ) { $consensus = "B"; } else { $consensus = "X"; } } elsif ( $scoreB =~ /^D$/i ) { if ( $scoreA =~ /^A$/i ) { $consensus = "A"; } else { $consensus = "X"; } } else { $consensus = "X"; } } } } return $consensus; } ################################################################################################################### sub score_type { # identifies type of scores my $source = shift @_; # = name of source (eg, file) from which the list of scores are coming from my @scores = @_; # = list of scores of supposedly the same type my $score_type; # = type of scores for ( @scores ) { if ( /[01]/ ) { $score_type = 'numbers'; last; } elsif ( /[ABCDH]/i ) { $score_type = 'letters'; last; } elsif ( /[X\-]/i ) { $score_type = 'unknown'; } else { print "\n-----------------------------\n\n", "-> Program terminated: 'source' appears to contain non-valid scores\n", " (scores other then 0/1 or A/B/C/D/H). Press 'Enter' to exit..."; ; exit; } } return $score_type; } ################################################################################################################### sub print_con_file { # prints csv file with consensus table derived from merging individual data sets of unique and quality-filtered markers and samples my $consensus_file = shift @_; # = name of file into which consensus table shall be written my @file_list = @{ shift @_ }; # = list of individual files (data sets) from which the consensus data set was produced my $min_marker_callrate = shift @_; # = call rate threshold for markers my $min_sample_callrate = shift @_; # = call rate threshold for samples my %n_g_markers = %{ shift @_ }; # = array of numbers of good (= quality-filtered) markers in each of the data sets my %n_g_samples = %{ shift @_ }; # = array of numbers of good (= quality-filtered) samples in each of the data sets my %typed_in_table = %{ shift @_ }; # = hash of arrays: sample => list of files in which sample was typed my %scored_in_table = %{ shift @_ }; # = hash of arrays: marker => list of files in which marker was scored my %cons_sample_callrate = %{ shift @_ }; # = hash: sample => call rate (denominated by total number of samples in consensus table) my %cons_marker_callrate = %{ shift @_ }; # = hash: marker => call rate (denominated by total number of markers in consensus table) my %sample_consensus = %{ shift @_ }; # = hash: sample => consensus ("consistency") value: defined scores (0,1,A,B,C,D,H) as % of existing scores (= defined or discordant) (denominator excludes missing scores!) my %marker_consensus = %{ shift @_ }; # = hash: marker => consensus ("consistency") value: defined scores (0,1,A,B,C,D,H) as % of existing scores (= defined or discordant) (denominator excludes missing scores!) my %global_marker_features = %{ shift @_ }; # = hash of hashes of hashes: files (data sets) => markers => marker features => values of marker features in file my %cons_scores = %{ shift @_ }; # = hash of hashes: samples => markers => scores of consensus table my ( $i, # = looping variable $marker, # = currently processed marker $sample, # = currently processed sample $file, # = currently looped-through file name $found, # = + or - depending on wether a particular sample or marker was present in a particular data set (file) %n_features, # = hash file => number of columns with marker features in each of the files $tot_n_features, # = total number of marker features in all files $feature, # = name of currently processed marker feature @table ); # = array of arrays of rows of cells my $global_callrate = 0; # = overall call rate of consensus table for $sample ( keys %cons_sample_callrate ) { # computes the global call rate for the enire consensus score table $global_callrate = $global_callrate + $cons_sample_callrate{ $sample }; } $global_callrate = $global_callrate / scalar ( keys %cons_sample_callrate ); for $file ( @file_list ) { # computes the total number of columns derived from all individual files that contain marker features $n_features{ $file } = 0; for ( keys %{ $global_marker_features{ $file } } ) { for ( sort keys %{ ${ $global_marker_features{ $file } }{ $_ } } ) { $n_features{ $file } ++; $tot_n_features ++; } last; } } open ( CON, ">$consensus_file" ) or die "\n-> Program terminated: can't create '$consensus_file' to output\n the consensus score table: $!\n"; print CON "Consensus data set ("; print CON scalar keys %cons_scores, " sample"; print CON "s" if ( scalar keys %cons_scores > 1 ); print CON " x ", scalar keys %scored_in_table, " marker"; print CON "s" if ( scalar keys %scored_in_table > 1 ); print CON ") produced by merging the data sets from the following files:\n"; for ( 0 .. $#file_list ) { print CON " - '$file_list[ $_ ]' ($n_g_samples{ $file_list[ $_ ] } sample"; print CON "s" if ( $n_g_samples{ $file_list[ $_ ] } > 1 ); print CON " x $n_g_markers{ $file_list[ $_ ] } marker"; print CON "s" if ( $n_g_markers{ $file_list[ $_ ] } > 1 ); print CON ")\n"; } print CON "\n (The 'qua' files contain the individual data sets pre-filtered for redundancy and call rate: marker threshold = $min_marker_callrate % & sample threshold = $min_sample_callrate %.)\n\n"; printf CON " => The global call rate in the consensus table is %2.2f ", $global_callrate; print CON "%.\n\n"; print CON "The sample and marker consensus values are the % of successfully scored calls divided by the number of successful plus discordant calls.\n\n"; for $file ( @file_list ) { # prints information on which data sets contained each of the samples for ( 0 .. @file_list + $tot_n_features + 1 ) { print CON ","; } print CON "$file:"; $found = 'n'; for $sample ( sort keys %typed_in_table ) { for ( @{ $typed_in_table{ $sample } } ) { if ( $_ eq $file ) { $found = "y"; last; } } print CON ",$found"; $found = 'n'; } print CON "\n"; } for ( 0 .. @file_list + $tot_n_features + 1 ) { # prints row with sample consensus values print CON ","; } print CON "SampleConsensus(%) =>"; for $sample ( sort keys %typed_in_table ) { print CON ",$sample_consensus{ $sample }"; } print CON "\n"; for ( 0 .. @file_list + $tot_n_features + 1 ) { # prints row with sample call rate values print CON ","; } print CON "SampleCallRate(%) =>"; for $sample ( sort keys %typed_in_table ) { print CON ",$cons_sample_callrate{ $sample }"; } print CON "\n"; for ( 0 .. @file_list + 2 ) { # prints row with info from which files the individual marker features are derived print CON ","; } for $file ( @file_list ) { $found = ''; until ( $found ) { # searches for a marker scored in the currently processed file for $marker ( sort keys %scored_in_table ) { for ( @{ $scored_in_table{ $marker } } ) { if ( $_ eq $file ) { $found = $marker; } } } } print CON "|| '$file'=>," if ( scalar ( keys %{ ${ $global_marker_features{ $file } }{ $found } } ) > 0 ); # checks whether marker features exist in a particular file for ( keys %{ $global_marker_features{ $file } } ) { for ( 1 .. ( scalar keys %{ ${ $global_marker_features{ $file } }{ $_ } } ) - 1 ) { print CON "*,"; } last; } } print CON "|"; for ( 1 .. ( keys %typed_in_table ) - 1 ) { print CON ",|"; } print CON "\n"; for $file ( @file_list ) { # prints row with names of marker features and sample print CON "$file,"; } print CON "MarkerConsensus(%),MarkerCallRate(%),Marker"; for $file ( @file_list ) { for ( keys %{ $global_marker_features{ $file } } ) { for ( sort keys %{ ${ $global_marker_features{ $file } }{ $_ } } ) { print CON ",$_"; } last; } } for $sample ( sort keys %typed_in_table ) { print CON ",$sample"; } print CON "\n"; for $marker ( sort keys %scored_in_table ) { # prints rows with marker features and scores $found = 'n'; for $file ( @file_list ) { for ( @{ $scored_in_table{ $marker } } ) { if ( $_ eq $file ) { $found = 'y'; last; } } print CON "$found,"; $found = 'n'; } print CON "$marker_consensus{ $marker },$cons_marker_callrate{ $marker },$marker"; for $file ( @file_list ) { # prints values of marker features from individual files $found = 'n'; for ( @{ $scored_in_table{ $marker } } ) { if ( $_ eq $file ) { $found = 'y'; last; } } if ( $found eq 'y' ) { for $feature ( sort keys %{ ${ $global_marker_features{ $file } }{ $marker } } ) { print CON ",${ ${ $global_marker_features{ $file } }{ $marker } }{ $feature }"; } } else { for ( 1 .. $n_features{ $file } ) { print CON ",-"; } } } for $sample ( sort keys %typed_in_table ) { if ( exists ${ $cons_scores{ $sample } }{ $marker } ) { print CON ",${ $cons_scores{ $sample } }{ $marker }"; } else { print CON ",-"; } } print CON "\n"; } close "CON"; print "\n =====> The consensus data set with ", scalar keys %cons_scores, " sample"; print "s" if ( scalar keys %cons_scores > 1 ); print " and ", scalar keys %scored_in_table, " marker"; print "s" if ( scalar keys %scored_in_table > 1 ); print " was saved\n"; print " under '$consensus_file'.\n"; return; } ################################################################################################################### sub filter_redundant { # reads, quality-controls and finds scores in a file containing marker features and scores my $file = shift @_; # = name of file to read my ( $new_file, # = name of new files into which non-redundant samples and marker entries are written $format, # = format of the file (comma or tab) $score_type, # = type of scores (0/1) vs. (A/B/C/D/H) @titles, # = array of column titles $row, # = currently processed row @row_cells, # = array of arrays containing cells of columns $sample, # = name of currently processed sample $marker, # = name of currently processed marker $n_markers, # = number of markers (rows) $nu_markers, # = number of unique markers (rows) $n_cols, # = number of columns $markercol, # = number of column containing marker names $marker_name, # = title of column containing marker names $scol1, # = number of first column containing scores $n_samples, # = number of samples (score columns) $nu_samples, # = number of unique samples (score columns) $i, # = looping index $j, # = looping index %marker_scores, # = hash of scores of all markers of the currently processed sample %scores, # = hash of hashs of scores of all markers of all samples %feature_values, # = hash of data for all marker features of the currently processed marker %marker_features, # = hash of hashes of data of marker features of all markers %cons_red_markers, # = hash of scroe consensus values (in %) of redundant markers %cons_red_samples, # = hash of scroe consensus values (in %) of redundant samples @markers, # = array of marker names in alphabetical order @samples ); # = array of sample names in alphabetical order @_ = &read_file ( $file ); # reads a file and outputs: file format / number of columns / number of rows / array of column titles / array of rows $format = shift @_; $n_cols = shift @_; $n_markers = shift @_; @titles = @{ shift @_ }; @row_cells = @{ shift @_ }; for ( @row_cells ) { unless ( $n_cols == @{ $_ } ) { # checks whether all rows have the same number of cells print "\n-----------------------------\n\n", "-> Program terminated: the table in '$file' has at least\n", " one row that has a different number of cells than the first row with the\n", " column titles. Go to Excel and revise the file as follows:\n\n", " 1. Check whether the row with the column titles is complete.\n\n", " 2. Check whether there's no empty row separating the data into two blocks.\n\n", " 3. Check whether all columns with scoring data are filled with scores.\n\n", " 4. Select all empty columns to the right of the table -> right-click\n", " and select 'Delete'.\n\n", " 5. Select all empty rows below the table -> right-click and select 'Delete'.\n\n", " 6. Save file in the csv or txt format and run DArTAmanager again.\n\n\n", " Press 'Enter' to exit...\n"; ; exit; } } $scol1 = &find_scores ( $file, \@titles, \@row_cells ); # finds first column (sample) with scores and counts number of samples $n_samples = $n_cols - $scol1 + 1; for $i ( 0 .. $#row_cells ) { # identifies type of scores for $j ( $scol1 - 1 .. $#titles ) { $_ = &score_type ( $file, ${$row_cells[ $i ] }[ $j ] ); ${$row_cells[ $i ] }[ $j ] = uc ( ${$row_cells[ $i ] }[ $j ] ); # converts scores into upper case ${$row_cells[ $i ] }[ $j ] =~ s/[Xx]/\-/g; # subsitutes unnkown scores denoted by "X" with "-" if ( $_ eq 'numbers' || $_ eq 'letters' ) { if ( $score_type ) { unless ( $_ eq $score_type ) { print "\n-----------------------------\n\n", "-> Program terminated: '$file' contains both '0/1' and\n", " 'A/B/C/D/H' scores. Please check this file and try again!\n", " Press 'Enter' to exit..."; ; exit; } } else { $score_type = $_; } } } } if ( $score_type ne 'numbers' && $score_type ne 'letters' ) { print "\n > Couldn't determine the type of scores in '$file' Type '1' if the\n", " data set contains 0/1 scores or 'A' if it contains A/B/C/D/H scores: "; do { chomp ( $score_type = ); $score_type = 'numbers' if ( $score_type =~ /\b1\b/ ); $score_type = 'letters' if ( $score_type =~ /\bA\b/i ); } until ( $score_type && ( $score_type eq 'numbers' || $score_type eq 'letters') ) } $markercol = &find_marker_names ( $file, $scol1, \@titles, \@row_cells ); # identifies the column that contains marker names $marker_name = $titles[ $markercol - 1 ]; @_ = &remove_red_markers ( $file, $score_type, $markercol, $scol1, \@titles, \@row_cells ); # removes redundant marker entries (retaining consensus scores for each group of redundant markers) @row_cells = @{ shift @_ }; %cons_red_markers = %{ shift @_ }; $nu_markers = @row_cells; @_ = &remove_red_samples ( $file, $score_type, $scol1, \@titles, \@row_cells ); # removes replicate sample entries (retaining consensus scores for each group of replicate samples) $nu_samples = shift @_; @titles = @{ shift @_ }; @row_cells = @{ shift @_ }; %cons_red_samples = %{ shift @_ }; for $j ( $scol1 - 1 .. $#titles ) { # creates a hash of hashes containing the scores of all markers of all samples for $i ( 0 .. $#row_cells ) { $marker_scores{ ${ $row_cells[ $i ] }[ $markercol - 1 ] } = ${ $row_cells[ $i ] }[ $j ]; } $scores{ $titles[ $j ] } = { %marker_scores }; } @samples = sort keys %scores; for $i ( 0 .. $#row_cells ) { # creates a hash of hashes containing the features of each feature type of all markers %feature_values = ( ); for $j ( 0 .. $scol1 - 2 ) { unless ( $j == $markercol - 1 ) { $feature_values{ $titles[ $j ] } = ${ $row_cells[ $i ] }[ $j ]; } } $marker_features{ @{ $row_cells[ $i ] }[ $markercol - 1 ] } = { %feature_values }; } @markers = sort keys %marker_features; unless ( ( $file =~ /.+\.uni$/i ) && ( $n_markers == $nu_markers ) && ( $n_samples == $nu_samples ) ) { $file =~ /^(.+)\..*$/; # creates a file to ouput non-redundant score entries $new_file = "$1.uni"; open ( UNIQUE, ">$new_file" ) or die "\n-> Program terminated: can't create '$new_file' to output\n the scoring table with non-redundant entries: $!\n"; print UNIQUE "Scoring table from file '$file' with redundant sample and marker entries collapsed into consensus entries\n\n"; unless ( $n_markers == $nu_markers ) { # outputs information on redundant entries found print UNIQUE " -> There where $nu_markers unique marker"; print UNIQUE "s" if ( $nu_markers > 1 ); print UNIQUE " among a total of $n_markers markers\n"; } else { print UNIQUE " -> There where no redundant marker entries\n"; } unless ( $n_samples == $nu_samples ) { print UNIQUE " -> There where $nu_samples unique sample"; print UNIQUE "s" if ( $nu_samples > 1 ); print UNIQUE " among a total of $n_samples samples\n\n"; } else { print UNIQUE " -> There where no redundant sample entries\n\n"; } unless ( ( $n_markers == $nu_markers ) && ( $n_samples == $nu_samples ) ) { print UNIQUE "Here's the redundancy-filtered scoring table (redundant markers and/or samples collapsed into consensus entries):\n\n"; } else { print UNIQUE "Here's the original scoring table which did not contain any redundant entries:\n\n" ; } if ( $n_samples != $nu_samples ) { # prints a row containing % score-consensus values for redundant sample entries if ( $n_markers != $nu_markers ) { print UNIQUE ","; } for ( keys %{ $marker_features{ $markers[ 0 ] } } ) { print UNIQUE ","; } print UNIQUE "Consensus(%) ->"; for ( sort keys %scores ) { if ( $cons_red_samples{ $_ } ) { print UNIQUE ",$cons_red_samples{ $_ }"; } else { print UNIQUE ",-"; } } print UNIQUE "\n"; } if ( $n_markers != $nu_markers ) { # prints row containing column titles print UNIQUE "Consensus(%),"; } print UNIQUE "$marker_name"; for ( sort keys %{ $marker_features{ $markers[ 0 ] } } ) { print UNIQUE ",$_"; } print UNIQUE ","; print UNIQUE "@samples\n"; for $marker ( sort keys %marker_features ) { # prints rows containing marker features and scores if ( $n_markers != $nu_markers ) { if ( $cons_red_markers{ $marker } ) { print UNIQUE "$cons_red_markers{ $marker },"; } else { print UNIQUE "-,"; } } print UNIQUE "$marker"; for ( sort keys %{ $marker_features{ $marker } } ) { print UNIQUE ",${ $marker_features{ $marker } }{ $_ }"; } for $sample ( sort keys %scores ) { print UNIQUE ",${ $scores{ $sample } }{ $marker }"; } print UNIQUE "\n"; } if ( $n_markers != $nu_markers ) { # prints information on marker and sample redundancy on screen print "\n ==> ", $n_markers - $nu_markers," redundant marker"; print "s" if ( $n_markers - $nu_markers > 1 ); print " transferred to '$1_red$marker_name"."s".".csv'.\n"; } else { print "\n ==> There were no redundant markers.\n"; } if ( $n_samples != $nu_samples ) { print "\n ==> ", $n_samples - $nu_samples, " redundant sample"; print "s" if ( $n_samples - $nu_samples > 1 ); print " transferred to '$1_redSamples.csv'.\n"; } else { print "\n ==> There were no redundant samples.\n"; } print "\n ==> The non-redundant data set with ", $nu_samples, " unique sample"; print "s" if ( $nu_samples > 1 ); print " and ", $nu_markers, " unique marker"; print "s" if ( $nu_markers > 1 ); print "\n was saved as '$1.uni'.\n"; # outputs information on redundant samples and markers found in file close UNIQUE or die "\n-> Program terminated: can't close '$new_file'.\n\n"; } else { print "\n ==> There were no redundant marker or samples.\n"; } return ( $new_file, $n_samples, $nu_samples, \@samples, \%cons_red_samples, $marker_name, \%marker_features, $n_markers, $nu_markers, \@markers, \%cons_red_markers, $score_type, \%scores ); } ################################################################################################################### sub read_file { # opens a text file, checks its format and reads column titles and rows my $file = shift @_; # = name of file to read my ( @header, # = array containing header (= leading rows before the row with the column titles) $title_row, # = row containing column titles @titles, # = array of column titles @rows, # = array of rows excluding title row and rows above title row @row_cells, # = array of arrays containing cells of columns $comma, # = number of commas in title row $tab, # = number of tabs in title row $format, # = delimiter used in file (comma or tab) $i, # = looping index $j ); # = looping index open ( FILE, "$file" ) or die "\n-> Program terminated: can't open $_[0]: $!\n\n"; # reads file content push @rows, ; chomp @rows; close FILE or warn "\nCan't close $file: $!\n"; for ( @rows ) { s/\s+$//g; s/^",+(.+)",/$1,/; s/^"(.+),+",/$1,/; s/^"(.+),+(.+)",/$1;$2,/; s/,",+(.*)",/,$1,/g; s/,"(.*),+",/,$1,/g; s/,"(.*),+(.*)",/,$1;$2,/g; } $i = $#rows; # identifies and excises header rows above the row that contains column titles do { unless ( $rows[ $i ] =~ /[,\t][01ABCDHX\-]$/i ) { $rows[ $i ] =~ /^([^,^\t]+)[,\t]/; print "\n > It looks like as if the column titles are in row no. ", $i + 1; if ( $1 ) { print " (first cell = \n '$1'). "; } else { print ".\n "; } print "Type 'n' if unsure: "; chomp ( $_ = ); if ( /\bn\b/i ) { print "\n > Here's a list of the leftmost cells from the first rows of the file: \n\n"; for $j ( 1 .. $i + 4 ) { $rows[ $j - 1 ] =~ /^([^,^\t]+)[,\t]/; if ( $rows[ $j - 1 ] ne "" ) { if ( $1 ) { printf "%2d: %s", $j, $1; print "\t<- row with column titles?" if ( $j == $i + 1 ); print "\n"; } else { printf "%2d: %s\n", $j, "empty"; } } else { printf "%2d: %s\n", $j, "empty"; } } print "\n > Please select the number of the row that has the column titles: "; chomp ( $_ = ); until ( ( $_ =~ /\d+/ ) && ( $_ >= 1 ) && ( $_ <= $#rows ) ) { print "\n > This is outside of the range of possible row numbers. Try again: "; chomp ( $_ = ); }; $i = $_ - 1; } @header = splice ( @rows, 0, $i ) unless ( $i == 0 ); } $i--; } until ( @header || $i <= 1 ); if ( $i == $#rows ) { print "\n-----------------------------\n", "\n-> Program terminated: the bottom/right corner of '$file'\n", " doesn't contain a score. Go to Excel and revise the file as follows:\n\n", " 1. Make sure that the scores are in the bottom/right part of the table.\n\n", " 2. Make sure that there are no additional data below the scoring table.\n\n", " 3. Select all empty columns to the right of the table -> right-click\n", " and 'Delete'.\n\n", " 4. Select all empty rows below the table -> right-click and 'Delete'.\n\n", " 5. Save file in the csv or txt format and run DArTAmanager again.\n\n\n", " Press 'Enter' to exit...\n"; ; exit; } $title_row = splice ( @rows, 0, 1 ); # excises row containing column titles $comma = ( $title_row =~ tr/,// ); # checks whether comma-separated or tab-delimited file $tab = ( $title_row =~ tr/\t// ); if ( ( $tab == 0 ) && ( $comma > 0 ) ) { $format = ","; } elsif ( ( $tab > 0 ) && ( $comma < $tab ) ) { $format = "\t"; } else { print "\n-----------------------------\n\n", "-> Program terminated: '$file' doesn't appear to be a\n", " comma or tab-delimited text file! (Or perhaps you've selected the wrong\n", " row with column titles?)\n", " Press 'Enter' to exit..."; ; exit; } push @titles, split ( /$format/, $title_row ); # creates array with column titles and removes trailing spaces at the end of individual titles for ( @titles ) { # removes leading and trailing whitspace from column titles s/^\s+//g; s/\s+$//g; } for ( @rows ) { # converts each row into an array of cells, and creates an array of arrays containing cells of rows push @row_cells, [ split ( /$format/, $_ ) ]; for ( @{ $row_cells[ -1 ] } ) { s/^\s+//g; # removes leading and trailing whitspace from all cells s/\s+$//g; } } return ( $format, scalar @titles, scalar @rows, \@titles, \@row_cells, \@header ); # returns the format of the file, number of columns, number of rows, array of column titles and array of rows } ################################################################################################################### sub find_scores { # identifies first column with scoring data in each table my $file = shift @_; # = name of file my @titles = @{ shift @_ }; # = array of column titles my @row_cells = @{ shift @_ }; # = array of arrays of cells of rows my ( $scol1, # = number of first column with scores $i, # = looping index for columns $j, # = looping index for rows $found, # = "non-score" once the first cell from the right end of the table with a non-score has been found $where, # = number of row in last column where a non-valid score entry was found $unsure, # if = "n" then user can define first score column $to_print ); # = number modifier for printing for ( $i = @titles - 1; $i >= 0 ; $i-- ) { # searches cell values of columns, starting from the right end for $j ( 0 .. @row_cells - 1 ) { unless ( ${ $row_cells[ $j ] }[ $i ] =~ /^[01ABCDHX\-]$/i ) { # checks whether a particular cell contains a score $found = "yes"; # indicates that a non-score value has been found $where = $j + 1; } } if ( $found ) { # decides what to do if once the first non-score from the right has been found if ( $i == @titles - 1 ) { if ( $where == 1 ) { $to_print = "st"; } elsif ( $where == 2 ) { $to_print = "nd"; } elsif ( $where == 3 ) { $to_print = "rd"; } else { $to_print = "th"; } print "\n-----------------------------\n\n", "-> Program terminated: The ". $where ."$to_print data row of the scoring table in\n", " '$file' contains at least one non-valid score entry (valid\n", " entries are: 0,1,A,a,B,b,C,c,D,d,H,h,-,X or x).\n\n", " Please revise this file and try again. Press 'Enter' to exit..."; ; exit; } else { $scol1 = $i + 2; print "\n > It looks like as if '$titles[ $scol1 - 1 ]' (column no. $scol1) is the first \n", # asks user whether to accept first column with scores " column with scores. Type 'n' if unsure: "; chomp ( $unsure = ); if ( $unsure =~ /\bn\b/i ) { # allows user to modify first-score column selection $scol1 = 1; print "\n > Here's a list of column titles...\n\n"; for ( @titles ) { # prints column titles on the screen printf "%3d: %s", $scol1, $_; if ( $scol1 == $i + 1 ) { print "\t\t<- last column with marker features..?\n"; } elsif ( $scol1 == $i + 2 ) { print "\t\t<- first column with scores..?\n"; } else { print "\n"; } $scol1++; } $scol1 = 1; print "\n > Please select the number of the first column that contains scores: "; chomp ( $scol1 = ); while ( ( $scol1 > @titles ) || ( $scol1 < 2 ) ) { print "\n > This is outside the valid range of columns. Try again: "; # rejects too large and too small column numbers chomp ( $scol1 = ); } if ( $scol1 <= $i + 1 ) { print "\n-----------------------------\n\n", # rejects lower column numbers than that which has been recognized as the first column with scores "-> Program terminated: there is at least one non-valid score entry in the\n", " '$titles[ $i ]' column"; print " (and possibly more in previous columns)" unless ( $scol1 == $i + 1 ); print ".\n Please check your file and try again. Press 'Enter' to exit..."; ; exit; } } return $scol1; } } } print "\n-> Program terminated: '$file' appears to contain only scores \n", # terminates if all cells contain scores " but no marker information. Press 'Enter' to exit..."; ; exit; } ################################################################################################################### sub find_marker_names { # finds column containing marker names my $file = shift @_; # = name of file my $scol1 = shift @_; # = number of first column with scores my @titles = @{ shift @_ }; # = array of column titles my @row_cells = @{ shift @_ }; # = array of arrays containing cells of rows my $i = 0; # = looping index my $j = 0; # = looping index my $accept = "?"; # = 'n' if user doesn't accept the marker name column offered by the program my $markercol; # = number of column with marker names do { for $j ( 0 .. $scol1 - 2 ) { if ( ${ $row_cells[ $i ] }[ $j ] =~ /^[a-z]{1,2}[A-Z][A-Za-z]{1,3}-\d{4}[AB]?$/ ) { # searches for DArT marker names $markercol = $j + 1; print "\n > Does the '$titles[ $j ]' column (no. ", $j + 1, ") contain the marker names? (y/n): "; chomp ( $accept = ); unless ( $accept =~ /\bn\b/i ) { return $markercol; } } } $i++; } until ( $i == $#row_cells || $accept =~ /\bn\b/i ); for ( @titles[ 0 .. $scol1 - 2 ] ) { # searches for a titles of a column with marker features that contains the string "marker" if ( /markers*/i ) { unless ( ( $markercol ) && ( $_ eq $titles[ $markercol - 1 ] ) ) { print "\n > Does the '$_' column contain the marker names? (y/n): "; chomp ( $accept = ); unless ( $accept =~ /\bn\b/i ) { return ( $markercol = $j + 1 ); } } } } $i = 0; do { for $j ( 0 .. $scol1 - 2 ) { if ( ${ $row_cells[ $i ] }[ $j ] =~ /^\d{12}_[A-P]_\d{1,2}$/i ) { # searches for DArT clone names as an alternative to marker names print "\n > Do you want to use DArT clone names in the '$titles[ $j ]' column as\n", " unique identifiers? (y/n): "; chomp ( $accept = ); unless ( $accept =~ /\bn\b/i ) { return ( $markercol = $j + 1 ); } } } $i++; } until ( $i == $#row_cells || $accept =~ /\bn\b/i ); if ( $scol1 == 2 ) { print "\n ==> Assumed that marker names are in column no. 1.\n"; return ( $markercol = 1 ); } print "\n > Here's a list of the titles of columns that contain marker features...\n\n"; # asks user to identify column with marker names or an alternative unique identifier for $i ( 1 .. $scol1 - 1 ) { # prints column titles on the screen if ( $titles[ $i - 1 ] ) { printf "%3d: %s\n", $i, $titles[ $i - 1 ]; } else { printf "%3d: %s\n", $i, 'empty'; } } print "\n > Please select the number of the column that contains marker names or any\n", " other unique identifier you wish to use: "; chomp ( $markercol = ); while ( ( $markercol >= $scol1 ) || ( $markercol < 1 ) ) { # rejects too big and too small column numbers print "\n > This is outside the valid range of columns. Try again: "; chomp ( $markercol = ); } return $markercol; } ################################################################################################################### sub remove_red_markers { # removes groups of redundant marker entries into a separate file, retaining one entry per group with the consensus scores my $file = shift @_; # = file name my $score_type = shift @_; # = type of scores (0/1) vs. (A/B/C/D/H) my $markercol = shift @_; # = number of column with marker names my $scol1 = shift @_; # = number of first column containing scores my @titles = @{ shift @_ }; # = array of column titles my @row_cells = @{ shift @_ }; # = array of arrays of cells of rows my $i = 0; # = looping index for processing of individual rows my $j = 0; # = nested looping index for second row to be compared to first row ($i) my $discordant = 0; # = counts number of discordant calls in a comparison of two or more score lists my $missing = 0; # = counts number of missing calls in a comparison of two or more score lists my $entry = "new"; # = indicates whether or not a particular marker has already been compared against my $n_markers = @row_cells; # = number of markers (= rows) my $nu_markers = $n_markers; # = number of unique markers (= rows) (starting value = total number) my ( $new_file, # = name of new file containing redundant marker entries @red_row, # = array of cells of a row belonging to a redundant marker entry @consensus, # = running consensus scores for the currently processed group of redundant marker entries %score_consensus ); # = hash of scoring consensus of all markers with multiple entries in % ( = 100 - % of scores that are discordant in at least one pair of score lists in a group of score lists for a redundant entry) for ( @row_cells ) { # loops through all rows, starting at row 0 @consensus = @{ $_ }[ $scol1 - 1 .. $#titles ]; for ( $j = $#row_cells; $j >= $i + 1; $j-- ) { # loops, for each row, through all other rows with higher indices if ( ${ $row_cells[ $i ] }[ $markercol - 1 ] eq ${ $row_cells[ $j ] }[ $markercol - 1 ] ) { # checks whether two rows contain data for the same entry (marker) if ( $nu_markers == $n_markers ) { $file =~ /^(.+)\..*$/; $new_file = "$1_red$titles[ $markercol - 1 ]s.csv"; open ( RED_ROWS, ">$new_file" ) or die "\n-> Program terminated: can't create '$new_file' to output\n groups of redundant markers: $!\n"; print RED_ROWS ",Consensus(%),@titles\n"; } print RED_ROWS ">>>>>>>>,>>>>>>>>\n" unless ( ( $entry eq "rep" ) || ( $nu_markers == $n_markers ) ); print RED_ROWS ",,@{ $row_cells[ $i ] }\n" unless ( $entry eq "rep" ); # writes first copy of redundant entry into new file @red_row = @{ splice ( @row_cells, $j, 1 ) }; # removes row with redundant entry print RED_ROWS ",,@red_row\n"; # writes next copy of redundant entry into new file @_ = &merge_score_lists ( $score_type, ${ $row_cells[ $i ] }[ $markercol - 1 ], $red_row[ $markercol - 1 ], \@consensus, [ @red_row[ $scol1 - 1 .. $#titles ] ] ); # builds the consensus scores for a marker with redundant entries @consensus = @{ $_[ 4 ] }; for ( @consensus ) { s/X/discordant/; # assigns "discordant" label to a pair of discordant scores so that it is remembered over several loops of merging pairs of redundant lists of scores } $nu_markers--; $entry = "rep"; } } if ( $entry eq "rep" ) { for ( @consensus ) { $discordant ++ if ( $_ eq "discordant" ); $missing ++ if ( $_ eq "-" ); s/discordant/X/; } $score_consensus{ ${ $row_cells[ $i ] }[ $markercol - 1 ] } = 100 * ( @consensus - $discordant - $missing ) / ( @consensus - $missing ); $discordant = $missing = 0; @{ $row_cells[ $i ] }[ $scol1 - 1 .. $#titles ] = @consensus; # overwrites the single remaining entry of a marker with (originally) multiple entries with the consensus scores print RED_ROWS "Consensus ->,$score_consensus{ ${ $row_cells[ $i ] }[ $markercol - 1 ] },@{ $row_cells[ $i ] }\n"; # writes the consensus scores for a marker into the separate file containing reduandant markers } $entry = "new"; $i++; } unless ( $nu_markers == $n_markers ) { close RED_ROWS or die "\n-> Program terminated: can't close '$new_file'\n\n"; } return ( \@row_cells, \%score_consensus ); } ################################################################################################################### sub remove_red_samples { # removes groups of redundant samples (columns) into a separate file, retaining one column per group with the consensus scores my $file = shift @_; # = file name my $score_type = shift @_; # = type of scores (0/1) vs. (A/B/C/D/H) my $scol1 = shift @_; # = number of first column containing scores my @titles = @{ shift @_ }; # = array of column titles my @row_cells = @{ shift @_ }; # = array of arrays of cells of rows (excluding first row with column titles) my $n_samples = @titles - $scol1 + 1; # = number of samples (columns with scores) my $nu_samples = $n_samples; # = number of unique samples my @red_titles = @titles[0 .. $scol1 - 2 ]; # = array of titles of redundant columns (and the leading columns containing marker features) my $status = "new"; # = indicates whether a particular sample (column) has already been found to be redundant my $n = 1; # = array index indicating the number of columns of a particluar sample with redundant entries my $discordant = 0; # = counts number of discordant calls in a comparison of two or more score lists my $missing = 0; # = counts number of missing calls in a comparison of two or more score lists my ( $new_file, # = name of new file containing redundant sample entries $i, # = looping index $j, # = looping index $sample, # = name of currently processed sample %red_data, # = hash of an array of arrays containing scores of redundant entries of individual samples @consensus, # = array of consensus scores for a group of replicates of a particular sample @separator, # = array of empty entries (number of entries = number of rows) @red_samples, # = array of names of redundant samples @new_col, # = new column with redundant scores @to_print, # = array of empty of spacer cells to print into file %score_consensus ); # = hash of scoring consensus of all markers with multiple entries in % ( = 100 - % of scores that are discordant in at least one pair of score lists in a group of score lists for a redundant entry) for $i ( $scol1 - 1 .. $#titles ) { # loops through all score columns, starting at the first column containing scores for ( $j = $#titles; $j > $i; $j-- ) { # loops, for each score column, through all other score columns with higher indices if ( $titles[ $i ] eq $titles[ $j ] ) { # checks whether two columns contain scores for the same sample if ( $status eq "new" ) { @{ $red_data{ $titles[ $i ] } }[ 0 ] = [ ( ) ]; for ( @row_cells ) { push @{ @{ $red_data{ $titles[ $i ] } }[ 0 ] }, ${ $_ }[ $i ]; # saves the scores of the first entry of a particular sample push @consensus, ${ $_ }[ $i ]; # creates an array containing the running-consensus scores } } @{ $red_data{ $titles[ $i ] } }[ $n ] = [ ( ) ]; for ( @row_cells ) { # removes scores from redundant column and transfers them into separate array push @{ @{ $red_data{ $titles[ $i ] } }[ $n ] }, splice ( @{ $_ }, $j, 1 ); } $nu_samples--; @_ = &merge_score_lists ( $score_type, $titles[ $i ], $titles[ $j ], \@{ @{ $red_data{ $titles[ $i ] } }[ $n ] }, \@consensus ); # builds the consensus scores for a sample represented by redundant columns @consensus = @{ $_[ 4 ] }; for ( @consensus ) { s/X/discordant/; # assigns "discordant" label to a pair of discordant scores so that it is remembered over several loops of merging pairs of redundant lists of scores } splice ( @titles, $j, 1 ); $status = "rep"; $n++; } } if ( $status eq "rep" ) { for ( @consensus ) { # converts "discordant" label into "X" for output into file $discordant ++ if ( $_ eq "discordant" ); $missing ++ if ( $_ eq "-" ); s/discordant/X/; } $score_consensus{ $titles[ $i ] } = 100 * ( @consensus - $discordant - $missing ) / ( @consensus - $missing ); $discordant = $missing = 0; @{ $red_data{ $titles[ $i ] } }[ $n ] = [ ( ) ]; for ( @consensus ) { push @{ @{ $red_data{ $titles[ $i ] } }[ $n ] }, $_; # writes the consensus scores (and spacer) into the array containing groups of redundant samples } for ( @row_cells ) { push @separator, ""; } ${ $red_data{ $titles[ $i ] } }[ $n + 1 ] = \@separator; push @red_samples, $titles[ $i ]; for ( @row_cells ) { ${ $_ }[ $i ] = shift @consensus; # overwrites the single remaining column representing a sample with originally multiple entries with the consensus scores } } $status = "new"; $n = 1; } unless ( $n_samples == $nu_samples ) { # outputs redudant sample entries plus the consensus scores into a separate file @red_samples = sort ( @red_samples ); # creates column titles for redundant sample entries for ( @red_samples ) { push @red_titles, ( $_, $_, "Consensus", "||" ); } pop @red_titles; $file =~ /^(.+)\..*$/; $new_file = "$1_redSamples.csv"; open ( RED_COLS, ">$new_file" ) or die "\n-> Program terminated: can't create '$new_file' to output\n groups of redundant samples: $!\n"; for ( 1 .. $scol1 - 2 ) { print RED_COLS ","; } print RED_COLS "Consensus(%) ->"; $i = "first sample"; for $sample ( sort keys %red_data ) { for ( 1 .. $#{ $red_data{ $sample } } - 1 ) { push @to_print, "-" ; } print RED_COLS ",||" if ( $i eq "subsequent sample" ); print RED_COLS ",@to_print,$score_consensus{ $sample }"; @to_print = ( ); $i = "subsequent sample"; } print RED_COLS "\n"; print RED_COLS "@red_titles\n"; for $i ( 0 .. $#row_cells ) { print RED_COLS "@{ $row_cells[ $i ] }[ 0 .. $scol1 - 2 ],"; for $sample ( sort keys ( %red_data ) ) { for ( @{ $red_data{ $sample } } ) { print RED_COLS "${ $_ }[ $i ],"; } } print RED_COLS "\n"; } close RED_COLS or die "\n-> Program terminated: can't close '$new_file': $!\n\n"; } return ( $nu_samples, \@titles, \@row_cells, \%score_consensus ); }