#!/opt/local/bin/perl # Copyright 2015 Geo Velikkakam James (geovjames@gmail.com) # This script was used in Simon et al. 2015. # This script is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 3 of the License, or # (at your option) any later version. # This script is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # For a copy of the GNU General Public License see . use Getopt::Std; my $opt_string = 'i:m:f:'; getopt("$opt_string",\%options) or usage(); if(defined $options{i} && defined $options{m}){ $dnafilename=$options{i}; $motiffile=$options{m}; }else{ print "error in input \n\nUsage: perl motif.pl -i fasta_file -m motif_file\n\n Options:\nIn motif file motifs are given in each line\n-f: file format of input sequence, default is \"FASTA\". \"genebank\" is another option\n\n"; exit; } if(defined $options{f}){ $format = $options{f} }else{ $format = "fasta"; } unless ( open(dnafilename, $dnafilename) ) { print "Cannot open file \"$dnafilename\"\n\n"; exit; } unless ( open(motiffile, $motiffile) ) { print "Cannot open file \"$motiffile\"\n\n"; exit; } while (){ chomp; # uc; push (@motif, uc($_)) unless (/^$/); # $motif_hash{$_}=0; } close (motiffile); if($format =~ /fasta/i) { $/ = "\n>"; @protein = ; $/= "\n"; }elsif($format =~ /genbank/i){ @genbank = ; @protein = genbank2seq(@genbank); # print "\n\n the first seq: $protein[0] \n"; } #print "@protein \n"; close (dnafilename); my %iub2character_class = ( A => 'A', C => 'C', G => 'G', T => 'T', R => '[GA]', Y => '[CT]', M => '[AC]', K => '[GT]', S => '[GC]', W => '[AT]', B => '[CGT]', D => '[AGT]', H => '[ACT]', V => '[ACG]', N => '[ACGT]', ); IUB_to_regexp(\@motif); foreach $protein1 (@protein){ $protein1 =~ s/>?(\w*).*\n//; $seq_name = $1; $protein1 =~ s/\n//g if($format =~/fasta/i); foreach $motif(@motif){ $count=0; @position=(); @seq=(); while( $protein1 =~ /$motif/ig ) { # push (@seq, $`); push(@position,length($`)+1); $count++; #exit; } print "$seq_name\t$motif\t$count\t@position\n"; } } ########################################################################################################## sub match_positions { my($regexp, $sequence) = @_; my @positions = ( ); while ( $sequence =~ /$regexp/ig ) { push ( @positions, pos($sequence) - length($&) + 1); } return @positions; } sub IUB_to_regexp { my($motif_ref) = @_; my $regular_expression = ''; for(my $j=0; $j<@$motif_ref; ++$j){ for ( my $i = 0 ; $i < length(@$motif_ref[$j]) ; ++$i ) { $regular_expression .= $iub2character_class{substr(@$motif_ref[$j], $i, 1)}; } @$motif_ref[$j] = $regular_expression; $regular_expression = ''; } } sub genbank2seq { my @GenBankFile = @_ ; my @sequence = (); my $dna =""; my $locus =""; foreach my $line (@GenBankFile) { if($line =~ /^LOCUS\s*(\w*)/){ $locus = ">".$1."\n"; # print "the locus is : ".">".$1."\n"; } elsif( $line =~ /^\/\/\n/ ) { # If $line is end-of-record line //\n, #last; #break out of the foreach loop. push(@sequence, $dna); $in_sequence = 0; $dna = ""; $locus = ""; } elsif( $in_sequence) { # If we know we're in a sequence, $line =~ s/[\s0-9]//g; $dna .= $line; # add the current line to $$dna. } elsif ( $line =~ /^ORIGIN/ ) { # If $line begins a sequence, if($locus) { $dna .= $locus; $in_sequence = 1; # set the $in_sequence flag. }else { print "their is no locus name for the entry, Please check the genbank format"; exit(); } } } # print "the seq is @sequence \n"; return @sequence; # remove whitespace and line numbers from DNA sequence }