#!/opt/local/bin/perl
# Copyright 2015 Geo Velikkakam James (geovjames@gmail.com)
# This script was used in Simon et al. 2015.
# This script is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
# This script is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
# For a copy of the GNU General Public License see .
use Getopt::Std;
my $opt_string = 'i:m:f:';
getopt("$opt_string",\%options) or usage();
if(defined $options{i} && defined $options{m}){
$dnafilename=$options{i};
$motiffile=$options{m};
}else{
print "error in input \n\nUsage: perl motif.pl -i fasta_file -m motif_file\n\n Options:\nIn motif file motifs are given in each line\n-f: file format of input sequence, default is \"FASTA\". \"genebank\" is another option\n\n";
exit;
}
if(defined $options{f}){
$format = $options{f}
}else{
$format = "fasta";
}
unless ( open(dnafilename, $dnafilename) ) {
print "Cannot open file \"$dnafilename\"\n\n";
exit;
}
unless ( open(motiffile, $motiffile) ) {
print "Cannot open file \"$motiffile\"\n\n";
exit;
}
while (){
chomp;
# uc;
push (@motif, uc($_)) unless (/^$/);
# $motif_hash{$_}=0;
}
close (motiffile);
if($format =~ /fasta/i) {
$/ = "\n>";
@protein = ;
$/= "\n";
}elsif($format =~ /genbank/i){
@genbank = ;
@protein = genbank2seq(@genbank);
# print "\n\n the first seq: $protein[0] \n";
}
#print "@protein \n";
close (dnafilename);
my %iub2character_class = (
A => 'A',
C => 'C',
G => 'G',
T => 'T',
R => '[GA]',
Y => '[CT]',
M => '[AC]',
K => '[GT]',
S => '[GC]',
W => '[AT]',
B => '[CGT]',
D => '[AGT]',
H => '[ACT]',
V => '[ACG]',
N => '[ACGT]',
);
IUB_to_regexp(\@motif);
foreach $protein1 (@protein){
$protein1 =~ s/>?(\w*).*\n//;
$seq_name = $1;
$protein1 =~ s/\n//g if($format =~/fasta/i);
foreach $motif(@motif){
$count=0;
@position=();
@seq=();
while( $protein1 =~ /$motif/ig ) {
# push (@seq, $`);
push(@position,length($`)+1);
$count++;
#exit;
}
print "$seq_name\t$motif\t$count\t@position\n";
}
}
##########################################################################################################
sub match_positions {
my($regexp, $sequence) = @_;
my @positions = ( );
while ( $sequence =~ /$regexp/ig ) {
push ( @positions, pos($sequence) - length($&) + 1);
}
return @positions;
}
sub IUB_to_regexp {
my($motif_ref) = @_;
my $regular_expression = '';
for(my $j=0; $j<@$motif_ref; ++$j){
for ( my $i = 0 ; $i < length(@$motif_ref[$j]) ; ++$i ) {
$regular_expression
.= $iub2character_class{substr(@$motif_ref[$j], $i, 1)};
}
@$motif_ref[$j] = $regular_expression;
$regular_expression = '';
}
}
sub genbank2seq {
my @GenBankFile = @_ ;
my @sequence = ();
my $dna ="";
my $locus ="";
foreach my $line (@GenBankFile) {
if($line =~ /^LOCUS\s*(\w*)/){
$locus = ">".$1."\n";
# print "the locus is : ".">".$1."\n";
}
elsif( $line =~ /^\/\/\n/ ) { # If $line is end-of-record line //\n,
#last; #break out of the foreach loop.
push(@sequence, $dna);
$in_sequence = 0;
$dna = "";
$locus = "";
} elsif( $in_sequence) { # If we know we're in a sequence,
$line =~ s/[\s0-9]//g;
$dna .= $line; # add the current line to $$dna.
} elsif ( $line =~ /^ORIGIN/ ) { # If $line begins a sequence,
if($locus) {
$dna .= $locus;
$in_sequence = 1; # set the $in_sequence flag.
}else {
print "their is no locus name for the entry, Please check the genbank format";
exit();
}
}
}
# print "the seq is @sequence \n";
return @sequence;
# remove whitespace and line numbers from DNA sequence
}