648 lines
18 KiB
Perl
648 lines
18 KiB
Perl
#!/usr/bin/env perl
|
|
|
|
package main;
|
|
our $SEE;
|
|
|
|
package Nuc_translator;
|
|
|
|
use strict;
|
|
require Exporter;
|
|
use Carp;
|
|
|
|
|
|
our @ISA = qw (Exporter);
|
|
our @EXPORT = qw (get_genetic_codes translate_sequence get_protein reverse_complement);
|
|
|
|
use vars qw ($currentCode %codon_table $init_codon_table_subref $support_Thymine_and_Uracil_subref);
|
|
|
|
|
|
=head1 NAME
|
|
|
|
package Nuc_translator.pm
|
|
|
|
|
|
=head1 SYNOPSIS
|
|
|
|
Nuc_translator::use_specified_genetic_code ("universal");
|
|
|
|
my $nuc_sequence = "atgaaagggccctga";
|
|
|
|
my $translation_frame = 1;
|
|
|
|
my $protein = &translate_sequence($nuc_sequence, $translation_frame);
|
|
|
|
|
|
=head1 DESCRIPTION
|
|
|
|
Methods are provided to translate nucleotide sequences into protein sequences using a specified genetic code.
|
|
|
|
Available genetic codes include universal, Euplotes, Tetrahymena, Candida, Acetabularia
|
|
|
|
For info on these codes, visit:
|
|
|
|
http://golgi.harvard.edu/biolinks/gencode.html
|
|
(https://web.archive.org/web/20040216020103/http://golgi.harvard.edu/biolinks/gencode.html)
|
|
|
|
Methods exported by this package include:
|
|
|
|
translate_sequence()
|
|
|
|
get_protein()
|
|
|
|
reverse_complement()
|
|
|
|
To change the translation code, the following fully qualified method must be used:
|
|
|
|
Nuc_translator::use_specified_genetic_code()
|
|
|
|
|
|
=head1 Methods
|
|
|
|
|
|
=cut
|
|
|
|
|
|
|
|
## See http://golgi.harvard.edu/biolinks/gencode.html
|
|
my %SUPPORTED_GENETIC_CODES = ( Universal => 1,
|
|
|
|
Tetrahymena => 1,
|
|
Acetabularia => 1,
|
|
Ciliate => 1,
|
|
Dasycladacean => 1,
|
|
Hexamita => 1,
|
|
|
|
Candida => 1,
|
|
|
|
Euplotid => 1,
|
|
|
|
SR1_Gracilibacteria => 1,
|
|
|
|
Pachysolen_tannophilus => 1,
|
|
|
|
Mesodinium => 1,
|
|
|
|
Peritrich => 1,
|
|
|
|
|
|
|
|
'Mitochondrial-Vertebrates' => 1,
|
|
'Mitochondrial-Yeast' => 1,
|
|
'Mitochondrial-Invertebrates' => 1,
|
|
"Mitochondrial-Protozoan" => 1,
|
|
"Mitochondrial-Echinoderm" => 1,
|
|
"Mitochondrial-Ascidian" => 1,
|
|
"Mitochondrial-Flatworm" => 1,
|
|
"Mitochondrial-Chlorophycean" => 1,
|
|
"Mitochondrial-Trematode" => 1,
|
|
"Mitochondrial-Scenedesmus_obliquus" => 1,
|
|
"Mitochondrial-Thraustochytrium" => 1,
|
|
"Mitochondrial-Pterobranchia" => 1,
|
|
|
|
);
|
|
|
|
|
|
|
|
|
|
|
|
=over 4
|
|
|
|
=item get_genetic_codes()
|
|
|
|
B<Description:> provides the list of supported genetic codes
|
|
|
|
B<Parameters:>
|
|
|
|
B<Returns:> list of genetic codes
|
|
|
|
=back
|
|
|
|
=cut
|
|
|
|
####
|
|
sub get_genetic_codes {
|
|
return (sort keys %SUPPORTED_GENETIC_CODES);
|
|
}
|
|
|
|
|
|
|
|
####
|
|
sub show_translation_table {
|
|
my ($genetic_code) = @_;
|
|
|
|
&use_specified_genetic_code($genetic_code);
|
|
|
|
my @nucs = qw(T C A G);
|
|
|
|
|
|
my $translation_line = "";
|
|
my $c1_line = "";
|
|
my $c2_line = "";
|
|
my $c3_line = "";
|
|
|
|
for my $c1 (@nucs) {
|
|
for my $c2 (@nucs) {
|
|
for my $c3 (@nucs) {
|
|
$c1_line .= $c1;
|
|
$c2_line .= $c2;
|
|
$c3_line .= $c3;
|
|
|
|
my $codon = $c1 . $c2 . $c3;
|
|
my $translation = $codon_table{$codon};
|
|
$translation_line .= "$translation";
|
|
}
|
|
}
|
|
}
|
|
|
|
my $translation_table = join("\n", $translation_line, $c1_line, $c2_line, $c3_line) . "\n";
|
|
|
|
return($translation_table);
|
|
}
|
|
|
|
|
|
=over 4
|
|
|
|
=item translate_sequence()
|
|
|
|
B<Description:> translates a nucleotide sequence given a specific frame 1-6.
|
|
|
|
B<Parameters:> $nuc_sequence, $frame
|
|
|
|
B<Returns:> $protein_sequence
|
|
|
|
=back
|
|
|
|
=cut
|
|
|
|
|
|
|
|
sub translate_sequence {
|
|
my ($sequence, $frame) = @_;
|
|
|
|
$sequence = uc ($sequence);
|
|
$sequence =~ tr/U/T/;
|
|
my $seq_length = length ($sequence);
|
|
unless ($frame >= 1 and $frame <= 6) {
|
|
confess "Frame $frame is not allowed. Only between 1 and 6";
|
|
}
|
|
|
|
if ($frame > 3) {
|
|
# on reverse strand. Revcomp the sequence and reset the frame
|
|
$sequence = &reverse_complement($sequence);
|
|
if ($frame == 4) {
|
|
$frame = 1;
|
|
}
|
|
elsif ($frame == 5) {
|
|
$frame = 2;
|
|
}
|
|
elsif ($frame == 6) {
|
|
$frame = 3;
|
|
}
|
|
}
|
|
|
|
$sequence =~ tr/T/U/;
|
|
my $start_point = $frame - 1;
|
|
my $protein_sequence;
|
|
for (my $i = $start_point; $i < $seq_length; $i+=3) {
|
|
my $codon = substr($sequence, $i, 3);
|
|
my $amino_acid;
|
|
if (exists($codon_table{$codon})) {
|
|
$amino_acid = $codon_table{$codon};
|
|
} else {
|
|
if (length($codon) == 3) {
|
|
$amino_acid = 'X';
|
|
} else {
|
|
$amino_acid = "";
|
|
}
|
|
}
|
|
$protein_sequence .= $amino_acid;
|
|
}
|
|
return($protein_sequence);
|
|
}
|
|
|
|
|
|
|
|
=over 4
|
|
|
|
=item get_protein()
|
|
|
|
B<Description:> translates nucleotide sequence into a protein sequence. All 3 forward translation frames are tried
|
|
and the first reading frame found to translate without stop codons is returned. If all 3 frames provide stop codons, the protein with the least number of stops is returned.
|
|
|
|
B<Parameters:> $nucleotide_sequence
|
|
|
|
B<Returns:> $protein_sequence
|
|
|
|
=back
|
|
|
|
=cut
|
|
|
|
|
|
|
|
sub get_protein {
|
|
my ($sequence) = @_;
|
|
|
|
## Assume frame 1 unless multiple stops appear.
|
|
my $least_stops = undef();
|
|
my $least_stop_prot_seq = "";
|
|
foreach my $forward_frame (1, 2, 3) {
|
|
my $protein = &translate_sequence($sequence, $forward_frame);
|
|
my $num_stops = &count_stops_in_prot_seq($protein);
|
|
if ($num_stops == 0) {
|
|
return ($protein);
|
|
} else {
|
|
if (!defined($least_stops)) {
|
|
#initialize data
|
|
$least_stops = $num_stops;
|
|
$least_stop_prot_seq = $protein;
|
|
} elsif ($num_stops < $least_stops) {
|
|
$least_stops = $num_stops;
|
|
$least_stop_prot_seq = $protein;
|
|
} else {
|
|
#keeping original $num_stops and $least_stop_prot_seq
|
|
}
|
|
}
|
|
}
|
|
return ($least_stop_prot_seq);
|
|
}
|
|
|
|
|
|
=over 4
|
|
|
|
=item reverse_complement()
|
|
|
|
B<Description:> reverse complements a nucleotide sequence
|
|
|
|
B<Parameters:> $nucleotide_sequence
|
|
|
|
B<Returns:> $nucleotide_sequence_rev_comped
|
|
|
|
=back
|
|
|
|
=cut
|
|
|
|
|
|
|
|
sub reverse_complement {
|
|
my($s) = @_;
|
|
my ($rc);
|
|
$rc = reverse ($s);
|
|
$rc =~tr/ACGTacgtyrkmYRKMUu/TGCAtgcarymkRYMKAa/;
|
|
return($rc);
|
|
}
|
|
|
|
|
|
####
|
|
sub count_stops_in_prot_seq {
|
|
my ($prot_seq) = @_;
|
|
chop $prot_seq; #remove trailing stop.
|
|
my $stop_num = 0;
|
|
while ($prot_seq =~ /\*/g) {
|
|
$stop_num++;
|
|
}
|
|
return ($stop_num);
|
|
}
|
|
|
|
####
|
|
sub use_specified_genetic_code {
|
|
my ($special_code) = @_;
|
|
print STDERR "using special genetic code $special_code\n" if $SEE;
|
|
unless ($SUPPORTED_GENETIC_CODES{$special_code}) {
|
|
die "Sorry, $special_code is not currently supported or recognized.\n";
|
|
}
|
|
&$init_codon_table_subref(); ## Restore default universal code. Others are variations on this.
|
|
$currentCode = $special_code;
|
|
|
|
if ($special_code eq "Universal") {
|
|
# already set.
|
|
}
|
|
|
|
elsif ($special_code =~ /^(Tetrahymena|Acetabularia|Ciliate|Dasycladacean|Hexamita)$/) {
|
|
$codon_table{UAA} = "Q";
|
|
$codon_table{UAG} = "Q";
|
|
}
|
|
|
|
elsif ($special_code eq "Candida") {
|
|
$codon_table{CUG} = "S";
|
|
}
|
|
|
|
elsif ($special_code eq "Euplotid") {
|
|
$codon_table{UGA} = "C"; # not *
|
|
}
|
|
|
|
elsif ($special_code eq "SR1_Gracilibacteria") {
|
|
$codon_table{UGA} = "G"; # not *
|
|
}
|
|
|
|
elsif ($special_code eq "Pachysolen_tannophilus") {
|
|
$codon_table{CUG} = "A"; # not L
|
|
}
|
|
elsif ($special_code eq "Mesodinium") {
|
|
$codon_table{UAA} = "Y"; # not *
|
|
$codon_table{UAG} = "Y"; # not *
|
|
}
|
|
|
|
elsif ($special_code eq "Peritrich") {
|
|
$codon_table{UAA} = "E"; # not *
|
|
$codon_table{UAG} = "E"; # not *
|
|
}
|
|
|
|
elsif ($special_code =~ /Mitochondrial/) {
|
|
&_set_mitochondrial_code($special_code);
|
|
}
|
|
|
|
else {
|
|
## shouldn't ever get here anyway.
|
|
confess "Error, code $special_code is not recognized.\n";
|
|
}
|
|
|
|
|
|
&$support_Thymine_and_Uracil_subref();
|
|
|
|
}
|
|
|
|
|
|
####
|
|
sub _set_mitochondrial_code {
|
|
my $code = shift;
|
|
|
|
# see: https://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi#SG2
|
|
|
|
if ($code eq "Mitochondrial-Vertebrates") {
|
|
$codon_table{UGA} = "W";
|
|
$codon_table{AUA} = "M";
|
|
$codon_table{AGA} = "*";
|
|
$codon_table{AGG} = "*";
|
|
}
|
|
elsif ($code eq "Mitochondrial-Yeast") {
|
|
$codon_table{AUA} = "M"; # instead of I
|
|
$codon_table{CUU} = $codon_table{CUC} = $codon_table{CUA} = $codon_table{CUG} = "T"; # instead of L
|
|
$codon_table{UGA} = "W"; # instead of *
|
|
}
|
|
elsif ($code eq "Mitochondrial-Invertebrates") {
|
|
$codon_table{AGA} = "S";
|
|
$codon_table{AGG} = "S";
|
|
$codon_table{AUA} = "M";
|
|
$codon_table{UGA} = "W";
|
|
}
|
|
elsif ($code eq "Mitochondrial-Protozoan") {
|
|
$codon_table{UGA} = "W";
|
|
}
|
|
elsif ($code eq "Mitochondrial-Echinoderm" || $code eq "Mitochondrial-Flatworm") {
|
|
$codon_table{AAA} = "N"; # not K
|
|
$codon_table{AGA} = "S"; # not R
|
|
$codon_table{AGG} = "S"; # not R
|
|
$codon_table{UGA} = "W"; # not *
|
|
}
|
|
elsif ($code eq "Mitochondrial-Ascidian") {
|
|
$codon_table{AGA} = "G"; # not R
|
|
$codon_table{AGG} = "G"; # not R
|
|
$codon_table{AUA} = "M"; # not I
|
|
$codon_table{UGA} = "W"; # not *
|
|
}
|
|
|
|
elsif ($code eq "Mitochondrial-Chlorophycean") {
|
|
$codon_table{UAG} = "L"; # not *
|
|
}
|
|
elsif ($code eq "Mitochondrial-Trematode") {
|
|
$codon_table{UGA} = "W"; # not *
|
|
$codon_table{AUA} = "M"; # not I
|
|
$codon_table{AGA} = "S"; # not R
|
|
$codon_table{AGG} = "S"; # not R
|
|
$codon_table{AAA} = "N"; # not K
|
|
}
|
|
elsif ($code eq "Mitochondrial-Scenedesmus_obliquus") {
|
|
$codon_table{UCA} = "*"; # not S
|
|
$codon_table{UAG} = "L"; # not *
|
|
}
|
|
elsif ($code eq "Mitochondrial-Thraustochytrium") {
|
|
$codon_table{UUA} = "*";
|
|
}
|
|
elsif ($code eq "Mitochondrial-Pterobranchia") {
|
|
$codon_table{AGA} = "S"; # not R
|
|
$codon_table{AGG} = "K"; # not R
|
|
$codon_table{UGA} = "W"; # not *
|
|
}
|
|
|
|
|
|
else {
|
|
confess "Sorry, $code hasn't been fully implemented yet.";
|
|
}
|
|
|
|
|
|
|
|
|
|
return;
|
|
}
|
|
|
|
|
|
|
|
####
|
|
sub get_stop_codons {
|
|
my @stop_codons;
|
|
foreach my $codon (keys %codon_table) {
|
|
if ($codon_table{$codon} eq '*') {
|
|
push (@stop_codons, $codon);
|
|
}
|
|
}
|
|
#foreach my $codon (@stop_codons) {
|
|
# $codon =~ tr/U/T/;
|
|
#}
|
|
return (@stop_codons);
|
|
}
|
|
|
|
|
|
BEGIN {
|
|
$init_codon_table_subref = sub {
|
|
print STDERR "initing codon table.\n" if $SEE;
|
|
## Set to Universal Genetic Code
|
|
$currentCode = "universal";
|
|
|
|
%codon_table = ( UUU => 'F',
|
|
UUC => 'F',
|
|
UUA => 'L',
|
|
UUG => 'L',
|
|
|
|
CUU => 'L',
|
|
CUC => 'L',
|
|
CUA => 'L',
|
|
CUG => 'L',
|
|
|
|
AUU => 'I',
|
|
AUC => 'I',
|
|
AUA => 'I',
|
|
AUG => 'M',
|
|
|
|
GUU => 'V',
|
|
GUC => 'V',
|
|
GUA => 'V',
|
|
GUG => 'V',
|
|
|
|
UCU => 'S',
|
|
UCC => 'S',
|
|
UCA => 'S',
|
|
UCG => 'S',
|
|
|
|
CCU => 'P',
|
|
CCC => 'P',
|
|
CCA => 'P',
|
|
CCG => 'P',
|
|
|
|
ACU => 'T',
|
|
ACC => 'T',
|
|
ACA => 'T',
|
|
ACG => 'T',
|
|
|
|
GCU => 'A',
|
|
GCC => 'A',
|
|
GCA => 'A',
|
|
GCG => 'A',
|
|
|
|
UAU => 'Y',
|
|
UAC => 'Y',
|
|
UAA => '*',
|
|
UAG => '*',
|
|
|
|
CAU => 'H',
|
|
CAC => 'H',
|
|
CAA => 'Q',
|
|
CAG => 'Q',
|
|
|
|
AAU => 'N',
|
|
AAC => 'N',
|
|
AAA => 'K',
|
|
AAG => 'K',
|
|
|
|
GAU => 'D',
|
|
GAC => 'D',
|
|
GAA => 'E',
|
|
GAG => 'E',
|
|
|
|
UGU => 'C',
|
|
UGC => 'C',
|
|
UGA => '*',
|
|
UGG => 'W',
|
|
|
|
CGU => 'R',
|
|
CGC => 'R',
|
|
CGA => 'R',
|
|
CGG => 'R',
|
|
|
|
AGU => 'S',
|
|
AGC => 'S',
|
|
AGA => 'R',
|
|
AGG => 'R',
|
|
|
|
GGU => 'G',
|
|
GGC => 'G',
|
|
GGA => 'G',
|
|
GGG => 'G'
|
|
|
|
);
|
|
};
|
|
|
|
|
|
$support_Thymine_and_Uracil_subref = sub {
|
|
|
|
my @codons = keys %codon_table;
|
|
foreach my $codon (@codons) {
|
|
if ($codon =~ /U/) {
|
|
my $aa = $codon_table{$codon};
|
|
my $T_codon = $codon;
|
|
$T_codon =~ s/U/T/g;
|
|
$codon_table{$T_codon} = $aa;
|
|
}
|
|
}
|
|
};
|
|
|
|
|
|
# init codon table, using uracil codons
|
|
&$init_codon_table_subref();
|
|
|
|
# update codon table to also support thymine
|
|
&$support_Thymine_and_Uracil_subref();
|
|
}
|
|
|
|
|
|
|
|
|
|
####
|
|
sub run_test() {
|
|
my %expected_translation_tables = (
|
|
'Universal' => "FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
|
|
'Mitochondrial-Vertebrates' => "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSS**VVVVAAAADDEEGGGG",
|
|
'Mitochondrial-Yeast' => "FFLLSSSSYY**CCWWTTTTPPPPHHQQRRRRIIMMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
|
|
'Mitochondrial-Protozoan' => "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
|
|
'Mitochondrial-Invertebrates' => "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSSSSVVVVAAAADDEEGGGG",
|
|
'Ciliate' => 'FFLLSSSSYYQQCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG',
|
|
'Mitochondrial-Echinoderm' => "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNNKSSSSVVVVAAAADDEEGGGG",
|
|
'Euplotid' => "FFLLSSSSYY**CCCWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
|
|
'Candida' => "FFLLSSSSYY**CC*WLLLSPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
|
|
'Mitochondrial-Ascidian' => "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSSGGVVVVAAAADDEEGGGG",
|
|
'Mitochondrial-Chlorophycean' => "FFLLSSSSYY*LCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
|
|
'Mitochondrial-Trematode' => "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNNKSSSSVVVVAAAADDEEGGGG",
|
|
'Mitochondrial-Scenedesmus_obliquus' => "FFLLSS*SYY*LCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
|
|
'Mitochondrial-Thraustochytrium' => "FF*LSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
|
|
'Mitochondrial-Pterobranchia' => "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSSKVVVVAAAADDEEGGGG",
|
|
'SR1_Gracilibacteria' => "FFLLSSSSYY**CCGWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
|
|
'Pachysolen_tannophilus' => "FFLLSSSSYY**CC*WLLLAPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
|
|
'Mesodinium' => "FFLLSSSSYYYYCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
|
|
'Peritrich' => "FFLLSSSSYYEECC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
|
|
);
|
|
|
|
my $exit_code = 0;
|
|
|
|
foreach my $genetic_code (sort keys %expected_translation_tables) {
|
|
my $expected_translation = $expected_translation_tables{$genetic_code};
|
|
|
|
my $reconstructed_translation_table = &show_translation_table($genetic_code);
|
|
my @pts = split(/\n/, $reconstructed_translation_table);
|
|
my $trans_table = shift @pts;
|
|
|
|
if ($trans_table eq $expected_translation) {
|
|
print "$genetic_code\tOK\n";
|
|
}
|
|
else {
|
|
print "\n$genetic_code\tERROR:\n"
|
|
. "$expected_translation (EXPECTED)\n"
|
|
. "$trans_table (Computed)\n\n";
|
|
|
|
$exit_code = 1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
exit($exit_code);
|
|
|
|
}
|
|
|
|
unless (caller) {
|
|
|
|
my $genetic_code = $ARGV[0];
|
|
|
|
if ($genetic_code eq "TEST") {
|
|
&run_test();
|
|
}
|
|
|
|
if ($genetic_code) {
|
|
print "\nTranslation table for $genetic_code:\n\n";
|
|
print &show_translation_table($genetic_code) . "\n\n";
|
|
}
|
|
else {
|
|
print "Supported genetic codes:\n\n" . join("\n", &get_genetic_codes()) . "\n\n"
|
|
. "Choose one to show translation table.\n\n";
|
|
}
|
|
|
|
exit(0);
|
|
|
|
}
|
|
|
|
|
|
|
|
1; #end of module
|
|
|
|
|
|
|
|
|