biyelunwen/99.scripts/trinity_utils/PerlLib/Exons_to_geneobj.pm

212 lines
6.2 KiB
Perl

package main;
our $SEE;
package Exons_to_geneobj;
use strict;
use Longest_orf;
use Gene_obj;
## No reason to instantiate. Use methods, fully qualified.
## allow for partial ORFS (missing start or stop codon in longest ORF)
###################
## Public method ##
###################
sub create_gene_obj {
my ($exons_href, $sequence_ref, $partial_info_href) = @_;
unless (ref $sequence_ref) {
die "Error, need reference to sequence as input parameter.\n";
}
unless (ref $partial_info_href) {
$partial_info_href = {};
}
## exons_ref should be end5's keyed to end3's for all exons.
my ($gene_struct_mod, $cdna_seq) = &get_cdna_seq ($exons_href, $sequence_ref);
my $cdna_seq_length = length $cdna_seq;
my $long_orf_obj = new Longest_orf();
# establish long orf finding parameters.
$long_orf_obj->forward_strand_only();
if ($partial_info_href->{"5prime"}) {
print "Exons_to_geneobj: Allowing 5' partials\n" if $SEE;
$long_orf_obj->allow_5prime_partials();
}
$long_orf_obj->allow_3prime_partials(); ## Allow this by default.
$long_orf_obj->get_longest_orf($cdna_seq);
my ($end5, $end3) = $long_orf_obj->get_end5_end3();
print "CDS: $end5, $end3\n" if $SEE;
my $gene_obj = &create_gene ($gene_struct_mod, $end5, $end3);
## Make adjustments if partial
my @exons = $gene_obj->get_exons();
my $first_exon = $exons[0];
my $last_exon = $exons[$#exons];
my $need_refine_flag = 0;
if ($partial_info_href->{"5prime"}) {
$gene_obj->set_5prime_partial(1);
my $cds_exon = $first_exon->get_CDS_obj();
if (ref $cds_exon) {
$cds_exon->{phase} = 0;
my $diff_coords = abs ($cds_exon->{end5} - $first_exon->{end5});
if ($diff_coords > 0 && $diff_coords < 3) {
$cds_exon->{end5} = $first_exon->{end5};
$need_refine_flag = 1;
## update phase of first exon
if ($diff_coords == 1) {
$cds_exon->{phase} = 2;
} elsif ($diff_coords == 2) {
$cds_exon->{phase} = 1;
}
}
}
}
if ($partial_info_href->{"3prime"}) {
$gene_obj->set_3prime_partial(1);
my $cds_exon = $last_exon->get_CDS_obj();
if (ref $cds_exon) {
my $diff_coords = abs ($cds_exon->{end3} - $last_exon->{end3});
if ($diff_coords > 0 && $diff_coords < 3) {
$cds_exon->{end3} = $last_exon->{end3};
$need_refine_flag = 1;
}
}
}
## Set the phase attribute for each cds exon:
my @cds_exons;
foreach my $exon ($gene_obj->get_exons()) {
if (my $cds = $exon->get_CDS_obj()) {
if (ref $cds) {
push (@cds_exons, $cds);
}
}
}
if (@cds_exons) {
my $cds_length = 0;
my $first_cds = shift @cds_exons;
my $phase = $first_cds->{phase};
unless ($phase) {
## didn't set it for non 5' partials yet.
$phase = $first_cds->{phase} = 0;
}
$cds_length -= $phase;
$cds_length += $first_cds->length();
foreach my $following_cds (@cds_exons) {
$following_cds->{phase} = $cds_length % 3;
$cds_length += $following_cds->length();
}
}
if ($need_refine_flag) {
$gene_obj->refine_gene_object();
}
return ($gene_obj);
}
########################
## Private methods #####
########################
####
sub get_cdna_seq {
my ($gene_struct, $assembly_seq_ref) = @_;
my (@end5s) = sort {$a<=>$b} keys %$gene_struct;
my $strand = "?";
foreach my $end5 (@end5s) {
my $end3 = $gene_struct->{$end5};
if ($end5 == $end3) { next;}
$strand = ($end5 < $end3) ? '+':'-';
last;
}
if ($strand eq "?") {
print Dumper ($gene_struct);
die "ERROR: I can't determine what orientation the cDNA is in!\n";
}
print NOTES "strand: $strand\n";
my $cdna_seq;
my $gene_struct_mod = {strand=>$strand,
exons=>[]}; #ordered lend->rend coordinate listing.
foreach my $end5 (@end5s) {
#print $end5;
my $end3 = $gene_struct->{$end5};
my ($coord1, $coord2) = sort {$a<=>$b} ($end5, $end3);
my $exon_seq = substr ($$assembly_seq_ref, $coord1 - 1, ($coord2 - $coord1 + 1));
$cdna_seq .= $exon_seq;
push (@{$gene_struct_mod->{exons}}, [$coord1, $coord2]);
}
if ($strand eq '-') {
$cdna_seq = reverse_complement ($cdna_seq);
}
return ($gene_struct_mod, $cdna_seq);
}
####
sub create_gene {
my ($gene_struct_mod, $cds_pointer_lend, $cds_pointer_rend) = @_;
my $strand = $gene_struct_mod->{strand};
my @exons = @{$gene_struct_mod->{exons}};
if ($strand eq '-') {
@exons = reverse (@exons);
}
my $mRNA_pointer_lend = 1;
my $mRNA_pointer_rend = 0;
my $gene_obj = new Gene_obj();
foreach my $coordset_ref (@exons) {
my ($coord1, $coord2) = @$coordset_ref;
my ($end5, $end3) = ($strand eq '+') ? ($coord1, $coord2) : ($coord2, $coord1);
my $exon_obj = new mRNA_exon_obj($end5, $end3);
my $exon_length = ($coord2 - $coord1 + 1);
$mRNA_pointer_rend = $mRNA_pointer_lend + $exon_length - 1;
## see if cds is within current cDNA range.
if ( $cds_pointer_rend >= $mRNA_pointer_lend && $cds_pointer_lend <= $mRNA_pointer_rend) { #overlap
my $diff = $cds_pointer_lend - $mRNA_pointer_lend;
my $delta_lend = ($diff >0) ? $diff : 0;
$diff = $mRNA_pointer_rend - $cds_pointer_rend;
my $delta_rend = ($diff > 0) ? $diff : 0;
if ($strand eq '+') {
$exon_obj->add_CDS_exon_obj($end5 + $delta_lend, $end3 - $delta_rend);
} else {
$exon_obj->add_CDS_exon_obj($end5 - $delta_lend, $end3 + $delta_rend);
}
}
$gene_obj->add_mRNA_exon_obj($exon_obj);
$mRNA_pointer_lend = $mRNA_pointer_rend + 1;
}
$gene_obj->refine_gene_object();
#$gene_obj->{strand} = $strand;
print $gene_obj->toString() if $SEE;
return ($gene_obj);
}
sub reverse_complement {
my($s) = @_;
my ($rc);
$rc = reverse ($s);
$rc =~tr/ACGTacgtyrkmYRKM/TGCAtgcarymkRYMK/;
return($rc);
}
1; #EOM