1875 lines
49 KiB
Perl
1875 lines
49 KiB
Perl
#!/usr/local/bin/perl
|
|
|
|
##########################
|
|
### Class CDNA_alignment
|
|
##########################
|
|
package main;
|
|
our $SEE;
|
|
|
|
=head1 NAME
|
|
|
|
CDNA::CDNA_alignment
|
|
|
|
=cut
|
|
|
|
=head1 DESCRIPTION
|
|
|
|
This module provides an object specification for storing and manipulating cDNA alignments. The alignment coordinates along the genomic sequence are given along with a genomic sequence. Splice sites are validated and the spliced orientation is determined.
|
|
|
|
=cut
|
|
|
|
|
|
|
|
package CDNA::CDNA_alignment;
|
|
use strict;
|
|
use Exons_to_geneobj;
|
|
use Gene_obj;
|
|
use CDNA::Alignment_segment;
|
|
use Carp qw (cluck confess croak);
|
|
use GFF_maker;
|
|
|
|
## Global Vars
|
|
our $ALLOW_ATAC_splice_pairs = 1; # ON by default; turn it off if you prefer.
|
|
|
|
##
|
|
|
|
|
|
=over 4
|
|
|
|
=item new()
|
|
|
|
B<Description:> Instantiates a new CDNA::CDNA_alignment object.
|
|
|
|
B<Parameters:> $cdna_length, $Alignment_obj_aref, $sequence_sref
|
|
|
|
B<$cdna_length> is the length of the complete cDNA sequence.
|
|
|
|
B<$Alignment_obj_aref> is a reference to a list of CDNA::Alignment_segment objects like so:
|
|
|
|
$Alignment_obj_aref = \@Alignment_obj_list
|
|
|
|
B<$sequence_sref> should be a reference to the string containing the genomic sequence like so:
|
|
|
|
$sequence = "gatc.....";
|
|
|
|
$sequence_sref = \$sequence;
|
|
|
|
B<Returns:> $obj_ref
|
|
|
|
returns a reference to a CDNA_alignment object. This object is validated requiring consensus splice sites, and the spliced orientation of the cDNA is determined based on the validating orientation.
|
|
|
|
=back
|
|
|
|
=cut
|
|
|
|
|
|
sub new {
|
|
my $packagename = shift;
|
|
my ($cdna_length, $Alignment_obj_aref, $sequence_ref) = @_;
|
|
unless (@$Alignment_obj_aref) {
|
|
die "No alignment segments available to create alignment object.\n";
|
|
}
|
|
|
|
my $self = {
|
|
acc => undef(), # cDNA accession.
|
|
title =>undef(), # com_name, title, header, whatever you want to call the cdna.
|
|
genomic_seq => $sequence_ref,
|
|
genome_acc => undef,
|
|
alignment_segs=>$Alignment_obj_aref,
|
|
orientation => undef(),
|
|
lend=>undef(),
|
|
rend=>undef(),
|
|
length=>0, # alignment span length.
|
|
num_aligned_nts => 0, #number of cDNA nucleotides found in alignment
|
|
cdna_length => $cdna_length, # the length of the cDNA sequence.
|
|
avg_per_id => 0, # the average percent identity of this alignment.
|
|
error_flag=>0, #default w/o errors. Used to indicate non-consensus splice sites.
|
|
spliced_orientation => '?', # [+-?] depending on validating consenus splice sites; relative to genomic sequence orientation.
|
|
num_segments=>0,
|
|
percent_cdna_aligned => 0, # provides the percentage of the cDNA length that is in the alignment.
|
|
is_fli => 0 #indicates whether the cDNA is a full-length insert (ie. expected to be complete).
|
|
};
|
|
bless ($self, $packagename);
|
|
|
|
## Convert alignment to object form:
|
|
$self->process_alignment();
|
|
|
|
if ($self->{num_segments} > 1 && (ref $sequence_ref)) {
|
|
$self->identify_splice_junctions($sequence_ref);
|
|
}
|
|
return ($self);
|
|
}
|
|
|
|
sub process_alignment {
|
|
my $self = shift;
|
|
$self->determine_alignment_attributes();
|
|
my @alignment_segments = $self->get_alignment_segments();
|
|
for (my $i = 0; $i <= $#alignment_segments; $i++) {
|
|
my $segment = $alignment_segments[$i];
|
|
if ($#alignment_segments == 0) { #single segment in alignment
|
|
$segment->set_type("single");
|
|
} elsif ($i == 0) {
|
|
$segment->set_type("first");
|
|
} elsif ($i == $#alignment_segments) {
|
|
$segment->set_type("last");
|
|
} else { #must be internal
|
|
$segment->set_type("internal");
|
|
}
|
|
}
|
|
$self->set_num_segments($#alignment_segments + 1);
|
|
$self->verify_contiguity();
|
|
|
|
}
|
|
|
|
sub identify_splice_junctions {
|
|
my $self = shift;
|
|
my $sequence_ref = shift;
|
|
my $orientation = $self->get_orientation();
|
|
|
|
my $opposite_orientation = ($orientation eq '+') ? '-' : '+';
|
|
|
|
## Since some cDNAs are supplied in the opposite orientation, the splice sites may be on the reverse strand.
|
|
## In this case, we must change the orientation
|
|
my $error_flag = 0;
|
|
my $error_text = "";
|
|
my $validating_orientation = $orientation; #initialize.
|
|
my %error_lengths; #used to find best orientation (least errors)
|
|
foreach my $orient ($orientation, $opposite_orientation) {
|
|
$error_text = $self->validate_splice_junctions($orient, $sequence_ref);
|
|
$error_lengths{$orient} = length $error_text;
|
|
if ($error_text) {
|
|
print "ERRORS in Splice sites given orientation $orient:\n$error_text\n" if $::SEE;
|
|
unless ($error_flag) { print "Trying other strand might help?\n" if $::SEE;}
|
|
$error_text = ""; #rest
|
|
$error_flag = 1;
|
|
} else {
|
|
$error_flag = 0;
|
|
$validating_orientation = $orient;
|
|
last;
|
|
}
|
|
}
|
|
if ($error_flag) {
|
|
print "Sorry, still contains problematic splice sites:\n$error_text\n" if $::SEE;
|
|
print "Setting error flag for this alignment\n" if $::SEE;
|
|
$self->set_error_flag("Splice site validations failed");
|
|
print "ERROR: SETTING ERROR_FLAG\n" if $::SEE;
|
|
## revalidate splice sites using orientation that generated the least errors:
|
|
my @orients = sort {$error_lengths{$a}<=>$error_lengths{$b}} ('+', '-');
|
|
my $best_orient = shift @orients;
|
|
$self->validate_splice_junctions($best_orient, $sequence_ref); #required for appropriate token printing.
|
|
} else {
|
|
$self->set_spliced_orientation($validating_orientation);
|
|
}
|
|
}
|
|
|
|
sub validate_splice_junctions {
|
|
my $self = shift;
|
|
my ($orient) = shift;
|
|
my $sequence_ref = shift;
|
|
my $errors = "";
|
|
my (@splice_boundary_pairs) = &get_consensus_splice_sites($orient);
|
|
my @segments = $self->get_alignment_segments();
|
|
|
|
my $num_segments = scalar (@segments);
|
|
if ($num_segments == 1) {
|
|
## no introns
|
|
die "Error, trying to validate splice junctions for single segment alignment! ";
|
|
}
|
|
|
|
## analyze introns
|
|
for (my $i = 1; $i <= $#segments; $i++) {
|
|
my ($prev_segment, $curr_segment) = ($segments[$i-1], $segments[$i]);
|
|
|
|
my ($prev_lend, $prev_rend) = $prev_segment->get_coords();
|
|
my ($curr_lend, $curr_rend) = $curr_segment->get_coords();
|
|
|
|
my $splice_chars_left = uc substr($$sequence_ref, $prev_rend, 2);
|
|
my $splice_chars_right = uc substr($$sequence_ref, $curr_lend -2 -1, 2);
|
|
|
|
$prev_segment->set_right_splice_site_chars($splice_chars_left);
|
|
$curr_segment->set_left_splice_site_chars($splice_chars_right);
|
|
|
|
## check boundaries:
|
|
my $splice_pair_OK = 0;
|
|
|
|
CONSENSUS_PAIR:
|
|
foreach my $consensus_pair (@splice_boundary_pairs) {
|
|
my ($left_chars, $right_chars) = @$consensus_pair;
|
|
|
|
if ($left_chars eq $splice_chars_left && $right_chars eq $splice_chars_right) {
|
|
# found consensus
|
|
## further validate any AT-AC for donor site extended consensus
|
|
my ($intron_lend, $intron_rend) = ($prev_rend + 1, $curr_lend - 1);
|
|
if ($ALLOW_ATAC_splice_pairs) {
|
|
if ( ( $orient eq '+' && $left_chars eq 'AT')
|
|
||
|
|
($orient eq '-' && $right_chars eq 'AT') ) {
|
|
unless (&_validates_AT_AC_donor_extended_consensus($orient, $intron_lend, $intron_rend, $sequence_ref)) {
|
|
next CONSENSUS_PAIR;
|
|
}
|
|
}
|
|
}
|
|
|
|
## got consensus splice pair
|
|
$splice_pair_OK = 1;
|
|
## left and right local are relative to intron
|
|
# in methods, they reference segment
|
|
$prev_segment->set_right_splice_junction(1);
|
|
$curr_segment->set_left_splice_junction(1);
|
|
last;
|
|
}
|
|
}
|
|
unless ($splice_pair_OK) {
|
|
|
|
$errors .= "nonconsensus splice pair [$splice_chars_left-$splice_chars_right]\n";
|
|
|
|
## validate boundaries separately.
|
|
## this is useful if only one site is nonconsensus
|
|
foreach my $consensus_pair (@splice_boundary_pairs) {
|
|
my ($left_chars, $right_chars) = @$consensus_pair;
|
|
if ($left_chars eq $splice_chars_left) {
|
|
$prev_segment->set_right_splice_junction(1);
|
|
}
|
|
if ($right_chars eq $splice_chars_right) {
|
|
$curr_segment->set_left_splice_junction(1);
|
|
}
|
|
}
|
|
|
|
## make nonconsensus lower case
|
|
unless ($prev_segment->has_right_splice_junction()) {
|
|
$prev_segment->set_right_splice_site_chars( lc $splice_chars_left);
|
|
}
|
|
unless ($curr_segment->has_left_splice_junction()) {
|
|
$curr_segment->set_left_splice_site_chars( lc $splice_chars_right);
|
|
}
|
|
|
|
}
|
|
}
|
|
|
|
return ($errors);
|
|
}
|
|
|
|
|
|
sub verify_contiguity {
|
|
my $self = shift;
|
|
my $num_segments = $self->get_num_segments();
|
|
my $orient = $self->get_orientation();
|
|
if ($num_segments > 1) {
|
|
my @alignment_segments = $self->get_alignment_segments();
|
|
for (my $i = 1; $i <= $#alignment_segments; $i++) {
|
|
my $prev_seg = $alignment_segments[$i-1];
|
|
my $curr_seg = $alignment_segments[$i];
|
|
my $error_flag = 0;
|
|
my $diff = $curr_seg->{mlend} - $prev_seg->{mrend};
|
|
if ($orient eq '+' && $diff != 1) {
|
|
$error_flag = 1;
|
|
}elsif ($orient eq '-' && $diff != -1) {
|
|
$error_flag = 1;
|
|
}
|
|
if ($error_flag) {
|
|
$self->set_error_flag("Incontiguous alignment");
|
|
return();
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
|
|
sub get_consensus_splice_sites () {
|
|
|
|
my $orientation = shift;
|
|
|
|
my @pairs;
|
|
|
|
if ($orientation eq '+') {
|
|
|
|
## Forward pairs
|
|
# GT-AG
|
|
# GC-AG
|
|
# AT-AC
|
|
|
|
push (@pairs, ['GT', 'AG'], ['GC', 'AG']);
|
|
|
|
if ($ALLOW_ATAC_splice_pairs) {
|
|
push (@pairs, ['AT', 'AC']);
|
|
}
|
|
|
|
}
|
|
|
|
else {
|
|
## Rev Comp of above:
|
|
# CT-AC
|
|
# CT-GC
|
|
# GT-AT
|
|
|
|
push (@pairs, ['CT', 'AC'], ['CT', 'GC']);
|
|
|
|
if ($ALLOW_ATAC_splice_pairs) {
|
|
push (@pairs, ['GT', 'AT']);
|
|
}
|
|
}
|
|
|
|
|
|
return (@pairs);
|
|
}
|
|
|
|
|
|
####
|
|
sub _validates_AT_AC_donor_extended_consensus {
|
|
my ($orient, $intron_lend, $intron_rend, $sequence_ref) = @_;
|
|
|
|
my $forward_consensus = 'ATATCC';
|
|
my $reverse_consensus = 'GGATAT';
|
|
|
|
if ($orient eq '+') {
|
|
my $long_donor_seq = uc substr($$sequence_ref, $intron_lend - 1, 6);
|
|
if ($long_donor_seq eq $forward_consensus) {
|
|
return (1);
|
|
}
|
|
}
|
|
elsif ($orient eq '-') {
|
|
my $long_donor_seq = uc substr($$sequence_ref, $intron_rend -6, 6);
|
|
if ($long_donor_seq eq $reverse_consensus) {
|
|
return (1);
|
|
}
|
|
}
|
|
|
|
## got here, didn't fit long consensus
|
|
return (0);
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
sub set_orientation {
|
|
my $self = shift;
|
|
my $orientation = shift;
|
|
$self->{orientation} = $orientation;
|
|
}
|
|
|
|
|
|
=over 4
|
|
|
|
=item get_aligned_orientation()
|
|
|
|
B<Description:> Provides the orientation of the incoming cDNA alignment.
|
|
|
|
B<Parameters:> none.
|
|
|
|
B<Returns:> [+-]
|
|
|
|
=back
|
|
|
|
=cut
|
|
|
|
|
|
sub get_aligned_orientation {
|
|
my $self = shift;
|
|
return ($self->{orientation});
|
|
}
|
|
|
|
sub get_orientation { # deprecated in favor of get_aligned_orientation()
|
|
my $self = shift;
|
|
return ($self->get_aligned_orientation());
|
|
}
|
|
|
|
|
|
sub set_title {
|
|
my $self = shift;
|
|
my $title = shift;
|
|
$self->{title} = $title;
|
|
}
|
|
|
|
|
|
|
|
|
|
=over 4
|
|
|
|
=item get_title()
|
|
|
|
B<Description:> Provides the title for the cDNA, generally the header from a fasta file
|
|
|
|
B<Parameters:> none.
|
|
|
|
B<Returns:> string or undef
|
|
|
|
=back
|
|
|
|
=cut
|
|
|
|
|
|
|
|
|
|
sub get_title {
|
|
my $self = shift;
|
|
return ($self->{title});
|
|
}
|
|
|
|
|
|
|
|
sub set_coords {
|
|
my $self = shift;
|
|
my ($lend, $rend) = @_;
|
|
$self->{lend} = $lend;
|
|
$self->{rend} = $rend;
|
|
}
|
|
|
|
|
|
=over 4
|
|
|
|
=item get_coords()
|
|
|
|
B<Description:> Provides the coordinate span for the alignment along the genomic sequence. Orientation is not implied, coordinates always provided relevant to the forward orientation. Use the get_orientation method to determine the strand.
|
|
|
|
B<Parameters:> none.
|
|
|
|
B<Returns:> $lend, $rend
|
|
|
|
$lend, $rend are integer values indicating the beginning and end coordinates of the alignment on the genomic sequence. The genomic sequence begins at position 1.
|
|
|
|
=back
|
|
|
|
=cut
|
|
|
|
|
|
|
|
sub get_coords {
|
|
my $self = shift;
|
|
return ($self->{lend}, $self->{rend});
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
=over 4
|
|
|
|
=item get_mcoords()
|
|
|
|
B<Description:> returns the cDNA sequence coordinates corresponding to the lend and rend returned by get_coords()
|
|
|
|
ie. my ($lend, $rend) = $alignment->get_coords(); // always in forward orientation (lend < rend)
|
|
|
|
my ($mlend, $mrend) = $alignment->get_mcoords();
|
|
|
|
$mlend corresponds to $lend
|
|
|
|
$mrend corresponds to $rend
|
|
|
|
|
|
B<Parameters:>none
|
|
|
|
B<Returns:> ($mlend, $mrend)
|
|
|
|
=back
|
|
|
|
=cut
|
|
|
|
|
|
sub get_mcoords {
|
|
my $self = shift;
|
|
|
|
my @alignment_segments = $self->get_alignment_segments();
|
|
|
|
my $leftmost_seg = shift @alignment_segments;
|
|
my $rightmost_seg = pop @alignment_segments;
|
|
|
|
unless ($rightmost_seg) {
|
|
$rightmost_seg = $leftmost_seg; #must only be one in which case left = right
|
|
}
|
|
|
|
my ($mlend, $whatever1) = $leftmost_seg->get_mcoords();
|
|
my ($whatever2, $mrend) = $rightmost_seg->get_mcoords();
|
|
|
|
return ($mlend, $mrend);
|
|
}
|
|
|
|
|
|
|
|
|
|
=over 4
|
|
|
|
=item get_intron_coords()
|
|
|
|
B<Description:> returns list of intron coordinates
|
|
|
|
B<Parameters:> none
|
|
|
|
B<Returns:> ( [intron_lend, intron_rend], [intron_lend, intron_rend], ...)
|
|
|
|
|
|
lend always less than rend
|
|
|
|
=back
|
|
|
|
=cut
|
|
|
|
|
|
sub get_intron_coords {
|
|
my $self = shift;
|
|
|
|
my @segments = $self->get_alignment_segments();
|
|
|
|
my @seg_coords;
|
|
foreach my $segment (@segments) {
|
|
my ($exon_lend, $exon_rend) = sort {$a<=>$b} $segment->get_coords();
|
|
|
|
push (@seg_coords, [$exon_lend, $exon_rend]);
|
|
}
|
|
|
|
my @intron_coords;
|
|
|
|
if (scalar (@seg_coords) >= 2) {
|
|
## actually have introns
|
|
@seg_coords = sort {$a->[0]<=>$b->[0]} @seg_coords;
|
|
|
|
my $curr_seg = shift @seg_coords;
|
|
while (@seg_coords) {
|
|
my $next_seg = shift @seg_coords;
|
|
|
|
my ($curr_lend, $curr_rend) = @$curr_seg;
|
|
|
|
my ($next_lend, $next_rend) = @$next_seg;
|
|
|
|
my ($intron_lend, $intron_rend) = ($curr_rend + 1, $next_lend - 1);
|
|
|
|
push (@intron_coords, [$intron_lend, $intron_rend]);
|
|
|
|
$curr_seg = $next_seg;
|
|
}
|
|
}
|
|
|
|
|
|
return (@intron_coords);
|
|
}
|
|
|
|
|
|
sub determine_alignment_attributes {
|
|
my $self = shift;
|
|
my @alignment_segments = $self->get_alignment_segments();
|
|
my @coords;
|
|
my $orientation;
|
|
my $num_nts_matched = 0;
|
|
my $per_id_x_length = 0;
|
|
foreach my $segment (@alignment_segments) {
|
|
my ($lend, $rend) = $segment->get_coords();
|
|
my ($mlend, $mrend) = $segment->get_mcoords();
|
|
my $orient = $segment->get_orientation();
|
|
if (!$orientation && $orient =~ /[+-]/) {
|
|
$orientation = $orient;
|
|
}
|
|
my $seg_length;
|
|
if ($mrend && $mlend) {
|
|
$seg_length = abs($mrend - $mlend) + 1;
|
|
} else {
|
|
$seg_length = abs ($rend - $lend) + 1;
|
|
}
|
|
$num_nts_matched += $seg_length;
|
|
my $per_id = $segment->get_per_id();
|
|
$per_id_x_length += $per_id * $seg_length;
|
|
push (@coords, $lend, $rend);
|
|
}
|
|
$self->set_orientation($orientation);
|
|
foreach my $segment (@alignment_segments) {
|
|
$segment->set_orientation($orientation);
|
|
}
|
|
@coords = sort {$a<=>$b} @coords;
|
|
my $lend = shift @coords;
|
|
my $rend = pop @coords;
|
|
$self->set_coords($lend, $rend);
|
|
$self->{length} = abs ($rend - $lend) + 1;
|
|
$self->{num_aligned_nts} = $num_nts_matched;
|
|
$self->{avg_per_id} = $per_id_x_length / $num_nts_matched;
|
|
if ($self->{avg_per_id} > 100) { die "Error, can't have average per_id > 100%\n";}
|
|
if ($self->{cdna_length} > 0) {
|
|
$self->{percent_cdna_aligned} = $num_nts_matched / $self->{cdna_length} * 100;
|
|
}
|
|
}
|
|
|
|
|
|
=over 4
|
|
|
|
=item get_alignment_segments()
|
|
|
|
B<Description:> Returns the alignment segments which comprise an alignment object, ordered by left coordinate position.
|
|
|
|
B<Parameters:> none
|
|
|
|
B<Returns:> @alignment_segments
|
|
|
|
@alignment_segments is a list of CDNA::Alignment_segment objects (see below).
|
|
|
|
=back
|
|
|
|
=cut
|
|
|
|
sub get_alignment_segments() {
|
|
my $self = shift;
|
|
return (sort {$a->{lend}<=>$b->{lend}} @{$self->{alignment_segs}});
|
|
}
|
|
|
|
|
|
|
|
=over 4
|
|
|
|
=item add_alignment_segment()
|
|
|
|
B<Description:> Adds a CDNA::Alignment_segment object to the list of segments of this CDNA_alignment object.
|
|
|
|
B<Parameters:> CDNA::Alignment_segment object.
|
|
|
|
B<Returns:> none.
|
|
|
|
=back
|
|
|
|
=cut
|
|
|
|
|
|
sub add_alignment_segment() {
|
|
my $self = shift;
|
|
my $segment = shift;
|
|
push (@{$self->{alignment_segs}}, $segment);
|
|
}
|
|
|
|
|
|
=over 4
|
|
|
|
=item delete_all_segments()
|
|
|
|
B<Description:> Empties the current list of Alignment_segment objects.
|
|
|
|
B<Parameters:> None.
|
|
|
|
B<Returns:> None.
|
|
|
|
=back
|
|
|
|
=cut
|
|
|
|
|
|
|
|
|
|
sub delete_all_segments () {
|
|
my $self = shift;
|
|
@{$self->{alignment_segs}} = (); #empty the array
|
|
}
|
|
|
|
|
|
sub set_error_flag () {
|
|
my $self = shift;
|
|
my $error = shift;
|
|
if ($self->{error_flag}) {
|
|
$self->{error_flag} .= $error;
|
|
} else {
|
|
$self->{error_flag} = $error;
|
|
}
|
|
}
|
|
|
|
|
|
=over 4
|
|
|
|
=item get_error_flag()
|
|
|
|
B<Description:> Provides the status of the aligments validation.
|
|
|
|
B<Parameters:> none.
|
|
|
|
B<Returns:> [error_text|0]
|
|
|
|
A text string containing the error is returned if an error exists. Otherwise, zero is returned indicating the lack of errors.
|
|
|
|
=back
|
|
|
|
=cut
|
|
|
|
|
|
sub get_error_flag () {
|
|
my $self = shift;
|
|
return ($self->{error_flag});
|
|
}
|
|
|
|
|
|
=over 4
|
|
|
|
=item toString()
|
|
|
|
B<Description:> Returns an alignment as lines of text providing coordinate information.
|
|
|
|
B<Parameters:> none.
|
|
|
|
B<Returns:> alignment_string
|
|
|
|
=back
|
|
|
|
=cut
|
|
|
|
|
|
sub toString() {
|
|
my $self = shift;
|
|
my $text = "\n\nAlignment: orientation: " . $self->{orientation} . "\n"
|
|
. "coords: " . $self->{lend} . "-" . $self->{rend} . "\n";
|
|
|
|
my @segments = $self->get_alignment_segments();
|
|
foreach my $segment (@segments) {
|
|
$text .= $segment->toString();
|
|
}
|
|
return ($text);
|
|
}
|
|
|
|
|
|
####
|
|
sub to_GFF3_format {
|
|
my $self = shift;
|
|
my %preferences = @_;
|
|
|
|
|
|
my $seq_id = $preferences{seq_id} || $self->{genome_acc} || confess "Need seq_id in preferences, or set genome_acc attribute of obj";
|
|
my $match_id = $preferences{match_id} or confess "Error, require match_id attribute";
|
|
my $source = $preferences{source} || "PASA";
|
|
|
|
my $orientation = $self->get_orientation();
|
|
|
|
|
|
|
|
my @alignment_segments = $self->get_alignment_segments();
|
|
my $acc = $self->get_acc() or confess "Error, accession for cdna_alignment is not available";
|
|
|
|
|
|
my $GFF3_text = "";
|
|
|
|
foreach my $alignment_segment (@alignment_segments) {
|
|
my ($genome_lend, $genome_rend) = $alignment_segment->get_coords();
|
|
my ($cdna_lend, $cdna_rend) = sort {$a<=>$b} $alignment_segment->get_mcoords();
|
|
|
|
|
|
my $gff_struct = { seq_id => $seq_id,
|
|
source => $source,
|
|
type => "cDNA_match",
|
|
lend => $genome_lend,
|
|
rend => $genome_rend,
|
|
strand => $orientation,
|
|
attributes => "ID=$match_id; Target=$acc $cdna_lend $cdna_rend +",
|
|
};
|
|
|
|
if (my $per_id = $alignment_segment->get_per_id()) {
|
|
$gff_struct->{score} = $per_id;
|
|
}
|
|
|
|
$GFF3_text .= &GFF_maker::get_GFF_line($gff_struct);
|
|
|
|
}
|
|
|
|
return ($GFF3_text);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
####
|
|
sub to_GTF_format {
|
|
my $self = shift;
|
|
my %preferences = @_;
|
|
|
|
|
|
my $seq_id = $preferences{seq_id} || $self->{genome_acc} || confess "Need seq_id in preferences, or set genome_acc attribute of obj";
|
|
|
|
my $gene_id = $preferences{gene_id} || confess "Need gene_id in preferences";
|
|
my $transcript_id = $preferences{transcript_id} || $self->get_acc();
|
|
|
|
|
|
my $source = $preferences{source} || "PASA";
|
|
|
|
my $orientation = $self->get_orientation();
|
|
|
|
|
|
my ($trans_lend, $trans_rend) = sort {$a<=>$b} $self->get_coords();
|
|
|
|
my @alignment_segments = $self->get_alignment_segments();
|
|
my $acc = $self->get_acc() or confess "Error, accession for cdna_alignment is not available";
|
|
|
|
|
|
my $gtf_text = join("\t", ( $seq_id,
|
|
$source,
|
|
"transcript",
|
|
$trans_lend,
|
|
$trans_rend,
|
|
".",
|
|
$orientation,
|
|
".",
|
|
"gene_id \"$gene_id\"; transcript_id \"$transcript_id\";")
|
|
) . "\n";
|
|
|
|
|
|
foreach my $alignment_segment (@alignment_segments) {
|
|
my ($genome_lend, $genome_rend) = $alignment_segment->get_coords();
|
|
my ($cdna_lend, $cdna_rend) = sort {$a<=>$b} $alignment_segment->get_mcoords();
|
|
|
|
|
|
my $per_id = $alignment_segment->get_per_id() || ".";
|
|
|
|
$gtf_text .= join("\t", ( $seq_id,
|
|
$source,
|
|
"exon",
|
|
$genome_lend,
|
|
$genome_rend,
|
|
$per_id,
|
|
$orientation,
|
|
".",
|
|
"gene_id \"$gene_id\"; transcript_id \"$transcript_id\";")
|
|
) . "\n";
|
|
|
|
|
|
|
|
}
|
|
|
|
return ($gtf_text);
|
|
|
|
}
|
|
|
|
|
|
sub provide_cdna_segment_coords () {
|
|
my $self = shift;
|
|
my @segments = $self->get_alignment_segments();
|
|
my $coord_mapper = $self->{genome_cdna_coord_mapper};
|
|
foreach my $segment (@segments) {
|
|
my ($lend, $rend) = $segment->get_coords();
|
|
my $mlend = $coord_mapper->{$lend};
|
|
my $mrend = $coord_mapper->{$rend};
|
|
$segment->set_mcoords($mlend, $mrend);
|
|
}
|
|
}
|
|
|
|
=over 4
|
|
|
|
=item remap_cdna_segment_coords()
|
|
|
|
B<Description:> Method is used on an assembled alignment to renumber the coordinates of the assembled cDNA product, starting at 1 and ending at the assembly length. Orientation is determined by the validated spliced orientation.
|
|
|
|
B<Parameters:> none.
|
|
|
|
B<Returns:> none.
|
|
|
|
=back
|
|
|
|
=cut
|
|
|
|
sub remap_cdna_segment_coords () { ## Used when cDNA mapped coordinates aren't known, or when an assembly of other alignments was generated.
|
|
## reassigns cdna coords to alignment coords starting at 1 and ending at determined length.
|
|
## spliced orientation determines how the coords will map
|
|
## if spliced orient is ambiguous, aligned orient is used.
|
|
|
|
my $self = shift;
|
|
my %coord_mapper;
|
|
|
|
|
|
my @alignment_segments = $self->get_alignment_segments(); #remember, already in increasing order.
|
|
|
|
my $spliced_orient = $self->get_spliced_orientation();
|
|
my $aligned_orient = $self->get_aligned_orientation();
|
|
|
|
my $transcript_orientation = ($spliced_orient =~ /[\+\-]/) ? $spliced_orient : $aligned_orient;
|
|
|
|
if ($transcript_orientation eq '-') {
|
|
@alignment_segments = reverse @alignment_segments;
|
|
}
|
|
|
|
my $curr_pos = 0;
|
|
foreach my $segment (@alignment_segments) {
|
|
my ($lend, $rend) = $segment->get_coords();
|
|
my $seglength = abs ($rend - $lend) + 1;
|
|
my $mlend = $curr_pos + 1;
|
|
my $mrend = $curr_pos + $seglength;
|
|
$curr_pos += $seglength;
|
|
if ($transcript_orientation eq '-') {
|
|
($mlend, $mrend) = ($mrend, $mlend);
|
|
}
|
|
|
|
$segment->set_orientation($transcript_orientation);
|
|
#print "setting $mlend, $mrend\n";
|
|
$coord_mapper{$lend} = $mlend;
|
|
$coord_mapper{$rend} = $mrend;
|
|
$segment->set_mcoords($mlend, $mrend);
|
|
}
|
|
|
|
## reset aligned_orient to transcript_orient if different
|
|
## this should only happen when aligned orient and spliced orient are opposite.
|
|
## in which case, the aligned orient is reset to the spliced orient.
|
|
|
|
if ($aligned_orient ne $transcript_orientation) {
|
|
$self->set_orientation($transcript_orientation);
|
|
}
|
|
|
|
}
|
|
|
|
=over 4
|
|
|
|
=item toToken()
|
|
|
|
B<Description:> similar to the toString() method, but returns a pretty line of text summarizing the alignment and splice site data.
|
|
|
|
B<Parameters:> none.
|
|
|
|
B<Returns:> text_line
|
|
|
|
Here is an example:
|
|
|
|
orient(-/-) align: 103762(2613)-104359(2016)E<gt>CT....ACE<lt>104452(2015)-105229(1238)E<gt>CT....ACE<lt>105315(1237)-105482(1070)E<gt>CT....ACE<lt>105582(1069)-105842(809)E<gt>CT....ACE<lt>105935(808)-105985(758)E<gt>CT....ACE<lt>106071(757)-106316(512)E<gt>CT....ACE<lt>106394(511)-106619(286)E<gt>CT....ACE<lt>106712(285)-106879(118)E<gt>CT....ACE<lt>107427(117)-107543(1)
|
|
|
|
or
|
|
|
|
orient(+/+) align: 83074(1)-83318(245)E<gt>GT....AGE<lt>83637(246)-83702(311)E<gt>GT....AGE<lt>83796(312)-83846(362)E<gt>GT....AGE<lt>83938(363)-84017(442)E<gt>GT....AGE<lt>84308(443)-84352(487)E<gt>GT....AGE<lt>84467(488)-84507(528)
|
|
|
|
|
|
The orient specification includes (cDNA sequence alignment orientation/ spliced orientation). These are different when the reverse-complement of the sequence is provided, ascertained by the aligned orientation with consensus splice sites.
|
|
|
|
=back
|
|
|
|
=cut
|
|
|
|
sub toToken () {
|
|
my $self = shift;
|
|
my $orientation = $self->get_orientation();
|
|
my $spliced_orientation = $self->get_spliced_orientation();
|
|
my @alignment_segments = $self->get_alignment_segments();
|
|
my $assembled_token = "orient(a$orientation/s$spliced_orientation) align: ";
|
|
for (my $i = 0; $i <= $#alignment_segments; $i++) {
|
|
my $segment = $alignment_segments[$i];
|
|
$assembled_token .= $segment->toToken();
|
|
unless ($i == $#alignment_segments) {
|
|
$assembled_token .= "....";
|
|
}
|
|
}
|
|
return ($assembled_token);
|
|
}
|
|
|
|
|
|
sub set_num_segments {
|
|
my $self = shift;
|
|
my $num_segments = shift;
|
|
$self->{num_segments} = $num_segments;
|
|
}
|
|
|
|
=over 4
|
|
|
|
=item get_num_segments()
|
|
|
|
B<Description:> method provides the number of segments composing an alignment.
|
|
|
|
B<Parameters:> none.
|
|
|
|
B<Returns:> int
|
|
|
|
=back
|
|
|
|
=cut
|
|
|
|
sub get_num_segments {
|
|
my $self = shift;
|
|
return ($self->{num_segments});
|
|
}
|
|
|
|
sub set_spliced_orientation {
|
|
my $self = shift;
|
|
my $orientation = shift;
|
|
$self->{spliced_orientation} = $orientation;
|
|
}
|
|
|
|
|
|
=over 4
|
|
|
|
=item get_spliced_orientation ()
|
|
|
|
B<Description:> provides the validating spliced orientation for an alignment. In some cases this will be different from the alignment orientation; for example, in cases where the cDNA sequence is provided in the reverse orientation.
|
|
|
|
B<Parameters:> none
|
|
|
|
B<Returns:> [+|-|undef()]
|
|
|
|
undef is returned if the alignment did not validate properly. See get_error_flag()
|
|
|
|
=back
|
|
|
|
=cut
|
|
|
|
|
|
sub get_spliced_orientation {
|
|
my $self = shift;
|
|
return ($self->{spliced_orientation});
|
|
}
|
|
|
|
|
|
=over 4
|
|
|
|
=item set_acc()
|
|
|
|
B<Description:> Method sets the accession field of the cDNA sequence.
|
|
|
|
B<Parameters:> string
|
|
|
|
Provide the accession for the cDNA corresponding to this alignment.
|
|
|
|
B<Returns:> none.
|
|
|
|
=back
|
|
|
|
=cut
|
|
|
|
sub set_acc () {
|
|
my $self = shift;
|
|
my $acc = shift;
|
|
$self->{acc} = $acc;
|
|
}
|
|
|
|
=over 4
|
|
|
|
=item get_acc()
|
|
|
|
B<Description:> Method provides the accession for the cDNA in the alignment.
|
|
|
|
B<Parameters:> none
|
|
|
|
B<Returns:> string
|
|
|
|
=back
|
|
|
|
=cut
|
|
|
|
|
|
sub get_acc () {
|
|
my $self = shift;
|
|
return ($self->{acc});
|
|
}
|
|
|
|
|
|
=over 4
|
|
|
|
=item set_fli_status()
|
|
|
|
B<Description:> sets the is_fli attribute of the cDNA alignment, indicative of a full-length insert clone (or complete cDNA sequence).
|
|
|
|
B<Parameters:> [1|0]
|
|
|
|
1 = true, 0 = false.
|
|
|
|
B<Returns:> [1|0]
|
|
|
|
=back
|
|
|
|
=cut
|
|
|
|
|
|
sub set_fli_status {
|
|
my $self = shift;
|
|
my $is_fli_status = shift;
|
|
$self->{is_fli} = $is_fli_status;
|
|
}
|
|
|
|
|
|
=over 4
|
|
|
|
=item is_fli()
|
|
|
|
B<Description:>Provides the full-length insert status of the cDNA.
|
|
|
|
B<Parameters:> none.
|
|
|
|
B<Returns:> [0|1]
|
|
|
|
=back
|
|
|
|
=cut
|
|
|
|
sub is_fli {
|
|
my $self = shift;
|
|
return ($self->{is_fli});
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
=over 4
|
|
|
|
=item toAlignIllustration()
|
|
|
|
B<Description:> Provides a single line of text which illustrates the gapped alignment. See the example below.
|
|
|
|
B<Parameters:> none.
|
|
|
|
B<Returns:> string
|
|
|
|
Here is an example of an illustrated alignment:
|
|
|
|
------> <---> <----- (-)asmbl_6711
|
|
|
|
=back
|
|
|
|
=cut
|
|
|
|
|
|
sub toAlignIllustration () {
|
|
my ($self, $subtract, $rel_max, $max_line_chars) = @_;
|
|
my $spliced_orient = $self->get_spliced_orientation();
|
|
my $orient = $self->get_orientation();
|
|
my @segments = $self->get_alignment_segments();
|
|
my @chars = ();
|
|
my $converter = sub {my $coord = shift;
|
|
return ( int ( ($coord - $subtract)/$rel_max * $max_line_chars + 0.5));
|
|
};
|
|
foreach my $segment (@segments) {
|
|
my ($lend, $rend) = $segment->get_coords();
|
|
my $l_rel = &$converter($lend);
|
|
#print "lend: $lend -> l_rel: $l_rel\n" if $::SEE;
|
|
my $r_rel = &$converter($rend);
|
|
#print "rend: $rend -> r_rel: $r_rel\n" if $::SEE;
|
|
|
|
for (my $i = $l_rel; $i <= $r_rel; $i++) {
|
|
$chars[$i] = '-';
|
|
}
|
|
if ($segment->has_left_splice_junction()) {
|
|
$chars[$l_rel] = '<';
|
|
} elsif ( (! $segment->is_first()) && (! $segment->is_single_segment())) {
|
|
$chars[$l_rel] = '|';
|
|
}
|
|
|
|
if ($segment->has_right_splice_junction()) {
|
|
$chars[$r_rel] = '>';
|
|
} elsif ( (! $segment->is_last()) && (! $segment->is_single_segment())) {
|
|
$chars[$r_rel] = '|';
|
|
}
|
|
}
|
|
|
|
#fill rest of line with spaces.
|
|
for (my $i = 0; $i <= $#chars; $i++) {
|
|
unless ($chars[$i]) {
|
|
$chars[$i] = ' ';
|
|
}
|
|
}
|
|
my $outline = join ("", @chars);
|
|
my $acc = $self->get_acc();
|
|
$acc =~ tr/\t\n\000-\037\177-\377/\t\n/d; #remove any control characters from accession.
|
|
my $fli_status = ($self->is_fli()) ? " FL" : "";;
|
|
return ($outline . "\t(a$orient/s$spliced_orient)" . $acc . $fli_status);
|
|
}
|
|
|
|
|
|
=over 4
|
|
|
|
=item get_gene_obj_via_alignment()
|
|
|
|
B<Description:> Creates a Gene_obj object based on an alignment using the Exons_to_geneobj.pm module.
|
|
|
|
B<Parameters:>
|
|
|
|
B<Returns:> Gene_obj
|
|
|
|
The object returned is of the type Gene_obj defined in Gene_obj.pm
|
|
|
|
=back
|
|
|
|
=cut
|
|
|
|
|
|
sub get_gene_obj_via_alignment {
|
|
my $self = shift;
|
|
my $partial_info_href = shift; # { 5prime => 0|1, 3prime => 0|1 }, optional
|
|
|
|
my $orient = $self->get_spliced_orientation();
|
|
|
|
if ($orient eq '+' || '-') {
|
|
return($self->_get_gene_obj_via_alignment_by_orient($orient, $partial_info_href));
|
|
}
|
|
elsif ($orient eq '?') {
|
|
## find the orientation that provides the longest ORF
|
|
|
|
my $plus_orient_gene = $self->_get_gene_obj_via_alignment_by_orient('+', $partial_info_href);
|
|
my $minus_orient_gene = $self->_get_gene_obj_via_alignment_by_orient('-', $partial_info_href);
|
|
|
|
if ($plus_orient_gene->get_CDS_length() >= $minus_orient_gene->get_CDS_length()) {
|
|
return($plus_orient_gene);
|
|
}
|
|
else {
|
|
return($minus_orient_gene);
|
|
}
|
|
|
|
}
|
|
else {
|
|
confess "cannot process spliced orientation of $orient ";
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
sub _get_gene_obj_via_alignment_by_orient {
|
|
my ($self, $orient, $partial_info_href) = @_;
|
|
|
|
my @alignment_segments = $self->get_alignment_segments();
|
|
my %coords;
|
|
foreach my $segment (@alignment_segments) {
|
|
my ($end5, $end3) = $segment->get_coords();
|
|
|
|
if ($orient eq '-') {
|
|
($end5, $end3) = ($end3, $end5); #force coordinates to contain orientation info.
|
|
}
|
|
$coords{$end5} = $end3;
|
|
}
|
|
my $genomic_seq_ref = $self->{genomic_seq};
|
|
my $gene_obj;
|
|
if ($genomic_seq_ref) {
|
|
## find ORF
|
|
$gene_obj = Exons_to_geneobj::create_gene_obj(\%coords, $genomic_seq_ref, $partial_info_href);
|
|
} else {
|
|
## No ORF
|
|
$gene_obj = new Gene_obj;
|
|
$gene_obj->populate_gene_obj({}, \%coords);
|
|
}
|
|
|
|
return ($gene_obj);
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
=over 4
|
|
|
|
=item force_spliced_validation()
|
|
|
|
B<Description:> Routine used for testing purposes. Any fake alignment can be created and set to validate to the corresponding orientation using this routine.
|
|
|
|
B<Parameters:> [+|-]
|
|
|
|
B<Returns:> none.
|
|
|
|
=back
|
|
|
|
=cut
|
|
|
|
|
|
sub force_spliced_validation {
|
|
my $self = shift;
|
|
my $orientation = shift;
|
|
|
|
unless ($orientation eq '+' || $orientation eq '-') {
|
|
croak ("cannot force spliced validation to $orientation.\n");
|
|
return;
|
|
}
|
|
|
|
my ($right_splice, $left_splice) = ($orientation eq '+') ? ('XX','YY') : ('YY','XX');
|
|
|
|
my @alignment_segments = $self->get_alignment_segments();
|
|
foreach my $alignment_segment (@alignment_segments) {
|
|
if ($alignment_segment->is_internal() || $alignment_segment->is_last()) {
|
|
$alignment_segment->set_left_splice_junction(1);
|
|
$alignment_segment->set_left_splice_site_chars($left_splice);
|
|
}
|
|
if ($alignment_segment->is_internal() || $alignment_segment->is_first()) {
|
|
$alignment_segment->set_right_splice_junction(1);
|
|
$alignment_segment->set_right_splice_site_chars($right_splice);
|
|
}
|
|
$alignment_segment->set_orientation($orientation);
|
|
}
|
|
|
|
$self->set_spliced_orientation($orientation);
|
|
}
|
|
|
|
|
|
|
|
=over 4
|
|
|
|
=item clone()
|
|
|
|
B<Description:>Clones a CDNA_alignment object into a new CDNA_alignment object with same attributes. Performs a deep copy, so all alignment segments contained within the cloned CDNA_alignment object are also clones.
|
|
|
|
B<Parameters:> none.
|
|
|
|
B<Returns:> new CDNA_alignment
|
|
|
|
=back
|
|
|
|
=cut
|
|
|
|
sub clone {
|
|
my $self = shift;
|
|
my $packagename = ref $self;
|
|
|
|
my $clone = {};
|
|
bless ($clone, $packagename);
|
|
foreach my $key (keys %$self) {
|
|
$clone->{$key} = $self->{$key};
|
|
}
|
|
$clone->{alignment_segs} = [];
|
|
foreach my $alignment_segment ($self->get_alignment_segments()) {
|
|
$clone->add_alignment_segment($alignment_segment->clone());
|
|
}
|
|
|
|
return ($clone);
|
|
}
|
|
|
|
|
|
|
|
|
|
=over 4
|
|
|
|
=item get_genomic_seq_ref()
|
|
|
|
B<Description:> Returns a scalar refernence to the genomic sequence.
|
|
|
|
B<Parameters:> none.
|
|
|
|
B<Returns:> string_ref
|
|
|
|
=back
|
|
|
|
=cut
|
|
|
|
|
|
|
|
sub get_genomic_seq_ref {
|
|
my $self = shift;
|
|
return ($self->{genomic_seq});
|
|
}
|
|
|
|
|
|
=over 4
|
|
|
|
=item extractSplicedSequence()
|
|
|
|
B<Description:> Returns a string corresponding to the spliced cDNA sequence.
|
|
|
|
B<Parameters:> none.
|
|
|
|
B<Returns:> scalar
|
|
|
|
=back
|
|
|
|
=cut
|
|
|
|
|
|
|
|
sub extractSplicedSequence {
|
|
my $self = shift;
|
|
my ($genomic_seq_ref) = @_;
|
|
my $genomic_seq = $genomic_seq_ref;
|
|
|
|
unless ($genomic_seq) {
|
|
$genomic_seq = $self->{genomic_seq};
|
|
unless (ref $genomic_seq) {
|
|
confess "Can't extract the spliced sequence when no genomic sequence reference is available.\n";
|
|
}
|
|
}
|
|
|
|
my @segments = $self->get_alignment_segments();
|
|
my $splicedSequence = "";
|
|
my $toggle = 0;
|
|
foreach my $segment (@segments) {
|
|
my ($lend, $rend) = $segment->get_coords();
|
|
my $length = abs ($rend - $lend) + 1;
|
|
my $exonseq = substr ($$genomic_seq, $lend - 1, $length);
|
|
|
|
# alternate case among segments to facilitate manual identification of junctions.
|
|
if ($toggle) {
|
|
$exonseq = lc $exonseq;
|
|
$toggle = 0;
|
|
}
|
|
else {
|
|
$exonseq = uc $exonseq;
|
|
$toggle = 1;
|
|
}
|
|
|
|
$splicedSequence .= $exonseq;
|
|
}
|
|
my $orient = $self->get_orientation();
|
|
if ($orient eq "-") {
|
|
#reverse complement the sequence:
|
|
$splicedSequence = reverse ($splicedSequence);
|
|
$splicedSequence =~tr/ACGTacgtyrkmYRKM/TGCAtgcarymkRYMK/;
|
|
}
|
|
return ($splicedSequence);
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
=over 4
|
|
|
|
=item get_cDNA_to_genomic_coordinates()
|
|
|
|
B<Description:> converts a cDNA sequence -relative coordinates to the corresponding genomic sequence coordinates.
|
|
|
|
B<Parameters:> @coordinates
|
|
|
|
list of integers
|
|
|
|
B<Returns:> @converted_coordinates
|
|
|
|
list of integers.
|
|
|
|
=back
|
|
|
|
=cut
|
|
|
|
sub get_cDNA_to_genomic_coordinates {
|
|
my $self = shift;
|
|
my @coordinates = @_;
|
|
|
|
my @ret_coordinates = ();
|
|
|
|
my @segments = $self->get_alignment_segments();
|
|
foreach my $cdna_coord (@coordinates) {
|
|
my $corresponding_segment;
|
|
foreach my $seg (@segments) {
|
|
my ($mlend, $mrend) = sort {$a<=>$b} $seg->get_mcoords();
|
|
if ($cdna_coord >= $mlend && $cdna_coord <= $mrend) {
|
|
$corresponding_segment = $seg;
|
|
last;
|
|
}
|
|
}
|
|
unless (ref $corresponding_segment) {
|
|
confess "Error, cDNA coordinate ($cdna_coord) not found in segment list: " . $self->toToken();
|
|
}
|
|
my ($lend, $rend) = $corresponding_segment->get_coords();
|
|
my ($mlend, $mrend) = $corresponding_segment->get_mcoords();
|
|
my $orient = $corresponding_segment->get_orientation();
|
|
|
|
my $genomic_coord = undef;
|
|
if ($orient eq '+') {
|
|
my $diff = $cdna_coord - $mlend;
|
|
$genomic_coord = $lend + $diff;
|
|
} elsif ($orient eq '-') {
|
|
## mlend > mrend
|
|
my $diff = $cdna_coord - $mrend;
|
|
$genomic_coord = $rend - $diff;
|
|
}
|
|
|
|
push (@ret_coordinates, $genomic_coord);
|
|
}
|
|
|
|
return (@ret_coordinates);
|
|
}
|
|
|
|
|
|
|
|
|
|
=item get_genomic_to_cDNA_coordinates()
|
|
|
|
B<Description:> converts coordinates in the genome to coordinates in the cDNA:
|
|
|
|
B<Parameters:> @coordinates
|
|
|
|
list of integers
|
|
|
|
B<Returns:> @converted_coordinates
|
|
|
|
list of integers. Undef is returned for each position that could not be converted to the genomic coordinate system because it was not found within the aligned region.
|
|
|
|
=back
|
|
|
|
=cut
|
|
|
|
|
|
####
|
|
sub get_genomic_to_cDNA_coordinates {
|
|
|
|
my $self = shift;
|
|
my @coordinates = @_;
|
|
|
|
my @ret_coordinates = ();
|
|
|
|
my @segments = $self->get_alignment_segments();
|
|
foreach my $genomic_coord (@coordinates) {
|
|
my $corresponding_segment;
|
|
foreach my $seg (@segments) {
|
|
my ($lend, $rend) = $seg->get_coords();
|
|
if ($genomic_coord >= $lend && $genomic_coord <= $rend) {
|
|
$corresponding_segment = $seg;
|
|
last;
|
|
}
|
|
}
|
|
unless (ref $corresponding_segment) {
|
|
confess "Error, genomic coordinate ($genomic_coord) not found in segment list:" . $self->toToken();
|
|
}
|
|
my ($lend, $rend) = $corresponding_segment->get_coords();
|
|
my ($mlend, $mrend) = $corresponding_segment->get_mcoords();
|
|
my $orient = $corresponding_segment->get_orientation();
|
|
my $diff = $rend - $genomic_coord;
|
|
my $cdna_coord = undef;
|
|
if ($orient eq '+') {
|
|
$cdna_coord = $mrend - $diff;
|
|
} elsif ($orient eq '-') {
|
|
$cdna_coord = $mrend + $diff;
|
|
}
|
|
|
|
push (@ret_coordinates, $cdna_coord);
|
|
}
|
|
|
|
return (@ret_coordinates);
|
|
}
|
|
|
|
|
|
|
|
####
|
|
sub overlaps_genome_span {
|
|
my ($self, $other_alignment) = @_;
|
|
|
|
my ($lend_A, $rend_A) = sort {$a<=>$b} $self->get_coords();
|
|
|
|
my ($lend_B, $rend_B) = sort {$a<=>$b} $other_alignment->get_coords();
|
|
|
|
if (&_overlap($lend_A, $rend_A, $lend_B, $rend_B)) {
|
|
return(1);
|
|
}
|
|
else {
|
|
return(0);
|
|
}
|
|
}
|
|
|
|
|
|
sub has_overlapping_segment {
|
|
my ($self, $other_alignment) = @_;
|
|
|
|
unless ($self->overlaps_genome_span($other_alignment)) {
|
|
return(0);
|
|
}
|
|
|
|
|
|
my @self_segments = $self->get_alignment_segments();
|
|
|
|
my @other_segments = $self->get_alignment_segments();
|
|
|
|
foreach my $segment_A (@self_segments) {
|
|
|
|
my ($lend_A, $rend_A) = sort {$a<=>$b} $segment_A->get_coords();
|
|
|
|
|
|
foreach my $segment_B (@other_segments) {
|
|
|
|
my ($lend_B, $rend_B) = sort {$a<=>$b} $segment_B->get_coords();
|
|
|
|
if (&_overlap($lend_A, $rend_A, $lend_B, $rend_B) ) {
|
|
|
|
return(1);
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return(0); # no overlapping segment
|
|
}
|
|
|
|
|
|
=over 4
|
|
|
|
=item is_compatible()
|
|
|
|
B<Description:> Returns true (1) if this alignment is found compatible with the other_alignment_obj
|
|
|
|
B<Parameters:> ($other_alignment_obj, $fuzz_dist)
|
|
|
|
Alignments A and B are of type CDNA::CDNA_alignment
|
|
|
|
B<Returns:> [1|0]
|
|
|
|
Compatibility between alignment objects requires:
|
|
-within their region of overlap, introns are identical
|
|
-if both have spliced orientations, they must be on the same strand
|
|
|
|
|
|
=back
|
|
|
|
=cut
|
|
|
|
|
|
####
|
|
sub is_compatible {
|
|
my $self = shift;
|
|
my ($other_alignment, $fuzzlength) = @_;
|
|
if (!defined $fuzzlength) {
|
|
$fuzzlength = 0; ## no fuzzy termini allowed
|
|
}
|
|
|
|
## The compatibility test requires:
|
|
# -alignments must have the same spliced orientation if not ?
|
|
# -alignments must overlap
|
|
# -alignments must have identical introns in their region of overlap (taking into account the fuzz distance for terminal exons)
|
|
|
|
my ($a_lend, $a_rend) = $self->get_coords();
|
|
my $a_spliced_orient = $self->get_spliced_orientation();
|
|
my $a_num_segments = $self->get_num_segments();
|
|
|
|
my ($b_lend, $b_rend) = $other_alignment->get_coords();
|
|
my $b_spliced_orient = $other_alignment->get_spliced_orientation();
|
|
my $b_num_segments = $other_alignment->get_num_segments();
|
|
|
|
## overlap test:
|
|
unless (&_overlap($a_lend, $a_rend, $b_lend, $b_rend)) {
|
|
return (0); # not compatible
|
|
}
|
|
|
|
## transcribed orientation test:
|
|
if ($a_spliced_orient ne $b_spliced_orient && $a_spliced_orient ne '?' && $b_spliced_orient ne '?') {
|
|
# neither alignment is ambiguously oriented and they're transcribed on opposite strands:
|
|
return (0); # not compatible
|
|
}
|
|
|
|
## same introns test:
|
|
if ($a_num_segments > 1 || $b_num_segments > 1) {
|
|
my @a_introns = $self->get_intron_coords();
|
|
my @b_introns = $self->get_intron_coords();
|
|
|
|
my ($overlapping_lend, $overlapping_rend) = &_get_coords_of_overlap($a_lend, $a_rend, $b_lend, $b_rend);
|
|
print "Overlapping coords between alignments: $overlapping_lend to $overlapping_rend\n" if $SEE;
|
|
## make adjustments to required overlap coordinates considering fuzzlength:
|
|
if ($fuzzlength) {
|
|
## adjust left overlap boundary requirement
|
|
$overlapping_lend = &_adjust_left_overlap_boundary_via_fuzzlength($overlapping_lend, $self, $other_alignment, $fuzzlength);
|
|
$overlapping_rend = &_adjust_right_overlap_boundary_via_fuzzlength($overlapping_rend, $self, $other_alignment, $fuzzlength);
|
|
print "\tadjusted overlapping coords to: $overlapping_lend to $overlapping_rend\n" if $SEE;
|
|
}
|
|
|
|
## find introns within adjusted overlap range and ensure identity
|
|
my @a_intron_coords = $self->get_intron_coords();
|
|
my @b_intron_coords = $other_alignment->get_intron_coords();
|
|
|
|
my @a_introns_in_range = &_get_overlapping_capped_introns($overlapping_lend, $overlapping_rend, \@a_intron_coords);
|
|
my @b_introns_in_range = &_get_overlapping_capped_introns($overlapping_lend, $overlapping_rend, \@b_intron_coords);
|
|
|
|
if (@a_introns_in_range || @b_introns_in_range) {
|
|
## ensure identity:
|
|
my %all_introns;
|
|
my %a_introns;
|
|
foreach my $coordset (@a_introns_in_range) {
|
|
my $key = join (",", @$coordset);
|
|
$a_introns{$key} = 1;
|
|
$all_introns{$key} = 1;
|
|
}
|
|
my %b_introns;
|
|
foreach my $coordset (@b_introns_in_range) {
|
|
my $key = join (",", @$coordset);
|
|
$b_introns{$key} = 1;
|
|
$all_introns{$key} = 1;
|
|
}
|
|
foreach my $intron_key (keys %all_introns) {
|
|
unless ($a_introns{$intron_key} && $b_introns{$intron_key}) {
|
|
return (0); # not compatible, an intron difference in the overlapping region exists.
|
|
}
|
|
}
|
|
}
|
|
|
|
}
|
|
|
|
## if got this far, passed all compatibility tests.
|
|
|
|
return (1); # yes, compatible.
|
|
}
|
|
|
|
####
|
|
sub _get_contained_coords {
|
|
my ($lend, $rend, $coordsets_aref) = @_;
|
|
my @contained_coords;
|
|
foreach my $coordset (@$coordsets_aref) {
|
|
my ($coord_lend, $coord_rend) = sort {$a<=>$b} @$coordset;
|
|
if ($lend <= $coord_lend && $coord_rend <= $rend) {
|
|
## coordset contained
|
|
push (@contained_coords, $coordset);
|
|
}
|
|
}
|
|
return (@contained_coords);
|
|
}
|
|
|
|
####
|
|
# get introns that overlap lend and rend, and set termini of intron coords to these values if they extend beyond them.
|
|
sub _get_overlapping_capped_introns {
|
|
my ($lend, $rend, $coordsets_aref) = @_;
|
|
my @contained_coords;
|
|
foreach my $coordset (@$coordsets_aref) {
|
|
my ($coord_lend, $coord_rend) = sort {$a<=>$b} @$coordset;
|
|
if ($lend <= $coord_rend && $rend >= $coord_lend) {
|
|
## coordset contained
|
|
if ($coord_lend < $lend) {
|
|
$coord_lend = $lend;
|
|
}
|
|
if ($coord_rend > $rend) {
|
|
$coord_rend = $rend;
|
|
}
|
|
push (@contained_coords, [$coord_lend, $coord_rend]);
|
|
}
|
|
}
|
|
return (@contained_coords);
|
|
}
|
|
|
|
|
|
####
|
|
sub _adjust_left_overlap_boundary_via_fuzzlength {
|
|
my ($overlapping_lend, $alignment_a, $alignment_b, $fuzzlength) = @_;
|
|
|
|
## make adjustments to take into account fuzzlength and existing intron coordinates and adjacent segments
|
|
## we trust short aligment segments that precede an intron, even if they're shorter than the fuzzlength
|
|
|
|
my $a_overlapping_segment = $alignment_a->find_segment_containing_coord($overlapping_lend);
|
|
my $b_overlapping_segment = $alignment_b->find_segment_containing_coord($overlapping_lend);
|
|
my @bounds = ();
|
|
if ($a_overlapping_segment) {
|
|
my ($a_seg_lend, $a_seg_rend) = $a_overlapping_segment->get_coords();
|
|
push (@bounds, $a_seg_rend);
|
|
}
|
|
if ($b_overlapping_segment) {
|
|
my ($b_seg_lend, $b_seg_rend) = $b_overlapping_segment->get_coords();
|
|
push (@bounds, $b_seg_rend);
|
|
}
|
|
if (@bounds) {
|
|
my $max_bound = max_coord(@bounds);
|
|
my $delta = $max_bound - $overlapping_lend;
|
|
if ($delta < 0) {
|
|
confess "Error, delta left bound is less than zero";
|
|
}
|
|
my $fuzz_employed = min_coord($fuzzlength, $delta);
|
|
$overlapping_lend += $fuzz_employed;
|
|
}
|
|
return ($overlapping_lend);
|
|
}
|
|
|
|
####
|
|
sub _adjust_right_overlap_boundary_via_fuzzlength {
|
|
my ($overlapping_rend, $alignment_a, $alignment_b, $fuzzlength) = @_;
|
|
|
|
## make adjustments to take into account fuzzlength and existing intron coordinates and adjacent segments
|
|
## we trust short aligment segments that precede an intron, even if they're shorter than the fuzzlength
|
|
|
|
my $a_overlapping_segment = $alignment_a->find_segment_containing_coord($overlapping_rend);
|
|
my $b_overlapping_segment = $alignment_b->find_segment_containing_coord($overlapping_rend);
|
|
my @bounds = ();
|
|
if ($a_overlapping_segment) {
|
|
my ($a_seg_lend, $a_seg_rend) = $a_overlapping_segment->get_coords();
|
|
push (@bounds, $a_seg_lend);
|
|
}
|
|
if ($b_overlapping_segment) {
|
|
my ($b_seg_lend, $b_seg_rend) = $b_overlapping_segment->get_coords();
|
|
push (@bounds, $b_seg_lend);
|
|
}
|
|
if (@bounds) {
|
|
my $min_bound = min_coord(@bounds);
|
|
my $delta = $overlapping_rend - $min_bound;
|
|
if ($delta < 0) {
|
|
confess "Error, delta left bound is less than zero";
|
|
}
|
|
my $fuzz_employed = min_coord($fuzzlength, $delta);
|
|
$overlapping_rend -= $fuzz_employed;
|
|
}
|
|
return ($overlapping_rend);
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
####
|
|
sub find_segment_containing_coord {
|
|
my $self = shift;
|
|
my ($coord) = @_;
|
|
my @segments = $self->get_alignment_segments();
|
|
foreach my $segment (@segments) {
|
|
my ($lend, $rend) = $segment->get_coords();
|
|
if ($lend <= $coord && $coord <= $rend) {
|
|
return ($segment);
|
|
}
|
|
}
|
|
return (undef); # none found
|
|
}
|
|
|
|
|
|
sub _overlap {
|
|
my ($a1_lend, $a1_rend, $a2_lend, $a2_rend) = @_;
|
|
#print "Checking overlap @_\t";
|
|
if ($a2_rend >= $a1_lend && $a2_lend <= $a1_rend) { #overlap
|
|
#print "YES\n";
|
|
return (1);
|
|
} else {
|
|
#print "NO\n";
|
|
return (0);
|
|
}
|
|
}
|
|
|
|
=over 4
|
|
|
|
=item encapsulates()
|
|
|
|
B<Description:> Returns true (1) if this alignment encapsulates the span of the other alignment object
|
|
|
|
B<Parameters:> ($other_alignment_obj, $fuzz_dist)
|
|
|
|
Alignments A and B are of type CDNA::CDNA_alignment
|
|
|
|
B<Returns:> [1|0]
|
|
|
|
=back
|
|
|
|
=cut
|
|
|
|
|
|
sub encapsulates {
|
|
my $self = shift;
|
|
my ($alignmentB, $fuzz_dist) = @_;
|
|
my $alignmentA = $self;
|
|
|
|
if (! defined $fuzz_dist) {
|
|
$fuzz_dist = 0;
|
|
}
|
|
|
|
my ($alend, $arend) = $alignmentA->get_coords();
|
|
my ($blend, $brend) = $alignmentB->get_coords();
|
|
|
|
if ($blend + $fuzz_dist >= $alend &&
|
|
$brend - $fuzz_dist <= $arend)
|
|
{
|
|
return (1);
|
|
} else {
|
|
return (0);
|
|
}
|
|
}
|
|
|
|
####
|
|
sub _get_coords_of_overlap {
|
|
my ($a_lend, $a_rend, $b_lend, $b_rend) = @_;
|
|
unless (&_overlap($a_lend, $a_rend, $b_lend, $b_rend)) {
|
|
confess "Error, trying to get coordinates of overlapping region for two features that do not overlap!";
|
|
}
|
|
my $overlapping_lend = &max_coord($a_lend, $b_lend);
|
|
my $overlapping_rend = &min_coord($a_rend, $b_rend);
|
|
|
|
return ($overlapping_lend, $overlapping_rend);
|
|
}
|
|
|
|
####
|
|
sub min_coord {
|
|
my @coords = @_;
|
|
@coords = sort {$a<=>$b} @coords;
|
|
my $min_coord = shift @coords;
|
|
return ($min_coord);
|
|
}
|
|
|
|
####
|
|
sub max_coord {
|
|
my @coords = @_;
|
|
@coords = sort {$a<=>$b} @coords;
|
|
my $max_coord = pop @coords;
|
|
return ($max_coord);
|
|
}
|
|
|
|
|
|
1; #EOM
|
|
|
|
|
|
|
|
|
|
|