biyelunwen/99.scripts/trinity_utils/PerlLib/CDNA/CDNA_alignment.pm

1875 lines
49 KiB
Perl

#!/usr/local/bin/perl
##########################
### Class CDNA_alignment
##########################
package main;
our $SEE;
=head1 NAME
CDNA::CDNA_alignment
=cut
=head1 DESCRIPTION
This module provides an object specification for storing and manipulating cDNA alignments. The alignment coordinates along the genomic sequence are given along with a genomic sequence. Splice sites are validated and the spliced orientation is determined.
=cut
package CDNA::CDNA_alignment;
use strict;
use Exons_to_geneobj;
use Gene_obj;
use CDNA::Alignment_segment;
use Carp qw (cluck confess croak);
use GFF_maker;
## Global Vars
our $ALLOW_ATAC_splice_pairs = 1; # ON by default; turn it off if you prefer.
##
=over 4
=item new()
B<Description:> Instantiates a new CDNA::CDNA_alignment object.
B<Parameters:> $cdna_length, $Alignment_obj_aref, $sequence_sref
B<$cdna_length> is the length of the complete cDNA sequence.
B<$Alignment_obj_aref> is a reference to a list of CDNA::Alignment_segment objects like so:
$Alignment_obj_aref = \@Alignment_obj_list
B<$sequence_sref> should be a reference to the string containing the genomic sequence like so:
$sequence = "gatc.....";
$sequence_sref = \$sequence;
B<Returns:> $obj_ref
returns a reference to a CDNA_alignment object. This object is validated requiring consensus splice sites, and the spliced orientation of the cDNA is determined based on the validating orientation.
=back
=cut
sub new {
my $packagename = shift;
my ($cdna_length, $Alignment_obj_aref, $sequence_ref) = @_;
unless (@$Alignment_obj_aref) {
die "No alignment segments available to create alignment object.\n";
}
my $self = {
acc => undef(), # cDNA accession.
title =>undef(), # com_name, title, header, whatever you want to call the cdna.
genomic_seq => $sequence_ref,
genome_acc => undef,
alignment_segs=>$Alignment_obj_aref,
orientation => undef(),
lend=>undef(),
rend=>undef(),
length=>0, # alignment span length.
num_aligned_nts => 0, #number of cDNA nucleotides found in alignment
cdna_length => $cdna_length, # the length of the cDNA sequence.
avg_per_id => 0, # the average percent identity of this alignment.
error_flag=>0, #default w/o errors. Used to indicate non-consensus splice sites.
spliced_orientation => '?', # [+-?] depending on validating consenus splice sites; relative to genomic sequence orientation.
num_segments=>0,
percent_cdna_aligned => 0, # provides the percentage of the cDNA length that is in the alignment.
is_fli => 0 #indicates whether the cDNA is a full-length insert (ie. expected to be complete).
};
bless ($self, $packagename);
## Convert alignment to object form:
$self->process_alignment();
if ($self->{num_segments} > 1 && (ref $sequence_ref)) {
$self->identify_splice_junctions($sequence_ref);
}
return ($self);
}
sub process_alignment {
my $self = shift;
$self->determine_alignment_attributes();
my @alignment_segments = $self->get_alignment_segments();
for (my $i = 0; $i <= $#alignment_segments; $i++) {
my $segment = $alignment_segments[$i];
if ($#alignment_segments == 0) { #single segment in alignment
$segment->set_type("single");
} elsif ($i == 0) {
$segment->set_type("first");
} elsif ($i == $#alignment_segments) {
$segment->set_type("last");
} else { #must be internal
$segment->set_type("internal");
}
}
$self->set_num_segments($#alignment_segments + 1);
$self->verify_contiguity();
}
sub identify_splice_junctions {
my $self = shift;
my $sequence_ref = shift;
my $orientation = $self->get_orientation();
my $opposite_orientation = ($orientation eq '+') ? '-' : '+';
## Since some cDNAs are supplied in the opposite orientation, the splice sites may be on the reverse strand.
## In this case, we must change the orientation
my $error_flag = 0;
my $error_text = "";
my $validating_orientation = $orientation; #initialize.
my %error_lengths; #used to find best orientation (least errors)
foreach my $orient ($orientation, $opposite_orientation) {
$error_text = $self->validate_splice_junctions($orient, $sequence_ref);
$error_lengths{$orient} = length $error_text;
if ($error_text) {
print "ERRORS in Splice sites given orientation $orient:\n$error_text\n" if $::SEE;
unless ($error_flag) { print "Trying other strand might help?\n" if $::SEE;}
$error_text = ""; #rest
$error_flag = 1;
} else {
$error_flag = 0;
$validating_orientation = $orient;
last;
}
}
if ($error_flag) {
print "Sorry, still contains problematic splice sites:\n$error_text\n" if $::SEE;
print "Setting error flag for this alignment\n" if $::SEE;
$self->set_error_flag("Splice site validations failed");
print "ERROR: SETTING ERROR_FLAG\n" if $::SEE;
## revalidate splice sites using orientation that generated the least errors:
my @orients = sort {$error_lengths{$a}<=>$error_lengths{$b}} ('+', '-');
my $best_orient = shift @orients;
$self->validate_splice_junctions($best_orient, $sequence_ref); #required for appropriate token printing.
} else {
$self->set_spliced_orientation($validating_orientation);
}
}
sub validate_splice_junctions {
my $self = shift;
my ($orient) = shift;
my $sequence_ref = shift;
my $errors = "";
my (@splice_boundary_pairs) = &get_consensus_splice_sites($orient);
my @segments = $self->get_alignment_segments();
my $num_segments = scalar (@segments);
if ($num_segments == 1) {
## no introns
die "Error, trying to validate splice junctions for single segment alignment! ";
}
## analyze introns
for (my $i = 1; $i <= $#segments; $i++) {
my ($prev_segment, $curr_segment) = ($segments[$i-1], $segments[$i]);
my ($prev_lend, $prev_rend) = $prev_segment->get_coords();
my ($curr_lend, $curr_rend) = $curr_segment->get_coords();
my $splice_chars_left = uc substr($$sequence_ref, $prev_rend, 2);
my $splice_chars_right = uc substr($$sequence_ref, $curr_lend -2 -1, 2);
$prev_segment->set_right_splice_site_chars($splice_chars_left);
$curr_segment->set_left_splice_site_chars($splice_chars_right);
## check boundaries:
my $splice_pair_OK = 0;
CONSENSUS_PAIR:
foreach my $consensus_pair (@splice_boundary_pairs) {
my ($left_chars, $right_chars) = @$consensus_pair;
if ($left_chars eq $splice_chars_left && $right_chars eq $splice_chars_right) {
# found consensus
## further validate any AT-AC for donor site extended consensus
my ($intron_lend, $intron_rend) = ($prev_rend + 1, $curr_lend - 1);
if ($ALLOW_ATAC_splice_pairs) {
if ( ( $orient eq '+' && $left_chars eq 'AT')
||
($orient eq '-' && $right_chars eq 'AT') ) {
unless (&_validates_AT_AC_donor_extended_consensus($orient, $intron_lend, $intron_rend, $sequence_ref)) {
next CONSENSUS_PAIR;
}
}
}
## got consensus splice pair
$splice_pair_OK = 1;
## left and right local are relative to intron
# in methods, they reference segment
$prev_segment->set_right_splice_junction(1);
$curr_segment->set_left_splice_junction(1);
last;
}
}
unless ($splice_pair_OK) {
$errors .= "nonconsensus splice pair [$splice_chars_left-$splice_chars_right]\n";
## validate boundaries separately.
## this is useful if only one site is nonconsensus
foreach my $consensus_pair (@splice_boundary_pairs) {
my ($left_chars, $right_chars) = @$consensus_pair;
if ($left_chars eq $splice_chars_left) {
$prev_segment->set_right_splice_junction(1);
}
if ($right_chars eq $splice_chars_right) {
$curr_segment->set_left_splice_junction(1);
}
}
## make nonconsensus lower case
unless ($prev_segment->has_right_splice_junction()) {
$prev_segment->set_right_splice_site_chars( lc $splice_chars_left);
}
unless ($curr_segment->has_left_splice_junction()) {
$curr_segment->set_left_splice_site_chars( lc $splice_chars_right);
}
}
}
return ($errors);
}
sub verify_contiguity {
my $self = shift;
my $num_segments = $self->get_num_segments();
my $orient = $self->get_orientation();
if ($num_segments > 1) {
my @alignment_segments = $self->get_alignment_segments();
for (my $i = 1; $i <= $#alignment_segments; $i++) {
my $prev_seg = $alignment_segments[$i-1];
my $curr_seg = $alignment_segments[$i];
my $error_flag = 0;
my $diff = $curr_seg->{mlend} - $prev_seg->{mrend};
if ($orient eq '+' && $diff != 1) {
$error_flag = 1;
}elsif ($orient eq '-' && $diff != -1) {
$error_flag = 1;
}
if ($error_flag) {
$self->set_error_flag("Incontiguous alignment");
return();
}
}
}
}
sub get_consensus_splice_sites () {
my $orientation = shift;
my @pairs;
if ($orientation eq '+') {
## Forward pairs
# GT-AG
# GC-AG
# AT-AC
push (@pairs, ['GT', 'AG'], ['GC', 'AG']);
if ($ALLOW_ATAC_splice_pairs) {
push (@pairs, ['AT', 'AC']);
}
}
else {
## Rev Comp of above:
# CT-AC
# CT-GC
# GT-AT
push (@pairs, ['CT', 'AC'], ['CT', 'GC']);
if ($ALLOW_ATAC_splice_pairs) {
push (@pairs, ['GT', 'AT']);
}
}
return (@pairs);
}
####
sub _validates_AT_AC_donor_extended_consensus {
my ($orient, $intron_lend, $intron_rend, $sequence_ref) = @_;
my $forward_consensus = 'ATATCC';
my $reverse_consensus = 'GGATAT';
if ($orient eq '+') {
my $long_donor_seq = uc substr($$sequence_ref, $intron_lend - 1, 6);
if ($long_donor_seq eq $forward_consensus) {
return (1);
}
}
elsif ($orient eq '-') {
my $long_donor_seq = uc substr($$sequence_ref, $intron_rend -6, 6);
if ($long_donor_seq eq $reverse_consensus) {
return (1);
}
}
## got here, didn't fit long consensus
return (0);
}
sub set_orientation {
my $self = shift;
my $orientation = shift;
$self->{orientation} = $orientation;
}
=over 4
=item get_aligned_orientation()
B<Description:> Provides the orientation of the incoming cDNA alignment.
B<Parameters:> none.
B<Returns:> [+-]
=back
=cut
sub get_aligned_orientation {
my $self = shift;
return ($self->{orientation});
}
sub get_orientation { # deprecated in favor of get_aligned_orientation()
my $self = shift;
return ($self->get_aligned_orientation());
}
sub set_title {
my $self = shift;
my $title = shift;
$self->{title} = $title;
}
=over 4
=item get_title()
B<Description:> Provides the title for the cDNA, generally the header from a fasta file
B<Parameters:> none.
B<Returns:> string or undef
=back
=cut
sub get_title {
my $self = shift;
return ($self->{title});
}
sub set_coords {
my $self = shift;
my ($lend, $rend) = @_;
$self->{lend} = $lend;
$self->{rend} = $rend;
}
=over 4
=item get_coords()
B<Description:> Provides the coordinate span for the alignment along the genomic sequence. Orientation is not implied, coordinates always provided relevant to the forward orientation. Use the get_orientation method to determine the strand.
B<Parameters:> none.
B<Returns:> $lend, $rend
$lend, $rend are integer values indicating the beginning and end coordinates of the alignment on the genomic sequence. The genomic sequence begins at position 1.
=back
=cut
sub get_coords {
my $self = shift;
return ($self->{lend}, $self->{rend});
}
=over 4
=item get_mcoords()
B<Description:> returns the cDNA sequence coordinates corresponding to the lend and rend returned by get_coords()
ie. my ($lend, $rend) = $alignment->get_coords(); // always in forward orientation (lend < rend)
my ($mlend, $mrend) = $alignment->get_mcoords();
$mlend corresponds to $lend
$mrend corresponds to $rend
B<Parameters:>none
B<Returns:> ($mlend, $mrend)
=back
=cut
sub get_mcoords {
my $self = shift;
my @alignment_segments = $self->get_alignment_segments();
my $leftmost_seg = shift @alignment_segments;
my $rightmost_seg = pop @alignment_segments;
unless ($rightmost_seg) {
$rightmost_seg = $leftmost_seg; #must only be one in which case left = right
}
my ($mlend, $whatever1) = $leftmost_seg->get_mcoords();
my ($whatever2, $mrend) = $rightmost_seg->get_mcoords();
return ($mlend, $mrend);
}
=over 4
=item get_intron_coords()
B<Description:> returns list of intron coordinates
B<Parameters:> none
B<Returns:> ( [intron_lend, intron_rend], [intron_lend, intron_rend], ...)
lend always less than rend
=back
=cut
sub get_intron_coords {
my $self = shift;
my @segments = $self->get_alignment_segments();
my @seg_coords;
foreach my $segment (@segments) {
my ($exon_lend, $exon_rend) = sort {$a<=>$b} $segment->get_coords();
push (@seg_coords, [$exon_lend, $exon_rend]);
}
my @intron_coords;
if (scalar (@seg_coords) >= 2) {
## actually have introns
@seg_coords = sort {$a->[0]<=>$b->[0]} @seg_coords;
my $curr_seg = shift @seg_coords;
while (@seg_coords) {
my $next_seg = shift @seg_coords;
my ($curr_lend, $curr_rend) = @$curr_seg;
my ($next_lend, $next_rend) = @$next_seg;
my ($intron_lend, $intron_rend) = ($curr_rend + 1, $next_lend - 1);
push (@intron_coords, [$intron_lend, $intron_rend]);
$curr_seg = $next_seg;
}
}
return (@intron_coords);
}
sub determine_alignment_attributes {
my $self = shift;
my @alignment_segments = $self->get_alignment_segments();
my @coords;
my $orientation;
my $num_nts_matched = 0;
my $per_id_x_length = 0;
foreach my $segment (@alignment_segments) {
my ($lend, $rend) = $segment->get_coords();
my ($mlend, $mrend) = $segment->get_mcoords();
my $orient = $segment->get_orientation();
if (!$orientation && $orient =~ /[+-]/) {
$orientation = $orient;
}
my $seg_length;
if ($mrend && $mlend) {
$seg_length = abs($mrend - $mlend) + 1;
} else {
$seg_length = abs ($rend - $lend) + 1;
}
$num_nts_matched += $seg_length;
my $per_id = $segment->get_per_id();
$per_id_x_length += $per_id * $seg_length;
push (@coords, $lend, $rend);
}
$self->set_orientation($orientation);
foreach my $segment (@alignment_segments) {
$segment->set_orientation($orientation);
}
@coords = sort {$a<=>$b} @coords;
my $lend = shift @coords;
my $rend = pop @coords;
$self->set_coords($lend, $rend);
$self->{length} = abs ($rend - $lend) + 1;
$self->{num_aligned_nts} = $num_nts_matched;
$self->{avg_per_id} = $per_id_x_length / $num_nts_matched;
if ($self->{avg_per_id} > 100) { die "Error, can't have average per_id > 100%\n";}
if ($self->{cdna_length} > 0) {
$self->{percent_cdna_aligned} = $num_nts_matched / $self->{cdna_length} * 100;
}
}
=over 4
=item get_alignment_segments()
B<Description:> Returns the alignment segments which comprise an alignment object, ordered by left coordinate position.
B<Parameters:> none
B<Returns:> @alignment_segments
@alignment_segments is a list of CDNA::Alignment_segment objects (see below).
=back
=cut
sub get_alignment_segments() {
my $self = shift;
return (sort {$a->{lend}<=>$b->{lend}} @{$self->{alignment_segs}});
}
=over 4
=item add_alignment_segment()
B<Description:> Adds a CDNA::Alignment_segment object to the list of segments of this CDNA_alignment object.
B<Parameters:> CDNA::Alignment_segment object.
B<Returns:> none.
=back
=cut
sub add_alignment_segment() {
my $self = shift;
my $segment = shift;
push (@{$self->{alignment_segs}}, $segment);
}
=over 4
=item delete_all_segments()
B<Description:> Empties the current list of Alignment_segment objects.
B<Parameters:> None.
B<Returns:> None.
=back
=cut
sub delete_all_segments () {
my $self = shift;
@{$self->{alignment_segs}} = (); #empty the array
}
sub set_error_flag () {
my $self = shift;
my $error = shift;
if ($self->{error_flag}) {
$self->{error_flag} .= $error;
} else {
$self->{error_flag} = $error;
}
}
=over 4
=item get_error_flag()
B<Description:> Provides the status of the aligments validation.
B<Parameters:> none.
B<Returns:> [error_text|0]
A text string containing the error is returned if an error exists. Otherwise, zero is returned indicating the lack of errors.
=back
=cut
sub get_error_flag () {
my $self = shift;
return ($self->{error_flag});
}
=over 4
=item toString()
B<Description:> Returns an alignment as lines of text providing coordinate information.
B<Parameters:> none.
B<Returns:> alignment_string
=back
=cut
sub toString() {
my $self = shift;
my $text = "\n\nAlignment: orientation: " . $self->{orientation} . "\n"
. "coords: " . $self->{lend} . "-" . $self->{rend} . "\n";
my @segments = $self->get_alignment_segments();
foreach my $segment (@segments) {
$text .= $segment->toString();
}
return ($text);
}
####
sub to_GFF3_format {
my $self = shift;
my %preferences = @_;
my $seq_id = $preferences{seq_id} || $self->{genome_acc} || confess "Need seq_id in preferences, or set genome_acc attribute of obj";
my $match_id = $preferences{match_id} or confess "Error, require match_id attribute";
my $source = $preferences{source} || "PASA";
my $orientation = $self->get_orientation();
my @alignment_segments = $self->get_alignment_segments();
my $acc = $self->get_acc() or confess "Error, accession for cdna_alignment is not available";
my $GFF3_text = "";
foreach my $alignment_segment (@alignment_segments) {
my ($genome_lend, $genome_rend) = $alignment_segment->get_coords();
my ($cdna_lend, $cdna_rend) = sort {$a<=>$b} $alignment_segment->get_mcoords();
my $gff_struct = { seq_id => $seq_id,
source => $source,
type => "cDNA_match",
lend => $genome_lend,
rend => $genome_rend,
strand => $orientation,
attributes => "ID=$match_id; Target=$acc $cdna_lend $cdna_rend +",
};
if (my $per_id = $alignment_segment->get_per_id()) {
$gff_struct->{score} = $per_id;
}
$GFF3_text .= &GFF_maker::get_GFF_line($gff_struct);
}
return ($GFF3_text);
}
####
sub to_GTF_format {
my $self = shift;
my %preferences = @_;
my $seq_id = $preferences{seq_id} || $self->{genome_acc} || confess "Need seq_id in preferences, or set genome_acc attribute of obj";
my $gene_id = $preferences{gene_id} || confess "Need gene_id in preferences";
my $transcript_id = $preferences{transcript_id} || $self->get_acc();
my $source = $preferences{source} || "PASA";
my $orientation = $self->get_orientation();
my ($trans_lend, $trans_rend) = sort {$a<=>$b} $self->get_coords();
my @alignment_segments = $self->get_alignment_segments();
my $acc = $self->get_acc() or confess "Error, accession for cdna_alignment is not available";
my $gtf_text = join("\t", ( $seq_id,
$source,
"transcript",
$trans_lend,
$trans_rend,
".",
$orientation,
".",
"gene_id \"$gene_id\"; transcript_id \"$transcript_id\";")
) . "\n";
foreach my $alignment_segment (@alignment_segments) {
my ($genome_lend, $genome_rend) = $alignment_segment->get_coords();
my ($cdna_lend, $cdna_rend) = sort {$a<=>$b} $alignment_segment->get_mcoords();
my $per_id = $alignment_segment->get_per_id() || ".";
$gtf_text .= join("\t", ( $seq_id,
$source,
"exon",
$genome_lend,
$genome_rend,
$per_id,
$orientation,
".",
"gene_id \"$gene_id\"; transcript_id \"$transcript_id\";")
) . "\n";
}
return ($gtf_text);
}
sub provide_cdna_segment_coords () {
my $self = shift;
my @segments = $self->get_alignment_segments();
my $coord_mapper = $self->{genome_cdna_coord_mapper};
foreach my $segment (@segments) {
my ($lend, $rend) = $segment->get_coords();
my $mlend = $coord_mapper->{$lend};
my $mrend = $coord_mapper->{$rend};
$segment->set_mcoords($mlend, $mrend);
}
}
=over 4
=item remap_cdna_segment_coords()
B<Description:> Method is used on an assembled alignment to renumber the coordinates of the assembled cDNA product, starting at 1 and ending at the assembly length. Orientation is determined by the validated spliced orientation.
B<Parameters:> none.
B<Returns:> none.
=back
=cut
sub remap_cdna_segment_coords () { ## Used when cDNA mapped coordinates aren't known, or when an assembly of other alignments was generated.
## reassigns cdna coords to alignment coords starting at 1 and ending at determined length.
## spliced orientation determines how the coords will map
## if spliced orient is ambiguous, aligned orient is used.
my $self = shift;
my %coord_mapper;
my @alignment_segments = $self->get_alignment_segments(); #remember, already in increasing order.
my $spliced_orient = $self->get_spliced_orientation();
my $aligned_orient = $self->get_aligned_orientation();
my $transcript_orientation = ($spliced_orient =~ /[\+\-]/) ? $spliced_orient : $aligned_orient;
if ($transcript_orientation eq '-') {
@alignment_segments = reverse @alignment_segments;
}
my $curr_pos = 0;
foreach my $segment (@alignment_segments) {
my ($lend, $rend) = $segment->get_coords();
my $seglength = abs ($rend - $lend) + 1;
my $mlend = $curr_pos + 1;
my $mrend = $curr_pos + $seglength;
$curr_pos += $seglength;
if ($transcript_orientation eq '-') {
($mlend, $mrend) = ($mrend, $mlend);
}
$segment->set_orientation($transcript_orientation);
#print "setting $mlend, $mrend\n";
$coord_mapper{$lend} = $mlend;
$coord_mapper{$rend} = $mrend;
$segment->set_mcoords($mlend, $mrend);
}
## reset aligned_orient to transcript_orient if different
## this should only happen when aligned orient and spliced orient are opposite.
## in which case, the aligned orient is reset to the spliced orient.
if ($aligned_orient ne $transcript_orientation) {
$self->set_orientation($transcript_orientation);
}
}
=over 4
=item toToken()
B<Description:> similar to the toString() method, but returns a pretty line of text summarizing the alignment and splice site data.
B<Parameters:> none.
B<Returns:> text_line
Here is an example:
orient(-/-) align: 103762(2613)-104359(2016)E<gt>CT....ACE<lt>104452(2015)-105229(1238)E<gt>CT....ACE<lt>105315(1237)-105482(1070)E<gt>CT....ACE<lt>105582(1069)-105842(809)E<gt>CT....ACE<lt>105935(808)-105985(758)E<gt>CT....ACE<lt>106071(757)-106316(512)E<gt>CT....ACE<lt>106394(511)-106619(286)E<gt>CT....ACE<lt>106712(285)-106879(118)E<gt>CT....ACE<lt>107427(117)-107543(1)
or
orient(+/+) align: 83074(1)-83318(245)E<gt>GT....AGE<lt>83637(246)-83702(311)E<gt>GT....AGE<lt>83796(312)-83846(362)E<gt>GT....AGE<lt>83938(363)-84017(442)E<gt>GT....AGE<lt>84308(443)-84352(487)E<gt>GT....AGE<lt>84467(488)-84507(528)
The orient specification includes (cDNA sequence alignment orientation/ spliced orientation). These are different when the reverse-complement of the sequence is provided, ascertained by the aligned orientation with consensus splice sites.
=back
=cut
sub toToken () {
my $self = shift;
my $orientation = $self->get_orientation();
my $spliced_orientation = $self->get_spliced_orientation();
my @alignment_segments = $self->get_alignment_segments();
my $assembled_token = "orient(a$orientation/s$spliced_orientation) align: ";
for (my $i = 0; $i <= $#alignment_segments; $i++) {
my $segment = $alignment_segments[$i];
$assembled_token .= $segment->toToken();
unless ($i == $#alignment_segments) {
$assembled_token .= "....";
}
}
return ($assembled_token);
}
sub set_num_segments {
my $self = shift;
my $num_segments = shift;
$self->{num_segments} = $num_segments;
}
=over 4
=item get_num_segments()
B<Description:> method provides the number of segments composing an alignment.
B<Parameters:> none.
B<Returns:> int
=back
=cut
sub get_num_segments {
my $self = shift;
return ($self->{num_segments});
}
sub set_spliced_orientation {
my $self = shift;
my $orientation = shift;
$self->{spliced_orientation} = $orientation;
}
=over 4
=item get_spliced_orientation ()
B<Description:> provides the validating spliced orientation for an alignment. In some cases this will be different from the alignment orientation; for example, in cases where the cDNA sequence is provided in the reverse orientation.
B<Parameters:> none
B<Returns:> [+|-|undef()]
undef is returned if the alignment did not validate properly. See get_error_flag()
=back
=cut
sub get_spliced_orientation {
my $self = shift;
return ($self->{spliced_orientation});
}
=over 4
=item set_acc()
B<Description:> Method sets the accession field of the cDNA sequence.
B<Parameters:> string
Provide the accession for the cDNA corresponding to this alignment.
B<Returns:> none.
=back
=cut
sub set_acc () {
my $self = shift;
my $acc = shift;
$self->{acc} = $acc;
}
=over 4
=item get_acc()
B<Description:> Method provides the accession for the cDNA in the alignment.
B<Parameters:> none
B<Returns:> string
=back
=cut
sub get_acc () {
my $self = shift;
return ($self->{acc});
}
=over 4
=item set_fli_status()
B<Description:> sets the is_fli attribute of the cDNA alignment, indicative of a full-length insert clone (or complete cDNA sequence).
B<Parameters:> [1|0]
1 = true, 0 = false.
B<Returns:> [1|0]
=back
=cut
sub set_fli_status {
my $self = shift;
my $is_fli_status = shift;
$self->{is_fli} = $is_fli_status;
}
=over 4
=item is_fli()
B<Description:>Provides the full-length insert status of the cDNA.
B<Parameters:> none.
B<Returns:> [0|1]
=back
=cut
sub is_fli {
my $self = shift;
return ($self->{is_fli});
}
=over 4
=item toAlignIllustration()
B<Description:> Provides a single line of text which illustrates the gapped alignment. See the example below.
B<Parameters:> none.
B<Returns:> string
Here is an example of an illustrated alignment:
------> <---> <----- (-)asmbl_6711
=back
=cut
sub toAlignIllustration () {
my ($self, $subtract, $rel_max, $max_line_chars) = @_;
my $spliced_orient = $self->get_spliced_orientation();
my $orient = $self->get_orientation();
my @segments = $self->get_alignment_segments();
my @chars = ();
my $converter = sub {my $coord = shift;
return ( int ( ($coord - $subtract)/$rel_max * $max_line_chars + 0.5));
};
foreach my $segment (@segments) {
my ($lend, $rend) = $segment->get_coords();
my $l_rel = &$converter($lend);
#print "lend: $lend -> l_rel: $l_rel\n" if $::SEE;
my $r_rel = &$converter($rend);
#print "rend: $rend -> r_rel: $r_rel\n" if $::SEE;
for (my $i = $l_rel; $i <= $r_rel; $i++) {
$chars[$i] = '-';
}
if ($segment->has_left_splice_junction()) {
$chars[$l_rel] = '<';
} elsif ( (! $segment->is_first()) && (! $segment->is_single_segment())) {
$chars[$l_rel] = '|';
}
if ($segment->has_right_splice_junction()) {
$chars[$r_rel] = '>';
} elsif ( (! $segment->is_last()) && (! $segment->is_single_segment())) {
$chars[$r_rel] = '|';
}
}
#fill rest of line with spaces.
for (my $i = 0; $i <= $#chars; $i++) {
unless ($chars[$i]) {
$chars[$i] = ' ';
}
}
my $outline = join ("", @chars);
my $acc = $self->get_acc();
$acc =~ tr/\t\n\000-\037\177-\377/\t\n/d; #remove any control characters from accession.
my $fli_status = ($self->is_fli()) ? " FL" : "";;
return ($outline . "\t(a$orient/s$spliced_orient)" . $acc . $fli_status);
}
=over 4
=item get_gene_obj_via_alignment()
B<Description:> Creates a Gene_obj object based on an alignment using the Exons_to_geneobj.pm module.
B<Parameters:>
B<Returns:> Gene_obj
The object returned is of the type Gene_obj defined in Gene_obj.pm
=back
=cut
sub get_gene_obj_via_alignment {
my $self = shift;
my $partial_info_href = shift; # { 5prime => 0|1, 3prime => 0|1 }, optional
my $orient = $self->get_spliced_orientation();
if ($orient eq '+' || '-') {
return($self->_get_gene_obj_via_alignment_by_orient($orient, $partial_info_href));
}
elsif ($orient eq '?') {
## find the orientation that provides the longest ORF
my $plus_orient_gene = $self->_get_gene_obj_via_alignment_by_orient('+', $partial_info_href);
my $minus_orient_gene = $self->_get_gene_obj_via_alignment_by_orient('-', $partial_info_href);
if ($plus_orient_gene->get_CDS_length() >= $minus_orient_gene->get_CDS_length()) {
return($plus_orient_gene);
}
else {
return($minus_orient_gene);
}
}
else {
confess "cannot process spliced orientation of $orient ";
}
}
sub _get_gene_obj_via_alignment_by_orient {
my ($self, $orient, $partial_info_href) = @_;
my @alignment_segments = $self->get_alignment_segments();
my %coords;
foreach my $segment (@alignment_segments) {
my ($end5, $end3) = $segment->get_coords();
if ($orient eq '-') {
($end5, $end3) = ($end3, $end5); #force coordinates to contain orientation info.
}
$coords{$end5} = $end3;
}
my $genomic_seq_ref = $self->{genomic_seq};
my $gene_obj;
if ($genomic_seq_ref) {
## find ORF
$gene_obj = Exons_to_geneobj::create_gene_obj(\%coords, $genomic_seq_ref, $partial_info_href);
} else {
## No ORF
$gene_obj = new Gene_obj;
$gene_obj->populate_gene_obj({}, \%coords);
}
return ($gene_obj);
}
=over 4
=item force_spliced_validation()
B<Description:> Routine used for testing purposes. Any fake alignment can be created and set to validate to the corresponding orientation using this routine.
B<Parameters:> [+|-]
B<Returns:> none.
=back
=cut
sub force_spliced_validation {
my $self = shift;
my $orientation = shift;
unless ($orientation eq '+' || $orientation eq '-') {
croak ("cannot force spliced validation to $orientation.\n");
return;
}
my ($right_splice, $left_splice) = ($orientation eq '+') ? ('XX','YY') : ('YY','XX');
my @alignment_segments = $self->get_alignment_segments();
foreach my $alignment_segment (@alignment_segments) {
if ($alignment_segment->is_internal() || $alignment_segment->is_last()) {
$alignment_segment->set_left_splice_junction(1);
$alignment_segment->set_left_splice_site_chars($left_splice);
}
if ($alignment_segment->is_internal() || $alignment_segment->is_first()) {
$alignment_segment->set_right_splice_junction(1);
$alignment_segment->set_right_splice_site_chars($right_splice);
}
$alignment_segment->set_orientation($orientation);
}
$self->set_spliced_orientation($orientation);
}
=over 4
=item clone()
B<Description:>Clones a CDNA_alignment object into a new CDNA_alignment object with same attributes. Performs a deep copy, so all alignment segments contained within the cloned CDNA_alignment object are also clones.
B<Parameters:> none.
B<Returns:> new CDNA_alignment
=back
=cut
sub clone {
my $self = shift;
my $packagename = ref $self;
my $clone = {};
bless ($clone, $packagename);
foreach my $key (keys %$self) {
$clone->{$key} = $self->{$key};
}
$clone->{alignment_segs} = [];
foreach my $alignment_segment ($self->get_alignment_segments()) {
$clone->add_alignment_segment($alignment_segment->clone());
}
return ($clone);
}
=over 4
=item get_genomic_seq_ref()
B<Description:> Returns a scalar refernence to the genomic sequence.
B<Parameters:> none.
B<Returns:> string_ref
=back
=cut
sub get_genomic_seq_ref {
my $self = shift;
return ($self->{genomic_seq});
}
=over 4
=item extractSplicedSequence()
B<Description:> Returns a string corresponding to the spliced cDNA sequence.
B<Parameters:> none.
B<Returns:> scalar
=back
=cut
sub extractSplicedSequence {
my $self = shift;
my ($genomic_seq_ref) = @_;
my $genomic_seq = $genomic_seq_ref;
unless ($genomic_seq) {
$genomic_seq = $self->{genomic_seq};
unless (ref $genomic_seq) {
confess "Can't extract the spliced sequence when no genomic sequence reference is available.\n";
}
}
my @segments = $self->get_alignment_segments();
my $splicedSequence = "";
my $toggle = 0;
foreach my $segment (@segments) {
my ($lend, $rend) = $segment->get_coords();
my $length = abs ($rend - $lend) + 1;
my $exonseq = substr ($$genomic_seq, $lend - 1, $length);
# alternate case among segments to facilitate manual identification of junctions.
if ($toggle) {
$exonseq = lc $exonseq;
$toggle = 0;
}
else {
$exonseq = uc $exonseq;
$toggle = 1;
}
$splicedSequence .= $exonseq;
}
my $orient = $self->get_orientation();
if ($orient eq "-") {
#reverse complement the sequence:
$splicedSequence = reverse ($splicedSequence);
$splicedSequence =~tr/ACGTacgtyrkmYRKM/TGCAtgcarymkRYMK/;
}
return ($splicedSequence);
}
=over 4
=item get_cDNA_to_genomic_coordinates()
B<Description:> converts a cDNA sequence -relative coordinates to the corresponding genomic sequence coordinates.
B<Parameters:> @coordinates
list of integers
B<Returns:> @converted_coordinates
list of integers.
=back
=cut
sub get_cDNA_to_genomic_coordinates {
my $self = shift;
my @coordinates = @_;
my @ret_coordinates = ();
my @segments = $self->get_alignment_segments();
foreach my $cdna_coord (@coordinates) {
my $corresponding_segment;
foreach my $seg (@segments) {
my ($mlend, $mrend) = sort {$a<=>$b} $seg->get_mcoords();
if ($cdna_coord >= $mlend && $cdna_coord <= $mrend) {
$corresponding_segment = $seg;
last;
}
}
unless (ref $corresponding_segment) {
confess "Error, cDNA coordinate ($cdna_coord) not found in segment list: " . $self->toToken();
}
my ($lend, $rend) = $corresponding_segment->get_coords();
my ($mlend, $mrend) = $corresponding_segment->get_mcoords();
my $orient = $corresponding_segment->get_orientation();
my $genomic_coord = undef;
if ($orient eq '+') {
my $diff = $cdna_coord - $mlend;
$genomic_coord = $lend + $diff;
} elsif ($orient eq '-') {
## mlend > mrend
my $diff = $cdna_coord - $mrend;
$genomic_coord = $rend - $diff;
}
push (@ret_coordinates, $genomic_coord);
}
return (@ret_coordinates);
}
=item get_genomic_to_cDNA_coordinates()
B<Description:> converts coordinates in the genome to coordinates in the cDNA:
B<Parameters:> @coordinates
list of integers
B<Returns:> @converted_coordinates
list of integers. Undef is returned for each position that could not be converted to the genomic coordinate system because it was not found within the aligned region.
=back
=cut
####
sub get_genomic_to_cDNA_coordinates {
my $self = shift;
my @coordinates = @_;
my @ret_coordinates = ();
my @segments = $self->get_alignment_segments();
foreach my $genomic_coord (@coordinates) {
my $corresponding_segment;
foreach my $seg (@segments) {
my ($lend, $rend) = $seg->get_coords();
if ($genomic_coord >= $lend && $genomic_coord <= $rend) {
$corresponding_segment = $seg;
last;
}
}
unless (ref $corresponding_segment) {
confess "Error, genomic coordinate ($genomic_coord) not found in segment list:" . $self->toToken();
}
my ($lend, $rend) = $corresponding_segment->get_coords();
my ($mlend, $mrend) = $corresponding_segment->get_mcoords();
my $orient = $corresponding_segment->get_orientation();
my $diff = $rend - $genomic_coord;
my $cdna_coord = undef;
if ($orient eq '+') {
$cdna_coord = $mrend - $diff;
} elsif ($orient eq '-') {
$cdna_coord = $mrend + $diff;
}
push (@ret_coordinates, $cdna_coord);
}
return (@ret_coordinates);
}
####
sub overlaps_genome_span {
my ($self, $other_alignment) = @_;
my ($lend_A, $rend_A) = sort {$a<=>$b} $self->get_coords();
my ($lend_B, $rend_B) = sort {$a<=>$b} $other_alignment->get_coords();
if (&_overlap($lend_A, $rend_A, $lend_B, $rend_B)) {
return(1);
}
else {
return(0);
}
}
sub has_overlapping_segment {
my ($self, $other_alignment) = @_;
unless ($self->overlaps_genome_span($other_alignment)) {
return(0);
}
my @self_segments = $self->get_alignment_segments();
my @other_segments = $self->get_alignment_segments();
foreach my $segment_A (@self_segments) {
my ($lend_A, $rend_A) = sort {$a<=>$b} $segment_A->get_coords();
foreach my $segment_B (@other_segments) {
my ($lend_B, $rend_B) = sort {$a<=>$b} $segment_B->get_coords();
if (&_overlap($lend_A, $rend_A, $lend_B, $rend_B) ) {
return(1);
}
}
}
return(0); # no overlapping segment
}
=over 4
=item is_compatible()
B<Description:> Returns true (1) if this alignment is found compatible with the other_alignment_obj
B<Parameters:> ($other_alignment_obj, $fuzz_dist)
Alignments A and B are of type CDNA::CDNA_alignment
B<Returns:> [1|0]
Compatibility between alignment objects requires:
-within their region of overlap, introns are identical
-if both have spliced orientations, they must be on the same strand
=back
=cut
####
sub is_compatible {
my $self = shift;
my ($other_alignment, $fuzzlength) = @_;
if (!defined $fuzzlength) {
$fuzzlength = 0; ## no fuzzy termini allowed
}
## The compatibility test requires:
# -alignments must have the same spliced orientation if not ?
# -alignments must overlap
# -alignments must have identical introns in their region of overlap (taking into account the fuzz distance for terminal exons)
my ($a_lend, $a_rend) = $self->get_coords();
my $a_spliced_orient = $self->get_spliced_orientation();
my $a_num_segments = $self->get_num_segments();
my ($b_lend, $b_rend) = $other_alignment->get_coords();
my $b_spliced_orient = $other_alignment->get_spliced_orientation();
my $b_num_segments = $other_alignment->get_num_segments();
## overlap test:
unless (&_overlap($a_lend, $a_rend, $b_lend, $b_rend)) {
return (0); # not compatible
}
## transcribed orientation test:
if ($a_spliced_orient ne $b_spliced_orient && $a_spliced_orient ne '?' && $b_spliced_orient ne '?') {
# neither alignment is ambiguously oriented and they're transcribed on opposite strands:
return (0); # not compatible
}
## same introns test:
if ($a_num_segments > 1 || $b_num_segments > 1) {
my @a_introns = $self->get_intron_coords();
my @b_introns = $self->get_intron_coords();
my ($overlapping_lend, $overlapping_rend) = &_get_coords_of_overlap($a_lend, $a_rend, $b_lend, $b_rend);
print "Overlapping coords between alignments: $overlapping_lend to $overlapping_rend\n" if $SEE;
## make adjustments to required overlap coordinates considering fuzzlength:
if ($fuzzlength) {
## adjust left overlap boundary requirement
$overlapping_lend = &_adjust_left_overlap_boundary_via_fuzzlength($overlapping_lend, $self, $other_alignment, $fuzzlength);
$overlapping_rend = &_adjust_right_overlap_boundary_via_fuzzlength($overlapping_rend, $self, $other_alignment, $fuzzlength);
print "\tadjusted overlapping coords to: $overlapping_lend to $overlapping_rend\n" if $SEE;
}
## find introns within adjusted overlap range and ensure identity
my @a_intron_coords = $self->get_intron_coords();
my @b_intron_coords = $other_alignment->get_intron_coords();
my @a_introns_in_range = &_get_overlapping_capped_introns($overlapping_lend, $overlapping_rend, \@a_intron_coords);
my @b_introns_in_range = &_get_overlapping_capped_introns($overlapping_lend, $overlapping_rend, \@b_intron_coords);
if (@a_introns_in_range || @b_introns_in_range) {
## ensure identity:
my %all_introns;
my %a_introns;
foreach my $coordset (@a_introns_in_range) {
my $key = join (",", @$coordset);
$a_introns{$key} = 1;
$all_introns{$key} = 1;
}
my %b_introns;
foreach my $coordset (@b_introns_in_range) {
my $key = join (",", @$coordset);
$b_introns{$key} = 1;
$all_introns{$key} = 1;
}
foreach my $intron_key (keys %all_introns) {
unless ($a_introns{$intron_key} && $b_introns{$intron_key}) {
return (0); # not compatible, an intron difference in the overlapping region exists.
}
}
}
}
## if got this far, passed all compatibility tests.
return (1); # yes, compatible.
}
####
sub _get_contained_coords {
my ($lend, $rend, $coordsets_aref) = @_;
my @contained_coords;
foreach my $coordset (@$coordsets_aref) {
my ($coord_lend, $coord_rend) = sort {$a<=>$b} @$coordset;
if ($lend <= $coord_lend && $coord_rend <= $rend) {
## coordset contained
push (@contained_coords, $coordset);
}
}
return (@contained_coords);
}
####
# get introns that overlap lend and rend, and set termini of intron coords to these values if they extend beyond them.
sub _get_overlapping_capped_introns {
my ($lend, $rend, $coordsets_aref) = @_;
my @contained_coords;
foreach my $coordset (@$coordsets_aref) {
my ($coord_lend, $coord_rend) = sort {$a<=>$b} @$coordset;
if ($lend <= $coord_rend && $rend >= $coord_lend) {
## coordset contained
if ($coord_lend < $lend) {
$coord_lend = $lend;
}
if ($coord_rend > $rend) {
$coord_rend = $rend;
}
push (@contained_coords, [$coord_lend, $coord_rend]);
}
}
return (@contained_coords);
}
####
sub _adjust_left_overlap_boundary_via_fuzzlength {
my ($overlapping_lend, $alignment_a, $alignment_b, $fuzzlength) = @_;
## make adjustments to take into account fuzzlength and existing intron coordinates and adjacent segments
## we trust short aligment segments that precede an intron, even if they're shorter than the fuzzlength
my $a_overlapping_segment = $alignment_a->find_segment_containing_coord($overlapping_lend);
my $b_overlapping_segment = $alignment_b->find_segment_containing_coord($overlapping_lend);
my @bounds = ();
if ($a_overlapping_segment) {
my ($a_seg_lend, $a_seg_rend) = $a_overlapping_segment->get_coords();
push (@bounds, $a_seg_rend);
}
if ($b_overlapping_segment) {
my ($b_seg_lend, $b_seg_rend) = $b_overlapping_segment->get_coords();
push (@bounds, $b_seg_rend);
}
if (@bounds) {
my $max_bound = max_coord(@bounds);
my $delta = $max_bound - $overlapping_lend;
if ($delta < 0) {
confess "Error, delta left bound is less than zero";
}
my $fuzz_employed = min_coord($fuzzlength, $delta);
$overlapping_lend += $fuzz_employed;
}
return ($overlapping_lend);
}
####
sub _adjust_right_overlap_boundary_via_fuzzlength {
my ($overlapping_rend, $alignment_a, $alignment_b, $fuzzlength) = @_;
## make adjustments to take into account fuzzlength and existing intron coordinates and adjacent segments
## we trust short aligment segments that precede an intron, even if they're shorter than the fuzzlength
my $a_overlapping_segment = $alignment_a->find_segment_containing_coord($overlapping_rend);
my $b_overlapping_segment = $alignment_b->find_segment_containing_coord($overlapping_rend);
my @bounds = ();
if ($a_overlapping_segment) {
my ($a_seg_lend, $a_seg_rend) = $a_overlapping_segment->get_coords();
push (@bounds, $a_seg_lend);
}
if ($b_overlapping_segment) {
my ($b_seg_lend, $b_seg_rend) = $b_overlapping_segment->get_coords();
push (@bounds, $b_seg_lend);
}
if (@bounds) {
my $min_bound = min_coord(@bounds);
my $delta = $overlapping_rend - $min_bound;
if ($delta < 0) {
confess "Error, delta left bound is less than zero";
}
my $fuzz_employed = min_coord($fuzzlength, $delta);
$overlapping_rend -= $fuzz_employed;
}
return ($overlapping_rend);
}
####
sub find_segment_containing_coord {
my $self = shift;
my ($coord) = @_;
my @segments = $self->get_alignment_segments();
foreach my $segment (@segments) {
my ($lend, $rend) = $segment->get_coords();
if ($lend <= $coord && $coord <= $rend) {
return ($segment);
}
}
return (undef); # none found
}
sub _overlap {
my ($a1_lend, $a1_rend, $a2_lend, $a2_rend) = @_;
#print "Checking overlap @_\t";
if ($a2_rend >= $a1_lend && $a2_lend <= $a1_rend) { #overlap
#print "YES\n";
return (1);
} else {
#print "NO\n";
return (0);
}
}
=over 4
=item encapsulates()
B<Description:> Returns true (1) if this alignment encapsulates the span of the other alignment object
B<Parameters:> ($other_alignment_obj, $fuzz_dist)
Alignments A and B are of type CDNA::CDNA_alignment
B<Returns:> [1|0]
=back
=cut
sub encapsulates {
my $self = shift;
my ($alignmentB, $fuzz_dist) = @_;
my $alignmentA = $self;
if (! defined $fuzz_dist) {
$fuzz_dist = 0;
}
my ($alend, $arend) = $alignmentA->get_coords();
my ($blend, $brend) = $alignmentB->get_coords();
if ($blend + $fuzz_dist >= $alend &&
$brend - $fuzz_dist <= $arend)
{
return (1);
} else {
return (0);
}
}
####
sub _get_coords_of_overlap {
my ($a_lend, $a_rend, $b_lend, $b_rend) = @_;
unless (&_overlap($a_lend, $a_rend, $b_lend, $b_rend)) {
confess "Error, trying to get coordinates of overlapping region for two features that do not overlap!";
}
my $overlapping_lend = &max_coord($a_lend, $b_lend);
my $overlapping_rend = &min_coord($a_rend, $b_rend);
return ($overlapping_lend, $overlapping_rend);
}
####
sub min_coord {
my @coords = @_;
@coords = sort {$a<=>$b} @coords;
my $min_coord = shift @coords;
return ($min_coord);
}
####
sub max_coord {
my @coords = @_;
@coords = sort {$a<=>$b} @coords;
my $max_coord = pop @coords;
return ($max_coord);
}
1; #EOM