biyelunwen/99.scripts/trinity_utils/PerlLib/KmerGraphLib/AlignGraph.pm

150 lines
3.4 KiB
Perl
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

package main;
our $SEE;
package AlignGraph;
use strict;
use warnings;
use Carp;
use AlignNode;
use base qw (ReadCoverageGraph);
no warnings qw (recursion);
sub new {
my $packagename = shift;
my $self = $packagename->SUPER::new();
## Nodes exist in order of end5 -> end3 at all times. Strand is included in the node name just for safety reasons.
bless ($self, $packagename);
return($self);
}
sub add_alignment {
my $self = shift;
my ($read_acc, $scaffold, $strand, $genome_coords_aref) = @_;
my @align_positions;
foreach my $coordset (sort {$a->[0]<=>$b->[0]} @$genome_coords_aref) {
my ($lend, $rend) = sort {$a<=>$b} @$coordset;
for (my $i = $lend; $i <= $rend; $i++) {
push (@align_positions, $i);
}
}
$self->_add_ordered_positions_to_graph(\@align_positions, $strand, $read_acc);
return;
}
sub _add_ordered_positions_to_graph {
my $self = shift;
my ($ordered_positions_aref, $strand, $read_acc) = @_;
unless (ref $ordered_positions_aref eq 'ARRAY') {
confess "Error, require ordered position list";
}
unless ($strand =~ /^[\+\-]$/) {
confess "strand must be: + or - ";
}
my @align_positions = @$ordered_positions_aref;
if ($strand eq '-') {
@align_positions = reverse @align_positions;
}
my $prev_align_pos = shift @align_positions;
while (@align_positions) {
my $next_align_pos = shift @align_positions;
# print "\tadding $next_align_pos\n";
my $prev_align_node = $self->get_or_create_node("$prev_align_pos,$strand", $read_acc);
my $next_align_node = $self->get_or_create_node("$next_align_pos,$strand", $read_acc);
$self->link_adjacent_nodes($prev_align_node, $next_align_node);
$prev_align_pos = $next_align_pos;
}
return;
}
sub get_all_nodes {
my $self = shift;
return( sort {$a->{_value} cmp $b->{_value}} $self->SUPER::get_all_nodes());
}
1; #EOM
=CIGAR_format
from: http://bioperl.org/pipermail/bioperl-l/2003-March/011591.html
cigar line format (where CIGAR stands for Concise
Idiosyncratic Gapped Alignment Report).
In the cigar line format alignments are sotred as follows:
M: Match
D: Deletino
I: Insertion
An example of an alignment for a hypthetical protein match is shown
below:
Query: 42 PGPAGLP----GSVGLQGPRGLRGPLP-GPLGPPL...
PG P G GP R PLGP
Sbjct: 1672 PGTP*TPLVPLGPWVPLGPSSPR--LPSGPLGPTD...
protein_align_feature table as the following cigar line:
7M4D12M2I2MD7M
From SAM documentation:
Clipped alignment. In Smith-Waterman alignment, a sequence may not be aligned from the first residue to the last one.
Subsequences at the ends may be clipped off. We introduce operation ʻSʼ to describe (softly) clipped alignment. Here is
an example. Suppose the clipped alignment is:
REF: AGCTAGCATCGTGTCGCCCGTCTAGCATACGCATGATCGACTGTCAGCTAGTCAGACTAGTCGATCGATGTG
READ: gggGTGTAACC-GACTAGgggg
where on the read sequence, bases in uppercase are matches and bases in lowercase are clipped off. The CIGAR for
this alignment is: 3S8M1D6M4S.
Spliced alignment. In cDNA-to-genome alignment, we may want to distinguish introns from deletions in exons. We
introduce operation ʻNʼ to represent long skip on the reference sequence. Suppose the spliced alignment is:
REF: AGCTAGCATCGTGTCGCCCGTCTAGCATACGCATGATCGACTGTCAGCTAGTCAGACTAGTCGATCGATGTG
READ: GTGTAACCC................................TCAGAATA
where ʻ...ʼ on the read sequence indicates the intron. The CIGAR for this alignment is: 9M32N8M.
=cut