1135 lines
23 KiB
Perl
1135 lines
23 KiB
Perl
package main;
|
|
our $SEE;
|
|
|
|
package KmerGraph;
|
|
|
|
use strict;
|
|
use warnings;
|
|
use Carp;
|
|
use KmerNode;
|
|
use ReadTracker;
|
|
|
|
use base qw (ReadCoverageGraph);
|
|
|
|
no warnings qw (recursion);
|
|
|
|
|
|
my $MIN_DISPLAY_SEQ_TAG_LENGTH = 10; # lower than this, and shows up in the dot file as a label.
|
|
|
|
sub new {
|
|
my $packagename = shift;
|
|
my $KmerLength = shift;
|
|
|
|
unless ($KmerLength > 1) { croak "Error, need KmerLength > 1"; }
|
|
|
|
my $self = {
|
|
_nodes => {}, #kmerSeq => nodeAddress until after compaction... becomes nodeaddress => $nodeaddress
|
|
KmerLength => $KmerLength,
|
|
compacted => 0,
|
|
color_to_accs => {},
|
|
|
|
};
|
|
|
|
bless ($self, $packagename);
|
|
|
|
return($self);
|
|
}
|
|
|
|
|
|
sub add_sequence_to_graph {
|
|
my $self = shift;
|
|
my ($acc, $sequence, $weight, $color) = @_; ## Sequence type can be undef, ReferenceTranscript, or GreedyPath
|
|
|
|
unless (defined($weight) && $weight =~ /^\d+/) {
|
|
confess "Error, params: (seq, weight, color), and weight must be numeric.";
|
|
}
|
|
|
|
if ($self->{compacted}) {
|
|
confess "Cannot add more sequences to the graph after it has been compacted";
|
|
}
|
|
|
|
push (@{$self->{color_to_accs}->{$color}}, $acc); # add this accession to color indicator. # prefer 1-1 relationship here.
|
|
|
|
$sequence = uc $sequence;
|
|
|
|
my $KmerLength = $self->{KmerLength};
|
|
|
|
if (length($sequence) < $KmerLength+1) {
|
|
warn "sequence $sequence is less than KmerLength $KmerLength+1; skipping it\n";
|
|
return;
|
|
}
|
|
|
|
my $prevKmer = substr($sequence, 0, $KmerLength);
|
|
my $prevKmerNode = $self->get_or_create_node($prevKmer);
|
|
$prevKmerNode->{_count}++;
|
|
$prevKmerNode->track_reads($acc);
|
|
|
|
if ($color) {
|
|
$prevKmerNode->add_color($color);
|
|
}
|
|
|
|
for (my $i = 1; $i <= length($sequence)-$KmerLength; $i++) {
|
|
my $nextKmer = substr($sequence, $i, $KmerLength);
|
|
my $nextKmerNode = $self->get_or_create_node($nextKmer);
|
|
$nextKmerNode->track_reads($acc);
|
|
|
|
if ($color) {
|
|
$nextKmerNode->add_color($color);
|
|
}
|
|
|
|
$nextKmerNode->{_count}++;
|
|
|
|
my $prev_edge_weight = $self->get_edge_count($prevKmerNode, $nextKmerNode);
|
|
|
|
## only link together kmers that contain only recognizable nucleotides
|
|
$self->link_adjacent_nodes($prevKmerNode, $nextKmerNode, $weight);
|
|
|
|
my $edge_weight = $self->get_edge_count($prevKmerNode, $nextKmerNode);
|
|
if ($edge_weight < $weight) {
|
|
confess "Error, added weight $weight to edge $prevKmerNode -> $nextKmerNode and didn't stick";
|
|
}
|
|
|
|
if ($prev_edge_weight > $edge_weight) {
|
|
confess "Error, after adding new edge, prev weight of $prev_edge_weight became: $edge_weight ";
|
|
}
|
|
#print "$prevKmerNode -> $nextKmerNode : $prev_edge_weight -> $edge_weight\n";
|
|
|
|
$prevKmerNode = $nextKmerNode;
|
|
$prevKmer = $nextKmer;
|
|
}
|
|
|
|
return;
|
|
}
|
|
|
|
sub get_or_create_node {
|
|
my $self = shift;
|
|
my ($node_name, $read_accession) = @_;
|
|
|
|
if ($self->node_exists($node_name)) {
|
|
my $node = $self->get_node($node_name);
|
|
return($node);
|
|
|
|
}
|
|
else {
|
|
# instantiate it, add it to the graph
|
|
return($self->create_node($node_name));
|
|
}
|
|
}
|
|
|
|
|
|
sub create_node {
|
|
my $self = shift;
|
|
|
|
my ($node_name) = @_;
|
|
|
|
if ($self->node_exists($node_name)) {
|
|
confess "Error, node $node_name already exists in the graph";
|
|
}
|
|
|
|
my $node = new KmerNode($node_name);
|
|
|
|
$self->{_nodes}->{$node_name} = $node;
|
|
|
|
return($node);
|
|
}
|
|
|
|
|
|
|
|
|
|
####
|
|
sub compact_graph {
|
|
my $self = shift;
|
|
|
|
$self->validate_graph();
|
|
|
|
my $kmer_length = $self->{KmerLength};
|
|
|
|
unless ($self->{compacted}) {
|
|
$self->_prep_graph_for_compaction();
|
|
|
|
$self->{compacted} = 1;
|
|
}
|
|
|
|
my $COMPACTED = 0; # flag
|
|
|
|
|
|
## identify all nodes that are branched on the left and not branched on the right
|
|
## then join together the unbranched nodes starting from these and walking right.
|
|
|
|
my @nodes = $self->get_all_nodes();
|
|
|
|
## merge unbranched nodes in runs.
|
|
|
|
|
|
|
|
|
|
####
|
|
#### \
|
|
#### (-)---
|
|
#### /
|
|
|
|
# or
|
|
|
|
# (-)---
|
|
|
|
|
|
# or
|
|
|
|
#### (-)------------
|
|
#### /
|
|
#### -
|
|
#### \
|
|
#### (-) ------------
|
|
|
|
|
|
# or, single line with incompatible decorations:
|
|
|
|
# aaaaaaaaaa b bbbbbbbbbb
|
|
# ----------(-)----------
|
|
|
|
|
|
my @init_nodes;
|
|
foreach my $node (@nodes) {
|
|
my @prev_nodes = $node->get_all_prev_nodes();
|
|
|
|
my @next_nodes = $node->get_all_next_nodes();
|
|
|
|
if
|
|
|
|
( (scalar(@next_nodes) == 1)
|
|
&&
|
|
|
|
(
|
|
|
|
|
|
|
|
|
|
(
|
|
(scalar(@prev_nodes) != 1) # branched before or no node before.
|
|
||
|
|
(scalar (@prev_nodes) == 1 && scalar($prev_nodes[0]->get_all_next_nodes() > 1) ) # previous node branches off.
|
|
|
|
)
|
|
|
|
||
|
|
|
|
## decoration switch
|
|
(scalar(@prev_nodes) == 1 && (! &_compatible_decorations($node, $prev_nodes[0])) && (&_compatible_decorations($node, $next_nodes[0])) )
|
|
|
|
)
|
|
|
|
)
|
|
{
|
|
# got one
|
|
push (@init_nodes, $node);
|
|
}
|
|
|
|
}
|
|
|
|
print "Got " . scalar(@init_nodes) . " init nodes.\n" if $main::SEE;
|
|
|
|
#foreach my $node (@init_nodes) {
|
|
# print "Init node $node:\n" . $node->toString() . "\n";# if $main::SEE;
|
|
#}
|
|
|
|
|
|
## Collapse neighboring nodes before branching
|
|
foreach my $node (@init_nodes) {
|
|
|
|
#print "Init node $node:\n" . $node->toString() . "\n";# if $main::SEE;
|
|
|
|
my @unbranched_path;
|
|
|
|
my ($next_node) = $node->get_all_next_nodes();
|
|
|
|
unless ($next_node) {
|
|
print "warning, init node now lacks next nodes... skipping.\n"; # potential bug? or circular dealt with already?
|
|
next;
|
|
}
|
|
|
|
my %seen = ($node => 1);
|
|
|
|
while (1) {
|
|
# avoid circularity
|
|
if ($seen{$next_node}) {
|
|
print "Seen $next_node already... avoiding circularity.\n" if $main::SEE;
|
|
last;
|
|
}
|
|
$seen{$next_node} = 1;
|
|
|
|
unless (&_compatible_decorations($node, $next_node)) {
|
|
print "Incompatible decorations.\n" if $main::SEE;
|
|
last;
|
|
}
|
|
|
|
# check for left branching on next node
|
|
my $num_prev_nodes = scalar ($next_node->get_all_prev_nodes());
|
|
if (scalar $num_prev_nodes > 1) {
|
|
print "Got $num_prev_nodes prev nodes for $next_node; terminating extension.\n" if $main::SEE;
|
|
last;
|
|
}
|
|
|
|
push (@unbranched_path, $next_node);
|
|
my @next_nodes = $next_node->get_all_next_nodes();
|
|
|
|
if (scalar @next_nodes != 1) {
|
|
last;
|
|
}
|
|
$next_node = shift @next_nodes;
|
|
|
|
}
|
|
|
|
if ($main::SEE) {
|
|
print "Single unbranched path:\n";
|
|
$self->print_path($node, @unbranched_path);
|
|
}
|
|
|
|
foreach my $next_node (@unbranched_path) {
|
|
# merge pairs
|
|
$self->_append_node($node, $next_node);
|
|
$COMPACTED = 1;
|
|
|
|
}
|
|
|
|
if ($main::SEE) {
|
|
print "Collapsed:\n";
|
|
$self->print_path($node);
|
|
print "\n\n\n";
|
|
}
|
|
|
|
}
|
|
|
|
|
|
$self->validate_graph();
|
|
|
|
|
|
return ($COMPACTED);
|
|
}
|
|
|
|
####
|
|
sub _append_node {
|
|
my $self = shift;
|
|
my ($node, $next_node) = @_;
|
|
|
|
## In compacted mode already.
|
|
|
|
if (! _compatible_decorations($node, $next_node)) {
|
|
confess "Error, trying to join nodes with incompatible decorations.";
|
|
}
|
|
|
|
my $KmerLength = $self->{KmerLength};
|
|
|
|
|
|
# add counts
|
|
$node->set_count($node->get_count() + $next_node->get_count());
|
|
|
|
my $orig_node_seq = $node->get_sequence();
|
|
my $orig_next_node_seq = $next_node->get_sequence();
|
|
|
|
my $new_node_sequence = $orig_node_seq . substr($orig_next_node_seq, $KmerLength-1); # exclude K-1 prefix.
|
|
|
|
# add terminal base
|
|
$node->set_sequence($new_node_sequence);
|
|
|
|
# add read evidence
|
|
$node->{_ReadTracker}->append_to_ReadTracker($next_node->{_ReadTracker});
|
|
|
|
# sever connection between node and next
|
|
$node->delete_next_node($next_node);
|
|
$next_node->delete_prev_node($node);
|
|
|
|
# sever connection between next and next-next nodes, and reconnect next-next's with node.
|
|
|
|
my @next_node_next_nodes = $next_node->get_all_next_nodes();
|
|
|
|
my %next_next_node_edge_count;
|
|
|
|
## update edge counts.
|
|
foreach my $next_next_node (@next_node_next_nodes) {
|
|
my $edge_count = $self->get_edge_count($next_node, $next_next_node);
|
|
|
|
$next_next_node_edge_count{$next_next_node} = $edge_count;
|
|
}
|
|
|
|
$self->delete_node_from_graph($next_node);
|
|
|
|
foreach my $next_next_node (@next_node_next_nodes) {
|
|
$self->link_adjacent_nodes($node, $next_next_node, $next_next_node_edge_count{$next_next_node});
|
|
|
|
}
|
|
|
|
|
|
|
|
return;
|
|
}
|
|
|
|
|
|
####
|
|
sub toGraphViz {
|
|
my $self = shift;
|
|
|
|
my (%settings) = @_;
|
|
|
|
my @nodes = $self->get_all_nodes();
|
|
|
|
my $text = "digraph G {\n";
|
|
|
|
$text .=
|
|
#"node [width=0.1,height=0.1,fontsize=10,shape=point];\n"
|
|
"node [width=0.1,height=0.1,fontsize=10];\n"
|
|
. "edge [fontsize=12];\n"
|
|
. "margin=1.0;\n"
|
|
. "rankdir=LR;\n"
|
|
. "labeljust=l;\n";
|
|
|
|
|
|
foreach my $node (sort {$a->get_ID() cmp $b->get_ID()} @nodes) {
|
|
|
|
my @prev_nodes = $node->get_all_prev_nodes();
|
|
my @next_nodes = $node->get_all_next_nodes();
|
|
|
|
my $sequence = $node->get_sequence();
|
|
|
|
if (my $min_len = $settings{no_short_singletons}) {
|
|
|
|
if ( scalar(@prev_nodes) == 0
|
|
&&
|
|
scalar(@next_nodes) == 0
|
|
&&
|
|
length($sequence) < $min_len) {
|
|
|
|
next;
|
|
}
|
|
}
|
|
|
|
|
|
|
|
my $seqLen = length($sequence);
|
|
|
|
my $count = $node->get_count();
|
|
|
|
my $len = $seqLen;
|
|
|
|
if (scalar(@prev_nodes)==0) { # no left connecting node:
|
|
$len = $seqLen - ($self->{KmerLength} - 1);
|
|
}
|
|
|
|
my $avg_count = $count; #int($count/$len + 0.5);
|
|
|
|
my $id = hex($node->get_ID());
|
|
|
|
my $atts = "";
|
|
|
|
if (length($sequence) < $MIN_DISPLAY_SEQ_TAG_LENGTH) {
|
|
$atts = "-$sequence";
|
|
}
|
|
else {
|
|
$atts = "-" . substr($sequence, 0, 3) . "..." . substr($sequence, -3);
|
|
}
|
|
|
|
my $depth ="";
|
|
if (defined $node->{depth}) {
|
|
$depth = $node->{depth};
|
|
}
|
|
|
|
#$text .= "\t$id \[label=\"$id-L$seqLen-C$avg_count$atts\"$color];\n";
|
|
$text .= "\t$id \[label=\"D:$depth-L$seqLen-C$avg_count$atts\"];\n";
|
|
|
|
|
|
foreach my $next_node (@next_nodes) {
|
|
|
|
my $next_id = hex($next_node->get_ID());
|
|
|
|
my @colors_in_common = &get_colors_in_common($node, $next_node);
|
|
|
|
my $edge_weight = $self->get_edge_count($node, $next_node);
|
|
|
|
if (@colors_in_common) {
|
|
foreach my $color (@colors_in_common) {
|
|
|
|
my $label = "$edge_weight";
|
|
unless ($color eq 'black') { # reserved for general data
|
|
my @accs = @{$self->{color_to_accs}->{$color}};
|
|
$label = join(",", @accs);
|
|
}
|
|
#$text .= "\t$id->$next_id [label=$edge_weight, color=\"$color\"];\n";
|
|
$text .= "\t$id->$next_id [label=\"$label\", color=\"$color\"];\n";
|
|
|
|
}
|
|
}
|
|
|
|
}
|
|
}
|
|
|
|
$text .= "}\n";
|
|
|
|
return($text);
|
|
}
|
|
|
|
|
|
sub get_colors_in_common {
|
|
my ($node, $next_node) = @_;
|
|
|
|
my @colorsA = $node->get_colors();
|
|
my @colorsB = $next_node->get_colors();
|
|
|
|
my %counts;
|
|
|
|
foreach my $color (@colorsA, @colorsB) {
|
|
$counts{$color}++;
|
|
}
|
|
my @colors = grep { $counts{$_} > 1 } keys %counts;
|
|
|
|
return(@colors);
|
|
}
|
|
|
|
|
|
|
|
####
|
|
sub _prep_graph_for_compaction {
|
|
my $self = shift;
|
|
|
|
my @nodes = $self->get_all_nodes();
|
|
|
|
my %new_node_set; ## change lookup so it's not based on sequence anymore.
|
|
|
|
foreach my $node (@nodes) {
|
|
$new_node_set{"$node"} = $node;
|
|
}
|
|
|
|
$self->{_nodes} = \%new_node_set;
|
|
}
|
|
|
|
|
|
####
|
|
sub delete_node_from_graph {
|
|
my $self = shift;
|
|
my ($node) = @_;
|
|
|
|
$self->prune_nodes_from_graph($node);
|
|
|
|
if ($self->{compacted}) {
|
|
|
|
delete $self->{_nodes}->{$node};
|
|
|
|
}
|
|
else {
|
|
my $kmer = $node->get_sequence();
|
|
delete $self->{_nodes}->{"$kmer"};
|
|
}
|
|
|
|
return;
|
|
}
|
|
|
|
|
|
####
|
|
sub purge_nodes_below_count {
|
|
my $self = shift;
|
|
|
|
my ($min_count) = @_;
|
|
|
|
my @nodes = $self->get_all_nodes();
|
|
|
|
my $num_deleted = 0;
|
|
|
|
foreach my $node (@nodes) {
|
|
|
|
if ($node->get_count() < $min_count) {
|
|
|
|
my @colors = $node->get_colors();
|
|
unless (scalar(@colors) == 1 && $colors[0] eq "black") {
|
|
next;
|
|
## skipping base data (reads) below coverage limit
|
|
}
|
|
|
|
$self->delete_node_from_graph($node);
|
|
|
|
$num_deleted++;
|
|
}
|
|
}
|
|
|
|
print STDERR " -purged nodes below count($min_count), deleted $num_deleted nodes.\n";
|
|
|
|
return;
|
|
}
|
|
|
|
|
|
####
|
|
sub validate_graph {
|
|
my $self = shift;
|
|
|
|
print STDERR "Validating graph.\n";
|
|
|
|
my @nodes = $self->get_all_nodes();
|
|
|
|
my %node_ids_in_graph;
|
|
|
|
foreach my $node (@nodes) {
|
|
|
|
my $node_id = $node->get_ID();
|
|
$node_ids_in_graph{$node_id} = $node;
|
|
}
|
|
|
|
foreach my $node (@nodes) {
|
|
|
|
my $id = $node->get_ID();
|
|
|
|
my @prev_nodes = $node->get_all_prev_nodes();
|
|
foreach my $prev_node (@prev_nodes) {
|
|
my $prev_id = $prev_node->get_ID();
|
|
|
|
unless (exists ($node_ids_in_graph{$prev_id})) {
|
|
print STDERR "Error, $prev_id exists as prev_node to $id, but not in graph\n";
|
|
print STDERR $prev_node->get_sequence() . "\n" . $prev_node->toString();
|
|
confess;
|
|
}
|
|
|
|
unless ($prev_node->has_next_node($node)) {
|
|
confess "Error, prev_node: $prev_id lacks current next node $id as link.\n";
|
|
}
|
|
|
|
|
|
}
|
|
|
|
my @next_nodes = $node->get_all_next_nodes();
|
|
foreach my $next_node (@next_nodes) {
|
|
my $next_id = $next_node->get_ID();
|
|
|
|
unless (exists ($node_ids_in_graph{$next_id})) {
|
|
print STDERR "Error, $next_id exists as next_node to $id, but not in graph\n";
|
|
print STDERR $next_node->get_sequence() . "\n" . $next_node->toString();
|
|
confess;
|
|
}
|
|
|
|
unless ($next_node->has_prev_node($node)) {
|
|
confess "Error, next node: $next_id lacks current node $id as prev link.\n";
|
|
}
|
|
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return;
|
|
}
|
|
|
|
####
|
|
sub print_path {
|
|
my $self = shift;
|
|
my @nodes = @_;
|
|
|
|
my $counter = 0;
|
|
my $prev_node;
|
|
|
|
foreach my $node (@nodes) {
|
|
$counter++;
|
|
printf("%4s", $counter);
|
|
|
|
my $edge_count = "";
|
|
if ($prev_node) {
|
|
$edge_count = " edge_weight: " . $self->get_edge_count($prev_node, $node);
|
|
}
|
|
|
|
print " " . $node->get_value() . " C:" . $node->get_count() . " " . join(",", $node->get_colors()) . "$edge_count\n";
|
|
|
|
$prev_node = $node;
|
|
|
|
}
|
|
print "\n";
|
|
|
|
return;
|
|
}
|
|
|
|
|
|
sub _compatible_decorations {
|
|
my ($nodeA, $nodeB) = @_;
|
|
|
|
|
|
my @colorsA = $nodeA->get_colors();
|
|
my @colorsB = $nodeB->get_colors();
|
|
|
|
unless (@colorsA || @colorsB) {
|
|
# no decorations.
|
|
return(1);
|
|
}
|
|
|
|
my %counts;
|
|
foreach my $color (@colorsA, @colorsB) {
|
|
$counts{$color}++;
|
|
}
|
|
|
|
my @colors_unique = grep { $counts{$_} == 1 } keys %counts;
|
|
|
|
if (@colors_unique) {
|
|
#print STDERR "unique colors: @colors_unique\n";
|
|
return(0);
|
|
}
|
|
|
|
else {
|
|
return(1);
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
####
|
|
sub score_nodes {
|
|
my $self = shift;
|
|
|
|
$self->assign_depth_to_nodes();
|
|
|
|
## score = sum prev counts for those reads that are consistent, excluding those that are from bifurcating reads.
|
|
|
|
|
|
my @nodes = sort {$a->{depth}<=>$b->{depth}} $self->get_all_nodes();
|
|
|
|
## assign base scores:
|
|
foreach my $node (@nodes) {
|
|
|
|
## define base scores: contributions by reads that are not in prev or next nodes.
|
|
my %neighboring_reads = $self->gather_read_indices($node->get_all_prev_nodes(), $node->get_all_next_nodes());
|
|
|
|
my $base_score = 0;
|
|
|
|
my $read_tracker = $node->get_ReadTracker();
|
|
my @reads = $read_tracker->get_tracked_read_indices();
|
|
foreach my $read (@reads) {
|
|
if ($node->{depth} == 0 || ! $neighboring_reads{$read}) {
|
|
my $count = $read_tracker->get_read_base_count($read);
|
|
|
|
$base_score += $count;
|
|
#print "$node has read [$read] with count [$count]\n";
|
|
|
|
}
|
|
}
|
|
|
|
$node->{_base_score} = $base_score;
|
|
$node->{_sum_score} = $base_score; # init
|
|
#print "$node : base score = $base_score\n";
|
|
}
|
|
|
|
|
|
## assign sum scores
|
|
foreach my $node (@nodes) {
|
|
my %indices_exclude;
|
|
|
|
my $read_tracker = $node->get_ReadTracker();
|
|
|
|
my $highest_prev_score = $node->{_sum_score};
|
|
my $highest_prev_node = undef;
|
|
|
|
foreach my $prev_node ($node->get_all_prev_nodes()) {
|
|
## want reads that are in current node and prev node, but not other next nodes of prev node (exclusive to current node).
|
|
my $sum_score = $prev_node->{_sum_score};
|
|
|
|
my @other_prev_next_nodes = grep { $_ ne $node } $prev_node->get_all_next_nodes();
|
|
|
|
my %reads_ignore = $self->gather_read_indices(@other_prev_next_nodes);
|
|
|
|
foreach my $read ($read_tracker->get_tracked_read_indices()) {
|
|
|
|
if (! $reads_ignore{$read}) {
|
|
|
|
$sum_score += $read_tracker->get_read_base_count($read);
|
|
}
|
|
}
|
|
|
|
push (@{$prev_node->{_forward_scores}}, { next_node => $node,
|
|
score => $sum_score,
|
|
} );
|
|
|
|
|
|
if ($sum_score > $highest_prev_score) {
|
|
$highest_prev_score = $sum_score;
|
|
$highest_prev_node = $prev_node;
|
|
}
|
|
}
|
|
|
|
$node->{_sum_score} = $highest_prev_score;
|
|
$node->{_best_prev} = $highest_prev_node;
|
|
}
|
|
|
|
|
|
## find highest sum score
|
|
my $highest_sum_score = 0;
|
|
my $best_node = undef;
|
|
|
|
foreach my $node (@nodes) {
|
|
#print "Node: $node, has sum score: " . $node->{_sum_score} . "\n";
|
|
if ($node->{_sum_score} > $highest_sum_score) {
|
|
$highest_sum_score = $node->{_sum_score};
|
|
$best_node = $node;
|
|
}
|
|
}
|
|
|
|
|
|
my @path_nodes;
|
|
my $prev_node = $best_node;
|
|
while ($prev_node) {
|
|
#print "Backtracked: $prev_node " . $prev_node->{_sum_score} . "\n";
|
|
#print $prev_node->toString();
|
|
push (@path_nodes, $prev_node);
|
|
$prev_node = $prev_node->{_best_prev};
|
|
|
|
}
|
|
|
|
@path_nodes = reverse @path_nodes;
|
|
|
|
return (@path_nodes);
|
|
|
|
|
|
}
|
|
|
|
|
|
####
|
|
sub extract_sequence_from_path {
|
|
my $self = shift;
|
|
my (@ordered_nodes) = @_; # ordered left to right
|
|
|
|
my @seqs;
|
|
|
|
foreach my $node (@ordered_nodes) {
|
|
|
|
$node->{_visited} = 1;
|
|
|
|
push (@seqs, $node->get_sequence());
|
|
|
|
}
|
|
|
|
|
|
my $kmer_length = $self->{KmerLength};
|
|
|
|
my $final_seq = shift @seqs; ## first one gets full kmer treatment.
|
|
|
|
foreach my $seq (@seqs) {
|
|
$seq = substr($seq, $kmer_length-1);
|
|
|
|
$final_seq .= $seq;
|
|
}
|
|
|
|
|
|
return($final_seq);
|
|
}
|
|
|
|
|
|
|
|
####
|
|
sub gather_read_indices {
|
|
my $self = shift;
|
|
my (@nodes) = @_;
|
|
|
|
my %indices;
|
|
|
|
foreach my $node (@nodes) {
|
|
|
|
my $read_tracker = $node->get_ReadTracker();
|
|
|
|
my @read_indices = $read_tracker->get_tracked_read_indices();
|
|
|
|
foreach my $read (@read_indices) {
|
|
$indices{$read}=1;
|
|
}
|
|
}
|
|
|
|
return(%indices);
|
|
}
|
|
|
|
|
|
####
|
|
sub assign_depth_to_nodes {
|
|
my $self = shift;
|
|
print STDERR "Assigning depth to nodes\n";
|
|
|
|
my @root_nodes = $self->get_root_nodes();
|
|
|
|
## base cases.
|
|
foreach my $root (@root_nodes) {
|
|
$root->{depth} = 0;
|
|
}
|
|
|
|
foreach my $node ($self->get_all_nodes()) {
|
|
if (! defined ($node->{depth})) {
|
|
$self->assign_depth($node, {});
|
|
}
|
|
}
|
|
|
|
|
|
return;
|
|
}
|
|
|
|
|
|
####
|
|
sub assign_depth {
|
|
my $self = shift;
|
|
my $node = shift;
|
|
my $seen_href = shift;
|
|
|
|
my @ancestors = $node->get_all_prev_nodes();
|
|
|
|
my $max_depth = 0;
|
|
|
|
foreach my $ancestor (@ancestors) {
|
|
|
|
my $depth;
|
|
if (defined($ancestor->{depth})) {
|
|
$depth = 1 + $ancestor->{depth};
|
|
}
|
|
else {
|
|
|
|
if ($seen_href->{$ancestor}) {
|
|
# print "Breaking cycle: $ancestor\n" . join("\n", keys %$seen_href) . "\n";
|
|
next;
|
|
}
|
|
$seen_href->{$ancestor} = 1;
|
|
|
|
$depth = 1 + $self->assign_depth($ancestor, $seen_href);
|
|
}
|
|
if ($depth > $max_depth) {
|
|
$max_depth = $depth;
|
|
}
|
|
}
|
|
|
|
# print "$node\tdepth: $max_depth\n";
|
|
$node->{depth} = $max_depth;
|
|
return($max_depth);
|
|
}
|
|
|
|
|
|
####
|
|
sub all_nodes_visited {
|
|
my $self = shift;
|
|
|
|
my @nodes = $self->get_all_nodes();
|
|
|
|
foreach my $node (@nodes) {
|
|
|
|
if (! $node->{_visited}) {
|
|
return(0);
|
|
}
|
|
}
|
|
|
|
return(1); # all visited.
|
|
}
|
|
|
|
####
|
|
sub extract_path_from_unvisited_node {
|
|
my $self = shift;
|
|
|
|
my @nodes = $self->get_all_nodes();
|
|
|
|
my $highest_scoring_unvisited_node = undef;
|
|
my $highest_score = -1;
|
|
|
|
foreach my $node (@nodes) {
|
|
if (! $node->{_visited}) {
|
|
if ($node->{_base_score} > $highest_score) {
|
|
$highest_score = $node->{_base_score};
|
|
$highest_scoring_unvisited_node = $node;
|
|
}
|
|
}
|
|
}
|
|
|
|
unless ($highest_scoring_unvisited_node) {
|
|
confess "Error, no unvisited node found";
|
|
}
|
|
|
|
my @best_path;
|
|
|
|
my %seen; # don't follow cycles. FIXME: cycles shouldn't be part of the scoring, though... bug somewhere upstream?
|
|
|
|
my $prev_node = $highest_scoring_unvisited_node;
|
|
while ($prev_node && ! $prev_node->{_visited} && ! $seen{$prev_node}) {
|
|
print "walking prev_node: $prev_node\n" if $main::SEE;
|
|
$seen{$prev_node} = 1;
|
|
unshift(@best_path, $prev_node);
|
|
$prev_node = $prev_node->{_best_prev};
|
|
}
|
|
|
|
## now track forward.
|
|
my $next_node = $highest_scoring_unvisited_node;
|
|
delete $seen{$next_node}; # already used above.
|
|
while ($next_node && ! $next_node->{_visited} && ! $seen{$next_node}) {
|
|
print "walking next_node: $next_node\n" if $main::SEE;
|
|
$seen{$next_node} = 1;
|
|
my @structs = @{$next_node->{_forward_scores}};
|
|
if (@structs) {
|
|
@structs = sort {$a->{score}<=>$b->{score}} @structs;
|
|
my $best_struct = pop @structs;
|
|
push (@best_path, $next_node);
|
|
$next_node = $best_struct->{next_node};
|
|
}
|
|
else {
|
|
$next_node = undef;
|
|
}
|
|
}
|
|
|
|
return(@best_path);
|
|
}
|
|
|
|
|
|
|
|
####
|
|
sub prune_low_weight_edges {
|
|
my $self = shift;
|
|
my %params = @_;
|
|
|
|
my $edge_weight_threshold = $params{edge_weight_threshold};
|
|
|
|
|
|
my @nodes = $self->get_all_nodes();
|
|
|
|
my @edges_to_prune;
|
|
|
|
foreach my $node (@nodes) {
|
|
my @prev_nodes = $node->get_all_prev_nodes();
|
|
|
|
foreach my $prev_node (@prev_nodes) {
|
|
|
|
my $edge_ratio = $self->compute_edge_support_ratio($prev_node, $node);
|
|
|
|
#print "EdgeRatio: $edge_ratio\n";
|
|
|
|
if ($edge_ratio < $edge_weight_threshold) {
|
|
push (@edges_to_prune, [$prev_node, $node]);
|
|
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
if (@edges_to_prune) {
|
|
|
|
foreach my $edges_to_prune (@edges_to_prune) {
|
|
|
|
my ($prev_node, $node) = @$edges_to_prune;
|
|
$self->prune_edge($prev_node, $node);
|
|
}
|
|
|
|
return(scalar(@edges_to_prune));
|
|
}
|
|
else {
|
|
return(0);
|
|
}
|
|
|
|
}
|
|
|
|
####
|
|
sub prune_singletons {
|
|
my $self = shift;
|
|
|
|
my ($min_seq_length) = @_;
|
|
|
|
my @nodes = $self->get_all_nodes();
|
|
|
|
foreach my $node (@nodes) {
|
|
|
|
if ( (! $node->get_all_prev_nodes())
|
|
&&
|
|
(! $node->get_all_next_nodes()) ) {
|
|
|
|
if (length ($node->get_sequence()) < $min_seq_length) {
|
|
|
|
$self->delete_node_from_graph($node);
|
|
}
|
|
}
|
|
}
|
|
|
|
return;
|
|
}
|
|
|
|
|
|
####
|
|
sub compute_edge_support_ratio {
|
|
my $self = shift;
|
|
my ($prev_node, $node) = @_;
|
|
|
|
## ratio = reads in common / (all reads out of prev UNION all reads into node)
|
|
|
|
my @reads_exiting_prev_node = $prev_node->get_reads_exiting_node();
|
|
my %exiting_reads = map { + $_ => 1 } @reads_exiting_prev_node;
|
|
|
|
my @reads_entering_node = $node->get_reads_entering_node();
|
|
my %entering_reads = map { + $_ => 1 } @reads_entering_node;
|
|
|
|
my %all_reads = map { + $_ => 1 } (@reads_exiting_prev_node, @reads_entering_node);
|
|
|
|
my @all = keys %all_reads;
|
|
my @shared;
|
|
|
|
foreach my $read (@all) {
|
|
if ($entering_reads{$read} && $exiting_reads{$read}) {
|
|
push (@shared, $read);
|
|
}
|
|
}
|
|
|
|
my $ratio = scalar(@shared) / scalar(@all);
|
|
|
|
return($ratio);
|
|
}
|
|
|
|
|
|
|
|
####
|
|
sub prune_dangling_nodes {
|
|
my $self = shift;
|
|
my %params = @_;
|
|
|
|
my $min_leaf_node_length = $params{min_leaf_node_length};
|
|
my $min_leaf_node_avg_cov = $params{min_leaf_node_avg_cov};
|
|
|
|
unless (defined ($min_leaf_node_length)
|
|
&&
|
|
defined ($min_leaf_node_avg_cov) ) {
|
|
|
|
confess "Error, must define values for both: min_leaf_node_length && min_leaf_node_avg_cov";
|
|
}
|
|
|
|
my @nodes = $self->get_all_nodes();
|
|
|
|
my @nodes_to_prune;
|
|
|
|
my $kmer_length = $self->{KmerLength};
|
|
|
|
foreach my $node (@nodes) {
|
|
|
|
## check to see if it's a dangling node.
|
|
if ( (! $node->get_all_prev_nodes())
|
|
||
|
|
(! $node->get_all_next_nodes())
|
|
) {
|
|
|
|
my $length = length($node->get_sequence());
|
|
|
|
my $cov = $node->get_count();
|
|
|
|
my $avg_cov = $cov / ($length - ($kmer_length - 1));
|
|
|
|
if ($avg_cov < $min_leaf_node_avg_cov && $length < $min_leaf_node_length) {
|
|
push (@nodes_to_prune, $node);
|
|
}
|
|
|
|
}
|
|
}
|
|
|
|
if (@nodes_to_prune) {
|
|
|
|
foreach my $node (@nodes_to_prune) {
|
|
|
|
$self->prune_nodes_from_graph($node);
|
|
}
|
|
|
|
return(scalar(@nodes_to_prune)); # nodes pruned.
|
|
}
|
|
|
|
else {
|
|
return(0); # no nodes pruned.
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
1; #EOM
|
|
|