384 lines
8.7 KiB
Perl
384 lines
8.7 KiB
Perl
#!/usr/bin/env perl
|
|
|
|
use strict;
|
|
use warnings;
|
|
|
|
use FindBin;
|
|
use lib ("$FindBin::RealBin/../../PerlLib", "$FindBin::RealBin/../../PerlLib/KmerGraphLib");
|
|
|
|
use Fasta_reader;
|
|
|
|
use ColorGradient;
|
|
|
|
use KmerNode;
|
|
use KmerGraph;
|
|
use Carp;
|
|
|
|
use Getopt::Long qw(:config no_ignore_case bundling);
|
|
|
|
no warnings 'recursion';
|
|
|
|
my $usage = <<_EOUSAGE_;
|
|
|
|
#####################################################################################
|
|
#
|
|
# Required:
|
|
#
|
|
# --reads <string> fasta files containing reads (comma-separated list of files: fileA.reads,fileB.reads,...
|
|
# basic black coloring in graph.)
|
|
# and/or
|
|
#
|
|
# --misc_seqs <string> fasta files containing reads (comma-separated list of files: fileA.reads,fileB.reads,...)
|
|
# each sequence is colored differently and identified in the graph @ edges
|
|
# and:
|
|
# --assemble
|
|
# and/or
|
|
# --graph [filename] dot output file for viewing structure in GraphViz
|
|
#
|
|
# Optional:
|
|
# -K Kmer length (default: 24)
|
|
# -R maximum recursive search depth from terminus for extension (default: 5) # note can exponentially increases runtime
|
|
#
|
|
#
|
|
# -L minimum assembled sequence length to report (default: 100)
|
|
# -E minimum Kmer seed entropy (default: 1.5; note maximum entropy = 2 for nucleotide sequences)
|
|
# --min_coverage minimum kmer coverage to report kmer from reads in graph.
|
|
#
|
|
#
|
|
# -v verbose
|
|
#
|
|
# --no_compact do NOT compact the graph
|
|
#
|
|
# --min_edge_threshold default(0.1)
|
|
# --min_leaf_node_length default(76)
|
|
# --min_leaf_node_avg_cov default(1.2)
|
|
#
|
|
# -N max assemblies to report (default: infinity)
|
|
#
|
|
#####################################################################################
|
|
|
|
_EOUSAGE_
|
|
|
|
;
|
|
|
|
|
|
|
|
my $help_flag;
|
|
|
|
my $fasta_file;
|
|
my $KmerLength = 24;
|
|
|
|
my $max_recurse_depth = 5;
|
|
|
|
my $min_length = 100;
|
|
my $min_cov = 1;
|
|
|
|
our $VERBOSE =0;
|
|
|
|
my $MIN_KMER_ENTROPY = 1.5;
|
|
my $graph_file;
|
|
my $assemble_flag;
|
|
my $cds_file;
|
|
my $iworm_file;
|
|
|
|
my $MIN_EDGE_THRESHOLD = 0.1;
|
|
my $MIN_LEAF_NODE_LENGTH = 76;
|
|
my $MIN_LEAF_NODE_AVG_COV = 1.2;
|
|
|
|
|
|
my $NO_COMPACT = 0;
|
|
my $max_asmbls_report = -1;
|
|
|
|
my $reads_files;
|
|
my $misc_seqs_files;
|
|
|
|
&GetOptions ( 'h' => \$help_flag,
|
|
|
|
## required
|
|
'misc_seqs=s' => \$misc_seqs_files,
|
|
'reads=s' => \$reads_files,
|
|
|
|
'K=i' => \$KmerLength,
|
|
'R=i' => \$max_recurse_depth,
|
|
|
|
## options
|
|
'L=i' => \$min_length,
|
|
'E=f' => \$MIN_KMER_ENTROPY,
|
|
"min_coverage=i" => \$min_cov,
|
|
"graph=s" => \$graph_file,
|
|
"assemble" => \$assemble_flag,
|
|
"iworm=s" => \$iworm_file,
|
|
|
|
"CDS=s" => \$cds_file,
|
|
|
|
"min_edge_threshold=f" => \$MIN_EDGE_THRESHOLD,
|
|
"min_leaf_node_length=i" => \$MIN_LEAF_NODE_LENGTH,
|
|
"min_leaf_node_avg_cov=f" => \$MIN_LEAF_NODE_AVG_COV,
|
|
|
|
"no_compact" => \$NO_COMPACT,
|
|
'N=i' => \$max_asmbls_report,
|
|
|
|
# hidden option for debugging
|
|
'v' => \$VERBOSE,
|
|
|
|
|
|
);
|
|
|
|
if ($help_flag) {
|
|
die $usage;
|
|
}
|
|
|
|
unless ( ($reads_files || $misc_seqs_files) && ($graph_file || $assemble_flag)) {
|
|
die $usage;
|
|
}
|
|
|
|
|
|
our $SEE = $VERBOSE;
|
|
|
|
|
|
main: {
|
|
|
|
my $KmerGraph = KmerGraph->new($KmerLength);
|
|
|
|
if ($reads_files) {
|
|
my $seq_counter = 0;
|
|
my @files = split(/,/, $reads_files);
|
|
foreach my $file (@files) {
|
|
my $fasta_reader = new Fasta_reader($file);
|
|
|
|
while (my $seq_obj = $fasta_reader->next()) {
|
|
my $acc = $seq_obj->get_accession();
|
|
my $sequence = $seq_obj->get_sequence();
|
|
|
|
$seq_counter++;
|
|
|
|
print STDERR "\r-parsing seq($seq_counter) ";
|
|
|
|
$KmerGraph->add_sequence_to_graph($acc, $sequence, 1, 'black');
|
|
|
|
}
|
|
}
|
|
}
|
|
|
|
if ($misc_seqs_files) {
|
|
|
|
my @seqs;
|
|
|
|
my @files = split(/,/, $misc_seqs_files);
|
|
foreach my $file (@files) {
|
|
my $fr = new Fasta_reader($file);
|
|
|
|
|
|
|
|
while (my $seq_obj = $fr->next()) {
|
|
my $seq = $seq_obj->get_sequence();
|
|
my $acc = $seq_obj->get_accession();
|
|
|
|
if (length($seq) >= $min_length) {
|
|
push (@seqs, [$acc, $seq]);
|
|
}
|
|
|
|
}
|
|
}
|
|
|
|
my @colors;
|
|
if (scalar @seqs > 1) {
|
|
@colors = &ColorGradient::get_RGB_gradient(scalar @seqs);
|
|
|
|
@colors = &ColorGradient::convert_RGB_hex(@colors);
|
|
}
|
|
else {
|
|
@colors = ('green');
|
|
}
|
|
|
|
while (@seqs) {
|
|
my $seq_info = shift @seqs;
|
|
my ($acc, $seq) = @$seq_info;
|
|
my $color = shift @colors;
|
|
print STDERR "Adding additional seq trace: $acc, $color\n";
|
|
$KmerGraph->add_sequence_to_graph($acc, $seq, 0, $color);
|
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
if ($min_cov > 1) {
|
|
print STDERR "\n-removing kmer nodes below coverage: $min_cov\n";
|
|
|
|
$KmerGraph->purge_nodes_below_count($min_cov);
|
|
}
|
|
|
|
|
|
|
|
print STDERR "\nDone adding sequences\n";
|
|
|
|
|
|
|
|
unless ($NO_COMPACT) {
|
|
|
|
my $round = 0;
|
|
|
|
my ($compacted_graph_flag, $number_dangling_nodes_pruned, $number_pruned_edges);
|
|
|
|
do {
|
|
|
|
|
|
$compacted_graph_flag = $KmerGraph->compact_graph();
|
|
|
|
$number_dangling_nodes_pruned = $KmerGraph->prune_dangling_nodes(min_leaf_node_length => $MIN_LEAF_NODE_LENGTH,
|
|
min_leaf_node_avg_cov => $MIN_LEAF_NODE_AVG_COV);
|
|
|
|
$number_pruned_edges = $KmerGraph->prune_low_weight_edges(edge_weight_threshold => $MIN_EDGE_THRESHOLD);
|
|
|
|
$round++;
|
|
|
|
# no op
|
|
print STDERR "Cycling through graph compaction, round: $round\n";
|
|
print STDERR "\tnodes_pruned: $number_dangling_nodes_pruned\n"
|
|
. "\tedges_pruned: $number_pruned_edges\n";
|
|
|
|
|
|
} while ($compacted_graph_flag && ($number_dangling_nodes_pruned || $number_pruned_edges));
|
|
|
|
|
|
|
|
}
|
|
|
|
print STDERR "done.\n";
|
|
|
|
|
|
if ($assemble_flag) {
|
|
|
|
my @path = $KmerGraph->score_nodes();
|
|
|
|
my $assembled_seq = $KmerGraph->extract_sequence_from_path(@path);
|
|
|
|
unless ($assembled_seq) {
|
|
confess "Error, no assembled sequence form path: @path";
|
|
}
|
|
|
|
|
|
my $score = &sum_node_counts(@path);
|
|
|
|
my $counter = 0;
|
|
if (length($assembled_seq) >= $min_length) {
|
|
$counter++;
|
|
|
|
my $length_normalized_score = sprintf("%.2f", $score / length($assembled_seq));
|
|
|
|
print ">assembly_$counter;$score;$length_normalized_score len: " . length($assembled_seq) . "\n$assembled_seq\n";
|
|
}
|
|
|
|
|
|
while (! $KmerGraph->all_nodes_visited()) {
|
|
|
|
if ($max_asmbls_report > 0 && $counter >= $max_asmbls_report) {
|
|
last;
|
|
}
|
|
|
|
my @path = $KmerGraph->extract_path_from_unvisited_node(@path);
|
|
|
|
#$KmerGraph->print_path(@path);
|
|
|
|
#foreach my $node (@path) {
|
|
# print $node->toString();
|
|
#}
|
|
|
|
my $assembled_seq = $KmerGraph->extract_sequence_from_path(@path); # marks as visited.
|
|
|
|
unless ($assembled_seq) {
|
|
confess "Error, no sequence extracted from path: @path";
|
|
}
|
|
|
|
if ( length($assembled_seq) >= $min_length) {
|
|
$counter++;
|
|
|
|
my $score = &sum_node_counts(@path);
|
|
my $length_normalized_score = sprintf("%.2f", $score / length($assembled_seq));
|
|
|
|
print ">assembly_$counter;$score;$length_normalized_score len: " . length($assembled_seq) . "\n$assembled_seq\n";
|
|
}
|
|
}
|
|
}
|
|
|
|
## Write graph file in dot format for GraphViz
|
|
if ($graph_file) {
|
|
open (my $ofh, ">$graph_file") or die "Error, cannot write graph file to $graph_file ";
|
|
print $ofh $KmerGraph->toGraphViz( no_short_singletons => 100);
|
|
close $ofh;
|
|
}
|
|
|
|
|
|
|
|
exit(0);
|
|
}
|
|
|
|
|
|
####
|
|
sub sum_node_counts {
|
|
my @path = @_;
|
|
|
|
my $sum = 0;
|
|
|
|
foreach my $node (@path) {
|
|
my $count = $node->get_count();
|
|
$sum += $count;
|
|
}
|
|
|
|
return($sum);
|
|
}
|
|
|
|
|
|
####
|
|
sub compute_entropy {
|
|
my ($string) = @_;
|
|
|
|
my @chars = split(//, $string);
|
|
|
|
my %char_counter;
|
|
foreach my $char (@chars) {
|
|
$char_counter{$char}++;
|
|
}
|
|
|
|
my $entropy = 0;
|
|
|
|
my $num_chars = length($string);
|
|
foreach my $char (keys %char_counter) {
|
|
my $count = $char_counter{$char};
|
|
my $prob = $count/$num_chars;
|
|
|
|
my $val = $prob * log(1/$prob)/log(2);
|
|
$entropy += $val;
|
|
}
|
|
|
|
return($entropy);
|
|
}
|
|
|
|
|
|
####
|
|
sub get_highest_scoring_iworm_assembly {
|
|
my ($iworm_file) = @_;
|
|
|
|
my @entries;
|
|
|
|
my $fasta_reader = new Fasta_reader($iworm_file);
|
|
while (my $seq_obj = $fasta_reader->next()) {
|
|
my $seq = $seq_obj->get_sequence();
|
|
my $acc = $seq_obj->get_accession();
|
|
|
|
my @parts = split(/;/, $acc);
|
|
my $cov = pop @parts;
|
|
|
|
my $score = $cov * length($seq);
|
|
|
|
push (@entries, [$score, $seq]);
|
|
}
|
|
|
|
@entries = reverse sort {$a->[0]<=>$b->[0]} @entries;
|
|
|
|
my $best_entry = shift @entries;
|
|
|
|
return($best_entry->[1]);
|
|
}
|