190 lines
4.8 KiB
Perl
190 lines
4.8 KiB
Perl
#!/usr/bin/env perl
|
|
|
|
use strict;
|
|
use warnings;
|
|
use FindBin;
|
|
use lib "$FindBin::RealBin/../../PerlLib";
|
|
use Fasta_reader;
|
|
use List::Util qw(min max);
|
|
use Overlap_piler;
|
|
use Data::Dumper;
|
|
|
|
my $usage = "\n\n\tusage: $0 blast.outfmt6 query_fasta target_fasta\n\n\n";
|
|
|
|
my $blast_file = $ARGV[0] or die $usage;
|
|
my $query_fasta = $ARGV[1] or die $usage;
|
|
my $target_fasta = $ARGV[2] or die $usage;
|
|
|
|
|
|
my %query_seq_lens = &get_seq_lengths($query_fasta);
|
|
|
|
my %target_seq_lens;
|
|
if ($query_fasta eq $target_fasta) {
|
|
%target_seq_lens = %query_seq_lens;
|
|
}
|
|
else {
|
|
%target_seq_lens = &get_seq_lengths($target_fasta);
|
|
}
|
|
|
|
|
|
|
|
my $MAX_MISSING = 10;
|
|
my $COUNT_MISSING = 0;
|
|
|
|
main: {
|
|
|
|
# header
|
|
print join("\t", "#query_acc", "target_acc", "avg_per_id", "min_Evalue",
|
|
"query_match_range", "target_match_range", "pct_query_len", "pct_target_len", "max_pct_len") . "\n";
|
|
|
|
|
|
my @hits;
|
|
my $prev_query_target_pair = "";
|
|
|
|
open (my $fh, $blast_file) or die "Error, cannot open file $blast_file";
|
|
while (<$fh>) {
|
|
chomp;
|
|
my @x = split(/\t/);
|
|
my $query_acc = $x[0];
|
|
my $target_acc = $x[1];
|
|
|
|
my $query_target_pair = join("$;", $query_acc, $target_acc);
|
|
if ($query_target_pair ne $prev_query_target_pair) {
|
|
&process_hits(@hits) if @hits;
|
|
@hits = ();
|
|
}
|
|
push (@hits, [@x]);
|
|
$prev_query_target_pair = $query_target_pair;
|
|
}
|
|
|
|
# get last one
|
|
&process_hits(@hits);
|
|
|
|
exit(0);
|
|
}
|
|
|
|
####
|
|
sub process_hits {
|
|
my @hits = @_;
|
|
|
|
#print Dumper(\@hits);
|
|
|
|
my @query_coords;
|
|
my @target_coords;
|
|
|
|
my $query_acc = "";
|
|
my $target_acc = "";
|
|
|
|
my $sum_pct_id_len = 0;
|
|
my $sum_len = 0;
|
|
|
|
my $min_evalue = 1;
|
|
|
|
foreach my $hit (@hits) {
|
|
my @fields = @$hit;
|
|
|
|
unless ($query_acc) {
|
|
$query_acc = $fields[0];
|
|
$target_acc = $fields[1];
|
|
}
|
|
|
|
my ($query_lend, $query_rend) = sort {$a<=>$b} ($fields[6], $fields[7]);
|
|
my ($target_lend, $target_rend) = sort {$a<=>$b} ($fields[8], $fields[9]);
|
|
|
|
my $per_id = $fields[2];
|
|
my $query_seg_len = $query_rend - $query_lend + 1;
|
|
$sum_pct_id_len += $query_seg_len * $per_id;
|
|
$sum_len += $query_seg_len;
|
|
|
|
my $evalue = $fields[10];
|
|
if ($evalue < $min_evalue) {
|
|
$min_evalue = $evalue;
|
|
}
|
|
|
|
push (@query_coords, [$query_lend, $query_rend]);
|
|
push (@target_coords, [$target_lend, $target_rend]);
|
|
|
|
}
|
|
|
|
my $avg_per_id = sprintf("%.2f", $sum_pct_id_len / $sum_len);
|
|
|
|
|
|
my $query_match_len = 0;
|
|
my @query_match_regions;
|
|
{
|
|
|
|
my @query_piles = &Overlap_piler::simple_coordsets_collapser(@query_coords);
|
|
foreach my $pile (@query_piles) {
|
|
my ($pile_lend, $pile_rend) = @$pile;
|
|
$query_match_len += $pile_rend - $pile_lend + 1;
|
|
push (@query_match_regions, "$pile_lend-$pile_rend");
|
|
}
|
|
}
|
|
|
|
my $target_match_len = 0;
|
|
my @target_match_regions;
|
|
{
|
|
my @target_piles = &Overlap_piler::simple_coordsets_collapser(@target_coords);
|
|
foreach my $pile (@target_piles) {
|
|
my ($pile_lend, $pile_rend) = @$pile;
|
|
$target_match_len += $pile_rend - $pile_lend + 1;
|
|
push (@target_match_regions, "$pile_lend-$pile_rend");
|
|
}
|
|
}
|
|
|
|
my $query_len = $query_seq_lens{$query_acc};
|
|
my $target_len = $target_seq_lens{$target_acc};
|
|
|
|
if ( ! defined ($query_len)) {
|
|
print STDERR "Error, missing length for query: [$query_acc]\n";
|
|
$COUNT_MISSING++;
|
|
if ($COUNT_MISSING > $MAX_MISSING) {
|
|
die "Error, too many missing seq length entries encountered\n";
|
|
}
|
|
return;
|
|
}
|
|
|
|
if (! defined($target_len)) {
|
|
print STDERR "Error, missing length for db hit: [$target_acc]\n";
|
|
$COUNT_MISSING++;
|
|
if ($COUNT_MISSING > $MAX_MISSING) {
|
|
die "Error, too many missing seq length entries encountered\n";
|
|
}
|
|
return;
|
|
}
|
|
|
|
|
|
my $pct_query_len = sprintf("%.2f", $query_match_len / $query_len * 100);
|
|
my $pct_target_len = sprintf("%.2f", $target_match_len / $target_len * 100);
|
|
|
|
|
|
print join("\t", $query_acc, $target_acc, $avg_per_id, $min_evalue,
|
|
join(",", @query_match_regions), join(",", @target_match_regions),
|
|
$pct_query_len, $pct_target_len, max($pct_query_len, $pct_target_len) ) . "\n";
|
|
|
|
|
|
return;
|
|
}
|
|
|
|
|
|
####
|
|
sub get_seq_lengths {
|
|
my ($fasta_file) = @_;
|
|
|
|
my %seq_lens;
|
|
|
|
my $fasta_reader = new Fasta_reader($fasta_file);
|
|
while (my $seq_obj = $fasta_reader->next()) {
|
|
|
|
my $acc = $seq_obj->get_accession();
|
|
my $seq_len = length($seq_obj->get_sequence());
|
|
|
|
# print STDERR "$acc => $seq_len\n";
|
|
|
|
$seq_lens{$acc} = $seq_len;
|
|
}
|
|
|
|
return(%seq_lens);
|
|
}
|
|
|