#!/usr/bin/env perl use strict; use warnings; use FindBin; use lib "$FindBin::RealBin/../../PerlLib"; use Fasta_reader; use List::Util qw(min max); use Overlap_piler; use Data::Dumper; my $usage = "\n\n\tusage: $0 blast.outfmt6 query_fasta target_fasta\n\n\n"; my $blast_file = $ARGV[0] or die $usage; my $query_fasta = $ARGV[1] or die $usage; my $target_fasta = $ARGV[2] or die $usage; my %query_seq_lens = &get_seq_lengths($query_fasta); my %target_seq_lens; if ($query_fasta eq $target_fasta) { %target_seq_lens = %query_seq_lens; } else { %target_seq_lens = &get_seq_lengths($target_fasta); } my $MAX_MISSING = 10; my $COUNT_MISSING = 0; main: { # header print join("\t", "#query_acc", "target_acc", "avg_per_id", "min_Evalue", "query_match_range", "target_match_range", "pct_query_len", "pct_target_len", "max_pct_len") . "\n"; my @hits; my $prev_query_target_pair = ""; open (my $fh, $blast_file) or die "Error, cannot open file $blast_file"; while (<$fh>) { chomp; my @x = split(/\t/); my $query_acc = $x[0]; my $target_acc = $x[1]; my $query_target_pair = join("$;", $query_acc, $target_acc); if ($query_target_pair ne $prev_query_target_pair) { &process_hits(@hits) if @hits; @hits = (); } push (@hits, [@x]); $prev_query_target_pair = $query_target_pair; } # get last one &process_hits(@hits); exit(0); } #### sub process_hits { my @hits = @_; #print Dumper(\@hits); my @query_coords; my @target_coords; my $query_acc = ""; my $target_acc = ""; my $sum_pct_id_len = 0; my $sum_len = 0; my $min_evalue = 1; foreach my $hit (@hits) { my @fields = @$hit; unless ($query_acc) { $query_acc = $fields[0]; $target_acc = $fields[1]; } my ($query_lend, $query_rend) = sort {$a<=>$b} ($fields[6], $fields[7]); my ($target_lend, $target_rend) = sort {$a<=>$b} ($fields[8], $fields[9]); my $per_id = $fields[2]; my $query_seg_len = $query_rend - $query_lend + 1; $sum_pct_id_len += $query_seg_len * $per_id; $sum_len += $query_seg_len; my $evalue = $fields[10]; if ($evalue < $min_evalue) { $min_evalue = $evalue; } push (@query_coords, [$query_lend, $query_rend]); push (@target_coords, [$target_lend, $target_rend]); } my $avg_per_id = sprintf("%.2f", $sum_pct_id_len / $sum_len); my $query_match_len = 0; my @query_match_regions; { my @query_piles = &Overlap_piler::simple_coordsets_collapser(@query_coords); foreach my $pile (@query_piles) { my ($pile_lend, $pile_rend) = @$pile; $query_match_len += $pile_rend - $pile_lend + 1; push (@query_match_regions, "$pile_lend-$pile_rend"); } } my $target_match_len = 0; my @target_match_regions; { my @target_piles = &Overlap_piler::simple_coordsets_collapser(@target_coords); foreach my $pile (@target_piles) { my ($pile_lend, $pile_rend) = @$pile; $target_match_len += $pile_rend - $pile_lend + 1; push (@target_match_regions, "$pile_lend-$pile_rend"); } } my $query_len = $query_seq_lens{$query_acc}; my $target_len = $target_seq_lens{$target_acc}; if ( ! defined ($query_len)) { print STDERR "Error, missing length for query: [$query_acc]\n"; $COUNT_MISSING++; if ($COUNT_MISSING > $MAX_MISSING) { die "Error, too many missing seq length entries encountered\n"; } return; } if (! defined($target_len)) { print STDERR "Error, missing length for db hit: [$target_acc]\n"; $COUNT_MISSING++; if ($COUNT_MISSING > $MAX_MISSING) { die "Error, too many missing seq length entries encountered\n"; } return; } my $pct_query_len = sprintf("%.2f", $query_match_len / $query_len * 100); my $pct_target_len = sprintf("%.2f", $target_match_len / $target_len * 100); print join("\t", $query_acc, $target_acc, $avg_per_id, $min_evalue, join(",", @query_match_regions), join(",", @target_match_regions), $pct_query_len, $pct_target_len, max($pct_query_len, $pct_target_len) ) . "\n"; return; } #### sub get_seq_lengths { my ($fasta_file) = @_; my %seq_lens; my $fasta_reader = new Fasta_reader($fasta_file); while (my $seq_obj = $fasta_reader->next()) { my $acc = $seq_obj->get_accession(); my $seq_len = length($seq_obj->get_sequence()); # print STDERR "$acc => $seq_len\n"; $seq_lens{$acc} = $seq_len; } return(%seq_lens); }