biyelunwen/99.scripts/trinity_utils/util/analyze_blastPlus_topHit_co...

251 lines
7.8 KiB
Perl

#!/usr/bin/env perl
use strict;
use warnings;
use FindBin;
use lib ("$FindBin::RealBin/../PerlLib");
use Fasta_reader;
use Data::Dumper;
=ExampleCommands
# make blastable
makeblastdb \
-in refTranscripts.fasta\
-out refTranscripts -dbtype nucl
# run blast+
blastn -query Trinity.fasta -db refTranscripts -out blastn.fmt6.txt \
-evalue 1e-20 -dust no -task megablast -num_threads 2 -max_target_seqs 1 -outfmt 6
# analyze results
analyze_blastPlus_topHit_coverage.pl blastn.fmt6.txt refTranscripts.fasta Trinity.fasta
=cut
;
my $usage = "usage: $0 blast+.outfmt6.txt query.fasta search_db.fasta [output_prefix=NameOfBlastFileHere] [verbose=0]\n\n";
my $blast_out = $ARGV[0] or die $usage;
my $fasta_file_A = $ARGV[1] or die $usage;
my $fasta_file_B = $ARGV[2] or die $usage; # the fasta files don't have to be in any special order.
my $output_prefix = $ARGV[3] || "$blast_out";
my $verbose = $ARGV[4] || 0;
main: {
my $counter = 0;
my %query_to_top_hit; # only storing the hit with the greatest blast score.
# outfmt6:
# qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore
print STDERR "-parsing blast output: $blast_out\n" if $verbose;
open (my $fh, $blast_out) or die "Error, cannot open file $blast_out";
while (<$fh>) {
chomp;
my $line = $_;
my @x = split(/\t/);
my $query_id = $x[0];
my $db_id = $x[1];
my $percent_id = $x[2];
my $query_start = $x[6];
my $query_end = $x[7];
my $db_start = $x[8];
my $db_end = $x[9];
my $evalue = $x[10];
my $bitscore = $x[11];
if ( (! exists $query_to_top_hit{$query_id}) || ($bitscore > $query_to_top_hit{$query_id}->{bitscore}) ) {
$query_to_top_hit{$query_id} = { query_id => $query_id,
db_id => $db_id,
percent_id => $percent_id,
query_start => $query_start,
query_end => $query_end,
db_start => $db_start,
db_end => $db_end,
evalue => $evalue,
bitscore => $bitscore,
query_match_len => abs($query_end - $query_start) + 1,
db_match_len => abs($db_end - $db_start) + 1,
line => $line,
};
}
$counter++;
if ($counter % 100 == 0) {
print STDERR "\r[$counter] " if $verbose;
}
}
close $fh;
$counter = 0;
print STDERR "\n" if $verbose;
## identify those entries we need sequence length info for.
my %seq_lengths;
my %seq_headers;
{
foreach my $entry (values %query_to_top_hit) {
my $query_id = $entry->{query_id};
my $db_id = $entry->{db_id};
$seq_lengths{$query_id} = undef;
$seq_lengths{$db_id} = undef;
}
## get sequence length info
foreach my $fasta_file ($fasta_file_A, $fasta_file_B) {
print STDERR "-parsing seq length info from file: $fasta_file\n" if $verbose;
my $fasta_reader = new Fasta_reader($fasta_file);
while (my $seq_obj = $fasta_reader->next()) {
$counter++;
if ($counter % 100 == 0) {
print STDERR "\r[$counter] " if $verbose;
}
my $acc = $seq_obj->get_accession();
if (exists $seq_lengths{$acc}) {
my $sequence = $seq_obj->get_sequence();
$seq_lengths{$acc} = length($sequence);
my $header = $seq_obj->get_header();
# remove the accession
my @header_pieces = split(/\s+/, $header);
shift @header_pieces;
$header = join(" ", @header_pieces);
$seq_headers{$acc} = $header;
}
}
$counter = 0;
print STDERR "\n" if $verbose;
}
}
## analyze the results.
## make this hit-centric, only retain the longest-coverage query hit for each database sequence.
print STDERR "-analyzing hits.\n" if $verbose;
my %db_id_to_greatest_pct_cov; # ties broken by bitscore
{
open (my $ofh, ">$output_prefix.w_pct_hit_length") or die $!;
print $ofh join("\t", "#qseqid", "sseqid", "pident", "length", "mismatch",
"gapopen", "qstart", "qend", "sstart", "send", "evalue", "bitscore",
"db_hit_len", "pct_hit_len_aligned", "hit_descr") . "\n";
foreach my $entry (values %query_to_top_hit) {
my $db_id = $entry->{db_id};
my $db_match_len = $entry->{db_match_len};
my $db_seq_len = $seq_lengths{$db_id} or die "Error, no length found for $db_id, with hit: " . Dumper($entry);
my $percent_length_matched = sprintf("%.2f", $db_match_len / $db_seq_len * 100);
my $line = $entry->{line};
my $header = $seq_headers{$db_id};
print $ofh join("\t", $line, $db_match_len, $percent_length_matched, $header) . "\n";
$entry->{db_hit_pct_cov} = $percent_length_matched;
if ( ! exists $db_id_to_greatest_pct_cov{$db_id} ) {
$db_id_to_greatest_pct_cov{$db_id} = $entry;
}
else {
my $prev_entry = $db_id_to_greatest_pct_cov{$db_id};
if ($percent_length_matched > $prev_entry->{db_hit_pct_cov}
||
($percent_length_matched == $prev_entry->{db_hit_pct_cov}
&&
$entry->{bitscore} > $prev_entry->{bitscore})
) {
$db_id_to_greatest_pct_cov{$db_id} = $entry;
}
}
$counter++;
if ($counter % 100 == 0) {
print STDERR "\r[$counter] " if $verbose;
}
}
close $ofh;
}
$counter = 0;
print STDERR "\n" if $verbose;
## histogram summary
my @bins = qw(10 20 30 40 50 60 70 80 90 100);
my %bin_counts;
open (my $ofh, ">$output_prefix.hist") or die "Error, cannot write to $output_prefix.hist";
open (my $list_ofh, ">$output_prefix.hist.list") or die $!;
{
foreach my $entry (values %db_id_to_greatest_pct_cov) {
my $pct_cov = $entry->{db_hit_pct_cov};
my $prev_bin = 0;
foreach my $bin (@bins) {
if ($pct_cov > $prev_bin && $pct_cov <= $bin) {
$bin_counts{$bin}++;
print $list_ofh join("\t", "Bin_$bin", $entry->{line}) . "\n";
}
$prev_bin = $bin;
}
}
}
close $list_ofh;
## Report counts per bin
print "#hit_pct_cov_bin\tcount_in_bin\t>bin_below\n";
print $ofh "#hit_pct_cov_bin\tcount_in_bin\t>bin_below\n";
my $cumul = 0;
foreach my $bin (reverse(@bins)) {
my $count = $bin_counts{$bin} || 0;
$cumul += $count;
print join("\t", $bin, $count, $cumul) . "\n";
print $ofh join("\t", $bin, $count, $cumul) . "\n";
}
close $ofh;
exit(0);
}