251 lines
7.8 KiB
Perl
251 lines
7.8 KiB
Perl
#!/usr/bin/env perl
|
|
|
|
use strict;
|
|
use warnings;
|
|
|
|
use FindBin;
|
|
use lib ("$FindBin::RealBin/../PerlLib");
|
|
use Fasta_reader;
|
|
use Data::Dumper;
|
|
|
|
=ExampleCommands
|
|
|
|
# make blastable
|
|
makeblastdb \
|
|
-in refTranscripts.fasta\
|
|
-out refTranscripts -dbtype nucl
|
|
|
|
# run blast+
|
|
blastn -query Trinity.fasta -db refTranscripts -out blastn.fmt6.txt \
|
|
-evalue 1e-20 -dust no -task megablast -num_threads 2 -max_target_seqs 1 -outfmt 6
|
|
|
|
# analyze results
|
|
analyze_blastPlus_topHit_coverage.pl blastn.fmt6.txt refTranscripts.fasta Trinity.fasta
|
|
|
|
=cut
|
|
|
|
;
|
|
|
|
|
|
my $usage = "usage: $0 blast+.outfmt6.txt query.fasta search_db.fasta [output_prefix=NameOfBlastFileHere] [verbose=0]\n\n";
|
|
|
|
my $blast_out = $ARGV[0] or die $usage;
|
|
my $fasta_file_A = $ARGV[1] or die $usage;
|
|
my $fasta_file_B = $ARGV[2] or die $usage; # the fasta files don't have to be in any special order.
|
|
my $output_prefix = $ARGV[3] || "$blast_out";
|
|
my $verbose = $ARGV[4] || 0;
|
|
|
|
main: {
|
|
|
|
|
|
my $counter = 0;
|
|
|
|
my %query_to_top_hit; # only storing the hit with the greatest blast score.
|
|
|
|
# outfmt6:
|
|
# qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore
|
|
|
|
print STDERR "-parsing blast output: $blast_out\n" if $verbose;
|
|
open (my $fh, $blast_out) or die "Error, cannot open file $blast_out";
|
|
while (<$fh>) {
|
|
chomp;
|
|
my $line = $_;
|
|
my @x = split(/\t/);
|
|
my $query_id = $x[0];
|
|
my $db_id = $x[1];
|
|
my $percent_id = $x[2];
|
|
my $query_start = $x[6];
|
|
my $query_end = $x[7];
|
|
my $db_start = $x[8];
|
|
my $db_end = $x[9];
|
|
|
|
my $evalue = $x[10];
|
|
my $bitscore = $x[11];
|
|
|
|
if ( (! exists $query_to_top_hit{$query_id}) || ($bitscore > $query_to_top_hit{$query_id}->{bitscore}) ) {
|
|
|
|
$query_to_top_hit{$query_id} = { query_id => $query_id,
|
|
db_id => $db_id,
|
|
percent_id => $percent_id,
|
|
query_start => $query_start,
|
|
query_end => $query_end,
|
|
db_start => $db_start,
|
|
db_end => $db_end,
|
|
evalue => $evalue,
|
|
bitscore => $bitscore,
|
|
|
|
query_match_len => abs($query_end - $query_start) + 1,
|
|
db_match_len => abs($db_end - $db_start) + 1,
|
|
|
|
line => $line,
|
|
};
|
|
|
|
}
|
|
|
|
$counter++;
|
|
if ($counter % 100 == 0) {
|
|
print STDERR "\r[$counter] " if $verbose;
|
|
}
|
|
|
|
|
|
}
|
|
close $fh;
|
|
$counter = 0;
|
|
print STDERR "\n" if $verbose;
|
|
|
|
## identify those entries we need sequence length info for.
|
|
my %seq_lengths;
|
|
my %seq_headers;
|
|
{
|
|
foreach my $entry (values %query_to_top_hit) {
|
|
my $query_id = $entry->{query_id};
|
|
my $db_id = $entry->{db_id};
|
|
|
|
$seq_lengths{$query_id} = undef;
|
|
$seq_lengths{$db_id} = undef;
|
|
}
|
|
|
|
## get sequence length info
|
|
foreach my $fasta_file ($fasta_file_A, $fasta_file_B) {
|
|
|
|
print STDERR "-parsing seq length info from file: $fasta_file\n" if $verbose;
|
|
|
|
my $fasta_reader = new Fasta_reader($fasta_file);
|
|
|
|
while (my $seq_obj = $fasta_reader->next()) {
|
|
|
|
$counter++;
|
|
if ($counter % 100 == 0) {
|
|
print STDERR "\r[$counter] " if $verbose;
|
|
}
|
|
|
|
|
|
my $acc = $seq_obj->get_accession();
|
|
if (exists $seq_lengths{$acc}) {
|
|
|
|
my $sequence = $seq_obj->get_sequence();
|
|
$seq_lengths{$acc} = length($sequence);
|
|
|
|
my $header = $seq_obj->get_header();
|
|
# remove the accession
|
|
my @header_pieces = split(/\s+/, $header);
|
|
shift @header_pieces;
|
|
$header = join(" ", @header_pieces);
|
|
$seq_headers{$acc} = $header;
|
|
}
|
|
}
|
|
|
|
$counter = 0;
|
|
print STDERR "\n" if $verbose;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
## analyze the results.
|
|
## make this hit-centric, only retain the longest-coverage query hit for each database sequence.
|
|
|
|
print STDERR "-analyzing hits.\n" if $verbose;
|
|
my %db_id_to_greatest_pct_cov; # ties broken by bitscore
|
|
{
|
|
|
|
|
|
open (my $ofh, ">$output_prefix.w_pct_hit_length") or die $!;
|
|
|
|
print $ofh join("\t", "#qseqid", "sseqid", "pident", "length", "mismatch",
|
|
"gapopen", "qstart", "qend", "sstart", "send", "evalue", "bitscore",
|
|
"db_hit_len", "pct_hit_len_aligned", "hit_descr") . "\n";
|
|
|
|
foreach my $entry (values %query_to_top_hit) {
|
|
|
|
my $db_id = $entry->{db_id};
|
|
my $db_match_len = $entry->{db_match_len};
|
|
my $db_seq_len = $seq_lengths{$db_id} or die "Error, no length found for $db_id, with hit: " . Dumper($entry);
|
|
|
|
my $percent_length_matched = sprintf("%.2f", $db_match_len / $db_seq_len * 100);
|
|
|
|
my $line = $entry->{line};
|
|
my $header = $seq_headers{$db_id};
|
|
|
|
print $ofh join("\t", $line, $db_match_len, $percent_length_matched, $header) . "\n";
|
|
|
|
$entry->{db_hit_pct_cov} = $percent_length_matched;
|
|
|
|
|
|
if ( ! exists $db_id_to_greatest_pct_cov{$db_id} ) {
|
|
$db_id_to_greatest_pct_cov{$db_id} = $entry;
|
|
}
|
|
else {
|
|
my $prev_entry = $db_id_to_greatest_pct_cov{$db_id};
|
|
if ($percent_length_matched > $prev_entry->{db_hit_pct_cov}
|
|
||
|
|
($percent_length_matched == $prev_entry->{db_hit_pct_cov}
|
|
&&
|
|
$entry->{bitscore} > $prev_entry->{bitscore})
|
|
) {
|
|
$db_id_to_greatest_pct_cov{$db_id} = $entry;
|
|
}
|
|
}
|
|
|
|
$counter++;
|
|
if ($counter % 100 == 0) {
|
|
print STDERR "\r[$counter] " if $verbose;
|
|
}
|
|
|
|
}
|
|
close $ofh;
|
|
|
|
}
|
|
|
|
$counter = 0;
|
|
print STDERR "\n" if $verbose;
|
|
|
|
## histogram summary
|
|
|
|
my @bins = qw(10 20 30 40 50 60 70 80 90 100);
|
|
my %bin_counts;
|
|
|
|
open (my $ofh, ">$output_prefix.hist") or die "Error, cannot write to $output_prefix.hist";
|
|
open (my $list_ofh, ">$output_prefix.hist.list") or die $!;
|
|
{
|
|
|
|
|
|
foreach my $entry (values %db_id_to_greatest_pct_cov) {
|
|
|
|
my $pct_cov = $entry->{db_hit_pct_cov};
|
|
|
|
my $prev_bin = 0;
|
|
foreach my $bin (@bins) {
|
|
if ($pct_cov > $prev_bin && $pct_cov <= $bin) {
|
|
$bin_counts{$bin}++;
|
|
print $list_ofh join("\t", "Bin_$bin", $entry->{line}) . "\n";
|
|
}
|
|
$prev_bin = $bin;
|
|
}
|
|
|
|
|
|
}
|
|
}
|
|
close $list_ofh;
|
|
|
|
## Report counts per bin
|
|
print "#hit_pct_cov_bin\tcount_in_bin\t>bin_below\n";
|
|
print $ofh "#hit_pct_cov_bin\tcount_in_bin\t>bin_below\n";
|
|
|
|
my $cumul = 0;
|
|
foreach my $bin (reverse(@bins)) {
|
|
my $count = $bin_counts{$bin} || 0;
|
|
$cumul += $count;
|
|
print join("\t", $bin, $count, $cumul) . "\n";
|
|
print $ofh join("\t", $bin, $count, $cumul) . "\n";
|
|
|
|
}
|
|
close $ofh;
|
|
|
|
|
|
exit(0);
|
|
|
|
|
|
}
|