biyelunwen/99.scripts/trinity_utils/util/misc/SAM_show_alignment.pl

239 lines
7.4 KiB
Perl

#!/usr/bin/env perl
use strict;
use warnings;
use lib ($ENV{EUK_MODULES});
use Fasta_reader;
use SAM_reader;
use SAM_entry;
use Data::Dumper;
my $FASTA_LENGTH = 60;
my $usage = "usage: $0 alignments.sam target.fasta [JUST_ALIGN_STATS=0]\n\n";
my $sam_alignments = $ARGV[0] or die $usage;
my $target_fasta = $ARGV[1] or die $usage;
my $JUST_STATS = $ARGV[2] || 0;
main: {
my $fasta_reader = new Fasta_reader($target_fasta);
my %fasta_seqs = $fasta_reader->retrieve_all_seqs_hash();
my $sam_reader = new SAM_reader($sam_alignments);
my $curr_scaff_acc = "";
my $curr_scaff_seq = "";
while (my $sam_entry = $sam_reader->get_next() ) {
my $read_name = $sam_entry->reconstruct_full_read_name();
my $read_seq = uc $sam_entry->get_sequence();
my $scaff_name = $sam_entry->get_scaffold_name();
if ($scaff_name eq "*") { next; } # not aligned.
if ($scaff_name ne $curr_scaff_acc) {
$curr_scaff_acc = $scaff_name;
$curr_scaff_seq = uc $fasta_seqs{$scaff_name};
}
my ($genome_coords_aref, $read_coords_aref) = $sam_entry->get_alignment_coords();
unless (scalar(@$genome_coords_aref) >= 1) { next; }
unless ($JUST_STATS) {
## coordinate dump.
print "// $scaff_name (top) vs. $read_name (bottom):\n\n";
print "$scaff_name length: " . length($curr_scaff_seq) . "\n";
print "$read_name length: " . length($read_seq) . "\n";
print "$scaff_name\t$read_name\n";
for (my $i = 0; $i <= $#$genome_coords_aref; $i++) {
my ($genome_lend, $genome_rend) = @{$genome_coords_aref->[$i]};
my ($read_lend, $read_rend) = @{$read_coords_aref->[$i]};
print "$genome_lend-$genome_rend\t$read_lend-$read_rend\n";
}
print "\n";
}
eval {
&draw_alignment($scaff_name, $read_name, $genome_coords_aref, $read_coords_aref, \$curr_scaff_seq, \$read_seq);
};
if ($@) {
print STDERR "$@\n";
}
}
exit(0);
}
####
sub draw_alignment {
my ($scaff_name, $read_name, $genome_coords_aref, $read_coords_aref, $curr_scaff_seq_sref, $read_seq_sref) = @_;
my $scaff_align_string = "";
my $read_align_string = "";
my $prev_scaff_coord = 0;
my $prev_read_coord = 0;
my $num_indels = scalar(@$genome_coords_aref) - 1;
my $num_aligned_bases = 0;
while (@$genome_coords_aref) {
my $genome_coordset = shift @$genome_coords_aref;
my $read_coordset = shift @$read_coords_aref;
my ($genome_lend, $genome_rend) = @$genome_coordset;
my ($read_lend, $read_rend) = @$read_coordset;
## sometimes bwasw is off at the end of the contig
if ($genome_rend > length($$curr_scaff_seq_sref)) {
my $delta = $genome_rend - length($$curr_scaff_seq_sref);
$genome_rend -= $delta;
$read_rend -= $delta;
# print STDERR "*adjusting by $delta\n";
}
$num_aligned_bases += $read_rend - $read_lend + 1;
if ($prev_read_coord > 0 && $read_lend - $prev_read_coord > 1) {
## add gaps to read string
for (my $i = $prev_read_coord + 1; $i < $read_lend; $i++) {
$read_align_string .= substr($$read_seq_sref, $i-1, 1);
$scaff_align_string .= "-";;
}
}
if ($prev_scaff_coord > 0 && $genome_lend - $prev_scaff_coord > 1) {
# add gaps to genome string
for (my $i = $prev_scaff_coord + 1; $i < $genome_lend; $i++) {
$scaff_align_string .= substr($$curr_scaff_seq_sref, $i-1, 1);
$read_align_string .= "-";
}
}
$prev_read_coord = $read_rend;
$prev_scaff_coord = $genome_rend;
my $read_seq_region_len = $read_rend - $read_lend + 1;
my $genome_seq_region_len = $genome_rend - $genome_lend + 1;
my $read_seq_region = substr($$read_seq_sref, $read_lend - 1, $read_rend - $read_lend + 1);
my $genome_seq_region = substr($$curr_scaff_seq_sref, $genome_lend - 1, $genome_rend - $genome_lend + 1);
if (length($read_seq_region) != length($genome_seq_region)) {
die "Error, lengths of regions are different: ($read_seq_region_len vs. $genome_seq_region_len) extracted:\n"
. "read_seq_region:\t" . $read_seq_region . "\n"
. "genome_seq_regn:\t" . $genome_seq_region . "\n";
}
$scaff_align_string .= $genome_seq_region;
$read_align_string .= $read_seq_region;
}
my $alignment_length = length($scaff_align_string);
my $num_gaps = 0;
while ($scaff_align_string =~ /-/g) { $num_gaps++; }
while ($read_align_string =~ /-/g) { $num_gaps++; }
my $percent_gap = $num_gaps / $alignment_length * 100;
eval {
my ($num_mismatches) = &print_pretty($scaff_align_string, $read_align_string);
print join("\t", "#", "scaff_name", "read_name", "read_length", "aligned_bases", "matches", "mismatches", "indel_bkpts", "sum_indel_lens", "pct_mismatches", "pct_indel_bkpts", "pct_indel_lens") . "\n";
print join("\t", "#", $scaff_name, $read_name, length($$read_seq_sref),
$num_aligned_bases, $num_aligned_bases - $num_mismatches, $num_mismatches, $num_indels, $num_gaps,
sprintf("%.2f", $num_mismatches/$num_aligned_bases*100),
sprintf("%.2f", $num_indels/$num_aligned_bases*100),
sprintf("%.2f", $percent_gap),
) . "\n\n\n";
};
if ($@) {
print STDERR "$@\n";
die;
}
return;
}
####
sub print_pretty {
my ($scaff_align_string, $read_align_string) = @_;
if (length($scaff_align_string) != length($read_align_string) ) {
die "Error, alignment lengths differ.:\n"
. "Scaff_align_string: " . length($scaff_align_string) . "\t$scaff_align_string\n"
. "Read_align_string: " . length($read_align_string) . "\t$read_align_string\n";
}
my @scaff_align_chars = split(//, $scaff_align_string);
my @read_align_chars = split(//, $read_align_string);
my $top_text = "";
my $match_text = "";
my $bottom_text = "";
my $num_mismatches = 0;
for (my $i = 0; $i <= $#scaff_align_chars; $i++) {
if ($i != 0 && $i % 60 == 0) {
print join("\n", $top_text, $match_text, $bottom_text) . "\n\n" unless $JUST_STATS;
$top_text = ""; $match_text = ""; $bottom_text = "";
}
my $match_char = ".";
if ($scaff_align_chars[$i] ne "-" && $read_align_chars[$i] ne "-") {
$match_char = ($scaff_align_chars[$i] eq $read_align_chars[$i]) ? "|" : "*";
if ($match_char eq "*") {
$num_mismatches++;
}
}
$top_text .= $scaff_align_chars[$i];
$bottom_text .= $read_align_chars[$i];
$match_text .= $match_char;
}
if ($top_text) {
print join("\n", $top_text, $match_text, $bottom_text) . "\n\n" unless $JUST_STATS;
}
return ($num_mismatches);
}