131 lines
3.5 KiB
Perl
131 lines
3.5 KiB
Perl
#!/usr/bin/env perl
|
|
|
|
use strict;
|
|
use warnings;
|
|
|
|
use FindBin;
|
|
use lib ("$FindBin::RealBin/../../PerlLib");
|
|
|
|
use SAM_reader;
|
|
use SAM_entry;
|
|
use Fasta_reader;
|
|
|
|
my $usage = "usage: $0 coord_sorted.sam genome.fa [SS_lib_type]\n\n";
|
|
|
|
my $sam_file = $ARGV[0] or die $usage;
|
|
my $genome_fa = $ARGV[1] or die $usage;
|
|
my $SS_lib_type = $ARGV[2];
|
|
|
|
main: {
|
|
|
|
my $sam_reader = new SAM_reader($sam_file);
|
|
|
|
my $fasta_reader = new Fasta_reader($genome_fa);
|
|
my %genome_seqs = $fasta_reader->retrieve_all_seqs_hash();
|
|
|
|
|
|
my $genome_acc = "";
|
|
my $genome_seq = "";
|
|
|
|
|
|
|
|
while ($sam_reader->has_next()) {
|
|
|
|
my $num_introns = 0;
|
|
|
|
my $sam_entry = $sam_reader->get_next();
|
|
|
|
my $read_name = $sam_entry->reconstruct_full_read_name();
|
|
|
|
my $cigar = $sam_entry->get_cigar_alignment();
|
|
my $scaffold = $sam_entry->get_scaffold_name();
|
|
|
|
if ($scaffold eq "*") {
|
|
# unaligned read
|
|
next;
|
|
}
|
|
|
|
|
|
#print "$scaffold\t$cigar\n";
|
|
|
|
unless ($genome_acc eq $scaffold) {
|
|
$genome_acc = $scaffold;
|
|
$genome_seq = $genome_seq = $genome_seqs{$genome_acc} or die "Error, no genome seq for acc: $genome_acc";
|
|
}
|
|
|
|
my ($genome_coords_aref, $query_coords_aref) = $sam_entry->get_alignment_coords();
|
|
|
|
my $strand = $sam_entry->get_query_strand();
|
|
|
|
my $transcribed_strand = "?";
|
|
if ($SS_lib_type) {
|
|
$transcribed_strand = $sam_entry->get_query_transcribed_strand($SS_lib_type);
|
|
}
|
|
|
|
my $report_txt = "";
|
|
|
|
if (scalar @$genome_coords_aref > 1) {
|
|
|
|
my @genome_coords = @$genome_coords_aref;
|
|
## get the introns:
|
|
|
|
for (my $i = 1; $i <= $#genome_coords; $i++) {
|
|
my $prev_coordset = $genome_coords[$i-1];
|
|
my $curr_coordset = $genome_coords[$i];
|
|
|
|
my ($a_lend, $a_rend) = @$prev_coordset;
|
|
my ($b_lend, $b_rend) = @$curr_coordset;
|
|
|
|
my $intron_lend = $a_rend + 1;
|
|
my $intron_rend = $b_lend - 1;
|
|
|
|
my $intron_length = $b_lend - $a_rend - 1;
|
|
if ($intron_length < 0) {
|
|
die "Error, intron length invalid";
|
|
}
|
|
|
|
if ($intron_length >= 20) {
|
|
my $intron_seq = substr($genome_seq, $a_rend +1 -1, $intron_length);
|
|
|
|
my $left_intron = substr($intron_seq, 0, 2);
|
|
my $right_intron = substr($intron_seq, -2);
|
|
$num_introns++;
|
|
|
|
my $intron_dinucs = "$left_intron..$right_intron";
|
|
if ($transcribed_strand eq "?") {
|
|
|
|
## make an educated guess
|
|
if ($intron_dinucs eq "GT..AG"
|
|
||
|
|
$intron_dinucs eq "GC..AG"
|
|
||
|
|
$intron_dinucs eq "AT..AC") {
|
|
$transcribed_strand = "+";
|
|
}
|
|
elsif ($intron_dinucs eq "CT..AC"
|
|
||
|
|
$intron_dinucs eq "CT..GC"
|
|
||
|
|
$intron_dinucs eq "GT..AT") {
|
|
$transcribed_strand = '-';
|
|
}
|
|
}
|
|
|
|
|
|
$report_txt .= join("\t", $read_name,
|
|
"$scaffold",
|
|
"$intron_lend-$intron_rend",
|
|
"$transcribed_strand",
|
|
"$intron_dinucs") . "\n";
|
|
}
|
|
}
|
|
if ($num_introns >= 1) {
|
|
print "$report_txt\n"; # spacer between SAM entries
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
exit(0);
|
|
}
|