630 lines
11 KiB
Perl
630 lines
11 KiB
Perl
package SAM_entry;
|
|
|
|
use strict;
|
|
use warnings;
|
|
use Carp;
|
|
|
|
|
|
|
|
sub new {
|
|
my $packagename = shift;
|
|
my ($line) = @_;
|
|
|
|
unless (defined $line) {
|
|
confess "Error, need sam text line as parameter";
|
|
}
|
|
|
|
chomp $line;
|
|
|
|
my @fields = split(/\t/, $line);
|
|
|
|
my $self = {
|
|
_line => $line,
|
|
_fields => [@fields],
|
|
};
|
|
|
|
bless ($self, $packagename);
|
|
|
|
return($self);
|
|
}
|
|
|
|
|
|
####
|
|
sub get_original_line {
|
|
my $self = shift;
|
|
return($self->{_line});
|
|
}
|
|
|
|
####
|
|
sub get_fields {
|
|
my $self = shift;
|
|
return (@{$self->{_fields}});
|
|
}
|
|
|
|
|
|
####
|
|
sub get_read_name {
|
|
my $self = shift;
|
|
return ($self->{_fields}->[0]);
|
|
}
|
|
|
|
####
|
|
sub reconstruct_full_read_name {
|
|
my $self = shift;
|
|
|
|
my $read_name = $self->get_core_read_name();
|
|
|
|
if ($self->is_first_in_pair()) {
|
|
$read_name .= "/1";
|
|
}
|
|
elsif ($self->is_second_in_pair()) {
|
|
$read_name .= "/2";
|
|
}
|
|
|
|
return($read_name);
|
|
}
|
|
|
|
|
|
####
|
|
sub get_core_read_name {
|
|
my $self = shift;
|
|
my $read_name = $self->get_read_name();
|
|
my $core_read_name = $read_name;
|
|
|
|
$core_read_name =~ s|/\d$||;
|
|
|
|
return($core_read_name);
|
|
}
|
|
|
|
|
|
|
|
####
|
|
sub get_scaffold_name {
|
|
my $self = shift;
|
|
return($self->{_fields}->[2]);
|
|
}
|
|
|
|
####
|
|
sub get_aligned_position {
|
|
my $self = shift;
|
|
return($self->{_fields}->[3]);
|
|
}
|
|
|
|
sub get_scaffold_position { # preferred
|
|
my $self = shift;
|
|
return($self->get_aligned_position());
|
|
}
|
|
|
|
|
|
sub get_scaffold_start_position {
|
|
my $self = shift;
|
|
my ($lend, $rend) = $self->get_genome_span();
|
|
|
|
my $strand = $self->get_query_strand();
|
|
if ($strand eq '+') {
|
|
return($lend);
|
|
}
|
|
else {
|
|
return($rend);
|
|
}
|
|
}
|
|
|
|
|
|
####
|
|
sub get_read_group {
|
|
my $self = shift;
|
|
my $line = $self->get_original_line();
|
|
|
|
if ($line =~ /RG:Z:(\S+)/) {
|
|
return($1);
|
|
}
|
|
else {
|
|
return undef;
|
|
}
|
|
}
|
|
|
|
|
|
####
|
|
sub get_cigar_alignment {
|
|
my $self = shift;
|
|
return($self->{_fields}->[5]);
|
|
}
|
|
|
|
###
|
|
sub get_genome_span {
|
|
my $self = shift;
|
|
my ($genome_aref, $read_aref) = $self->get_alignment_coords();
|
|
|
|
my @coords;
|
|
foreach my $genome_coordset (@$genome_aref) {
|
|
push (@coords, @$genome_coordset);
|
|
}
|
|
|
|
@coords = sort {$a<=>$b} @coords;
|
|
|
|
my $min_coord = shift @coords;
|
|
my $max_coord = pop @coords;
|
|
|
|
return($min_coord, $max_coord);
|
|
}
|
|
|
|
####
|
|
sub get_read_span {
|
|
my $self = shift;
|
|
|
|
my ($genome_aref, $read_aref) = $self->get_alignment_coords();
|
|
|
|
my @coords;
|
|
foreach my $read_coordset (@$read_aref) {
|
|
push (@coords, @$read_coordset);
|
|
}
|
|
|
|
@coords = sort {$a<=>$b} @coords;
|
|
|
|
my $min_coord = shift @coords;
|
|
my $max_coord = pop @coords;
|
|
|
|
return($min_coord, $max_coord);
|
|
|
|
}
|
|
|
|
####
|
|
sub get_alignment_length {
|
|
my $self = shift;
|
|
|
|
my ($genome_coords_aref, $read_coords_aref) = $self->get_alignment_coords();
|
|
|
|
my $sum_len = 0;
|
|
|
|
my @genome_coords = @$genome_coords_aref;
|
|
foreach my $coords (@genome_coords) {
|
|
my ($genome_lend, $genome_rend) = @$coords;
|
|
|
|
$sum_len += abs($genome_rend - $genome_lend) + 1;
|
|
}
|
|
|
|
return($sum_len);
|
|
}
|
|
|
|
|
|
####
|
|
sub get_alignment_coords {
|
|
my $self = shift;
|
|
|
|
my $genome_lend = $self->get_aligned_position();
|
|
|
|
my $alignment = $self->get_cigar_alignment();
|
|
|
|
my $query_lend = 0;
|
|
|
|
my @genome_coords;
|
|
my @query_coords;
|
|
|
|
|
|
my $sum_hardmasked_query = 0;
|
|
|
|
$genome_lend--; # move pointer just before first position.
|
|
|
|
while ($alignment =~ /(\d+)([A-Z])/g) {
|
|
my $len = $1;
|
|
my $code = $2;
|
|
|
|
unless ($code =~ /^[MSDNIH]$/) {
|
|
confess "Error, cannot parse cigar code [$code] " . $self->toString();
|
|
}
|
|
|
|
# print "parsed $len,$code\n";
|
|
|
|
if ($code eq 'M') { # aligned bases match or mismatch
|
|
|
|
my $genome_rend = $genome_lend + $len;
|
|
my $query_rend = $query_lend + $len;
|
|
|
|
push (@genome_coords, [$genome_lend+1, $genome_rend]);
|
|
push (@query_coords, [$query_lend+1, $query_rend]);
|
|
|
|
# reset coord pointers
|
|
$genome_lend = $genome_rend;
|
|
$query_lend = $query_rend;
|
|
|
|
}
|
|
elsif ($code eq 'D' || $code eq 'N') { # insertion in the genome or gap in query (intron, perhaps)
|
|
$genome_lend += $len;
|
|
|
|
}
|
|
|
|
elsif ($code eq 'I' # gap in genome or insertion in query
|
|
||
|
|
$code eq 'S' || $code eq 'H') # masked region of query
|
|
{
|
|
$query_lend += $len;
|
|
|
|
if ($code eq "H") {
|
|
$sum_hardmasked_query += $len;
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
## see if reverse strand alignment - if so, must revcomp the read matching coordinates.
|
|
if ($self->get_query_strand() eq '-') {
|
|
|
|
my $read_len = length($self->get_sequence());
|
|
unless ($read_len) {
|
|
confess "Error, no read length obtained from entry: " . $self->get_original_line();
|
|
}
|
|
$read_len += $sum_hardmasked_query;
|
|
|
|
my @revcomp_coords;
|
|
foreach my $coordset (@query_coords) {
|
|
my ($lend, $rend) = @$coordset;
|
|
|
|
my $new_lend = $read_len - $lend + 1;
|
|
my $new_rend = $read_len - $rend + 1;
|
|
|
|
push (@revcomp_coords, [$new_lend, $new_rend]);
|
|
}
|
|
|
|
@query_coords = @revcomp_coords;
|
|
|
|
}
|
|
|
|
|
|
|
|
return(\@genome_coords, \@query_coords);
|
|
}
|
|
|
|
|
|
####
|
|
sub get_mate_scaffold_name {
|
|
my $self = shift;
|
|
|
|
return($self->{_fields}->[6]);
|
|
}
|
|
|
|
|
|
####
|
|
sub set_mate_scaffold_name {
|
|
my $self = shift;
|
|
my $mate_scaffold_name = shift;
|
|
|
|
$self->{_fields}->[6] = $mate_scaffold_name;
|
|
|
|
return;
|
|
}
|
|
|
|
|
|
####
|
|
sub get_mate_scaffold_position {
|
|
my $self = shift;
|
|
|
|
return($self->{_fields}->[7]);
|
|
}
|
|
|
|
|
|
####
|
|
sub set_mate_scaffold_position {
|
|
my $self = shift;
|
|
my $scaff_pos = shift;
|
|
|
|
$self->{_fields}->[7] = $scaff_pos;
|
|
|
|
return;
|
|
}
|
|
|
|
|
|
####
|
|
sub toString {
|
|
my $self = shift;
|
|
my @fields = @{$self->{_fields}};
|
|
|
|
if ($self->is_paired()) {
|
|
$fields[0] = $self->get_core_read_name();
|
|
}
|
|
|
|
return( join("\t", @fields));
|
|
}
|
|
|
|
|
|
####
|
|
sub get_mapping_quality {
|
|
my $self = shift;
|
|
return($self->{_fields}->[4]);
|
|
}
|
|
|
|
|
|
####
|
|
sub get_sequence {
|
|
my $self = shift;
|
|
return($self->{_fields}->[9]);
|
|
}
|
|
|
|
####
|
|
sub get_quality_scores {
|
|
my $self = shift;
|
|
return($self->{_fields}->[10]);
|
|
}
|
|
|
|
|
|
####
|
|
sub get_inferred_insert_size {
|
|
my $self = shift;
|
|
|
|
# should probably check to see if it's a paired read or not... user beware
|
|
return($self->{_fields}->[8]);
|
|
}
|
|
|
|
|
|
|
|
###################
|
|
## Flag Processing
|
|
###################
|
|
|
|
# from sam format spec:
|
|
|
|
=flag_description
|
|
|
|
Flag Description
|
|
0x0001 the read is paired in sequencing, no matter whether it is mapped in a pair
|
|
0x0002 the read is mapped in a proper pair (depends on the protocol, normally inferred during alignment) 1
|
|
0x0004 the query sequence itself is unmapped
|
|
0x0008 the mate is unmapped 1
|
|
0x0010 strand of the query (0 for forward; 1 for reverse strand)
|
|
0x0020 strand of the mate 1
|
|
0x0040 the read is the first read in a pair 1,2
|
|
0x0080 the read is the second read in a pair 1,2
|
|
0x0100 the alignment is not primary (a read having split hits may have multiple primary alignment records)
|
|
0x0200 the read fails platform/vendor quality checks
|
|
0x0400 the read is either a PCR duplicate or an optical duplicate
|
|
|
|
1. Flag 0x02, 0x08, 0x20, 0x40 and 0x80 are only meaningful when flag 0x01 is present.
|
|
2. If in a read pair the information on which read is the first in the pair is lost in the upstream analysis, flag 0x01 shuld
|
|
be present and 0x40 and 0x80 are both zero.
|
|
|
|
=cut
|
|
|
|
|
|
####
|
|
sub get_flag {
|
|
my $self = shift;
|
|
my $flag = $self->{_fields}->[1];
|
|
return($flag);
|
|
}
|
|
|
|
sub set_flag {
|
|
my $self = shift;
|
|
my $flag = shift;
|
|
|
|
unless (defined $flag) {
|
|
confess "Error, need flag value";
|
|
}
|
|
|
|
$self->{_fields}->[1] = $flag;
|
|
return;
|
|
}
|
|
|
|
####
|
|
sub is_paired {
|
|
my $self = shift;
|
|
return($self->_get_bit_val(0x0001));
|
|
}
|
|
|
|
sub set_paired {
|
|
my $self = shift;
|
|
my $bit_val = shift;
|
|
|
|
$self->_set_bit_val(0x0001, $bit_val);
|
|
|
|
return;
|
|
}
|
|
|
|
####
|
|
sub is_proper_pair {
|
|
my $self = shift;
|
|
return($self->_get_bit_val(0x0002));
|
|
}
|
|
|
|
sub set_proper_pair {
|
|
my $self = shift;
|
|
my $bit_val = shift;
|
|
|
|
$self->_set_bit_val(0x0002, $bit_val);
|
|
return;
|
|
}
|
|
|
|
####
|
|
sub is_query_unmapped {
|
|
my $self = shift;
|
|
return($self->_get_bit_val(0x0004));
|
|
}
|
|
|
|
sub set_query_unmapped {
|
|
my $self = shift;
|
|
my $bit_val = shift;
|
|
|
|
$self->_set_bit_val(0x0004, $bit_val);
|
|
}
|
|
|
|
|
|
####
|
|
sub is_mate_unmapped {
|
|
my $self = shift;
|
|
return($self->_get_bit_val(0x0008));
|
|
}
|
|
|
|
sub set_mate_unmapped {
|
|
my $self = shift;
|
|
my $bit_val = shift;
|
|
|
|
return($self->_set_bit_val(0x0008, $bit_val));
|
|
}
|
|
|
|
sub is_duplicate {
|
|
my ($self) = shift;
|
|
|
|
return($self->_get_bit_val(0x0400));
|
|
}
|
|
sub set_duplicate {
|
|
my $self = shift;
|
|
my $bit_val = shift;
|
|
|
|
return($self->_set_bit_val(0x0400, $bit_val));
|
|
}
|
|
|
|
|
|
####
|
|
sub get_query_strand {
|
|
my $self = shift;
|
|
|
|
my $strand = ($self->_get_bit_val(0x0010)) ? '-' : '+';
|
|
return($strand);
|
|
}
|
|
|
|
####
|
|
sub get_query_transcribed_strand {
|
|
my $self = shift;
|
|
my ($SS_lib_type) = @_;
|
|
|
|
unless ($SS_lib_type) {
|
|
confess "Error, SS_lib_type required as a parameter, possible values: RF,FR,F,R " . $self->toString();
|
|
}
|
|
|
|
|
|
my $aligned_strand = $self->get_query_strand();
|
|
my $opposite_strand = ($aligned_strand eq '+') ? '-' : '+';
|
|
|
|
my $transcribed_strand;
|
|
|
|
if (! $self->is_paired()) {
|
|
|
|
## UNPAIRED or SINGLE READS
|
|
unless ($SS_lib_type =~ /^(F|R)$/) {
|
|
confess "Error, cannot have $SS_lib_type library type with unpaired reads " . $self->toString();
|
|
}
|
|
|
|
$transcribed_strand = ($SS_lib_type eq "F") ? $aligned_strand : $opposite_strand;
|
|
}
|
|
else {
|
|
|
|
## paired RNA-Seq reads: left fragment is on the 3' end revcomped, and right fragment is at the 5' end sense strand.
|
|
|
|
unless ($SS_lib_type =~ /^(FR|RF)$/) {
|
|
confess "Error, cannot have $SS_lib_type library type with paired reads " . $self->toString();
|
|
}
|
|
|
|
if ($self->is_first_in_pair()) {
|
|
$transcribed_strand = ($SS_lib_type eq "FR") ? $aligned_strand : $opposite_strand;
|
|
}
|
|
else {
|
|
# second pair
|
|
$transcribed_strand = ($SS_lib_type eq "FR") ? $opposite_strand : $aligned_strand;
|
|
}
|
|
}
|
|
|
|
|
|
return($transcribed_strand);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
sub set_query_strand {
|
|
my $self = shift;
|
|
my $strand = shift;
|
|
|
|
unless ($strand eq '+' || $strand eq '-') {
|
|
confess "Error, strand value must be [+-]";
|
|
}
|
|
|
|
my $bit_val = ($strand eq '+') ? 0 : 1;
|
|
$self->_set_bit_val(0x0010, $bit_val);
|
|
}
|
|
|
|
####
|
|
sub get_mate_strand {
|
|
my $self = shift;
|
|
|
|
my $strand = ($self->_get_bit_val(0x0020)) ? '-' : '+';
|
|
return($strand);
|
|
}
|
|
|
|
sub set_mate_strand {
|
|
my $self = shift;
|
|
my $strand = shift;
|
|
|
|
unless ($strand eq '+' || $strand eq '-') {
|
|
confess "Error, strand value must be [+-]";
|
|
}
|
|
|
|
my $bit_val = ($strand eq '+') ? 0 : 1;
|
|
$self->_set_bit_val(0x0020, $bit_val);
|
|
}
|
|
|
|
####
|
|
sub is_first_in_pair {
|
|
my $self = shift;
|
|
return($self->_get_bit_val(0x0040));
|
|
}
|
|
|
|
sub set_first_in_pair {
|
|
my $self = shift;
|
|
my $bit_val = shift;
|
|
|
|
$self->_set_bit_val(0x0040, $bit_val);
|
|
return;
|
|
}
|
|
|
|
####
|
|
sub is_second_in_pair {
|
|
my $self = shift;
|
|
return($self->_get_bit_val(0x0080));
|
|
}
|
|
|
|
|
|
sub set_second_in_pair {
|
|
my $self = shift;
|
|
my $bit_val = shift;
|
|
|
|
$self->_set_bit_val(0x0080, $bit_val);
|
|
return;
|
|
}
|
|
|
|
|
|
|
|
####
|
|
sub _get_bit_val {
|
|
my $self = shift;
|
|
my ($bit_position) = @_;
|
|
|
|
my $flag = $self->get_flag();
|
|
return($flag & $bit_position);
|
|
}
|
|
|
|
|
|
####
|
|
sub _set_bit_val {
|
|
my $self = shift;
|
|
my ($bit_position, $bit_val) = @_;
|
|
|
|
unless (defined $bit_position && defined $bit_val) {
|
|
confess "Error, need bit position and value";
|
|
}
|
|
|
|
my $flag = $self->get_flag();
|
|
|
|
if ($bit_val) {
|
|
$flag |= $bit_position;
|
|
}
|
|
else {
|
|
# erase bit
|
|
$flag &= ~$bit_position;
|
|
}
|
|
|
|
$self->set_flag($flag);
|
|
}
|
|
|
|
|
|
1; #EOM
|
|
|