298 lines
5.1 KiB
Perl
298 lines
5.1 KiB
Perl
package PSL_parser;
|
|
|
|
use strict;
|
|
use warnings;
|
|
|
|
use Carp;
|
|
|
|
sub new {
|
|
my $packagename = shift;
|
|
|
|
my ($psl_file) = @_;
|
|
|
|
my $fh;
|
|
|
|
if (ref $psl_file eq "IO::Handle") {
|
|
$fh = $psl_file; # a filehandle not a file
|
|
}
|
|
else {
|
|
|
|
|
|
unless (-e $psl_file) {
|
|
confess "error, cannot find $psl_file";
|
|
}
|
|
open ($fh, $psl_file) or confess "Error, cannot open file $psl_file";
|
|
}
|
|
|
|
|
|
my $self = { file => $psl_file,
|
|
fh => $fh,
|
|
};
|
|
|
|
bless ($self, $packagename);
|
|
|
|
|
|
|
|
return($self);
|
|
}
|
|
|
|
sub get_next {
|
|
my $self = shift;
|
|
|
|
my $fh = $self->{fh};
|
|
|
|
while (my $line = <$fh>) {
|
|
if ($line =~ /^\d+\s/) {
|
|
return(PSL_entry->new($line));
|
|
}
|
|
}
|
|
|
|
return(undef); # no more lines
|
|
}
|
|
|
|
|
|
|
|
##################################################################################
|
|
package PSL_entry;
|
|
|
|
use strict;
|
|
use warnings;
|
|
use Carp;
|
|
|
|
|
|
sub new {
|
|
my $packagename = shift;
|
|
my ($psl_line) = @_;
|
|
|
|
my @fields = split(/\t/, $psl_line);
|
|
|
|
my $self = { fields => [@fields] };
|
|
|
|
bless ($self, $packagename);
|
|
|
|
return($self);
|
|
}
|
|
|
|
|
|
|
|
################
|
|
# blat format: # Q=cDNA T=genomic
|
|
################
|
|
|
|
# 0: match
|
|
# 1: mis-match
|
|
# 2: rep. match
|
|
# 3: N's
|
|
# 4: Q gap count
|
|
# 5: Q gap bases
|
|
# 6: T gap count
|
|
# 7: T gap bases
|
|
# 8: strand
|
|
# 9: Q name
|
|
# 10: Q size
|
|
# 11: Q start
|
|
# 12: Q end
|
|
# 13: T name
|
|
# 14: T size
|
|
# 15: T start
|
|
# 16: T end
|
|
# 17: block count
|
|
# 18: block Sizes
|
|
# 19: Q starts
|
|
# 20: T starts
|
|
# 21: Q seqs (pslx format)
|
|
# 22: T seqs (pslx format)
|
|
|
|
## All sequences start at 0 here; array-based.
|
|
|
|
|
|
sub get_line {
|
|
my $self = shift;
|
|
return(join("\t", @{$self->{fields}}));
|
|
}
|
|
|
|
|
|
sub get_match_count {
|
|
my $self = shift;
|
|
return($self->{fields}->[0]);
|
|
}
|
|
|
|
sub get_mismatch_count {
|
|
my $self = shift;
|
|
return($self->{fields}->[1]);
|
|
}
|
|
|
|
sub get_N_count {
|
|
my $self = shift;
|
|
return($self->{fields}->[3]);
|
|
}
|
|
|
|
sub get_Q_gap_count {
|
|
my $self = shift;
|
|
return($self->{fields}->[4]);
|
|
}
|
|
|
|
sub get_Q_gap_bases {
|
|
my $self = shift;
|
|
return($self->{fields}->[5]);
|
|
}
|
|
|
|
sub get_T_gap_count {
|
|
my $self = shift;
|
|
return($self->{fields}->[6]);
|
|
}
|
|
|
|
sub get_T_gap_bases {
|
|
my $self = shift;
|
|
return($self->{fields}->[7]);
|
|
}
|
|
|
|
sub get_strand {
|
|
my $self = shift;
|
|
return($self->{fields}->[8]);
|
|
}
|
|
|
|
sub get_Q_name {
|
|
my $self = shift;
|
|
return($self->{fields}->[9]);
|
|
}
|
|
|
|
sub get_Q_size {
|
|
my $self = shift;
|
|
return($self->{fields}->[10]);
|
|
}
|
|
|
|
sub get_Q_span {
|
|
my $self = shift;
|
|
return($self->{fields}->[11] + 1, $self->{fields}->[12]);
|
|
}
|
|
|
|
sub get_T_name {
|
|
my $self = shift;
|
|
return($self->{fields}->[13]);
|
|
}
|
|
|
|
sub get_T_size {
|
|
my $self = shift;
|
|
return($self->{fields}->[14]);
|
|
}
|
|
|
|
sub get_T_span {
|
|
my $self = shift;
|
|
return($self->{fields}->[15] + 1, $self->{fields}->[16]);
|
|
}
|
|
|
|
|
|
####
|
|
sub get_per_id {
|
|
my $self = shift;
|
|
|
|
my $matches = $self->get_match_count();
|
|
my $mismatches = $self->get_mismatch_count();
|
|
|
|
my $per_id = $matches / ($matches + $mismatches) * 100;
|
|
|
|
$per_id = sprintf("%.2f", $per_id);
|
|
|
|
return($per_id);
|
|
}
|
|
|
|
|
|
|
|
|
|
####
|
|
sub get_alignment_coords {
|
|
my $self = shift;
|
|
|
|
my @x = @{$self->{fields}};
|
|
|
|
|
|
my @alignment_segments;
|
|
|
|
|
|
my @cdna_coords = split (/,/, $x[19]);
|
|
my @genomic_coords = split (/,/, $x[20]);
|
|
my @lengths = split (/,/, $x[18]);
|
|
|
|
my $strand = $self->get_strand();
|
|
my $cdna_length = $self->get_Q_size();
|
|
|
|
my @ret_genome_coords;
|
|
my @ret_cdna_coords;
|
|
|
|
## report each segment match as a separate btab entry:
|
|
my $segment_number = 0;
|
|
my $num_segs = scalar(@genomic_coords);
|
|
for (my $i = 0; $i < $num_segs; $i++) {
|
|
$segment_number++;
|
|
my $length = $lengths[$i];
|
|
unless (defined $length) { next; }
|
|
my $cdna_coord = $cdna_coords[$i];
|
|
my $genomic_coord = $genomic_coords[$i];
|
|
my ($cdna_end5, $cdna_end3) = (++$cdna_coord, $cdna_coord + $length - 1);
|
|
my ($genomic_end5, $genomic_end3) = (++$genomic_coord, $genomic_coord + $length -1);
|
|
if ($strand eq "-") {
|
|
($cdna_end5, $cdna_end3) = ($cdna_length - $cdna_end5 + 1, $cdna_length - $cdna_end3 + 1);
|
|
}
|
|
|
|
push (@ret_genome_coords, [$genomic_end5, $genomic_end3]);
|
|
push (@ret_cdna_coords, [$cdna_end5, $cdna_end3]);
|
|
}
|
|
|
|
|
|
return(\@ret_genome_coords, \@ret_cdna_coords);
|
|
}
|
|
|
|
|
|
sub toString {
|
|
my $self = shift;
|
|
|
|
my $genome_acc = $self->get_T_name();
|
|
my $cdna_acc = $self->get_Q_name();
|
|
|
|
my $genome_length = $self->get_T_size();
|
|
my $cdna_length = $self->get_Q_size();
|
|
|
|
|
|
my $strand = $self->get_strand();
|
|
|
|
my ($genome_lend, $genome_rend) = $self->get_T_span();
|
|
my ($cdna_lend, $cdna_rend) = $self->get_Q_span();
|
|
|
|
my $per_id = sprintf("%.2f", $self->get_per_id());
|
|
|
|
my $ret_text = join("\t", $genome_acc, "($genome_length)", "$genome_lend-$genome_rend", $strand,
|
|
$cdna_acc, "($cdna_length)", "$cdna_lend-$cdna_rend", $per_id);
|
|
|
|
|
|
my ($genome_coords_aref, $cdna_coords_aref) = $self->get_alignment_coords();
|
|
|
|
my @genome_coords = @$genome_coords_aref;
|
|
my @cdna_coords = @$cdna_coords_aref;
|
|
|
|
my $align_text = "";
|
|
while (@genome_coords) {
|
|
my $genome_coordset = shift @genome_coords;
|
|
my $cdna_coordset = shift @cdna_coords;
|
|
|
|
if ($align_text) {
|
|
$align_text .= "....";
|
|
}
|
|
$align_text .= $genome_coordset->[0] . "(" . $cdna_coordset->[0] . ")-"
|
|
. $genome_coordset->[1] . "(" . $cdna_coordset->[1] . ")";
|
|
}
|
|
|
|
$ret_text .= "\n$align_text\n";
|
|
|
|
return ($ret_text);
|
|
}
|
|
|
|
|
|
1; #EOM
|
|
|
|
|
|
|
|
|
|
|
|
|