biyelunwen/99.scripts/trinity_utils/PerlLib/PSL_parser.pm

298 lines
5.1 KiB
Perl

package PSL_parser;
use strict;
use warnings;
use Carp;
sub new {
my $packagename = shift;
my ($psl_file) = @_;
my $fh;
if (ref $psl_file eq "IO::Handle") {
$fh = $psl_file; # a filehandle not a file
}
else {
unless (-e $psl_file) {
confess "error, cannot find $psl_file";
}
open ($fh, $psl_file) or confess "Error, cannot open file $psl_file";
}
my $self = { file => $psl_file,
fh => $fh,
};
bless ($self, $packagename);
return($self);
}
sub get_next {
my $self = shift;
my $fh = $self->{fh};
while (my $line = <$fh>) {
if ($line =~ /^\d+\s/) {
return(PSL_entry->new($line));
}
}
return(undef); # no more lines
}
##################################################################################
package PSL_entry;
use strict;
use warnings;
use Carp;
sub new {
my $packagename = shift;
my ($psl_line) = @_;
my @fields = split(/\t/, $psl_line);
my $self = { fields => [@fields] };
bless ($self, $packagename);
return($self);
}
################
# blat format: # Q=cDNA T=genomic
################
# 0: match
# 1: mis-match
# 2: rep. match
# 3: N's
# 4: Q gap count
# 5: Q gap bases
# 6: T gap count
# 7: T gap bases
# 8: strand
# 9: Q name
# 10: Q size
# 11: Q start
# 12: Q end
# 13: T name
# 14: T size
# 15: T start
# 16: T end
# 17: block count
# 18: block Sizes
# 19: Q starts
# 20: T starts
# 21: Q seqs (pslx format)
# 22: T seqs (pslx format)
## All sequences start at 0 here; array-based.
sub get_line {
my $self = shift;
return(join("\t", @{$self->{fields}}));
}
sub get_match_count {
my $self = shift;
return($self->{fields}->[0]);
}
sub get_mismatch_count {
my $self = shift;
return($self->{fields}->[1]);
}
sub get_N_count {
my $self = shift;
return($self->{fields}->[3]);
}
sub get_Q_gap_count {
my $self = shift;
return($self->{fields}->[4]);
}
sub get_Q_gap_bases {
my $self = shift;
return($self->{fields}->[5]);
}
sub get_T_gap_count {
my $self = shift;
return($self->{fields}->[6]);
}
sub get_T_gap_bases {
my $self = shift;
return($self->{fields}->[7]);
}
sub get_strand {
my $self = shift;
return($self->{fields}->[8]);
}
sub get_Q_name {
my $self = shift;
return($self->{fields}->[9]);
}
sub get_Q_size {
my $self = shift;
return($self->{fields}->[10]);
}
sub get_Q_span {
my $self = shift;
return($self->{fields}->[11] + 1, $self->{fields}->[12]);
}
sub get_T_name {
my $self = shift;
return($self->{fields}->[13]);
}
sub get_T_size {
my $self = shift;
return($self->{fields}->[14]);
}
sub get_T_span {
my $self = shift;
return($self->{fields}->[15] + 1, $self->{fields}->[16]);
}
####
sub get_per_id {
my $self = shift;
my $matches = $self->get_match_count();
my $mismatches = $self->get_mismatch_count();
my $per_id = $matches / ($matches + $mismatches) * 100;
$per_id = sprintf("%.2f", $per_id);
return($per_id);
}
####
sub get_alignment_coords {
my $self = shift;
my @x = @{$self->{fields}};
my @alignment_segments;
my @cdna_coords = split (/,/, $x[19]);
my @genomic_coords = split (/,/, $x[20]);
my @lengths = split (/,/, $x[18]);
my $strand = $self->get_strand();
my $cdna_length = $self->get_Q_size();
my @ret_genome_coords;
my @ret_cdna_coords;
## report each segment match as a separate btab entry:
my $segment_number = 0;
my $num_segs = scalar(@genomic_coords);
for (my $i = 0; $i < $num_segs; $i++) {
$segment_number++;
my $length = $lengths[$i];
unless (defined $length) { next; }
my $cdna_coord = $cdna_coords[$i];
my $genomic_coord = $genomic_coords[$i];
my ($cdna_end5, $cdna_end3) = (++$cdna_coord, $cdna_coord + $length - 1);
my ($genomic_end5, $genomic_end3) = (++$genomic_coord, $genomic_coord + $length -1);
if ($strand eq "-") {
($cdna_end5, $cdna_end3) = ($cdna_length - $cdna_end5 + 1, $cdna_length - $cdna_end3 + 1);
}
push (@ret_genome_coords, [$genomic_end5, $genomic_end3]);
push (@ret_cdna_coords, [$cdna_end5, $cdna_end3]);
}
return(\@ret_genome_coords, \@ret_cdna_coords);
}
sub toString {
my $self = shift;
my $genome_acc = $self->get_T_name();
my $cdna_acc = $self->get_Q_name();
my $genome_length = $self->get_T_size();
my $cdna_length = $self->get_Q_size();
my $strand = $self->get_strand();
my ($genome_lend, $genome_rend) = $self->get_T_span();
my ($cdna_lend, $cdna_rend) = $self->get_Q_span();
my $per_id = sprintf("%.2f", $self->get_per_id());
my $ret_text = join("\t", $genome_acc, "($genome_length)", "$genome_lend-$genome_rend", $strand,
$cdna_acc, "($cdna_length)", "$cdna_lend-$cdna_rend", $per_id);
my ($genome_coords_aref, $cdna_coords_aref) = $self->get_alignment_coords();
my @genome_coords = @$genome_coords_aref;
my @cdna_coords = @$cdna_coords_aref;
my $align_text = "";
while (@genome_coords) {
my $genome_coordset = shift @genome_coords;
my $cdna_coordset = shift @cdna_coords;
if ($align_text) {
$align_text .= "....";
}
$align_text .= $genome_coordset->[0] . "(" . $cdna_coordset->[0] . ")-"
. $genome_coordset->[1] . "(" . $cdna_coordset->[1] . ")";
}
$ret_text .= "\n$align_text\n";
return ($ret_text);
}
1; #EOM