biyelunwen/99.scripts/trinity_utils/PerlLib/PSL_parser.pm

package PSL_parser;

use strict;
use warnings;

use Carp;

sub new {
	my $packagename = shift;

	my ($psl_file) = @_;

	my $fh;

	if (ref $psl_file eq "IO::Handle") {
		$fh = $psl_file; # a filehandle not a file
	}
	else {


		unless (-e $psl_file) {
			confess "error, cannot find $psl_file";
		}
		open ($fh, $psl_file) or confess "Error, cannot open file $psl_file";
	}


	my $self = { file => $psl_file,
				 fh => $fh,
	};

	bless ($self, $packagename);


	return($self);
}

sub get_next {
	my $self = shift;

	my $fh = $self->{fh};

	while (my $line = <$fh>) {
		if ($line =~ /^\d+\s/) {
			return(PSL_entry->new($line));
		}
	}

	return(undef); # no more lines
}


##################################################################################
package PSL_entry;

use strict;
use warnings;
use Carp;


sub new {
	my $packagename = shift;
	my ($psl_line) = @_;

	my @fields = split(/\t/, $psl_line);

	my $self = { fields => [@fields] };

	bless ($self, $packagename);

	return($self);
}


################
# blat format: # Q=cDNA T=genomic
################

#  0: match
#  1: mis-match
#  2: rep. match
#  3: N's
#  4: Q gap count
#  5: Q gap bases
#  6: T gap count
#  7: T gap bases
#  8: strand
#  9: Q name
# 10: Q size
# 11: Q start
# 12: Q end
# 13: T name
# 14: T size
# 15: T start
# 16: T end
# 17: block count
# 18: block Sizes
# 19: Q starts
# 20: T starts
# 21: Q seqs (pslx format)
# 22: T seqs (pslx format)

## All sequences start at 0 here; array-based.


sub get_line {
	my $self = shift;
	return(join("\t", @{$self->{fields}}));
}


sub get_match_count {
	my $self = shift;
	return($self->{fields}->[0]);
}

sub get_mismatch_count {
	my $self = shift;
	return($self->{fields}->[1]);
}

sub get_N_count {
	my $self = shift;
	return($self->{fields}->[3]);
}

sub get_Q_gap_count {
	my $self = shift;
	return($self->{fields}->[4]);
}

sub get_Q_gap_bases {
	my $self = shift;
	return($self->{fields}->[5]);
}

sub get_T_gap_count {
	my $self = shift;
	return($self->{fields}->[6]);
}

sub get_T_gap_bases {
	my $self = shift;
	return($self->{fields}->[7]);
}

sub get_strand {
	my $self = shift;
	return($self->{fields}->[8]);
}

sub get_Q_name {
	my $self = shift;
	return($self->{fields}->[9]);
}

sub get_Q_size {
	my $self = shift;
	return($self->{fields}->[10]);
}

sub get_Q_span {
	my $self = shift;
	return($self->{fields}->[11] + 1, $self->{fields}->[12]);
}

sub get_T_name {
	my $self = shift;
	return($self->{fields}->[13]);
}

sub get_T_size {
	my $self = shift;
	return($self->{fields}->[14]);
}

sub get_T_span {
	my $self = shift;
	return($self->{fields}->[15] + 1, $self->{fields}->[16]);
}


####
sub get_per_id {
	my $self = shift;

	my $matches = $self->get_match_count();
	my $mismatches = $self->get_mismatch_count();

	my $per_id = $matches / ($matches + $mismatches) * 100;

	$per_id = sprintf("%.2f", $per_id);

	return($per_id);
}


####
sub get_alignment_coords {
	my $self = shift;

	my @x = @{$self->{fields}};


    my @alignment_segments;


    my @cdna_coords = split (/,/, $x[19]);
    my @genomic_coords = split (/,/, $x[20]);
    my @lengths = split (/,/, $x[18]);

	my $strand = $self->get_strand();
	my $cdna_length = $self->get_Q_size();

	my @ret_genome_coords;
	my @ret_cdna_coords;

    ## report each segment match as a separate btab entry:
    my $segment_number = 0;
    my $num_segs = scalar(@genomic_coords);
	for (my $i = 0; $i < $num_segs; $i++) {
		$segment_number++;
		my $length = $lengths[$i];
		unless (defined $length) { next; }
		my $cdna_coord = $cdna_coords[$i];
		my $genomic_coord = $genomic_coords[$i];
		my ($cdna_end5, $cdna_end3) = (++$cdna_coord, $cdna_coord + $length - 1);
		my ($genomic_end5, $genomic_end3) = (++$genomic_coord, $genomic_coord + $length -1);
		if ($strand eq "-") {
			($cdna_end5, $cdna_end3) = ($cdna_length - $cdna_end5 + 1, $cdna_length - $cdna_end3 + 1);
		}

		push (@ret_genome_coords, [$genomic_end5, $genomic_end3]);
		push (@ret_cdna_coords, [$cdna_end5, $cdna_end3]);
	}


	return(\@ret_genome_coords, \@ret_cdna_coords);
}


sub toString {
	my $self = shift;

	my $genome_acc = $self->get_T_name();
	my $cdna_acc = $self->get_Q_name();

	my $genome_length = $self->get_T_size();
	my $cdna_length = $self->get_Q_size();


	my $strand = $self->get_strand();

	my ($genome_lend, $genome_rend) = $self->get_T_span();
	my ($cdna_lend, $cdna_rend) = $self->get_Q_span();

	my $per_id = sprintf("%.2f", $self->get_per_id());

	my $ret_text = join("\t", $genome_acc, "($genome_length)", "$genome_lend-$genome_rend", $strand,
						$cdna_acc, "($cdna_length)", "$cdna_lend-$cdna_rend", $per_id);


	my ($genome_coords_aref, $cdna_coords_aref) = $self->get_alignment_coords();

	my @genome_coords = @$genome_coords_aref;
	my @cdna_coords = @$cdna_coords_aref;

	my $align_text = "";
	while (@genome_coords) {
		my $genome_coordset = shift @genome_coords;
		my $cdna_coordset = shift @cdna_coords;

		if ($align_text) {
			$align_text .= "....";
		}
		$align_text .= $genome_coordset->[0] . "(" . $cdna_coordset->[0] . ")-"
			. $genome_coordset->[1] . "(" . $cdna_coordset->[1] . ")";
	}

	$ret_text .= "\n$align_text\n";

	return ($ret_text);
}


1; #EOM