biyelunwen/99.scripts/trinity_utils/PerlLib/WigParser.pm

203 lines
3.7 KiB
Perl

package WigParser;
use strict;
use warnings;
use Carp;
####
sub new {
my $packagename = shift;
my ($wig_file) = @_;
unless ($wig_file) {
confess "Error, need wig_filename as parameter to constructor";
}
my $self = { wig_file => $wig_file,
wig_file_fh => undef,
contig_to_seek_pos_href => {},
};
bless ($self, $packagename);
$self->_init_wig_info();
return($self);
}
####
sub _init_wig_info {
my $self = shift;
my $wig_file = $self->{wig_file};
open (my $fh, $wig_file) or die "Error, cannot open file $wig_file";
$self->{wig_file_fh} = $fh;
while (<$fh>) {
if (/chrom=(\S+)/) {
my $scaff = $1;
my $filepos = tell($fh);
$self->{contig_to_seek_pos_href}->{$scaff} = $filepos;
}
}
return;
}
####
sub get_contig_list {
my $self = shift;
my @contigs = keys %{$self->{contig_to_seek_pos_href}};
return(@contigs);
}
####
sub get_wig_array {
my $self = shift;
my ($contig, $extended_flag) = @_;
my $seekpos = $self->{contig_to_seek_pos_href}->{$contig};
unless (defined $seekpos) {
confess "Error, cannot find seek position entry for contig: $contig";
}
my $fh = $self->{wig_file_fh};
seek($fh, $seekpos, 0);
#print STDERR "-retrieving wig array for $contig at pos: $seekpos\n";
my @wig_array;
while (<$fh>) {
if (/chrom=/) { last; }
chomp;
my ($pos, $val, @rest) = split(/\s+/);
if ($pos > 1e12) {
confess "Error, position $pos is out of range of max contig position value set at 1e12";
}
if ($extended_flag) {
$wig_array[$pos] = [$val, @rest];
}
else {
$wig_array[$pos] = $val;
}
}
# fill in missing entries
foreach my $val (@wig_array) {
if (! defined ($val)) {
$val = 0;
}
}
return(@wig_array);
}
#####################################################################
## Static method that uses LOTS of memory (original implementation)
sub parse_wig {
my ($wig_file) = @_;
my %scaff_to_coverage;
print STDERR "-retrieving max positions per scaffold\n";
my %max_vals_for_scaffs = &_parse_max_scaff_lengths($wig_file);
print STDERR "-preallocating memory for coverage\n";
## preallocate arrays for coverage
my $sum_pos = 0;
foreach my $scaff (keys %max_vals_for_scaffs) {
my $max_val = $max_vals_for_scaffs{$scaff};
my @cov_vals = ();
$#cov_vals = $max_val;
for my $index (0..$max_val) {
$cov_vals[$index] = int(0);
}
$scaff_to_coverage{$scaff} = \@cov_vals;
$sum_pos += $max_val;
}
print STDERR "- $sum_pos bases represented\n";
print STDERR "-populating coverage data into memory.\n";
my $scaff = undef;
open (my $fh, $wig_file) or die "Error, cannot open file $wig_file";
while (<$fh>) {
if (/^track/) { next; };
if (/chrom=(\S+)/) {
$scaff = $1;
next;
}
chomp;
my ($pos, $val) = split(/\s+/);
$scaff_to_coverage{$scaff}->[$pos] = $val;
}
close $fh;
return(%scaff_to_coverage);
}
### Private
sub _parse_max_scaff_lengths {
my ($wig_file) = @_;
my %scaff_to_max_vals;
my $scaff = "";
my $counter = 0;
open (my $fh, $wig_file) or die "Error, cannot open file $wig_file";
while (<$fh>) {
if (/^track/) { next; };
if (/chrom=(\S+)/) {
$scaff = $1;
next;
}
chomp;
my ($pos, $val) = split(/\s+/);
$scaff_to_max_vals{$scaff} = $pos;
#print "scaff: $scaff\t$pos=> $val\n";
$counter++;
#if ($counter > 100000) { last; }
}
close $fh;
return(%scaff_to_max_vals);
}
1; #EOM