biyelunwen/99.scripts/trinity_utils/PerlLib/DelimParser.pm

292 lines
5.6 KiB
Perl

#!/usr/bin/env perl
# classes for DelimParser::Reader and DelimParser::Writer
package DelimParser;
use strict;
use warnings;
use Carp;
####
sub new {
my ($packagename, $fh, $delimiter) = @_;
unless ($fh && $delimiter) {
confess "Error, need filehandle and delimiter params";
}
my $self = { _delim => $delimiter,
_fh => $fh,
# set below in _init()
_column_headers => [],
};
bless ($self, $packagename);
return($self);
}
####
sub get_fh {
my $self = shift;
return($self->{_fh});
}
####
sub get_delim {
my $self = shift;
return($self->{_delim});
}
####
sub get_column_headers {
my $self = shift;
return(@{$self->{_column_headers}});
}
####
sub set_column_headers {
my $self = shift;
my (@columns) = @_;
$self->{_column_headers} = \@columns;
return;
}
####
sub get_num_columns {
my $self = shift;
return(length($self->get_column_headers()));
}
###
sub reconstruct_header_line {
my $self = shift;
my @column_headers = $self->get_column_headers();
my $header_line = join("\t", @column_headers);
return($header_line);
}
###
sub reconstruct_line_from_row {
my $self = shift;
my $row_href = shift;
unless ($row_href && ref $row_href) {
confess "Error, must set row_href as param";
}
my @column_headers = $self->get_column_headers();
my @vals;
foreach my $col_header (@column_headers) {
my $val = $row_href->{$col_header};
push (@vals, $val);
}
my $row_text = join("\t", @vals);
return($row_text);
}
##################################################
package DelimParser::Reader;
use strict;
use warnings;
use Carp;
use Data::Dumper;
our @ISA;
push (@ISA, 'DelimParser');
sub new {
my ($packagename) = shift;
my $self = $packagename->DelimParser::new(@_);
$self->_init();
return($self);
}
####
sub _init {
my $self = shift;
my $fh = $self->get_fh();
my $delim = $self->get_delim();
my $header_row = <$fh>;
chomp $header_row;
unless ($header_row) {
confess "Error, no header row read.";
}
my @fields = split(/$delim/, $header_row);
$self->set_column_headers(@fields);
return;
}
####
sub get_row {
my $self = shift;
my $fh = $self->get_fh();
my $line = <$fh>;
unless ($line) {
return(undef); # eof
}
my $delim = $self->get_delim();
my @fields = split(/$delim/, $line);
chomp $fields[$#fields]; ## it's important that this is done after the delimiter splitting in case the last field is actually empty.
my @column_headers = $self->get_column_headers();
my $num_col = scalar (@column_headers);
my $num_fields = scalar(@fields);
if ($num_col != $num_fields) {
confess "Error, line: [$line] " . Dumper(\@fields) . " is lacking $num_col fields: " . Dumper(\@column_headers);
}
my %dict;
foreach my $colname (@column_headers) {
my $field = shift @fields;
$dict{$colname} = $field;
}
return(\%dict);
}
####
sub get_row_val {
my ($self, $row_href, $key) = @_;
if (! exists $row_href->{$key}) {
confess "Error, row: " . Dumper($row_href) . " doesn't include key: [$key]";
}
return($row_href->{$key});
}
##################################################
package DelimParser::Writer;
use strict;
use warnings;
use Carp;
our @ISA;
push (@ISA, 'DelimParser');
sub new {
my ($packagename) = shift;
my ($ofh, $delim, $column_fields_aref, $FLAGS) = @_;
## FLAGS can be:
# NO_WRITE_HEADER|...
unless (ref $column_fields_aref eq 'ARRAY') {
confess "Error, need constructor params: ofh, delim, column_fields_aref";
}
my $self = $packagename->DelimParser::new($ofh, $delim);
$self->_initialize($column_fields_aref, $FLAGS);
return($self);
}
####
sub _initialize {
my $self = shift;
my $column_fields_aref = shift;
my $FLAGS = shift;
unless (ref $column_fields_aref eq 'ARRAY') {
confess "Error, require column_fields_aref as param";
}
my $ofh = $self->get_fh();
my $delim = $self->get_delim();
$self->set_column_headers(@$column_fields_aref);
unless (defined($FLAGS) && $FLAGS =~ /NO_WRITE_HEADER/) {
my $output_line = join($delim, @$column_fields_aref);
print $ofh "$output_line\n";
}
return;
}
####
sub write_row {
my $self = shift;
my $dict_href = shift;
unless (ref $dict_href eq "HASH") {
confess "Error, need dict_href as param";
}
my $num_dict_fields = scalar(keys %$dict_href);
my @column_headers = $self->get_column_headers();
my $delim = $self->get_delim();
my @out_fields;
for my $column_header (@column_headers) {
my $field = $dict_href->{$column_header};
unless (defined $field) {
confess "Error, missing value for required column field: $column_header";
}
if ($field =~ /$delim/) {
# don't allow any delimiters to contaminate the field value, otherwise it'll introduce offsets.
$field =~ s/$delim/ /g;
}
# also avoid newlines, which will also break the output formatting.
if ($field =~ /\n/) {
$field =~ s/\n/ /g;
}
push (@out_fields, $field);
}
my $outline = join("\t", @out_fields);
my $ofh = $self->get_fh();
print $ofh "$outline\n";
return;
}
1; #EOM