100 lines
2.7 KiB
Perl
100 lines
2.7 KiB
Perl
#!/usr/bin/env perl
|
|
|
|
use strict;
|
|
use warnings;
|
|
|
|
use FindBin;
|
|
use lib ("$FindBin::RealBin/../../PerlLib");
|
|
use Fastq_reader;
|
|
use File::Basename;
|
|
|
|
my $usage = "usage: $0 left.fq right.fq num_entries\n\n";
|
|
|
|
my $left_fq = $ARGV[0] or die $usage;
|
|
my $right_fq = $ARGV[1] or die $usage;
|
|
my $num_to_sample = $ARGV[2] or die $usage;
|
|
|
|
srand();
|
|
|
|
main: {
|
|
|
|
## do reservoir sampling on indices instead of the fastq records themselves, then just extract the selected ones.
|
|
|
|
my $num_fq_entries = &get_num_fq_entries($left_fq);
|
|
my @selected_entries = (1..$num_to_sample);
|
|
|
|
for (my $i = $num_to_sample + 1; $i <= $num_fq_entries; $i++) {
|
|
|
|
my $rand_pos = rand($i);
|
|
if ($rand_pos < $num_to_sample) {
|
|
$selected_entries[$rand_pos] = $i;
|
|
}
|
|
}
|
|
|
|
my %indices_want = map { + $_ => 1 } @selected_entries;
|
|
@selected_entries = (); # free
|
|
|
|
|
|
my $left_fq_reader = new Fastq_reader($left_fq) or die("unable to open $left_fq for input");
|
|
my $right_fq_reader = new Fastq_reader($right_fq) or die("unable to open $right_fq for input");;
|
|
|
|
my $num_M_entries = $num_to_sample/1e6;
|
|
$num_M_entries .= "M";
|
|
my $base_left_fq = basename($left_fq);
|
|
my $base_right_fq = basename($right_fq);
|
|
|
|
open (my $left_ofh, ">$base_left_fq.$num_M_entries.fq") or die $!;
|
|
open (my $right_ofh, ">$base_right_fq.$num_M_entries.fq") or die $!;
|
|
|
|
my $fq_index = 0;
|
|
while (my $left_entry = $left_fq_reader->next()) {
|
|
my $right_entry = $right_fq_reader->next();
|
|
|
|
unless ($left_entry && $right_entry) {
|
|
die "Error, didn't retrieve both left and right entries from file ($left_entry, $right_entry) ";
|
|
}
|
|
unless ($left_entry->get_core_read_name() eq $right_entry->get_core_read_name()) {
|
|
die "Error, core read names don't match: "
|
|
. "Left: " . $left_entry->get_core_read_name() . "\n"
|
|
. "Right: " . $right_entry->get_core_read_name() . "\n";
|
|
}
|
|
|
|
$fq_index++;
|
|
|
|
if ($indices_want{$fq_index}) {
|
|
print $left_ofh $left_entry->get_fastq_record();
|
|
print $right_ofh $right_entry->get_fastq_record();
|
|
}
|
|
}
|
|
|
|
print STDERR " done.\n";
|
|
|
|
close $left_ofh;
|
|
close $right_ofh;
|
|
|
|
exit(0);
|
|
}
|
|
|
|
####
|
|
sub get_num_fq_entries {
|
|
my ($fastq_file) = @_;
|
|
|
|
unless (-s $fastq_file) {
|
|
die "Error, cannot locate file $fastq_file";
|
|
}
|
|
|
|
my $linecount;
|
|
if ($fastq_file =~ /\.gz$/) {
|
|
$linecount = `gunzip -c $fastq_file | wc -l`;
|
|
}
|
|
else {
|
|
$linecount = `cat $fastq_file | wc -l`;
|
|
}
|
|
chomp $linecount;
|
|
|
|
my $num_records = $linecount / 4;
|
|
|
|
return($num_records);
|
|
}
|
|
|