#!/usr/bin/env perl ## util/fastQ_rand_subset.pl ## Purpose: Extracts out a specific number of random reads from an ## input file, making sure that left and right ends of the selected ## reads are paired ## Usage: $0 left.fq right.fq num_entries ## use strict; use warnings; use FindBin; use lib ("$FindBin::RealBin/../../PerlLib"); use Fastq_reader; use File::Basename; use List::Util qw(shuffle); use Data::Dumper; my $usage = "usage: $0 left.fq right.fq num_entries num_total_records\n\n"; my $left_fq = $ARGV[0] or die $usage; my $right_fq = $ARGV[1] or die $usage; my $num_entries = $ARGV[2] or die $usage; my $num_total_records = $ARGV[3] or die $usage; main: { srand(); my @selected_indices = &get_random_indices($num_entries, $num_total_records); if (scalar(@selected_indices) != $num_entries) { die "Error, get_random_indices returned " . scalar(@selected_indices) . " instead of $num_total_records"; } else { print STDERR "-selected $num_entries indices. Now outputting selected records\n"; } my %selected = map { + $_ => 1 } @selected_indices; @selected_indices = (); print STDERR "Selecting $num_entries entries..."; my $left_fq_reader = new Fastq_reader($left_fq) or die("unable to open $left_fq for input"); my $right_fq_reader = new Fastq_reader($right_fq) or die("unable to open $right_fq for input");; my $num_M_entries = $num_entries/1e6; $num_M_entries .= "M"; my $base_left_fq = basename($left_fq); my $base_right_fq = basename($right_fq); open (my $left_ofh, ">$base_left_fq.$num_M_entries.fq") or die $!; open (my $right_ofh, ">$base_right_fq.$num_M_entries.fq") or die $!; my $counter = 0; while (my $left_entry = $left_fq_reader->next()) { my $right_entry = $right_fq_reader->next(); unless ($left_entry && $right_entry) { die "Error, didn't retrieve both left and right entries from file ($left_entry, $right_entry) "; } unless ($left_entry->get_core_read_name() eq $right_entry->get_core_read_name()) { die "Error, core read names don't match: " . "Left: " . $left_entry->get_core_read_name() . "\n" . "Right: " . $right_entry->get_core_read_name() . "\n"; } if ($selected{$counter}) { print $left_ofh $left_entry->get_fastq_record(); print $right_ofh $right_entry->get_fastq_record(); delete($selected{$counter}); } $counter++; if ($counter % 100000 == 0) { print STDERR "\r[$counter] "; } } print STDERR "\n\ndone.\n"; close $left_ofh; close $right_ofh; if (%selected) { die "Error, missing indices: " . Dumper(\%selected); } else { print STDERR "-all records located and output.\n"; } exit(0); } #### sub get_random_indices { my ($num_entries, $num_total_records) = @_; return( (shuffle 0..$num_total_records)[0..($num_entries-1)]); }