#!/usr/bin/env perl # written by # # Bob Freeman, Ph.D. # Acorn Worm Informatics, Kirschner lab # Dept of Systems Biology, Alpert 524 # Harvard Medical School # 200 Longwood Avenue # Boston, MA 02115 # 617/432.2294, vox # # bob_freeman@hms.harvard.edu =from_Bob I wrote a small script in Perl that gives a number of basic stats and can even generate a histogram of lengths for you. Script is attached. I typically use this for assessing the size / quality of my data. For example, after trimming a set of reads (prior to assembly), I used the command: fastq_stats.pl -i trimmed_reads.fastq -f "30,40,50,60,70,80,90,100" to assess resulting data, and my output is: Count27717551 Sum2347820942 Mean84.7052087 Min36 Max101 Median101 Q036 Q154 Q2101 Q3101 Q4101 36(min) <= 30.0:0 <= 40.0:183537 <= 50.0:283964 <= 60.0:8209709 <= 70.0:339854 <= 80.0:458850 <= 90.0:460218 <= 100.0:1271776 I believe I've modified the file to identify fasta, fastq, and protein fasta ( *.fasta, *.fa, *.mfasta, *.pro, and *.pep). Output can be in table-format also ( -t ). And use the -h switch to print out the (scarce) help info. It does use Bioperl and the Perl modules Statistics::Descriptive. Hope you find it useful! Bob =cut use strict; use Bio::SeqIO; use Carp; use Data::Dumper; use English qw ( -no_match_vars ); use Statistics::Descriptive; my $usage = qq ( fastq_stats.pl -i infile -f partition|bins -t reads in fastq or fasta data and spits out some descripting stats -i input file -f frequency distribution, either # partitions or a quoted list of comma-separated bins -t print results in table (tab-delimited) format ); # opens fastq or fasta data # reads in each sequence, grabbing data about each # then spits out output data $OUTPUT_AUTOFLUSH = 1; our @seqlen_data; our @hist_bins; our ($infile, $freqdist, $fdref); our $table = 0; parse_args(); read_seq_data2(); if (!$table) { output_stats(); } else { output_stats_table(); } exit; # # # END OF PROGRAM # sub parse_args { # while (scalar(@ARGV)) { my $arg = shift(@ARGV); if ($arg eq '-h') { die $usage; } elsif ($arg eq '-i') { $infile = shift(@ARGV); } elsif ($arg eq '-f') { $freqdist = shift(@ARGV); } elsif ($arg eq '-t') { $table = 1; } else { die "unknown argument '$arg'"; } } # check to ensure we have the required parameters if (!defined $infile) { print "Error from undefined input arguments!\n"; die $usage; } # parse frequency distribution stuff if (defined $freqdist) { $freqdist =~ s/ //g; @hist_bins = split (/,/, $freqdist); $freqdist = 1; } else { $freqdist = 0; } } sub read_seq_data { my ($file_suffix) = $infile =~ m/.+\.(\S+)$/; # create input SeqIO object my $seq_in = Bio::SeqIO->new( '-file' => "<$infile", '-format' => $file_suffix ); while (my $inseq = $seq_in->next_seq()) { push @seqlen_data, $inseq->length(); } #end of while } sub read_seq_data2 { if ($infile =~ m/(\.fasta$)|(\.fa$)|(\.mfasta$)|(\.mfa$)|(\.pro$)|(\.pep$)/) { read_fasta(); } elsif ($infile =~ m/(\.fastq$)|(\.fq$)/) { read_fastq(); } else { die "Error: Unknown file format for input file $infile!\n"; } } sub read_fasta { # read in fasta data using BioPerl methods. These are pretty quick. # create input SeqIO object my $seq_in = Bio::SeqIO->new( '-file' => "<$infile", '-format' => 'fasta' ); while (my $inseq = $seq_in->next_seq()) { push @seqlen_data, $inseq->length(); } #end of while } sub read_fastq { # read in fastq data in old-fashioned method, as it's faster than using BioPerl open (my $INFILE, "<$infile") || die "Error opening file $infile for reading: $!\n"; while ( my $line = <$INFILE> ) { #chomp $line; #next if $line =~ m/^\s|#/; if ($line =~ m/^@/) { # skip to next line and push length $line = <$INFILE>; chomp $line; push @seqlen_data, length($line); } } close $INFILE; } sub output_stats { print "\nSequence stats:\n"; my $stat = Statistics::Descriptive::Full->new(); $stat->add_data(@seqlen_data); print "Count\t", $stat->count(), "\n"; print "Sum\t", $stat->sum(), "\n"; print "Mean\t", $stat->mean(), "\n\n"; print "Min\t", $stat->min(), "\n"; print "Max\t", $stat->max(), "\n"; print "Median\t", $stat->median(), "\n\n"; print "Q0\t", $stat->quantile(0), "\n"; print "Q1\t", $stat->quantile(1), "\n"; print "Q2\t", $stat->quantile(2), "\n"; print "Q3\t", $stat->quantile(3), "\n"; print "Q4\t", $stat->quantile(4), "\n"; # do frequency distribution output if indicated if ($freqdist) { # call proper method if (scalar(@hist_bins) == 1) { $fdref = $stat->frequency_distribution_ref($hist_bins[0]); } else { $fdref = $stat->frequency_distribution_ref(\@hist_bins); } # now print it out print "\n", $stat->min(), "(min)\n"; for (sort {$a <=> $b} keys %$fdref) { #sprintf "<= %.1f: %8u \n", $_, "<= $_, \tcount = $fdref->{$_}\n"; printf "<= %5.1f:\t%8u \n", $_, $fdref->{$_}; } } } sub output_stats_table { # # do calculations my $stat = Statistics::Descriptive::Full->new(); $stat->add_data(@seqlen_data); if ($freqdist) { # call proper method if (scalar(@hist_bins) == 1) { $fdref = $stat->frequency_distribution_ref($hist_bins[0]); } else { $fdref = $stat->frequency_distribution_ref(\@hist_bins); } } # print header print "#Sequence stats:\n"; print join("\t", "#","Count","Sum","Mean","Min","Max","Median","Q0","Q1","Q2","Q3","Q4"); if ($freqdist) { for (sort {$a <=> $b} keys %$fdref) { #sprintf "<= %.1f: %8u \n", $_, "<= $_, \tcount = $fdref->{$_}\n"; print "\t<=", $_; } } print "\n"; #print results print $infile, "\t"; print $stat->count(), "\t"; print $stat->sum(), "\t"; print $stat->mean(), "\t"; print $stat->min(), "\t"; print $stat->max(), "\t"; print $stat->median(), "\t"; print $stat->quantile(0), "\t"; print $stat->quantile(1), "\t"; print $stat->quantile(2), "\t"; print $stat->quantile(3), "\t"; print $stat->quantile(4); if ($freqdist) { # now print it out for (sort {$a <=> $b} keys %$fdref) { #sprintf "<= %.1f: %8u \n", $_, "<= $_, \tcount = $fdref->{$_}\n"; print "\t", $fdref->{$_}; } } print "\n"; }