biyelunwen/99.scripts/trinity_utils/util/PBS/N50stats.pl

#!/usr/bin/env perl

=pod

=head1 NAME

=head1 USAGE

	-in input file in FASTA/Q or posmap length file (e.g. posmap.scflen)
	-genome genome size in bp for estimating N lengths and indexes
	-single FASTA/Q has sequence in a single line (faster)
	-overwrite => Force overwrite
	-reads	=> Force processing as read data (no N50 statistics)
	-noreads => Force as not being read data. Good for cDNA assemblies with short contigs

=head1 AUTHORS

 Alexie Papanicolaou 1

	Ecosystem Sciences, CSIRO, Black Mountain Labs, Clunies Ross Str, Canberra, Australia
	alexie@butterflybase.org

=head1 DISCLAIMER & LICENSE

This software is released under the GNU General Public License version 3 (GPLv3).
It is provided "as is" without warranty of any kind.
You can find the terms and conditions at http://www.opensource.org/licenses/gpl-3.0.html.
Please note that incorporating the whole software or parts of its code in proprietary software
is prohibited under the current license.

=head1 BUGS & LIMITATIONS

None known so far.

=cut

use strict;
use warnings;
use Getopt::Long;
use Pod::Usage;
use Statistics::Descriptive;
use Bio::SeqIO;
$|=1;

my (@infiles,$user_genome_size,$is_fasta,$is_fastq,$is_single,$overwrite,$is_reads,$isnot_reads);
GetOptions(
	'in=s{,}'    => \@infiles,
	'single' =>\$is_single,
	'genome:s' => \$user_genome_size,
	'overwrite' => \$overwrite,
	'reads'	=>\$is_reads,
	'noreads'	=>\$isnot_reads,
);
if (!@infiles){
	@infiles = @ARGV;
}
pod2usage "No input files!\n" if !@infiles;
die "Cannot ask for both reads and noreads options at the same time!\n" if $is_reads && $isnot_reads;

if ($is_reads && !$isnot_reads){
	print "Processing all data as reads\n";
}
if ($user_genome_size && $user_genome_size=~/\D/){
	if ($user_genome_size=~/^(\d+)kb$/i){
		$user_genome_size=int($1.'000');
	}
	elsif ($user_genome_size=~/^(\d+)mb$/i){
		$user_genome_size=int($1.'000000');
	}
	elsif ($user_genome_size=~/^(\d+)gb$/i){
		$user_genome_size=int($1.'000000000');
	}
	print "Genome set to ".&thousands($user_genome_size)." b.p.\n";
}

foreach my $infile (@infiles){
	unless ($infile && -s $infile){warn("I need a posmap length file, e.g. .posmap.scflen for scaffolds\n");pod2usage;}
	my $outfile=$infile.'.n50';
	$outfile.='g' if ($user_genome_size);
	warn ("Outfile $outfile already exists\n") if -s $outfile && !$overwrite;
	next  if -s $outfile && !$overwrite;
	my $total=int(0);
	my $gaps = int(0);
	my $seq_ref;
	my @head=`head $infile`;
	foreach (@head){
		if ($_=~/^>\S/){
			$is_fasta=1;
			print "FASTA file found!\n";
			last;
		}elsif($_=~/^@\S/){
			$is_fastq=1;
			print "FASTQ file found!\n";
			last;
		}
	}

	print "Parsing file $infile...\n";
	if ($is_fasta){
		($total,$gaps,$seq_ref) = &process_fasta($infile);
	}
	elsif($is_fastq){
		($total,$gaps,$seq_ref) = &process_fastq($infile);
	}
	else {
		($total,$gaps,$seq_ref) = &process_csv($infile);
	}
	print "Preparing stats...\n";
	my ($mean,$n50,$n10,$n25,$n50_length,$n10_length,$n25_length,$scaffolds,$scaffolds_size,$smallest,$largest,$sequence_number,$sum,$genome_size) = &process_stats($seq_ref,$total);

	if ($mean){
		open (OUT,">".$outfile);
		my $stat = Statistics::Descriptive::Full->new();
		$stat->add_data($seq_ref);
		#my $skew='';sprintf("%.2f",$stat->skewness());
		my $mean = sprintf("%.2f",$mean);
		my $median = $stat->median();
		my $var = sprintf("%.2f",$stat->variance());
		my $sd = sprintf("%.2f",$stat->standard_deviation());
		if (!$scaffolds || $scaffolds == 0){
			$scaffolds=$sequence_number;
			$scaffolds_size=$smallest;
		}
		print OUT "File: $infile\n";
		print OUT "TOTAL: ".&thousands($total)." bp in ".&thousands($sequence_number)." sequences\n";
		print OUT "\tof which ".&thousands($gaps)." are Ns/gaps.\n";
		print OUT "Mean: ".&thousands($mean)."\nStdev: ".&thousands($sd)."\n";
		print OUT "Median: ".&thousands($median)."\n";
		print OUT "Smallest: ".&thousands($smallest)."\nLargest: ".&thousands($largest)."\n";
		if (($mean >=1000 && !$is_reads) || $isnot_reads){
			print OUT "N10 length: ".&thousands($n10_length)."\nN10 Number: ".&thousands($n10)."\n";
			print OUT "N25 length: ".&thousands($n25_length)."\nN25 Number: ".&thousands($n25)."\n";
			print OUT "N50 length: ".&thousands($n50_length)."\nN50 Number: ".&thousands($n50)."\n";
			print OUT "Assuming a genome size of "
				.&thousands($user_genome_size)
				." then the top ".&thousands($scaffolds)
				." account for it (min "
				.&thousands($scaffolds_size)
				." bp)\n" if $user_genome_size;
		}else{
			print OUT "Reads found! Read coverage estimated to ".sprintf("%.2f",$total/$user_genome_size)."x using user provided genome size of ".&thousands($user_genome_size)."\n" if $user_genome_size;
		}
		close (OUT);
		print "Done, see $outfile\n";
		system("cat $outfile");
	}else {
		open (OUT,">".$outfile);
		print OUT "File: $infile\n";
		print OUT "TOTAL: $total bp in $sequence_number sequences\n";
		close (OUT);
		warn "Non fatal warning: Something went wrong in estimating the statistics. Maybe the provided genome length is much larger than sequence length or maybe less than 3 sequences provided?\n";
	}
}
########################################################################
sub process_fasta(){
	print "Processing as FASTA\n";
	my $infile=shift;
	my @array ;
	my $total=int(0);
	my $gaps=int(0);
	my $counter = int(0);

	if ($is_single){
		open (IN,$infile)||die;
		while (my $seq_id=<IN>) {
	                my $seq=<IN>;
	                my $length=length($seq)-1; # newline
			$counter+=length($seq_id)+$length+1;
	                next unless $length;
			$gaps+=($seq=~tr/[N\-]//);
	                push(@array,$length);
        	        $total+=$length;
		}
		close IN;
	}else{
		my $filein = new Bio::SeqIO(-file=>$infile , -format=>'fasta');
		while (my $seq_obj=$filein->next_seq()) {
			$counter+=length($seq_obj->seq().$seq_obj->description().' '.$seq_obj->id()) if $seq_obj->seq();
			my $length=$seq_obj->length();
			my $seq=$seq_obj->seq();
			$gaps+=($seq=~tr/[N\-]//);
			next unless $length;
			push(@array,$length);
	        	$total+=$length;
	        }
	}
	print "\n";
	die "No data found or wrong format\n" unless $total;
	return ($total,$gaps,\@array);
}
sub process_fastq(){
	print "Processing as FASTQ\n";
	my $infile=shift;
	my @array ;
	my $total=int(0);
	my $gaps=int(0);
	my $counter = int(0);
	open (IN,$infile);
	while (my $seq_id=<IN>) {
		my $seq=<IN>;
		my $scrap=<IN>.<IN>;
		my $length=length($seq)-1; #newline
		$counter+=length($seq_id)+$length+1;
		next unless $length;
		$gaps+=($seq=~tr/[N\-]//);
                push(@array,$length);
                $total+=$length;
       }
	close IN;
	print "\n";
	die "No data found or wrong format\n" unless $total;
	return ($total,$gaps,\@array);
}
sub process_csv(){
	print "Processing as CSV\n";
	my $infile=shift;
	my @array ;
	my $total=int(0);
	my $gaps='N/A';
	my $counter = int(0);
	open (IN,$infile)||die ("Cannot open $infile\n");
	while (my $ln=<IN>){
		$counter+=length($ln);
		$ln=~/(\d+)$/;
		next unless $1;
		my $length= $1;
		push(@array,$1);
		$total+=$length;
	}
	close IN;
	die "No data found or wrong format\n" unless $total;
	return ($total,$gaps,\@array);
}

sub process_stats(){
	my $sequences_ref = shift;
	my $total = shift;
	my $genome_size = int(0);
	my $mean = $total / scalar(@$sequences_ref);
	my ($n50,$n10,$n25,$n50_length,$n10_length,$n25_length,$scaffolds,$scaffolds_size,$smallest,$largest,$sequence_number,$sum);
	print "Sorting...";
	my @sequences=sort{$b<=>$a} @$sequences_ref;
	$smallest=$sequences[-1];
	$largest=$sequences[0];
	print " done!\n";
	$|=0;
	if (($mean < 1000 && !$isnot_reads) || $is_reads ){
		print "Reads detected. Ignoring N* calculations.\n";
		$genome_size=$total;
		$sequence_number = scalar(@$sequences_ref);
		return ($mean,$n50,$n10,$n25,$n50_length,$n10_length,$n25_length,$scaffolds,$scaffolds_size,$smallest,$largest,$sequence_number,$sum) if $mean <1000;
	}
	elsif (!$user_genome_size){
		print "Setting genome size for N* calculations to total consensus $total\n";
		$genome_size=$total;
	}elsif($user_genome_size){
		print "Setting genome size for N* calculations to user defined $user_genome_size\n";
		$genome_size = $user_genome_size;
	}

	foreach my $sequence_length ( @sequences){
		$sum+=$sequence_length;
		$sequence_number++;
		if($sum >= $genome_size*0.1 && !$n10){
			$n10=$sequence_number;
			$n10_length=$sequence_length;
		}
		elsif($sum >= $genome_size*0.25 && !$n25){
			$n25=$sequence_number;
			$n25_length=$sequence_length;
		}
		elsif($sum >= $genome_size*0.5 && !$n50){
			$n50 = $sequence_number;
			$n50_length=$sequence_length;
		}elsif ($sum >= $genome_size && !$scaffolds){
			$scaffolds = $sequence_number;
			$scaffolds_size = $sequence_length;
		}

	}
	print "Processed $sequence_number sequences\n";
	return ($mean,$n50,$n10,$n25,$n50_length,$n10_length,$n25_length,$scaffolds,$scaffolds_size,$smallest,$largest,$sequence_number,$sum,$genome_size);
}

sub thousands($){
	my $val = shift;
	return int(0) if !$val;
	$val = sprintf("%.0f", $val);
	1 while $val =~ s/(.*\d)(\d\d\d)/$1,$2/;
	return $val;
}