biyelunwen/99.scripts/trinity_utils/util/misc/cdhit_examine_isoforms.pl

69 lines
1.3 KiB
Perl

#!/usr/bin/env perl
use strict;
use warnings;
my $usage = "\n\tusage: $0 cd-hit.clstr\n\n" .
"try running cd-hit first like so:\n" .
"\tcd-hit-est -o cdhit -c 0.98 -i Trinity.fasta -p 1 -d 0 -b 3 -T 10\n\n";
my $cdhit_file = $ARGV[0] or die $usage;
main: {
my $num_bad_clusters = 0;
my $cluster;
my @trans;
open(my $fh, $cdhit_file) or die $!;
while (<$fh>) {
chomp;
if (/^>/) {
if (@trans) {
$num_bad_clusters += &examine_cluster($cluster, \@trans);
}
$cluster = $_;
@trans = ();
}
else {
push (@trans, $_);
}
}
close $fh;
if (@trans) {
$num_bad_clusters += &examine_cluster($cluster, \@trans);
}
print "Num bad clusters: $num_bad_clusters\n";
exit($num_bad_clusters);
}
####
sub examine_cluster {
my ($cluster, $trans_aref) = @_;
my @trans = @$trans_aref;
my %cluster_ids;
foreach my $tran (@trans) {
$tran =~ /TRINITY_(DN\d+)_/;
$cluster_ids{$1}++;
}
my $num_clusters = scalar (keys %cluster_ids);
if ($num_clusters != 1) {
print STDERR "ERROR, got multiple clusters represented:\n"
. "$cluster\n" . join("\n", @trans) . "\n\n";
return(1);
}
else {
return(0);
}
}