201 lines
4.6 KiB
Perl
201 lines
4.6 KiB
Perl
#!/usr/bin/env perl
|
|
|
|
use strict;
|
|
use warnings;
|
|
|
|
use FindBin;
|
|
use lib ("$FindBin::RealBin/../../PerlLib");
|
|
use Fasta_reader;
|
|
use File::Basename;
|
|
|
|
|
|
my $usage = "usage: $0 isoform.EXPR.matrix Trinity.fasta [by=transcript|gene (default:transcript)]\n\n"
|
|
. "\t note, use the isoform.EXPR.matrix file regardiess of wehther you choose transcript | gene feature type to explore.\n\n";
|
|
|
|
|
|
my $matrix_file = $ARGV[0] or die $usage;
|
|
my $fasta_file = $ARGV[1] or die $usage;
|
|
my $by_feature_type = $ARGV[2] || "transcript";
|
|
|
|
|
|
unless (-s $matrix_file) {
|
|
die "Error, cannot locate matrix file: $matrix_file";
|
|
}
|
|
unless (-s $fasta_file) {
|
|
die "Error, cannot locate fasta file: $fasta_file";
|
|
}
|
|
|
|
unless ($by_feature_type =~ /^(transcript|gene)$/) {
|
|
die "Error, cannot discern feature type: [$by_feature_type] ";
|
|
}
|
|
|
|
my %trans_lengths;
|
|
{
|
|
my $fasta_reader = new Fasta_reader($fasta_file);
|
|
while (my $seq_obj = $fasta_reader->next()) {
|
|
|
|
my $acc = $seq_obj->get_accession();
|
|
my $sequence = $seq_obj->get_sequence();
|
|
|
|
my $seq_len = length($sequence);
|
|
|
|
$trans_lengths{$acc} = $seq_len;
|
|
}
|
|
}
|
|
|
|
open (my $fh, $matrix_file) or die $!;
|
|
my $header = <$fh>;
|
|
|
|
my %gene_to_trans;
|
|
|
|
my $sum_expr = 0;
|
|
|
|
|
|
my $feature_type = "transcript";
|
|
|
|
while (<$fh>) {
|
|
chomp;
|
|
my @x = split(/\t/);
|
|
my $acc = shift @x; # gene accession
|
|
my $max_expr = 0;
|
|
my $trans_sum_expr = 0;
|
|
while (@x) {
|
|
my $expr = shift @x;
|
|
|
|
$trans_sum_expr += $expr;
|
|
$sum_expr += $expr;
|
|
|
|
if ($expr > $max_expr) {
|
|
$max_expr = $expr;
|
|
}
|
|
}
|
|
|
|
my $seq_len = $trans_lengths{$acc} or die "Error, no seq length for acc: $acc. Be sure to give the isoform.TPM expression matrix as input parameter";
|
|
|
|
my $gene_id = $acc;
|
|
|
|
|
|
if ($by_feature_type =~ /gene/) {
|
|
if ($acc =~ /^(\S+)_i\d+/) {
|
|
$gene_id = $1;
|
|
$feature_type = "gene";
|
|
}
|
|
else {
|
|
die "Error, by_feature_type is gene, but cannot extract gene_id from $acc ";
|
|
}
|
|
}
|
|
|
|
push (@{$gene_to_trans{$gene_id}}, { acc => $acc,
|
|
len => $seq_len,
|
|
sum_expr => $trans_sum_expr,
|
|
max_expr => $max_expr,
|
|
});
|
|
|
|
}
|
|
|
|
my @genes;
|
|
|
|
## make expression weighted gene length
|
|
foreach my $gene (keys %gene_to_trans) {
|
|
my @trans_structs = @{$gene_to_trans{$gene}};
|
|
|
|
my $sum_expr = 0;
|
|
my $sum_expr_n_len = 0;
|
|
my $max_expr = 0;
|
|
foreach my $trans_struct (@trans_structs) {
|
|
my $len = $trans_struct->{len};
|
|
my $expr = $trans_struct->{sum_expr} || 1;
|
|
$sum_expr_n_len += $len * $expr;
|
|
$sum_expr += $expr;
|
|
|
|
my $m_expr = $trans_struct->{max_expr};
|
|
if ($m_expr > $max_expr) {
|
|
$max_expr = $m_expr;
|
|
}
|
|
|
|
}
|
|
|
|
my $gene_len = $sum_expr_n_len / $sum_expr;
|
|
|
|
push (@genes, { acc => $gene,
|
|
sum_expr => $sum_expr,
|
|
len => $gene_len,
|
|
max_expr => $max_expr } );
|
|
}
|
|
|
|
|
|
@genes = reverse sort { $a->{sum_expr} <=> $b->{sum_expr}
|
|
||
|
|
$a->{len} <=> $b->{len} } @genes;
|
|
|
|
|
|
|
|
## write output table
|
|
|
|
|
|
my $E_file = basename($matrix_file) . ".by-$feature_type.E-inputs";
|
|
open (my $ofh, ">$E_file") or die $!;
|
|
print $ofh join("\t", "#Ex", "acc", "length", "max_expr_over_samples", "sum_expr_over_samples") . "\n";
|
|
|
|
print "Ex\tExN50\tnum_${feature_type}s\n";
|
|
|
|
my $prev_pct = 0;
|
|
my $sum = 0;
|
|
my @captured;
|
|
while (@genes) {
|
|
|
|
my $t = shift @genes;
|
|
|
|
$sum += $t->{sum_expr};
|
|
|
|
my $pct = int($sum/$sum_expr * 100);
|
|
|
|
print $ofh join("\t", $pct, $t->{acc}, int($t->{len}), sprintf("%.3f", $t->{max_expr}), sprintf("%.3f", $t->{sum_expr})) . "\n";
|
|
|
|
if ($prev_pct > 0 && $pct > $prev_pct) {
|
|
|
|
|
|
my $N50 = int(&calc_N50(@captured));
|
|
my $num_trans = scalar(@captured);
|
|
|
|
print "$prev_pct\t$N50\t$num_trans\n";
|
|
}
|
|
|
|
$prev_pct = $pct;
|
|
|
|
push (@captured, $t);
|
|
}
|
|
|
|
# do last one
|
|
|
|
my $N50 = &calc_N50(@captured);
|
|
my $num_genes = scalar(@captured);
|
|
print "100\t$N50\t$num_genes\n";
|
|
|
|
|
|
exit(0);
|
|
|
|
|
|
####
|
|
sub calc_N50 {
|
|
my @entries = @_;
|
|
|
|
@entries = reverse sort {$a->{len}<=>$b->{len}} @entries;
|
|
|
|
my $sum_len = 0;
|
|
foreach my $entry (@entries) {
|
|
$sum_len += $entry->{len};
|
|
}
|
|
|
|
my $loc_sum = 0;
|
|
foreach my $entry (@entries) {
|
|
$loc_sum += $entry->{len};
|
|
if ($loc_sum / $sum_len * 100 >= 50) {
|
|
return($entry->{len});
|
|
}
|
|
}
|
|
|
|
return(-1); # error
|
|
}
|
|
|