187 lines
4.6 KiB
Perl
187 lines
4.6 KiB
Perl
#!/usr/bin/env perl
|
|
|
|
package GFF3_alignment_utils;
|
|
|
|
use strict;
|
|
use warnings;
|
|
use Carp;
|
|
|
|
use Gene_obj;
|
|
use Gene_obj_indexer;
|
|
use CDNA::Alignment_segment;
|
|
use CDNA::CDNA_alignment;
|
|
use File::Basename;
|
|
|
|
__run_test() unless caller;
|
|
|
|
sub index_alignment_objs {
|
|
my ($gff3_alignment_file, $genome_alignment_indexer_href) = @_;
|
|
|
|
unless ($gff3_alignment_file && -s $gff3_alignment_file) {
|
|
confess "Error, cannot find or open file $gff3_alignment_file";
|
|
}
|
|
unless (ref $genome_alignment_indexer_href) {
|
|
confess "Error, need genome indexer href as param ";
|
|
}
|
|
|
|
|
|
my %genome_trans_to_alignment_segments;
|
|
my %trans_to_gene_id;
|
|
|
|
|
|
open (my $fh, $gff3_alignment_file) or die "Error, cannot open file $gff3_alignment_file";
|
|
while (<$fh>) {
|
|
chomp;
|
|
|
|
unless (/\w/) { next; }
|
|
|
|
my @x = split(/\t/);
|
|
|
|
unless (scalar (@x) >= 8 && $x[8] =~ /ID=/) {
|
|
print STDERR "ignoring line: $_\n";
|
|
next;
|
|
}
|
|
|
|
my $scaff = $x[0];
|
|
my $type = $x[2];
|
|
my $lend = $x[3];
|
|
my $rend = $x[4];
|
|
my $per_id = $x[5];
|
|
if ($per_id eq ".") { $per_id = 100; } # making an assumption here.
|
|
|
|
my $orient = $x[6];
|
|
|
|
my $info = $x[8];
|
|
|
|
my @parts = split(/;/, $info);
|
|
my %atts;
|
|
foreach my $part (@parts) {
|
|
$part =~ s/^\s+|\s+$//;
|
|
$part =~ s/\"//g;
|
|
my ($att, $val) = split(/=/, $part);
|
|
|
|
if (exists $atts{$att}) {
|
|
die "Error, already defined attribute $att in $_";
|
|
}
|
|
|
|
$atts{$att} = $val;
|
|
}
|
|
|
|
my $gene_id = $atts{ID} or die "Error, no gene_id at $_";
|
|
my $trans_id = $atts{Target} or die "Error, no trans_id at $_";
|
|
{
|
|
my @pieces = split(/\s+/, $trans_id);
|
|
$trans_id = shift @pieces;
|
|
}
|
|
|
|
my ($end5, $end3) = ($orient eq '+') ? ($lend, $rend) : ($rend, $lend);
|
|
|
|
$info =~ /Target=\S+ (\d+) (\d+)/ or die "Error, cannot extract match coordinates from info: $info";
|
|
my $cdna_seg_lend = $1;
|
|
my $cdna_seg_rend = $2;
|
|
|
|
($cdna_seg_lend, $cdna_seg_rend) = sort {$a<=>$b} ($cdna_seg_lend, $cdna_seg_rend); # always + orient for transcript coords.
|
|
|
|
|
|
my $alignment_segment = new CDNA::Alignment_segment($end5, $end3, $cdna_seg_lend, $cdna_seg_rend, $per_id);
|
|
|
|
|
|
push (@{$genome_trans_to_alignment_segments{$scaff}->{$trans_id}}, $alignment_segment);
|
|
|
|
$trans_to_gene_id{$trans_id} = $gene_id;
|
|
|
|
}
|
|
|
|
|
|
my %scaff_to_align_list;
|
|
|
|
|
|
## Output genes in gff3 format:
|
|
|
|
foreach my $scaff (sort keys %genome_trans_to_alignment_segments) {
|
|
|
|
my @alignment_accs = keys %{$genome_trans_to_alignment_segments{$scaff}};
|
|
|
|
foreach my $alignment_acc (@alignment_accs) {
|
|
|
|
my $segments_aref = $genome_trans_to_alignment_segments{$scaff}->{$alignment_acc};
|
|
|
|
## determine cdna length
|
|
my @cdna_coords;
|
|
foreach my $segment (@$segments_aref) {
|
|
push (@cdna_coords, $segment->get_mcoords());
|
|
}
|
|
@cdna_coords = sort {$a<=>$b} @cdna_coords;
|
|
my $max_coord = pop @cdna_coords;
|
|
|
|
my $cdna_alignment_obj = new CDNA::CDNA_alignment($max_coord, $segments_aref);
|
|
$cdna_alignment_obj->set_acc($alignment_acc);
|
|
$cdna_alignment_obj->{genome_acc} = $scaff;
|
|
|
|
my $gene_id = $trans_to_gene_id{$alignment_acc} or confess "Error no gene_id for acc: $alignment_acc";
|
|
|
|
$cdna_alignment_obj->{gene_id} = $gene_id;
|
|
|
|
|
|
$cdna_alignment_obj->{source} = basename($gff3_alignment_file);
|
|
|
|
if (ref $genome_alignment_indexer_href eq "Gene_obj_indexer") {
|
|
$genome_alignment_indexer_href->store_gene($alignment_acc, $cdna_alignment_obj);
|
|
|
|
}
|
|
else {
|
|
|
|
$genome_alignment_indexer_href->{$alignment_acc} = $cdna_alignment_obj;
|
|
}
|
|
push (@{$scaff_to_align_list{$scaff}}, $alignment_acc);
|
|
}
|
|
}
|
|
|
|
return(%scaff_to_align_list);
|
|
}
|
|
|
|
|
|
|
|
#################
|
|
## Testing
|
|
#################
|
|
|
|
|
|
sub __run_test {
|
|
|
|
my $usage = "usage: $0 file.alignment.gff3\n\n";
|
|
|
|
my $gff3_file = $ARGV[0] or die $usage;
|
|
|
|
my $indexer = {};
|
|
my %scaff_to_alignments = &index_alignment_objs($gff3_file, $indexer);
|
|
|
|
|
|
foreach my $scaffold (keys %scaff_to_alignments) {
|
|
|
|
my @align_ids = @{$scaff_to_alignments{$scaffold}};
|
|
|
|
foreach my $align_id (@align_ids) {
|
|
my $cdna_obj = $indexer->{$align_id};
|
|
|
|
print $cdna_obj->toString();
|
|
}
|
|
}
|
|
|
|
|
|
|
|
exit(0);
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1; #EOM
|
|
|