263 lines
8.5 KiB
Perl
263 lines
8.5 KiB
Perl
#!/usr/local/bin/perl
|
|
|
|
package main;
|
|
our $SEE;
|
|
|
|
|
|
package GFF3_utils;
|
|
|
|
use strict;
|
|
use warnings;
|
|
use Gene_obj;
|
|
use Gene_obj_indexer;
|
|
use Carp;
|
|
use URI::Escape;
|
|
use Data::Dumper;
|
|
|
|
|
|
####
|
|
sub index_GFF3_gene_objs {
|
|
|
|
my ($gff_filename, $gene_obj_indexer, $contig_id) = @_;
|
|
# contig_id is optional.
|
|
|
|
|
|
my $hash_mode = 0;
|
|
if (ref $gene_obj_indexer eq 'HASH') {
|
|
$hash_mode = 1;
|
|
}
|
|
|
|
## note can use either a gene_obj_indexer or a hash reference.
|
|
|
|
my %gene_coords;
|
|
my %asmbl_id_to_gene_id_list;
|
|
my %transcript_to_gene;
|
|
my %cds_phases;
|
|
|
|
my %gene_names;
|
|
my %loci;
|
|
|
|
open (my $fh, $gff_filename) or die $!;
|
|
|
|
my %gene_id_to_source_type;
|
|
|
|
my %source_tracker;
|
|
|
|
my $counter = 0;
|
|
# print STDERR "\n-parsing file $gff_filename\n";
|
|
while (<$fh>) {
|
|
|
|
chomp;
|
|
|
|
unless (/\w/) { next;} # empty line
|
|
|
|
if (/^\#/) { next; } # comment entry in gff3
|
|
|
|
my @x = split (/\t/);
|
|
|
|
unless (scalar @x >= 9) {
|
|
print STDERR "-ignoring line $_\n";
|
|
next;
|
|
}
|
|
|
|
my ($asmbl_id, $source, $feat_type, $lend, $rend, $orient, $cds_phase, $gene_info) = ($x[0], $x[1], $x[2], $x[3], $x[4], $x[6], $x[7], $x[8]);
|
|
|
|
if ($contig_id && $asmbl_id ne $contig_id) { next; }
|
|
|
|
unless ($feat_type) { die "Error, $_, no feat_type: line\[$_\]"; }
|
|
|
|
unless ($feat_type =~ /^(gene|mRNA|CDS|exon)$/) { next;}
|
|
|
|
$gene_info = uri_unescape($gene_info);
|
|
|
|
$gene_info =~ /ID=\"?([^;\s\"]+)\"?;?/;
|
|
my $id = $1 or die "Error, couldn't get the id field $_";
|
|
|
|
if (exists $source_tracker{$id} && $source_tracker{$id} ne $source) {
|
|
confess "Error, gene ID $id is given source $source when previously encountered with source $source_tracker{$id} ";
|
|
}
|
|
|
|
if ($feat_type eq 'gene') {
|
|
my $gene_name = "";
|
|
if ($gene_info =~ /Name=\"?([^\;\"]+)\"?/) {
|
|
$gene_name = $1;
|
|
}
|
|
else {
|
|
$gene_name = "";
|
|
}
|
|
|
|
if ($gene_info =~ /Note=\"?([^\;\"]+)\"?/) {
|
|
$gene_name .= " $1";
|
|
}
|
|
|
|
$gene_names{$id} = $gene_name;
|
|
|
|
}
|
|
|
|
if ($gene_info =~ /Alias=([^;]+)/) {
|
|
my $locus = $1;
|
|
$loci{$id} = $locus;
|
|
}
|
|
|
|
|
|
if ($feat_type eq 'gene') { next;} ## beyond this pt, gene is not needed.
|
|
|
|
$gene_info =~ /Parent=\"?([^;\s\"]+)\"?;?/;
|
|
my $parent = $1 or die "Error, couldn't get the parent info $_";
|
|
|
|
# print "id: $id, parent: $parent\n";
|
|
|
|
if ($feat_type eq 'mRNA') {
|
|
## just get the identifier info
|
|
$transcript_to_gene{$id} = $parent;
|
|
next;
|
|
}
|
|
|
|
my $transcript_id_listing = $parent;
|
|
|
|
my @transcript_ids = split(/,/, $transcript_id_listing);
|
|
foreach my $transcript_id (@transcript_ids) {
|
|
|
|
my $gene_id = $transcript_to_gene{$transcript_id};
|
|
unless (defined $gene_id) {
|
|
print STDERR "Error, no gene feature found for $transcript_id.... ignoring feature.\n";
|
|
next;
|
|
}
|
|
|
|
|
|
$gene_id_to_source_type{$gene_id} = $source;
|
|
|
|
my ($end5, $end3) = ($orient eq '+') ? ($lend, $rend) : ($rend, $lend);
|
|
|
|
$gene_coords{$asmbl_id}->{$gene_id}->{$transcript_id}->{$feat_type}->{$end5} = $end3;
|
|
# print "$asmbl_id, $gene_id, $transcript_id, $feat_type, $end5, $end3\n";
|
|
|
|
if ($cds_phase =~ /^\d+$/) {
|
|
$cds_phases{$gene_id}->{$transcript_id}->{$end5} = int($cds_phase);
|
|
}
|
|
}
|
|
|
|
}
|
|
close $fh;
|
|
|
|
##
|
|
# print STDERR "\n-caching genes.\n";
|
|
foreach my $asmbl_id (sort keys %gene_coords) {
|
|
my $genes_href = $gene_coords{$asmbl_id};
|
|
|
|
foreach my $gene_id (keys %$genes_href) {
|
|
print STDERR "\r-indexing [$gene_id] " if $SEE;
|
|
my $transcripts_href = $genes_href->{$gene_id};
|
|
|
|
my @gene_objs;
|
|
|
|
foreach my $transcript_id (keys %$transcripts_href) {
|
|
|
|
my $cds_coords_href = $transcripts_href->{$transcript_id}->{CDS} || {}; # could be a noncoding transcript w/ no CDS
|
|
my $exon_coords_href = $transcripts_href->{$transcript_id}->{exon};
|
|
|
|
unless (ref $exon_coords_href) {
|
|
print STDERR Dumper ($transcripts_href);
|
|
die "Error, missing exon coords for $transcript_id, $gene_id\n";
|
|
}
|
|
|
|
my $gene_obj = new Gene_obj();
|
|
|
|
|
|
if (scalar (keys %$cds_coords_href) == 1) {
|
|
|
|
## could be that only the cds span was provided.
|
|
## break it up across the exon segments
|
|
|
|
my ($cds_lend, $cds_rend) = sort {$a<=>$b} %$cds_coords_href;
|
|
my @exon_coords;
|
|
my $orient;
|
|
foreach my $exon_end5 (keys %$exon_coords_href) {
|
|
my $exon_end3 = $exon_coords_href->{$exon_end5};
|
|
push (@exon_coords, [$exon_end5, $exon_end3]);
|
|
if ($exon_end5 < $exon_end3) {
|
|
$orient = '+';
|
|
}
|
|
elsif ($exon_end5 > $exon_end3) {
|
|
$orient = '-';
|
|
}
|
|
}
|
|
|
|
$gene_obj->build_gene_obj_exons_n_cds_range(\@exon_coords, $cds_lend, $cds_rend, $orient);
|
|
}
|
|
else {
|
|
|
|
## cds and exons specified separately
|
|
|
|
$gene_obj->populate_gene_obj($cds_coords_href, $exon_coords_href);
|
|
}
|
|
|
|
$gene_obj->{Model_feat_name} = $transcript_id;
|
|
$gene_obj->{TU_feat_name} = $gene_id;
|
|
$gene_obj->{asmbl_id} = $asmbl_id;
|
|
|
|
if (my $gene_locus = $loci{$gene_id}) {
|
|
$gene_obj->{pub_locus} = $gene_locus;
|
|
}
|
|
if (my $transcript_locus = $loci{$transcript_id}) {
|
|
$gene_obj->{model_pub_locus} = $transcript_locus;
|
|
}
|
|
|
|
|
|
$gene_obj->{com_name} = $gene_names{$gene_id} || $transcript_id;
|
|
|
|
$gene_obj->{source} = $gene_id_to_source_type{$gene_id};
|
|
|
|
## set CDS phase info if available from the gff
|
|
my $cds_phases_href = $cds_phases{$gene_id}->{$transcript_id};
|
|
if (ref $cds_phases_href) {
|
|
## set the cds phases
|
|
my @exons = $gene_obj->get_exons();
|
|
foreach my $exon (@exons) {
|
|
if (my $cds = $exon->get_CDS_obj()) {
|
|
my ($end5, $end3) = $cds->get_coords();
|
|
my $phase = int($cds_phases_href->{$end5});
|
|
unless ($phase == 0 || $phase == 1 || $phase == 2) {
|
|
confess "Error, should have phase set for cds $gene_id $transcript_id $end5, but I do not know what $phase is. ";
|
|
}
|
|
$cds->set_phase($phase);
|
|
}
|
|
}
|
|
}
|
|
|
|
push (@gene_objs, $gene_obj);
|
|
}
|
|
|
|
## want single gene that includes all alt splice variants here
|
|
my $template_gene_obj = shift @gene_objs;
|
|
foreach my $other_gene_obj (@gene_objs) {
|
|
$template_gene_obj->add_isoform($other_gene_obj);
|
|
}
|
|
|
|
$template_gene_obj->refine_gene_object();
|
|
|
|
if ($hash_mode) {
|
|
$gene_obj_indexer->{$gene_id} = $template_gene_obj;
|
|
}
|
|
else {
|
|
$gene_obj_indexer->store_gene($gene_id, $template_gene_obj);
|
|
}
|
|
|
|
print "GFF3_utils: stored $gene_id\n" if $SEE;
|
|
|
|
# add to gene list for asmbl_id
|
|
my $gene_list_aref = $asmbl_id_to_gene_id_list{$asmbl_id};
|
|
unless (ref $gene_list_aref) {
|
|
$gene_list_aref = $asmbl_id_to_gene_id_list{$asmbl_id} = [];
|
|
}
|
|
push (@$gene_list_aref, $gene_id);
|
|
}
|
|
}
|
|
print STDERR "\n";
|
|
return (\%asmbl_id_to_gene_id_list);
|
|
}
|
|
|
|
|
|
|
|
1; #EOM
|