biyelunwen/99.scripts/trinity_utils/PerlLib/GTF.pm

4281 lines
125 KiB
Perl

# -*- Mode: Perl; tab-width: 8; perl-indent-level: 2; indent-tabs-mode: nil -*-
use strict;
###############################################################################
# GTF
###############################################################################
# The GTF object parses and stores all data from a gtf file.
package GTF;
use Carp;
# GTF::new(hash)
# This is the contructor for GTF objects. It takes one required argument
# which is a hash containing the following optional fields:
# gtf_filename: The filename of the gtf file to be loaded. If no filename
# is given it just creates an empty object.
# seq_filename: The filename of the sequence corresponding to this gtf file.
# If given additional validity checks are made, additional stats
# are kept and the transcript can be output.
# tx_out_fh: The filehandle to output the genes transcripts to. Only works
# If seq_filename is given.
# conseq_filename: The filename of the conservation sequence corresponding to
# this gtf file. If given additional stats are kept.
# warning_fh: The filehandle to output gtf format warnings to. If none
# is given warnings are disregarded.
sub new {
my ($info) = @_;
my $gtf = bless {Genes => [], Transcripts => [], Exons => [], CDS => [],
Inter => [], Inter_CNS => [], Filename => "", Sequence => "",
Comments => "", Modified => 0};
$gtf->{Total_Conseq} = {0 => -1,
1 => -1,
2 => -1};
$gtf->{Total_Seq} = {A => -1,
C => -1,
G => -1,
T => -1,
N => -1};
if(defined($info->{seq_filename})){
if(-e $info->{seq_filename}){
$gtf->{Sequence} = $info->{seq_filename};
}
else{
$gtf->{Sequence} = 0;
print STDERR "GTF: $info->{seq_filename} does not exist.\n";
}
}
else{
$gtf->{Sequence} = 0;
}
if(defined($info->{strict})){
$gtf->{Strict} = $info->{strict};
}
else{
$gtf->{Strict} = 0;
}
if(defined($info->{tx_out_fh})){
if(defined($gtf->{Sequence})){
$gtf->{Tx} = $info->{tx_out_fh};
}
else{
$gtf->{Tx} = 0;
print STDERR "GTF: Cannot ouput transcripts without the sequence.\n";
}
}
else{
$gtf->{Tx} = 0;
}
if(defined($info->{conseq_filename})){
if(-e $info->{conseq_filename}){
$gtf->{Conseq} = $info->{conseq_filename};
}
else{
$info->{Conseq} = 0;
print STDERR "GTF: $info->{conseq_filename} does not exist.\n";
}
}
else{
$info->{Conseq} = 0;
}
if((defined($info->{inframe_stops})) && ($info->{inframe_stops})){
$gtf->{Inframe_Stops} = 1;
}
else{
$gtf->{Inframe_Stops} = 0;
}
if((defined($info->{fix_gtf})) &&
($info->{fix_gtf} == 1)){
$gtf->{Fix_GTF} = 1;
}
else{
$gtf->{Fix_GTF} = 0;
}
if(defined($info->{warning_skips})){
$gtf->{Warning_Skips} = $info->{warning_skips};
}
else{
$gtf->{Warning_Skips} = [];
}
if(defined($info->{bad_list})){
$gtf->{Bad_List} = $info->{bad_list};
}
else{
$gtf->{Bad_List} = [];
}
if(defined($info->{detailed_error_count})){
$gtf->{Detailed_Error_Count} = $info->{detailed_error_count};
}
if(defined($info->{mark_ase})) {
$gtf->{Mark_ASE} = $info->{mark_ase};
}
else {
$gtf->{Mark_ASE} = 0;
}
if(defined($info->{gtf_filename})){
if(-e $info->{gtf_filename}){
$gtf->{Filename} = $info->{gtf_filename};
if((defined($info->{no_check})) && ($info->{no_check})){
$gtf->_load_no_check($info->{gtf_filename});
}
else{
$gtf->_parse_file($info->{warning_fh});
}
if($gtf->{Mark_ASE}) {
$gtf->mark_ase;
}
}
else{
print STDERR "GTF: $info->{gtf_filename} does not exist.\n";
}
}
else{
$gtf->{Filename} = "";
}
return $gtf;
}
# GTF::_update
# This function resorts all lists. It is called when a list is requested.
# It does nothing unless the $gtf->{Modified} value is 1. Sets
# $gene->{Modified} to 0;
sub _update{
my ($gtf) = @_;
unless($gtf->{Modified}){
return;
}
$gtf->{Modified} = 0;
my $genes = $gtf->{Genes};
my $txs = $gtf->{Transcripts};
my $cds = $gtf->{CDS};
my $inter = $gtf->{Inter};
$genes = [sort {$a->start <=> $b->start} @$genes];
$txs = [sort {$a->start <=> $b->start} @$txs];
$cds = [sort {$a->start <=> $b->start} @$cds];
$inter = [sort {$a->start <=> $b->start} @$inter];
$gtf->{Genes} = $genes;
$gtf->{Transcripts} = $txs;
$gtf->{CDS} = $cds;
$gtf->{Inter} = $inter;
}
# GTF::genes()
# This function returns an array of refernces to GTF::Gene objects.
# The array contains one reference for each gene in the gtf file.
# The array is sorted by the start position of each gene.
sub genes{
my ($gtf) = @_;
$gtf->_update;
return $gtf->{Genes};
}
sub transcripts{
my ($gtf) = @_;
$gtf->_update;
return $gtf->{Transcripts};
}
# GTF::add_gene(gene_ref)
# This function take a reference to a GTF::Gene object and adds it to the
# list of genes stored by this object.
sub add_gene{
my ($gtf,$gene) = @_;
my $genes = $gtf->{Genes};
my $txs = $gtf->{Transcripts};
my $cds = $gtf->{CDS};
push @$genes, $gene;
push @$txs, @{$gene->transcripts};
push @$cds, @{$gene->cds};
$gtf->{Modified} = 1;
}
# GTF::add_feature(feature_ref,seqname,source,strand,id)
# This function will take a reference to an intergenic feature (such as
# inter or inter_CNS) and add it to the appropriate feature list.
# Pass a sequence name, the strand ("+" is OK for intergenic)
# and an ID to identify the feature.
# Note that the feature must be of the types inter or inter_CNS.
sub add_feature{
my ($gtf,$feature,$seqname,$source,$strand,$id) = @_;
die "Only inter or inter_CNS features may be directly added to a GTF.\n"
if($feature->type ne "inter" && $feature->type ne "inter_CNS");
my $gene = GTF::Gene::new($id,$seqname,$source,$strand);
my $tx = GTF::Transcript::new($id);
$gene->add_transcript($tx);
$tx->add_feature($feature);
my $features;
if($feature->type eq "inter") { $features = $gtf->{Inter} }
elsif($feature->type eq "inter_CNS") { $features = $gtf->{Inter_CNS} }
push @$features, $feature;
$gtf->{Modified} = 1;
}
# GTF::set_genes(gene_list)
# This function take a reference to a list of GTF::Gene objects and adds each to the
# list of genes stored by this object.
sub set_genes{
my ($gtf,$genes) = @_;
$gtf->{Genes} = $genes;
my @txs;
my @cds;
foreach my $gene (@$genes){
push @txs, @{$gene->transcripts};
push @cds, @{$gene->cds};
}
$gtf->{Transcripts} = \@txs;
$gtf->{CDS} = \@cds;
$gtf->{Modified} = 1;
}
# GTF::remove_gene(gene_id)
# This function takes a gene_id and removes the gene with that gene_id
# from the list of genes stored by this object, if a gene with that gene_id
# can be found. Otherwise does nothing. Returns true if a gene was removed
# and false if no gene with that gene_id was found.
sub remove_gene{
my ($gtf,$gene_id) = @_;
my $genes = $gtf->{Genes};
my $removed = 0;
my @new_genes;
my @new_txs;
my @new_cds;
foreach my $gene (@$genes){
if($gene->gene_id eq $gene_id){
$removed++;
}
else{
push @new_genes, $gene;
push @new_txs, @{$gene->transcripts};
push @new_cds, @{$gene->cds};
}
}
if($removed){
$gtf->{Genes} = \@new_genes;
$gtf->{Transcripts} = \@new_txs;
$gtf->{CDS} = \@new_cds;
}
return $removed;
}
# GTF::remove_feature(feature)
# This function will remove a feature from the list of features.
# Note the feature must be of type inter or inter_CNS.
sub remove_feature {
my($gtf,$feature) = @_;
my $removed = 0;
my $feature_key;
if($feature->type eq "inter") {
$feature_key = "Inter";
}
elsif($feature->type eq "inter_CNS") {
$feature_key = "Inter_CNS";
}
my @new_features;
foreach my $stored_feature (@{$gtf->{$feature_key}}) {
if($stored_feature == $feature) {
$removed++;
}
else {
push @new_features, $stored_feature;
}
}
if($removed) {
$gtf->{$feature_key} = \@new_features;
}
return $removed;
}
# GTF::set_filename(filename)
# Sets the filename to be returned by the filename function
sub set_filename{
my ($gtf,$filename) = @_;
$gtf->{Filename} = $filename;
}
# GTF::offset(ammount)
# This function will offset all genes in this gene by the given ammount
sub offset{
my ($gtf,$offset) = @_;
unless(defined($offset)){
print STDERR "Undefined value passed to GTF::offset.\n";
return;
}
my $genes = $gtf->{Genes};
foreach my $gene (@$genes){
$gene->offset($offset);
}
$gtf->{Modified} = 1;
}
# GTF::reverse_complement(seq_length)
# This function takes the length of the sequence this gtf file was based on and
# and reverse complements everything in the file. So the positive strand becomes
# the negative strand and the files starts at seq_length and moves downward to 0.
sub reverse_complement{
my ($gtf, $seq_length) = @_;
unless(defined($seq_length)){
print STDERR
"Undefined value for seq_length passed to GTF::reverse_complement.\n";
}
my $genes = $gtf->{Genes};
foreach my $gene(@$genes){
$gene->reverse_complement($seq_length);
}
$gtf->{Modified} = 1;
}
# GTF::cds()
# This function returns an array of refernces to GTF::Feature objects.
# The array contains one refernece to each CDS feature in the gtf file.
# The array is sorted by the <start> of each CDS.
sub cds{
my ($gtf) = @_;
$gtf->_update;
return $gtf->{CDS};
}
sub inter {
my ($gtf) = @_;
$gtf->_update;
return $gtf->{Inter};
}
sub inter_cns {
my ($gtf) = @_;
$gtf->_update;
return $gtf->{Inter_CNS};
}
# GTF::mark_ase()
# If you are looking for alternative splicing events in your GTF file,
# call this function to marke them. This function is separate from the
# _update methods in order to keep the running time low.
sub mark_ase {
my ($gtf) = @_;
#set the alternative splicing event flags for each feature
#currently, only optional coding exons
# The main idea is to check all pairs of transcripts in a gene,
# and find any incident where there are two exons that have the same
# end coord and two exons which have the same start coord
# and that an exon in only one of the transcripts lies between them.
# It is accomplished by sorting all the exons in both transcripts
# and checking for a supercasette, i.e.
#
# ===|||||||||===||||||||======||||||||||====
# ==||||||||||=================||||||||||||==
# <--------------->
# supercassette
#
# Note that it doesn't matter if the opposite ends of the exon are equal.
# rpz
my @genes = @{$gtf->genes};
for my $gene(@{$gtf->genes}) {
my @txs = (@{$gene->transcripts});
if(scalar(@txs) > 1) {
for (my $i = 0; $i < scalar(@txs)-1; $i++) {
next if(scalar(@{$txs[$i]->cds}) == 0);
for (my $j = $i+1; $j < scalar(@txs); $j++) {
next if(scalar(@{$txs[$j]->cds}) == 0);
my @ocds = sort {$a->start <=> $b->start || $a->stop <=> $b->stop}
(@{$txs[$i]->cds}, @{$txs[$j]->cds});
for(my $k = 4; $k < scalar(@ocds); $k++) {
if( $ocds[$k-4]->stop == $ocds[$k-3]->stop &&
$ocds[$k-1]->start == $ocds[$k]->start) {
if($ocds[$k-2]->length%3 == 0) {
$ocds[$k-2]->set_ase("InframeOptional");
}
else {
$ocds[$k-2]->set_ase("FrameshiftOptional");
}
}
}
}
}
}
}
}
# GTF::infer_exons()
# Skips infering if Exons already exist
# Output is undefined if features overlap, except for start_codon
# Currently knows about utr5, start,stop,cds, utr3
sub infer_exons {
my ($gtf) = @_;
$gtf->_update;
for my $tx (@{$gtf->transcripts}) {
next if (@{$tx->exons});
if ($tx->strand eq '+') {
for ( @{$tx->utr5}, @{$tx->cds}, @{$tx->stop_codons}, @{$tx->utr3} ) {
if (!@{$tx->exons} || ${$tx->exons}[-1]->stop + 1 < $_->start ) {
$tx->add_feature( GTF::Feature::new('exon', $_->start, $_->stop, 0, 0) );
} elsif ( ${$tx->exons}[-1]->stop + 1 eq $_->start ) {
${$tx->exons}[-1]->set_stop($_->stop);
}
}
} else {
for ( @{$tx->utr3}, @{$tx->stop_codons}, @{$tx->cds}, @{$tx->utr5} ) {
if (!@{$tx->exons} || ${$tx->exons}[-1]->stop + 1 < $_->start ) {
$tx->add_feature(GTF::Feature::new('exon', $_->start, $_->stop, 0, 0));
} elsif ( ${$tx->exons}[-1]->stop + 1 eq $_->start ) {
${$tx->exons}[-1]->set_stop($_->stop);
}
}
}
}
}
# GTF::remove_exons()
# remove exons from all transcripts
sub remove_exons {
my ($gtf) = @_;
$gtf->_update;
$_->{Exons} = [] for (@{$gtf->transcripts});
}
# GTF::filename()
# This function returns the filename of the gtf file.
sub filename {shift->{Filename}}
# GTF::comments()
# This function returns all full line commentsfrom the gtf file.
sub comments {shift->{Comments}};
# GTF::conservation_count()
# Returns a hash containing counts for each conservation character (0,1,2)
sub conservation_count {shift->{Total_Conseq}};
# GTF::sequence_count()
# Returns a hash containing counts for each sequence character (A,C,T,G,N,X),
# Where X is all other characters
sub sequence_count {shift->{Total_Seq}};
# GTF::output_gtf_file([file_handle]);
# This function moves through each gene in the gtf file and outputs
# all data in valid gtf2 format. If takes and optional file_handle
# as the only argument to which it outputs the data. If no argument is
# given it outputs the data to stdout.
sub output_gtf_file{
my ($gtf,$out_handle) = @_;
$gtf->_update;
unless(defined $out_handle){
$out_handle = \*STDOUT;
}
if($gtf->comments){
foreach my $comment (@{$gtf->comments}){
print $out_handle "$comment\n";
}
}
for my $gene (@{$gtf->genes}) { $gene->_update }
my @top_level_features = sort {$a->start <=> $b->start}
(@{$gtf->genes}, @{$gtf->inter_cns}, @{$gtf->inter});
for my $feature (@top_level_features) {
$feature->output_gtf($out_handle);
}
}
# GTF::output_gff_file([file_handle]);
# This function moves through each gene in the gtf file and outputs
# all data in valid GFF format. If takes and optional file_handle
# as the only argument to which it outputs the data. If no argument is
# given it outputs the data to stdout.
sub output_gff_file{
my ($gtf,$out_handle) = @_;
$gtf->_update;
unless(defined $out_handle){
$out_handle = \*STDOUT;
}
if($gtf->genes){
foreach my $gene (@{$gtf->genes}){
$gene->output_gff($out_handle);
}
}
}
# GTF::_parse_file()
# This function is used internally by the GTF constructor GTF::new
# to read, parse, and store the GTF file;
sub _parse_file{
my ($gtf,$warnings) = @_;
unless(defined($warnings)){
if(open(WARN, ">/dev/null")){
$warnings = \*WARN;
}
else{
die "Could not open /dev/null for write\n";
}
}
my $fix_gtf = $gtf->{Fix_GTF};
my $strict = $gtf->{Strict};
#open file
open(GTF, "<$gtf->{Filename}")
or die "Could not open $gtf->{Filename}.\n";
my (%GeneObj,%TxObj);
my (@all_genes,@all_txs,@all_cds,@all_inter,@all_cns);
my $all_line_comments = [];
#prepare error counts
my $max_error_count = 5;
if(defined($gtf->{Detailed_Error_Count})){
$max_error_count = $gtf->{Detailed_Error_Count};
}
my @error_msgs = $gtf->_get_error_messages();
my @errors;
my @bad_list;
my $no_warn = $gtf->{Warning_Skips};
for(my $i = 0;$i <= $#error_msgs;$i++){
if($$no_warn[$i]){
$errors[$i] = $max_error_count;
}
else{
$errors[$i] = 0;
}
}
#read the file
my $line_num = 0;
while(my $in_line = <GTF>){
$line_num++;
chomp $in_line;
if($in_line !~ /\S/){
next;
}
if($in_line =~ /^\s*\#/){
push @$all_line_comments, $in_line;
next;
}
else{
#remove leading whitespace and get comments
my $comments = "";
while($in_line =~ /^(.*)\#(.*)$/){
$in_line = $1;
$comments = $2.$comments;
}
if($in_line =~ /^\s+(.*)$/){
$in_line = $1;
}
my @data = split /\s+/,$in_line;
#verify line is correct length
if($#data < 8){
if($errors[0] < $max_error_count){
print $warnings
"Not enough fields on line $line_num.\n";
}
$errors[0]++;
next;
}
#check for correct whitespace
my $field = 1;
while(($field < 8)&&($in_line =~ /\S+(\s+)/g)){
unless($1 =~ /^\t$/){
if($errors[1] < $max_error_count){
print $warnings
"Incorrect type of whitespace between fields ",
"$field and ",$field + 1, " on line $line_num. ",
"Should be tab.\n";
}
$errors[1]++;
}
$field++;
}
#verify correct <feature> field
$data[2] = lc($data[2]);
unless(($data[2] eq "cds") || ($data[2] eq "exon")
|| ($data[2] eq "5utr") || ($data[2] eq "3utr")
|| ($data[2] eq "start_codon")
|| ($data[2] eq "stop_codon")
|| ($data[2] eq "sec")
|| ($data[2] eq "intron_cns")
|| ($data[2] eq "inter_cns")
|| ($data[2] eq "inter")) {
if($errors[2] < $max_error_count){
print $warnings
"Incorrect value for <feature> field, \"$data[2]\", ",
"on line $line_num.\n";
}
$errors[2]++;
}
if (($data[2] eq "cds") || ($data[2] eq "5utr")
|| ($data[2] eq "3utr") || ($data[2] eq "sec")) {
$data[2] = uc($data[2]);
}
if ($data[2] eq "inter_cns") { $data[2] = "inter_CNS" }
if ($data[2] eq "intron_cns") { $data[2] = "intron_CNS" }
#verify correct <start> format
if($data[3] !~ /^\d*$/){
if($errors[3] < $max_error_count){
print $warnings
"Non-numerical value for <start> field, \"$data[3]\"",
", on line $line_num.\n";
}
$errors[3]++;
}
#verify correct <stop> format
if($data[4] !~ /^\d*$/){
if($errors[4] < $max_error_count){
print $warnings
"Non-numerical value for <stop> field, \"$data[4]\"",
", on line $line_num.\n";
}
$errors[4]++;
}
#verify start is < stop
if($data[3] > $data[4]){
if($errors[38] < $max_error_count){
print $warnings
"Start field is greater than stop field on line $line_num.\n";
}
$errors[38]++;
my $start = $data[3];
$data[3] = $data[4];
$data[4] = $start;
}
#verify correct <score> format
if($data[5] !~ /^-?\d*\.?\d*$/){ # should check for float
if($data[5] ne '.'){
if($errors[5] < $max_error_count){
print $warnings
"Value for <score> field, \"$data[5]\", is ",
"neither numerical nor \".\" on line $line_num\n";
}
$errors[5]++;
}
}
if(!$strict && $data[5] eq '.'){
$data[5] = 0;
}
#verify correct <strand> field
unless(($data[6] eq "+")||($data[6] eq "-")||
($data[6] eq ".")){
if($errors[6] < $max_error_count){
print $warnings
"Value of <strand> field is \"$data[6]\" on line ",
"$line_num. Should be \"+\", \"-\", or \".\".\n";
}
$errors[6]++;
if(!$strict){
if($data[6] eq "1"){
$data[6] = "+";
}
elsif($data[6] eq "-1"){
$data[6] = "-";
}
elsif($data[6] eq "0"){
$data[6] = ".";
}
}
}
#verify correct <frame> field
unless(($data[7] eq "0")||($data[7] eq "1")||
($data[7] eq "2")||($data[7] eq ".")){
if($errors[7] < $max_error_count){
print $warnings
"Value of <frame> field is \"$data[7]\" on line ",
"$line_num. Should be \"0\", \"1\", \"2\", or \".\"",
".\n";
}
$errors[7]++;
}
#prepare <attributes> and <comment> fields
my $attribute_string = "";
if($in_line =~ /^\S+\t\S+\t\S+\t\S+\t\S+\t\S+\t\S+\t\S+\t(.*)$/){
$attribute_string = "$1 ";
}
#check for equals
if($attribute_string =~ / = /){
if($errors[30] < $max_error_count){
print $warnings
"Illegal \'=\' character in <attributes> field on ",
"line $line_num\n";
}
$errors[30]++;
$attribute_string =~ s/=/ /g;
}
# check for tab character.
if($attribute_string =~ /\t/){
if($errors[9] < $max_error_count){
print $warnings
"Illegal tab character in <attributes> field on ",
"line $line_num.\n";
}
$errors[9]++;
$attribute_string =~ s/\t/ /g;
}
# make sure it has semicolon terminator after each attribute
# otherwise fake it.
unless($attribute_string =~ /;/){
if($errors[10] < $max_error_count){
print $warnings
"<attributes> field conatins no semicolons",
" terminating each attribute on line $line_num.\n";
}
$errors[10]++;
my @att = split /\s+/, $attribute_string;
$attribute_string = "";
my $att_count = $#att - (($#att+1) % 2);
for(my $i = 0;$i <= $att_count;$i+=2){
$attribute_string .= $att[$i]." ".$att[$i+1]."; ";
}
}
# parse <attributes> field
my $attributes = {};
while($attribute_string =~ /(\S+)(\s+)(\S+)(\s*)/g){
my $name = $1;
my $space1 = $2;
my $value = $3;
my $space2 = $4;
if($value =~ /^\"(\S*)\";$/){
$value = $1;
}
else{
if($value =~ /^(\S+);$/){
$value = $1;
}
else{
if($errors[10] < $max_error_count){
$value =~ /(.)$/;
print $warnings
"Bad terminator, \"$1\", after ",
"<attributes> name-value pair, $name $value, on ",
"line $line_num. Should be \";\".\n";
}
$errors[10]++;
}
if($value =~ /^\"(\S+)\"$/){
$value = $1;
}
else{
if($errors[11] < $max_error_count){
print $warnings
"<attributes> field missing quotes around",
" values on line $line_num.\n";
}
$errors[11]++;
}
}
$attributes->{$name} = $value;
unless($space1 =~ /^ $/){
if($errors[12] < $max_error_count){
print $warnings
"Bad separator, $space1, between <attributes> ",
"name-value pair, $name $value, on line ",
"$line_num.\n";
}
$errors[12]++;
}
unless($space2 =~ /^ $/){
if($errors[12] < $max_error_count){
print $warnings
"Bad separator, \"$space2\", between <attributes>",
" name-value pairs on line $line_num. Should be",
" \" \".\n$name $value\n";
}
$errors[12]++;
}
}
# check for gene_id and transcript_id attributes
my $gid = "missing";
if(defined($attributes->{gene_id})){
$gid = $attributes->{gene_id};
}
else{
if($errors[13] < $max_error_count){
print $warnings
"gene_id not set in <attributes> field on line ",
"$line_num.\n";
}
$errors[13]++;
$attributes->{gene_id} = "missing";
}
my $tid = "missing";
if(defined($attributes->{transcript_id})){
$tid = $attributes->{transcript_id};
}
else{
if($errors[14] < $max_error_count){
print $warnings
"transcript_id not set in <attributes> field on line ",
"$line_num.\n";
}
$errors[14]++;
$attributes->{transcript_id} = "missing";
}
#create objects
my $feature =
GTF::Feature::new($data[2], $data[3], $data[4], $data[5], $data[7]);
if($feature->type eq "CDS"){
push @all_cds, $feature;
}
elsif($feature->type eq "inter") {
push @all_inter, $feature;
}
elsif($feature->type eq "inter_CNS") {
push @all_cns, $feature;
}
my $tx;
my $gene;
unless(defined($TxObj{$tid}) && length($tid)){
$tx = GTF::Transcript::new($tid);
$TxObj{$tid} = $tx;
unless($feature->type eq "inter" || $feature->type eq "inter_CNS") {
push @all_txs, $tx;
}
unless(defined($GeneObj{$gid}) && length($gid)){
$gene = GTF::Gene::new($gid,$data[0],$data[1],$data[6]);
$GeneObj{$gid} = $gene;
unless($feature->type eq "inter" || $feature->type eq "inter_CNS") {
push @all_genes, $gene;
}
}
$gene = $GeneObj{$gid};
$gene->add_transcript($tx);
}
else{
$gene = $GeneObj{$gid};
}
$tx = $TxObj{$tid};
#check that strand value is consistent with the strand of the tx
unless($data[6] eq $tx->strand){
if($errors[20] < $max_error_count){
print $warnings
"Inconsistent <strand> value across gene_id = ".
$tx->gene_id.".\n";
}
$errors[20]++;
if (_check_errors_against_badlist($gtf,20)) {
push @bad_list, $tx->id;
}
if($strict){
die;
}
#last;???
}
$tx->add_feature($feature);
}
}
close(GTF);
#check each gene object
foreach my $tx (@all_txs){
my $exons = $tx->exons;
my $cds = $tx->cds;
my $utr3 = $tx->utr3;
my $utr5 = $tx->utr5;
my $sec = $tx->sec;
my $inter = $tx->inter;
my $inter_cns = $tx->inter_cns;
if($#$exons >= 0 && $#$utr3 == -1 && $#$utr5 == -1){
$tx->create_utr_objects_from_exons;
}
my $starts = $tx->start_codons;
my $stops = $tx->stop_codons;
my $strand = $tx->strand;
#make sure this tx contains at least one CDS, inter_CNS, or inter feature
if($#$cds == -1 && $#$inter_cns == -1 && $#$inter == -1){
if($errors[15] < $max_error_count){
print $warnings
"Transcript ".$tx->id." contains no CDS, inter or inter_CNS features.\n";
}
$errors[15]++;
if (_check_errors_against_badlist($gtf,15)) {
push @bad_list, $tx->id;
}
next;
}
#verify start/stop codon exists and has length 3
my $start_codon_start = -1;
my $start_codon_stop = -1;
if($#$starts == -1){
if($errors[17] < $max_error_count){
print $warnings
"Missing start_codon for transcript \"".$tx->id."\".\n";
}
$errors[17]++;
if (_check_errors_against_badlist($gtf,17)) {
push @bad_list, $tx->id;
}
}
else{
my $length = 0;
foreach my $start (@$starts){
$length += $start->stop - $start->start + 1;
if(($start_codon_start < 0) || ($start_codon_start > $start->start)){
$start_codon_start = $start->start;
}
if(($start_codon_stop < 0) || ($start_codon_stop < $start->stop)){
$start_codon_stop = $start->stop;
}
}
if($length != 3){
if($errors[16] < $max_error_count){
print $warnings
"Start Codon length is not three in transcript \"".$tx->id."\".\n";
}
$errors[16]++;
if (_check_errors_against_badlist($gtf,16)) {
push @bad_list, $tx->id;
}
}
}
my $stop_codon_start = -1;
my $stop_codon_stop = -1;
if($#$stops == -1){
if($errors[19] < $max_error_count){
print $warnings
"Missing stop_codon for transcript \"".$tx->id."\".\n";
}
$errors[19]++;
if (_check_errors_against_badlist($gtf,19)) {
push @bad_list, $tx->id;
}
}
else{
my $length = 0;
foreach my $stop (@$stops){
$length += $stop->stop - $stop->start + 1;
if(($stop_codon_start < 0) || ($stop_codon_start > $stop->stop)){
$stop_codon_start = $stop->start;
}
if(($stop_codon_stop < 0) || ($stop_codon_stop < $stop->stop)){
$stop_codon_stop = $stop->stop;
}
}
if($length != 3){
if($errors[18] < $max_error_count){
print $warnings
"Stop Codon length is not three in gene \"".$tx->id."\".\n";
}
$errors[18]++;
if (_check_errors_against_badlist($gtf,18)) {
push @bad_list, $tx->id;
}
}
}
#check that no cds or utr features overlap each other
# also counts introns that are too short
# introns shorter than 4 bp are in all likelihood
# biologically impossible (at least 2 bp for each splice signal)
my $p_exons;
my $overlap = 0;
my $zero_intron_count = 0;
my $short_intron_count = 0;
foreach $p_exons ($utr5, $starts, $cds, $stops, $utr3)
{
my $last_stop;
if ($#$p_exons > 0)
{
$last_stop = $$p_exons[0]->stop;
}
for(my $i = 1;$i <= $#$p_exons;$i++){
my $intron_length = $$p_exons[$i]->start - $last_stop - 1;
if($intron_length < 0){
$overlap++;
}
elsif($intron_length < 4){
$zero_intron_count++;
}
elsif($intron_length < 20){
$short_intron_count++;
}
$last_stop = $$p_exons[$i]->stop;
}
}
# check CDS - UTR junctions for overlaps
# here, introns with length 0 are allowed because some
# exons could be in part UTR and in part CDS
if ($tx->strand eq "+")
{
my $ilen;
if (($#$utr5 > -1) && ($#$starts > -1)){
$ilen = $$starts[0]->start - $$utr5[$#$utr5]->stop - 1;
$overlap++ if ($ilen < 0);
$zero_intron_count++ if (($ilen > 0) && ($ilen < 4));
}
if (($#$cds > -1) && ($#$stops > -1)){
# assumes stop codons are not included in CDS
$ilen = $$stops[0]->start - $$cds[$#$cds]->stop - 1;
$overlap++ if ($ilen < 0);
$zero_intron_count++ if (($ilen > 0) && ($ilen < 4));
}
if (($#$stops > -1) && ($#$utr3 > -1)){
$ilen = $$utr3[0]->start - $$stops[$#$stops]->stop - 1;
$overlap++ if ($ilen < 0);
$zero_intron_count++ if (($ilen > 0) && ($ilen < 4));
}
}
else{
my $ilen;
if (($#$utr3 > -1) && ($#$stops > -1)){
$ilen = $$stops[0]->start - $$utr3[$#$utr3]->stop - 1;
$overlap++ if ($ilen < 0);
$zero_intron_count++ if (($ilen > 0) && ($ilen < 4));
}
if (($#$stops > -1) && ($#$cds > -1)){
# assumes stop codon is not included in CDS
$ilen = $$cds[0]->start - $$stops[$#$stops]->stop - 1;
$overlap++ if ($ilen < 0);
$zero_intron_count++ if (($ilen > 0) && ($ilen < 4));
}
if (($#$starts > -1) && ($#$utr5 > -1)){
$ilen = $$utr5[0]->start - $$starts[$#$starts]->stop - 1;
$overlap++ if ($ilen < 0);
$zero_intron_count++ if (($ilen > 0) && ($ilen < 4));
}
}
if($overlap > 0){
if($errors[45] < $max_error_count){
print $warnings
"Overlapping CDS or UTR features in transcript \"".$tx->id."\".\n";
}
$errors[45]++;
if (_check_errors_against_badlist($gtf,45)) {
push @bad_list, $tx->id;
}
}
if($zero_intron_count > 0){
if($errors[48] < $max_error_count){
print $warnings "$zero_intron_count intron(s) shorter than 4 bp found on in transcript \"".$tx->id."\".\n" ;
}
$errors[48]++;
if (_check_errors_against_badlist($gtf,48)) {
push @bad_list, $tx->id;
}
}
if($short_intron_count > 0){
if($errors[49] < $max_error_count){
print $warnings "$short_intron_count short (<20bp) intron(s) found on in transcript \"".$tx->id."\".\n" ;
}
$errors[49]++;
if (_check_errors_against_badlist($gtf,49)) {
push @bad_list, $tx->id;
}
}
#check that each cds is between the start and stop and that the initial and
#terminal exons are placed correctly with respect to the start/stop codons
if($start_codon_start >= 0){
if($strand eq '-'){
for(my $i = $#$cds;$i >= 0;$i--){
if($$cds[$i]->end > $start_codon_stop){
if($errors[21] < $max_error_count){
print $warnings
"CDS before start codon in transcript_id = \"".
"".$tx->id."\".\n";
}
$errors[21]++;
if (_check_errors_against_badlist($gtf,21)) {
push @bad_list, $tx->id;
}
}
}
my $cds = $tx->initial_exon;
if($cds->end != $start_codon_stop){
if($errors[33] < $max_error_count){
print $warnings
"Start codon location not consistent with initial exon ".
"locaiton in transcript_id = \"".$tx->id."\".\n";
}
$errors[33]++;
if (_check_errors_against_badlist($gtf,33)) {
push @bad_list, $tx->id;
}
}
if($cds->length < 3){
my $cds = $tx->cds;
my $error = 0;
for(my $i = 0;$i <= $#$starts;$i++){
if($i == $#$starts){
unless($$cds[$#$cds-$i]->start <= $$starts[$#$starts-$i]->start){
$error = 1;
}
}
else{
unless($$cds[$#$cds-$i]->start == $$starts[$#$starts-$i]->start){
$error = 1;
}
}
}
if($error){
if($errors[47] < $max_error_count){
print $warnings "Start codon annotated in intron region on ".
"transcript_id = \"".$tx->id."\".\n";
}
$errors[47]++;
if (_check_errors_against_badlist($gtf,47)) {
push @bad_list, $tx->id;
}
my $pos = 0;
my $start_pos = 0;
my $cds_pos = $#$cds;
#fix the start codon by splitting it across the cds
while($pos < 3){
my $end;
if($$cds[$cds_pos]->length < 3-$pos){
$end = $$cds[$cds_pos]->start;
$pos += $$cds[$cds_pos]->length;
}
else{
$end = $$cds[$cds_pos]->end-2+$pos;
$pos = 3;
}
if($start_pos <= $#$starts){
$$starts[$start_pos]->set_start($end);
$$starts[$start_pos]->set_stop($$cds[$cds_pos]->end);
}
else{
my $sc = GTF::Feature::new("start_codon", $end, $$cds[$cds_pos]->end, ".", ".");
$tx->add_feature($sc);
}
$cds_pos--;
$start_pos++;
}
$starts = [sort {$a->start <=> $b->start} @$starts];
}
}
}
else{
for(my $i = 0;$i <= $#$cds;$i++){
if($$cds[$i]->start < $start_codon_start){
if($errors[21] < $max_error_count){
print $warnings
"CDS before start codon in transcript_id = \"".
"".$tx->id."\".\n";
}
$errors[21]++;
if (_check_errors_against_badlist($gtf,21)) {
push @bad_list, $tx->id;
}
}
}
my $cds = $tx->initial_exon;
if($cds->start != $start_codon_start){
if($errors[33] < $max_error_count){
print $warnings
"Start codon location not consistent with initial exon ".
"locaiton in transcript_id = \"".$tx->id."\".\n";
}
$errors[33]++;
if (_check_errors_against_badlist($gtf,33)) {
push @bad_list, $tx->id;
}
}
if($cds->length < 3){
my $cds = $tx->cds;
my $error = 0;
for(my $i = 0; $i <= $#$starts;$i++){
if($i == $#$starts){
unless($$cds[$i]->stop >= $$starts[$i]->stop){
$error = 1;
}
}
else{
unless($$cds[$i]->stop == $$starts[$i]->stop){
$error = 1;
}
}
}
if($error){
if($errors[47] < $max_error_count){
print $warnings "Start codon annotated in intron region on ".
"transcript_id = \"".$tx->id."\".\n";
}
$errors[47]++;
if (_check_errors_against_badlist($gtf,47)) {
push @bad_list, $tx->id;
}
my $pos = 0;
my $start_pos = 0;
my $cds_pos = 0;
#fix the start codon by splitting it across the cds
while($pos < 3){
my $end;
if($$cds[$cds_pos]->length < 3-$pos){
$end = $$cds[$cds_pos]->stop;
$pos += $$cds[$cds_pos]->length;
}
else{
$end = $$cds[$cds_pos]->start+2-$pos;
$pos = 3;
}
if($start_pos <= $#$starts){
$$starts[$start_pos]->set_start($$cds[$cds_pos]->start);
$$starts[$start_pos]->set_stop($end);
}
else{
my $sc = GTF::Feature::new("start_codon", $$cds[$cds_pos]->start, $end, ".", ".");
$tx->add_feature($sc);
}
$cds_pos++;
$start_pos++;
}
$starts = [sort {$a->start <=> $b->start} @$starts];
}
}
}
}
if($stop_codon_start >= 0){
if($strand =~ /\-/){
for(my $i = 0;$i <= $#$cds;$i++){
if($$cds[$i]->start < $stop_codon_start){
if($errors[22] < $max_error_count){
print $warnings
"CDS after stop codon in transcript_id = \"".
"".$tx->id."\".\n";
}
$errors[22]++;
if (_check_errors_against_badlist($gtf,22)) {
push @bad_list, $tx->id;
}
}
}
my $cds = $tx->terminal_exon;
if($cds->start == $stop_codon_start){
unless($cds->stop < $stop_codon_stop + 1){
$cds->set_start($stop_codon_stop + 1);
}
if($errors[46] < $max_error_count){
print $warnings
"Stop codon included in terminal CDS transcript_id = ".
"\"".$tx->id."\".\n";
}
$errors[46]++;
if (_check_errors_against_badlist($gtf,46)) {
push @bad_list, $tx->id;
}
}
if($cds->start != $stop_codon_stop + 1){
if($errors[34] < $max_error_count){
print $warnings
"Stop codon location not consistent with terminal exon ".
"locaiton in transcript_id = \"".$tx->id."\".\n";
}
$errors[34]++;
if (_check_errors_against_badlist($gtf,34)) {
push @bad_list, $tx->id;
}
}
}
else{
for(my $i = $#$cds;$i >= 0;$i--){
if($$cds[$i]->end > $stop_codon_stop){
if($errors[22] < $max_error_count){
print $warnings
"CDS after stop codon in transcript_id = \"".$tx->id."\".\n";
}
$errors[22]++;
if (_check_errors_against_badlist($gtf,22)) {
push @bad_list, $tx->id;
}
}
}
my $cds = $tx->terminal_exon;
if($cds->stop == $stop_codon_stop){
unless($cds->start > $stop_codon_start -1){
$cds->set_stop($stop_codon_start - 1);
}
if($errors[46] < $max_error_count){
print $warnings
"Stop codon included in terminal CDS transcript_id = ".
"\"".$tx->id."\".\n";
}
$errors[46]++;
if (_check_errors_against_badlist($gtf,46)) {
push @bad_list, $tx->id;
}
}
if($cds->stop != $stop_codon_start - 1){
if($errors[34] < $max_error_count){
print $warnings
"Stop codon location not consistent with terminal exon ".
"locaiton in transcript_id = \"".$tx->id."\".\n";
}
$errors[34]++;
if (_check_errors_against_badlist($gtf,34)) {
push @bad_list, $tx->id;
}
}
}
}
#check that complete transcripts have even number of codons
if(($start_codon_start >= 0) && ($stop_codon_stop >= 0)){
my $len = 0;
for(my $i = $#$cds;$i >= 0;$i--){
$len += $$cds[$i]->length;
}
unless(($len % 3) == 0){
if($errors[41] < $max_error_count){
print $warnings
"Complete transcript length not a multiple of three".
" on transcript_id = \"".$tx->id."\".\n";
}
$errors[41]++;
if (_check_errors_against_badlist($gtf,41)) {
push @bad_list, $tx->id;
}
}
}
#check for consistent phase
my $phase = -1;
if($start_codon_start >= 0){
$phase = 0;
}
elsif($start_codon_stop >= 0){
my $len = 0;
for(my $i = $#$cds;$i >= 0;$i--){
$len += $$cds[$i]->length;
}
$phase = $len % 3;
}
if($strand eq "-"){
if($phase == -1){
$phase = $$cds[$#$cds]->frame;
if($phase eq "."){
my $pos = -1;
for(my $i = $#$cds; $i >= 0;$i--){
unless($$cds[$i]->frame eq "."){
$pos = $i;
$phase = $$cds[$i]->frame;
last;
}
}
if($pos >= 0){
for(my $i = $pos + 1; $i <= $#$cds;$i++){
my $len = $$cds[$i]->length + $phase;
$phase = $len % 3;
}
}
}
}
unless($phase eq "."){
my $bad_frame = 0;
for(my $i = $#$cds;$i >= 0;$i--){
unless($phase eq $$cds[$i]->frame){
unless($$cds[$i]->frame eq "."){
$bad_frame = 1;
}
$$cds[$i]->set_frame($phase);
}
$phase = ($$cds[$i]->length - $phase) % 3;
if($phase == 1){
$phase = 2;
}
elsif($phase == 2){
$phase = 1;
}
}
if($bad_frame){
if($errors[42] < $max_error_count){
print $warnings
"Inconsistent frame accross transcript_id = ".
"\"".$tx->id."\".\n";
}
$errors[42]++;
if (_check_errors_against_badlist($gtf,42)) {
push @bad_list, $tx->id;
}
}
}
}
else{
if($phase == -1){
$phase = $$cds[0]->frame;
if($phase eq "."){
my $pos = -1;
for(my $i = 0; $i <= $#$cds;$i++){
unless($$cds[$i]->frame eq "."){
$pos = $i;
$phase = $$cds[$i]->frame;
last;
}
}
if($pos >= 0){
for(my $i = $pos - 1; $i >= 0;$i--){
my $len = $$cds[$i]->length + $phase;
$phase = $len % 3;
}
}
}
}
unless($phase eq "."){
my $bad_frame = 0;
for(my $i = 0; $i <= $#$cds;$i++){
unless($phase eq $$cds[$i]->frame){
unless($$cds[$i]->frame eq "."){
$bad_frame = 1;
}
$$cds[$i]->set_frame($phase);
}
$phase = ($$cds[$i]->length - $phase) % 3;
if($phase == 1){
$phase = 2;
}
elsif($phase == 2){
$phase = 1;
}
}
if($bad_frame){
if($errors[42] < $max_error_count){
print $warnings
"Inconsistent frame accross transcript_id = ".
"\"".$tx->id."\".\n";
}
$errors[42]++;
if (_check_errors_against_badlist($gtf,42)) {
push @bad_list, $tx->id;
}
}
}
}
# check selenocysteine (SEC) features here. must be inside a
# CDS and have frame 0
if($#$sec >= 0){
foreach my $s (@$sec){
if($s->frame ne "0" && $s->frame ne "."){
#bad SEC frame
if($errors[51] < $max_error_count){
print $warnings
"Bad frame, ".$s->frame."on SEC feature on tx ".$tx->id.". Must of 0 or \".\"\n";
}
$errors[51]++;
if (_check_errors_against_badlist($gtf,51)) {
push @bad_list, $tx->id;
}
}
my $good = 0;
foreach my $c (@$cds){
if($c->stop < $s->start){
next;
}
if($c->start <= $s->start &&
$c->stop >= $s->stop){
$good = 1;
last;
}
if($c->start > $s->stop){
last;
}
}
if($good == 0){
# SEC outside CDS
if($errors[52] < $max_error_count){
print $warnings
"SEC feature outside of CDS region on tx ".$tx->id.".\n";
}
$errors[52]++;
if (_check_errors_against_badlist($gtf,52)) {
push @bad_list, $tx->id;
}
}
}
}
}
#if the conservation sequence is available store it's info;
my $conseqname = $gtf->{Conseq};
my %total_conserv = (0 => 0,1 => 0,2 => 0);
if($conseqname){
open(Conseq, $conseqname)
or die "Could not open conservation sequence file: \"$conseqname\".\n";
my @conseq_info = stat(Conseq);
my @comp;
foreach my $tx (@all_txs){
my $cds = $tx->cds;
my $exons = $tx->exons;
my $starts = $tx->start_codons;
my $stops = $tx->stop_codons;
foreach my $start (@$starts){
push @comp,{feature => $start, "0" => 0, "1" => 0, "2" => 0};
}
foreach my $stop (@$stops){
push @comp,{feature => $stop, "0" => 0, "1" => 0, "2" => 0};
}
foreach my $exon (@$cds){
push @comp,{feature => $exon, "0" => 0, "1" => 0, "2" => 0};
}
foreach my $exon (@$exons){
push @comp,{feature => $exon, "0" => 0, "1" => 0, "2" => 0};
}
}
@comp = sort {$a->{feature}->start <=> $b->{feature}->start} @comp;
my $base = "X";
my $nuc_pos = 0;
my $file_pos = 0;
my $line = "X";
my $next_comp = 0;
my $max_comps = $#comp;
my %checking;
while(($next_comp <= $max_comps) and ($file_pos < $conseq_info[7])){
$base = getc(Conseq);
unless(defined($base)){
print $warnings "Features beyond end of conservation sequence.\n";
last;
}
$file_pos++;
if($base =~ /\d/){
$nuc_pos++;
if(defined($total_conserv{$base})){
$total_conserv{$base}++;
}
else{
$total_conserv{X}++;
if($errors[43] < $max_error_count){
print $warnings "Bad conservation sequence character, $base. ".
"Should be \'0\',\'1\', or \'2\'.\n";
}
$errors[43]++;
}
#compile composition stats
while(($next_comp <= $max_comps) and
($nuc_pos == $comp[$next_comp]->{feature}->start)){
$checking{$comp[$next_comp]} = $comp[$next_comp];
$next_comp++;
}
foreach my $id (keys %checking){
$checking{$id}->{$base}++;
if($checking{$id}->{feature}->stop == $nuc_pos){
delete $checking{$id};
}
}
}
}
while($file_pos < $conseq_info[7]){
$base = getc(Conseq);
$file_pos++;
if($base =~ /\d/){
$nuc_pos++;
if(defined($total_conserv{$base})){
$total_conserv{$base}++;
}
else{
$total_conserv{X}++;
if($errors[43] < $max_error_count){
print $warnings "Bad conservation sequence character, $base. ".
"Should be \'0\',\'1\', or \'2\'.\n";
}
$errors[43]++;
}
}
}
close(Conseq);
foreach my $info (@comp){
$info->{feature}->set_conseq($info->{0},$info->{1},$info->{2});
}
}
$gtf->{Total_Conseq} = \%total_conserv;
#if the sequence is available use it to fix any problems
my @inframe_stops;
my $seqname = $gtf->{Sequence};
my %tx_phase;
foreach my $tx (@all_txs){
$tx_phase{$tx->id} = -1;
}
my %total_seq = ('A' => 0,'C' => 0,'G' =>0,'T' => 0,'N' => 0,'X' => 0);
#EVAN need to rewrite all this sequence loading stuff because it is fugly
if($seqname){
open(SEQ, "<$seqname")
or die "Could not open sequence file: \"$seqname\".\n";
my @seq_info = stat(SEQ);
#try to fix things with the sequence info here
#check start codons, stop codons, splice site, internal stop
my %sequence;
my %seq_pos;
my @checks;
foreach my $tx (@all_txs){
$sequence{$tx->id} = "";
$seq_pos{$tx->id} = 0;
my $starts = $tx->start_codons;
my $stops = $tx->stop_codons;
my $utr3 = $tx->utr3;
my $utr5 = $tx->utr5;
my $cds = $tx->cds;
if($#$cds == -1){
next;
}
my $exons = $tx->exons;
foreach my $start (@$starts){
push @checks, {pos => $start->start,
type => "start_sequence",
tx => $tx,
object => $start,
stop => $start->stop};
}
foreach my $stop (@$stops){
push @checks, {pos => $stop->start,
type => "stop_sequence",
tx => $tx,
object => $stop,
stop => $stop->stop};
}
foreach my $exon (@$exons){
push @checks, {pos => $exon->start,
type => "exon_sequence",
tx => $tx,
object => $exon,
stop => $exon->stop};
}
foreach my $coding (@$cds){
push @checks, {pos => $coding->start,
type => "coding_sequence",
tx => $tx,
object => $coding,
stop => $coding->stop};
}
my($p_exons, $splice_type1, $splice_type2);
if ($tx->strand eq "+"){
$splice_type1 = "donor_splice";
$splice_type2 = "acceptor_splice";
}
else{
$splice_type1 = "acceptor_splice";
$splice_type2 = "donor_splice";
}
foreach $p_exons ($utr5, $starts, $cds, $stops, $utr3){
next if ($#$p_exons < 1);
my($i, $subtype);
$subtype = ($$p_exons[0]->type =~ /UTR/) ? "utr" : "cds";
for ($i = 1; $i <= $#$p_exons; $i++){
push @checks, {pos => $$p_exons[$i-1]->stop+2,
type => $subtype."_".$splice_type1,
tx => $tx,
stop => 0};
push @checks, {pos => $$p_exons[$i]->start-1,
type => $subtype."_".$splice_type2,
tx => $tx,
stop => 0};
}
}
# this assumes that the innermost start codon is included in CDS
# therefore, no checks are run on start_codon - CDS junction
# also, this allows zero-length introns at UTR5 - start_codon
# CDS - stop_codon and UTR3 - stop_codon junctions
if ($tx->strand eq "+"){
if (($#$utr5 > -1) && ($#$starts > -1)){
my $ilen = $$starts[0]->start - $$utr5[$#$utr5]->stop - 1;
if ($ilen != 0){
push @checks, {pos => $$utr5[$#$utr5]->stop+2,
type => "utr_donor_splice",
tx => $tx,
stop => 0};
push @checks, {pos => $$starts[0]->start-1,
type => "utr_acceptor_splice",
tx => $tx,
stop => 0};
}
}
if (($#$cds > -1) && ($#$stops > -1)){
my $ilen = $$stops[0]->start - $$cds[$#$cds]->stop - 1;
if ($ilen != 0){
push @checks, {pos => $$cds[$#$cds]->stop+2,
type => "cds_donor_splice",
tx => $tx,
stop => 0};
push @checks, {pos => $$stops[0]->start-1,
type => "cds_acceptor_splice",
tx => $tx,
stop => 0};
}
}
if (($#$stops > -1) && ($#$utr3 > -1)){
my $ilen = $$utr3[0]->start - $$stops[$#$stops]->stop - 1;
if ($ilen != 0){
push @checks, {pos => $$stops[$#$stops]->stop+2,
type => "utr_donor_splice",
tx => $tx,
stop => 0};
push @checks, {pos => $$utr3[0]->start-1,
type => "utr_acceptor_splice",
tx => $tx,
stop => 0};
}
}
if ($#{$stops} < 0){
push @checks, {pos => $$cds[$#{$cds}]->stop+1,
type => "stop_sequence?",
tx => $tx,
stop => $$cds[$#{$cds}]->stop+3};
}
if ($#{$starts} < 0){
if ($$cds[0]->start-3 > 0){
push @checks, {pos => $$cds[0]->start-3,
type => "start_sequence?",
tx => $tx,
stop => $$cds[0]->start-1};
}
else{
$sequence{$tx->id} = "NNN";
}
}
}
else{
if (($#$utr3 > -1) && ($#$stops > -1)){
my $ilen = $$stops[0]->start - $$utr3[$#$utr3]->stop - 1;
if ($ilen != 0){
push @checks, {pos => $$utr3[$#$utr3]->stop+2,
type => "utr_acceptor_splice",
tx => $tx,
stop => 0};
push @checks, {pos => $$stops[0]->start-1,
type => "utr_donor_splice",
tx => $tx,
stop => 0};
}
}
if (($#$stops > -1) && ($#$cds > -1)){
my $ilen = $$cds[0]->start - $$stops[$#$stops]->stop - 1;
if ($ilen != 0){
push @checks, {pos => $$stops[$#$stops]->stop+2,
type => "cds_acceptor_splice",
tx => $tx,
stop => 0};
push @checks, {pos => $$cds[0]->start-1,
type => "cds_donor_splice",
tx => $tx,
stop => 0};
}
}
if (($#$starts > -1) && ($#$utr5 > -1)){
my $ilen = $$utr5[0]->start - $$starts[$#$starts]->stop - 1;
if ($ilen != 0){
push @checks, {pos => $$starts[$#$starts]->stop+2,
type => "utr_acceptor_splice",
tx => $tx,
stop => 0};
push @checks, {pos => $$utr5[0]->start-1,
type => "utr_donor_splice",
tx => $tx,
stop => 0};
}
}
if ($#{$stops} < 0){
if ($$cds[0]->start-3 > 0){
push @checks, {pos => $$cds[0]->start-3,
type => "stop_sequence?",
tx => $tx,
stop => $$cds[0]->start-1};
}
}
if ($#{$starts} < 0){
push @checks, {pos => $$cds[$#{$cds}]->stop+1,
type => "start_sequence?",
tx => $tx,
stop => $$cds[$#{$cds}]->stop+3};
}
}
}
@checks = sort{$a->{pos} <=> $b->{pos} || $a->{stop} <=> $b->{stop}} @checks;
my $file_pos = length(<SEQ>);
my $base = "X";
my $nuc_pos = 0;
my $line = "X";
my $next = 0;
my $prev_base = "X";
my $max_checks = $#checks;
my %checking;
my $splice_type;
while((($next <= $max_checks) || ((keys %checking) >= 0))
&& ($file_pos < $seq_info[7])){
$base = getc(SEQ);
unless(defined($base)){
print $warnings "Features beyond end of sequence.\n";
last;
}
$file_pos++;
if($base =~ /\S/){
$base = uc($base);
if((defined($total_seq{$base})) && ($base ne 'X')){
$total_seq{$base}++;
}
else{
$total_seq{X}++;
if($errors[44] < $max_error_count){
print $warnings "Bad fasta sequence character, $base. Should be".
" \'A\',\'C\',\'T\',\'G\',or \'N\'.\n";
}
$errors[44]++;
}
$nuc_pos++;
while(($next <= $max_checks) && ($checks[$next]{pos} == $nuc_pos)){
if($checks[$next]{type} =~ /sequence/){
$checking{$checks[$next]{tx}->id . "\t" .
$checks[$next]{type}} = {
check => $checks[$next],
A => 0,
C => 0,
G => 0,
T => 0,
N => 0};
}
elsif(($checks[$next]{type} =~ /^(\w+)_acceptor_splice$/)
&& ($splice_type = $1)) {
if($checks[$next]{tx}->strand eq "-"){
unless(($prev_base eq "C") && ($base eq "T")){
if (($prev_base eq "G") && ($base eq "T")) {
if ($errors[50] < $max_error_count) {
print $warnings
"Possible non-canonical splice site sequence, ".
"$prev_base$base, on negative strand transcript \"".
$checks[$next]{tx}->id.
"\".\n";
}
$errors[50]++;
if ((_check_errors_against_badlist($gtf,50))
&& ($splice_type ne "utr")) {
push @bad_list, $checks[$next]{tx}->id;
}
}
else {
if($errors[26] < $max_error_count){
print $warnings
"Bad acceptor splice site sequence, $prev_base".
"$base, on negative strand transcript \"".
$checks[$next]{tx}->id.
"\". Should be CT or GT.\n";
}
$errors[26]++;
if (_check_errors_against_badlist($gtf,26)) {
push @bad_list, $checks[$next]{tx}->id;
}
}
}
}
else{
unless(($prev_base eq "A") && ($base eq "G")){
if (($prev_base eq "A") && ($base eq "C")) {
if ($errors[50] < $max_error_count) {
print $warnings
"Possible non-canonical splice site sequence, ".
"$prev_base$base, on transcript \"".
$checks[$next]{tx}->id.
"\".\n";
}
$errors[50]++;
if ((_check_errors_against_badlist($gtf,50))
&& ($splice_type ne "utr")) {
push @bad_list, $checks[$next]{tx}->id;
}
}
else {
if($errors[26] < $max_error_count){
print $warnings
"Bad acceptor splice site sequence, $prev_base".
"$base, on transcript \"".
$checks[$next]{tx}->id.
"\". Should be AG or AC.\n";
}
$errors[26]++;
if (_check_errors_against_badlist($gtf,26)) {
push @bad_list, $checks[$next]{tx}->id;
}
}
}
}
}
elsif(($checks[$next]{type} =~ /^(\w+)_donor_splice$/)
&& ($splice_type = $1)) {
if($checks[$next]{tx}->strand eq "-"){
unless(($prev_base eq "A") && ($base eq "C")){
if (($prev_base eq "A") && ($base eq "T")) {
if ($errors[50] < $max_error_count) {
print $warnings
"Possible non-canonical splice site sequence, ".
"$prev_base$base, on negative strand transcript \"".
$checks[$next]{tx}->id.
"\".\n";
}
$errors[50]++;
if ((_check_errors_against_badlist($gtf,50))
&& ($splice_type ne "utr")) {
push @bad_list, $checks[$next]{tx}->id;
}
}
elsif (($prev_base eq "G") && ($base eq "C")) {
if ($errors[50] < $max_error_count) {
print $warnings
"Possible non-canonical splice site sequence, ".
"$prev_base$base, on negative strand transcript \"".
$checks[$next]{tx}->id.
"\".\n";
}
$errors[50]++;
if ((_check_errors_against_badlist($gtf,50))
&& ($splice_type ne "utr")) {
push @bad_list, $checks[$next]{tx}->id;
}
}
else {
if ($errors[27] < $max_error_count){
print $warnings
"Bad donor splice site sequence, $prev_base".
"$base, on negative strand transcript \"".
$checks[$next]{tx}->id.
"\". Should be AC, GC, or AT.\n";
}
$errors[27]++;
if (_check_errors_against_badlist($gtf,27)) {
push @bad_list, $checks[$next]{tx}->id;
}
}
}
}
else{
unless(($prev_base eq "G") && ($base eq "T")){
if (($prev_base eq "G") && ($base eq "C")) {
if ($errors[50] < $max_error_count) {
print $warnings
"Possible non-canonical splice site sequence, ".
"$prev_base$base, on transcript \"".
$checks[$next]{tx}->id.
"\".\n";
}
$errors[50]++;
if ((_check_errors_against_badlist($gtf,50))
&& ($splice_type ne "utr")) {
push @bad_list, $checks[$next]{tx}->id;
}
}
elsif (($prev_base eq "A") && ($base eq "T")) {
if ($errors[50] < $max_error_count) {
print $warnings
"Possible non-canonical splice site sequence, ".
"$prev_base$base, on transcript \"".
$checks[$next]{tx}->id.
"\".\n";
}
$errors[50]++;
if ((_check_errors_against_badlist($gtf,50))
&& ($splice_type ne "utr")) {
push @bad_list, $checks[$next]{tx}->id;
}
}
else {
if ($errors[27] < $max_error_count){
print $warnings
"Bad donor splice site sequence, $prev_base".
"$base, on transcript \"".
$checks[$next]{tx}->id.
"\". Should be GT, GC, or AT.\n";
}
$errors[27]++;
if (_check_errors_against_badlist($gtf,27)) {
push @bad_list, $checks[$next]{tx}->id;
}
}
}
}
}
$next++;
}
foreach my $check (keys %checking){
$check =~ /^(.+)\t.+$/;
my $tx_id = $1;
unless(($checking{$check}{check}{type} =~ /exon/) ||
($seq_pos{$tx_id} >= $nuc_pos)){
$sequence{$tx_id}.= uc($base);
$seq_pos{$tx_id} = $nuc_pos;
}
$checking{$check}{uc($base)}++;
if($checking{$check}{check}{stop} eq $nuc_pos){
my $object = $checking{$check}{check}{object};
if(defined($object)){
if($object->strand eq "-"){
$object->set_bases($checking{$check}{T},
$checking{$check}{G},
$checking{$check}{C},
$checking{$check}{A},
$checking{$check}{N});
}
else{
$object->set_bases($checking{$check}{A},
$checking{$check}{C},
$checking{$check}{G},
$checking{$check}{T},
$checking{$check}{N});
}
}
delete $checking{$check};
}
}
$prev_base = $base;
}
}
foreach my $tx (@all_txs){
my $cds = $tx->cds;
unless($#$cds >= 0){
next;
}
my $sec = $tx->sec;
my $starts = $tx->start_codons;
my $stops = $tx->stop_codons;
my $strand = $tx->strand;
my $tx_id = $tx->id;
my $phase = $$cds[0]->frame;
my $old_phase = $phase;
if($strand eq "-"){
$phase = $$cds[$#$cds]->frame;
}
my $seq = $sequence{$tx->id};
if($seq =~ /^$/){
print STDERR "No sequence for $tx_id\n";
next;
}
if($tx->strand eq "-"){
$seq = _rev_comp($seq);
}
my $start = 0;
my $stop = 0;
my $found_start = -1;
my $found_stop = -1;
my @seq = split //, $seq;
#check some SEC stuff here;
foreach my $s (@$sec){
my $pos = 0;
if($tx->strand eq '-'){
foreach my $c (reverse @$cds){
if($c->start > $s->stop){
$pos += $c->stop-$c->start+1;
}
else{
$pos += $c->stop-$s->stop;
last;
}
}
if($#$stops == -1){
$pos += 3;
}
}
else{
foreach my $c (@$cds){
if($c->stop < $s->start){
$pos += $c->stop-$c->start+1;
}
else{
$pos += $s->start-$c->start;
last;
}
}
if($#$starts == -1){
$pos += 3;
}
}
my $codon = substr($seq,$pos,3);
unless($codon eq "TGA"){
#bad SEC codon
$error_msgs[53] = "SEC feature had bad sequence. Must be \"TGA\"";
if($errors[53] < $max_error_count){
print $warnings "SEC feature had bad sequence, $codon, on tx, ".$tx->id.
". Must be \"TGA\"\n";
}
$errors[53]++;
if (_check_errors_against_badlist($gtf,53)) {
push @bad_list, $tx->id;
}
}
substr($seq,$pos,3) = "NNN";
}
if($#$starts >= 0){
my $sc = substr($seq,0,3);
unless($sc eq "ATG"){
if($errors[24] < $max_error_count){
print $warnings
"Bad start codon sequence, $sc, on tx ".
$tx->id.". Should be ATG.\n";
}
$errors[24]++;
if (_check_errors_against_badlist($gtf,24)) {
push @bad_list, $tx->id;
}
}
}
else{
my $sc1 = substr($seq,0,3);
my $sc2 = substr($seq,3,3);
if($sc2 eq "ATG"){
$found_start = 3;
$start = 3;
}
elsif($sc1 eq "ATG"){
$found_start = 0;
}
else{
$start = 3;
}
}
if($#$stops >= 0){
my $sc = substr($seq,$#seq-2,3);
unless(($sc eq "TAA") ||
($sc eq "TAG") ||
($sc eq "TGA")){
if($errors[25] < $max_error_count){
print $warnings
"Bad stop codon sequence, $sc, on tx ".
$tx->id.". Should be TAA, TAG, or TGA.\n";
}
$errors[25]++;
if (_check_errors_against_badlist($gtf,25)) {
push @bad_list, $tx->id;
}
}
}
else{
my $sc1 = substr($seq,$#seq - 2,3);
my $sc2 = substr($seq,$#seq - 5,3);
if(($sc2 eq "TAA") ||
($sc2 eq "TAG") ||
($sc2 eq "TGA")){
$found_stop = $#seq - 5;
$stop = 3;
}
elsif(($sc1 eq "TAA") ||
($sc1 eq "TAG") ||
($sc1 eq "TGA")){
$found_stop = $#seq - 2;
}
else{
$stop = 3;
}
}
my @sc = (0,0,0);
my $scphase = $phase;
$scphase = 0 if($scphase eq '.');
for(my $i = $start;$i <= $#seq - $stop - 5;$i++){
my $codon = substr($seq,$i,3);
if(($codon eq "TAA") ||
($codon eq "TAG") ||
($codon eq "TGA")){
$sc[($i+$scphase) % 3] = 1;
}
}
if(($phase ne ".") &&
($sc[0])){
if($errors[29] < $max_error_count){
print $warnings "In frame stop codon(s) found on transcript ".
$tx->id."\n";
}
$errors[29]++;
if (_check_errors_against_badlist($gtf,29)) {
push @bad_list, $tx->id;
}
push @inframe_stops, $tx->id;
}
else{
if(($sc[0] == 1) &&
($sc[1] == 1) &&
($sc[2] == 1)){
push @inframe_stops, $tx->id;
if($errors[37] < $max_error_count){
print $warnings "Stop Codons in all frames on transcript ".
$tx->id."\n";
}
$errors[37]++;
if (_check_errors_against_badlist($gtf,37)) {
push @bad_list, $tx->id;
}
}
}
# add start or stop if possible
if($found_stop != -1){
if(($phase eq ".") ||
($phase == (($#seq + 1) % 3))){
if($sc[($#seq +1) % 3] == 0){
$phase = ($#seq + 1) % 3;
if($errors[32] < $max_error_count){
print $warnings
"Unannotated stop codon found on transcript ".
$tx->id."\n";
}
$errors[32]++;
if (_check_errors_against_badlist($gtf,32)) {
push @bad_list, $tx->id;
}
if($fix_gtf){
if($strand eq "+"){
my $terminal_exon = $$cds[$#$cds];
my $sc_start = $terminal_exon->stop + 1 - $stop;
my $stop_codon =
GTF::Feature::new("stop_codon",$sc_start,
$sc_start + 2,0,0);
$terminal_exon->set_stop($sc_start - 1);
$tx->add_feature($stop_codon);
}
else{
my $terminal_exon = $$cds[0];
my $sc_start = $terminal_exon->start - 3 + $stop;
my $stop_codon =
GTF::Feature::new("stop_codon",$sc_start,
$sc_start + 2,0,0);
$terminal_exon->set_start($sc_start + 3);
$tx->add_feature($stop_codon);
}
}
}
}
}
if($found_start != -1){
if(($phase eq ".") ||
($phase == 0)){
if($sc[0] == 0){
if($errors[31] < $max_error_count){
print $warnings
"Unannotated start codon found on transcript ".
$tx->id."\n";
}
$errors[31]++;
if (_check_errors_against_badlist($gtf,31)) {
push @bad_list, $tx->id;
}
if($fix_gtf){
if($strand eq "+"){
my $initial_exon = $$cds[0];
my $sc_start = $initial_exon->start - 3 + $start;
my $start_codon =
GTF::Feature::new("start_codon",$sc_start,
$sc_start + 2,0,0);
$initial_exon->set_start($sc_start);
$tx->add_feature($start_codon);
}
else{
my $initial_exon = $$cds[$#$cds];
my $sc_start = $initial_exon->stop + 1 - $start;
my $start_codon =
GTF::Feature::new("start_codon",$sc_start,
$sc_start + 2,0,0);
$initial_exon->set_stop($sc_start + 2);
$tx->add_feature($start_codon);
}
}
}
}
}
# set phase if unset here;
if(($phase ne ".") && ($old_phase eq ".") && ($fix_gtf)){
if($strand eq "+"){
for(my $i = 0;$i <= $#$cds;$i++){
$$cds[$i]->set_frame($phase);
$phase = ($$cds[$i]->length - $phase) % 3;
if($phase == 1){
$phase = 2;
}
elsif($phase == 2){
$phase = 1;
}
}
}
else{
for(my $i = $#$cds;$i >= 0;$i--){
$$cds[$i]->set_frame($phase);
$phase = ($$cds[$i]->length - $phase) % 3;
if($phase == 1){
$phase = 2;
}
elsif($phase == 2){
$phase = 1;
}
}
}
}
if($gtf->{Tx}){
my $tx_fh = $gtf->{Tx};
print $tx_fh ">".$tx->id ."\t".$tx->start."\t".$tx->stop."\n".
substr($seq,$start,$#seq - $stop - $start + 1)."\n";
}
}
}
$gtf->{Total_Seq} = \%total_seq;
for(my $i = 0;$i <= $#error_msgs;$i++){
if(($errors[$i] > 0) && !($$no_warn[$i])){
print $warnings
"\nWarnings encountered:\n",
"Count\tDescription\n";
for(my $j = $i;$j <= $#error_msgs;$j++){
if(($errors[$j] > 0) && !($$no_warn[$j])){
print $warnings
$errors[$j],"\t$error_msgs[$j]\n";
}
}
last;
}
}
#sort genes, transcripts, cds and utr by start position
@all_genes = sort {$a->start <=> $b->start} @all_genes;
@all_txs = sort {$a->start <=> $b->start} @all_txs;
@all_cds = sort {$a->start <=> $b->start} @all_cds;
@all_inter = sort {$a->start <=> $b->start} @all_inter;
@all_cns = sort {$a->start <=> $b->start} @all_cns;
print $warnings "\nStatistics:\n";
print $warnings
"\t", $#all_genes + 1, " genes with ", $#all_txs + 1,
" transcripts containing ", $#all_cds + 1 , " cds.\n";
if($gtf->{Inframe_Stops}){
print $warnings "Inframe Stop Genes:\n";
foreach my $stop (@inframe_stops){
print $warnings "$stop\n";
}
}
if ($gtf->{Bad_List}) {
print $warnings "Bad Genes:\n";
my %bad_out;
foreach my $bad (@bad_list) {
$bad_out{$bad}++;
}
my @out = keys %bad_out;
while (@out) {
print $warnings pop(@out),"\n";
}
}
close(WARN);
close(GTF);
$gtf->{Genes} = \@all_genes;
$gtf->{Transcripts} = \@all_txs;
$gtf->{CDS} = \@all_cds;
$gtf->{Inter} = \@all_inter;
$gtf->{Inter_CNS} = \@all_cns;
$gtf->{Comments} = $all_line_comments;
$gtf->{Parse_Errors} = \@errors;
}
# GTF::_get_error_messages()
# This function returns an array of the error messages associated with
# each error number.
sub _get_error_messages{
my @error_msgs;
$error_msgs[0] = "Not enough fields on line.";
$error_msgs[1] = "Illegal type of whitespace between fields. Sould be tab.";
$error_msgs[2] = "Illegal value for <feature> field. Should be ".
"\'CDS\', \'exon\', \'start_codon\', or \'stop_codon\'.";
$error_msgs[3] = "Illegal value for <start> field. Should be numerical.";
$error_msgs[4] = "Illegal value for <stop> field. Should be numerical.";
$error_msgs[5] = "Illegal value for <score> field. Should be numerical or \'.\'.";
$error_msgs[6] = "Illegal value for <strand> field. ".
"Should be \'+\', \'-\', or \'.\'.";
$error_msgs[7] = "Illegal value for <frame> field. ".
"Should be \'0\', \'1\', \'2\', or \'.\'.";
$error_msgs[8] = "Illegal \'\#\' character in attribute name.";
$error_msgs[9] = "Illegal tab character in <attributes> field.";
$error_msgs[10] = "Missing \';\' terminator after attribute in <attributes> field.";
$error_msgs[11] = "Missing \'\"\'s around attribute values in <attributes> field.";
$error_msgs[12] = "Incorrect separator between attribute name-value pairs in ".
"<attributes> field. Should be \' \'.";
$error_msgs[13] = "Missing \'gene_id\' attribute in <attributes> field.";
$error_msgs[14] = "Missing \'transcript_id\' attribute in <attributes> field.";
$error_msgs[15] = "Transcript contains no CDS, inter or inter_CNS features.";
$error_msgs[16] = "Start Codon length is not three.";
$error_msgs[17] = "Transcript has no start codon.";
$error_msgs[18] = "Stop Codon length is not three.";
$error_msgs[19] = "Transcript has no stop codon.";
$error_msgs[20] = "Inconsistent <strand> value across gene_id.";
$error_msgs[21] = "CDS before start codon.";
$error_msgs[22] = "CDS after stop codon.";
$error_msgs[23] = "Multiple genes using the same transcript_id.";
$error_msgs[24] = "Bad start codon sequence. Should be ATG.";
$error_msgs[25] = "Bad stop codon sequence. Should be TAA, TAG, or TGA.";
$error_msgs[26] = "Bad acceptor splice site sequence.";
$error_msgs[27] = "Bad donor splice site sequence.";
$error_msgs[28] = "In frame stop codon found.";
$error_msgs[29] = "In frame stop codon in transcript.";
$error_msgs[30] = "Illegal \'=\' character in <attributes> field.";
$error_msgs[31] = "Unannotated start codon found.";
$error_msgs[32] = "Unannotated stop codon found.";
$error_msgs[33] = "Start codon location inconsistent with initial exon location.";
$error_msgs[34] = "Stop codon location inconsistent with terminal exon location.";
$error_msgs[35] = "Start codon length is wrong.";
$error_msgs[36] = "Start codon length is wrong.";
$error_msgs[37] = "Transcript does not translate in any frame.";
$error_msgs[38] = "<start> field is greater than <stop> field.";
$error_msgs[39] = "Wrong phase value.";
$error_msgs[40] = "Missing phase value.";
$error_msgs[41] = "Complete transcript length not multiple of 3.";
$error_msgs[42] = "Inconsistant phase value accross transcript.";
$error_msgs[43] = "Bad conservation sequence character. Should be \'0\',\'1\', ".
"or \'2\'.";
$error_msgs[44] = "Bad fasta sequence character. Should be \'A\',\'C\',\'T\',\'G\',".
"or \'N\'.";
$error_msgs[45] = "Overlapping CDS features in transcript.";
$error_msgs[46] = "Stop codon included in terminal CDS.";
$error_msgs[47] = "Start codon annotated in intron region.";
$error_msgs[48] = "Transcript contains zero length intron(s).";
$error_msgs[49] = "Transcript contains short (<20bp) intron(s).";
$error_msgs[50] = "Possible non-canonical splice site sequence.";
$error_msgs[51] = "Bad frame on SEC feature. Must be 0 or \".\"";
$error_msgs[52] = "SEC feature outside of CDS feature.";
$error_msgs[53] = "SEC feature had bad sequence. Must be \"TGA\"";
return @error_msgs;
}
#this checks to see if the error is one of the errors that
#we want to return the transcript name and remove
sub _check_errors_against_badlist{
my ($gtf,$error_num) = @_;
my $bad = $gtf->{Bad_List};
if($$bad[$error_num]) {
return 1;
}
else {
return 0;
}
}
#this loads the gtf file without checking for errors
sub _load_no_check{
my ($gtf,$warnings) = @_;
my (@all_genes,@all_txs,@all_cds,@all_inter,@all_cns);
my %TxObj;
my %GeneObj;
my $strict = $gtf->{Strict};
#open file
open(GTF, "<$gtf->{Filename}")
or die "Could not open $gtf->{Filename}.\n";
while(my $in_line = <GTF>){
chomp $in_line;
if($in_line =~ /^(.*)\#/){
$in_line = $1;
}
if($in_line =~ /^\s+(.*)$/){
$in_line = $1;
}
if($in_line !~ /\S/){
next;
}
my @data = split /\t/,$in_line;
if($#data != 8){
die "Wrong number of fields on line:\n$in_line\n";
}
if(!$strict && $data[5] eq "."){
$data[5] = 0;
}
my %attributes;
while($data[8] =~ /^\s*(\S+) \"(\S*)\";(.*)$/){
my $key = $1;
my $val = $2;
$data[8] = $3;
$attributes{$key} = $val;
}
my $gid;
if(defined($attributes{gene_id})){
$gid = $attributes{gene_id};
}
else{
die "Missing gene_id\n";
}
my $tid;
if(defined($attributes{transcript_id})){
$tid = $attributes{transcript_id};
}
else{
die "Missing transcript_id\n";
}
#create objects
my $feature =
GTF::Feature::new($data[2], $data[3], $data[4], $data[5], $data[7]);
if($feature->type eq "CDS"){
push @all_cds, $feature;
}
elsif($feature->type eq "inter") {
push @all_inter, $feature;
}
elsif($feature->type eq "inter_CNS") {
push @all_cns, $feature;
}
my $tx;
my $gene;
unless(defined($TxObj{$tid}) && length($tid)){
$tx = GTF::Transcript::new($tid);
$TxObj{$tid} = $tx;
unless($feature->type eq "inter" || $feature->type eq "inter_CNS") {
push @all_txs, $tx;
}
unless(defined($GeneObj{$gid}) && length($gid)){
$gene = GTF::Gene::new($gid,$data[0],$data[1],$data[6]);
$GeneObj{$gid} = $gene;
unless($feature->type eq "inter" || $feature->type eq "inter_CNS") {
push @all_genes, $gene;
}
}
$gene = $GeneObj{$gid};
$gene->add_transcript($tx);
}
else{
$gene = $GeneObj{$gid};
}
$tx = $TxObj{$tid};
if($tx->strand ne $data[6]){
die "Features on transcript ".$tx->id." have different strands\n";
}
$tx->add_feature($feature);
}
close(GTF);
#sort genes, exons, and cds by start position
@all_genes = sort {$a->start <=> $b->start} @all_genes;
@all_txs = sort {$a->start <=> $b->start} @all_txs;
@all_cds = sort {$a->start <=> $b->start} @all_cds;
@all_inter = sort {$a->start <=> $b->start} @all_inter;
@all_cns = sort {$a->start <=> $b->start} @all_cns;
$gtf->{Genes} = \@all_genes;
$gtf->{Transcripts} = \@all_txs;
$gtf->{CDS} = \@all_cds;
$gtf->{Inter_CNS} = \@all_cns;
$gtf->{Inter} = \@all_inter;
}
sub _rev_comp{
my ($seq) = @_;
$seq = uc $seq;
my @bases = split //, $seq;
my $out = "";
foreach my $base (@bases){
if($base eq 'A'){
$base = 'T';
}
elsif($base eq 'C'){
$base = 'G';
}
elsif($base eq 'G'){
$base = 'C';
}
elsif($base eq 'T'){
$base = 'A';
}
$out = $base.$out;
}
return $out;
}
###############################################################################
# GTF::Gene
###############################################################################
# The GTF::Gene object stores all gtf data for one gene. A gene contains one
# or more transcripts
package GTF::Gene;
# GTF::Gene::new(gene_id,seqname,source,strand)
# This is the contrcutor for a GTF::Gene object. It has 4 required
# parameters. The are:
# gene_id - The gene_id for this gene
# seqname - The <seqname> field of the line. Should be a string
# containing no whitespace.
# source - The <source> field of the line. Should be a string
# containing no whitespace.
# strand - The <strand> field of the line. Should be either
# '+', '-', or '.'.
sub new {
my $gene = bless {};
($gene->{Id},$gene->{Seqname}, $gene->{Source}, $gene->{Strand}) = @_;
$gene->{Transcripts} = [];
$gene->{CDS} = [];
$gene->{Modified} = 0;
$gene->{Start} = -1;
$gene->{Stop} = -1;
return $gene;
}
sub add_transcript{
my ($gene,$tx) = @_;
my $txs = $gene->{Transcripts};
push @$txs, $tx;
$tx->_set_gene($gene);
$tx->_update;
$gene->{Modified} = 1;
}
sub id {shift->{Id}}
sub set_id{
my ($gene,$id) = @_;
$gene->{Id} = $id;
}
# GTF::Feature::seqname()
# This function returns the sequnce name, <seqname> field, that this
# feature came from as a string.
sub seqname {shift->{Seqname}}
# GTF::Feature::set_source()
# This function sets the sequence name, <seqname> field, of this line
sub set_seqname{
my ($feature,$seqname) = @_;
$feature->{Seqname} = $seqname;
}
# GTF::Feature::source()
# This function returns the source, <source> field, of this line as a
# string.
sub source {shift->{Source}}
# GTF::Feature::set_source()
# This function sets the source, <source> field, of this line
sub set_source{
my ($feature,$source) = @_;
$feature->{Source} = $source;
}
# GTF::Feature::strand()
# This function returns the strand, <strand> field, of this line as a
# string. Should be '+', '-', or '.'.
sub strand {shift->{Strand}}
# GTF::Gene::id()
# This function returns this transcript's unique transcript_id
sub gene_id{
my ($gene) = @_;
return $gene->{Id};
}
# GTF::Gene::transcript_ids()
# This function returns an array of all trnacsripts in this gene
sub transcripts{
my ($gene) = @_;
$gene->_update;
return $gene->{Transcripts};# [sort {$a->start <=> $b->start || $a->stop <=> $b->stop} @{$gene->{Transcripts}}];
}
# GTF::Gene::remove_transcript()
sub remove_transcript{
my($gene, $tx_id) = @_;
my $txs = $gene->transcripts;
my @new_txs = ();
@new_txs = grep($_->id ne $tx_id, @$txs);
$gene->{Transcripts} = \@new_txs if ($#new_txs < $#$txs);
}
sub cds{
my ($gene) = @_;
my $txs = $gene->transcripts;
my @cds;
foreach my $tx (@$txs){
push @cds, @{$tx->cds};
}
# employ a slow sort to avoid perl sort problems (see GTF::Gene::_update) rpz
my $tmp;
for(my $a = 0; $a <= $#cds; $a++){
for(my $b = $a+1; $b <= $#cds; $b++){
if(($cds[$a]->start > $cds[$b]->start) ||
(($cds[$a]->start == $cds[$b]->start) &&
($cds[$a]->stop > $cds[$b]->stop))){
$tmp = $cds[$a];
$cds[$a] = $cds[$b];
$cds[$b] = $tmp;
}
}
}
return \@cds;
}
# GTF::Gene::copy()
# This function returns a new object which is a copy of this object. Also
# creates copies of all transcripts of this gene
sub copy{
my ($gene) = @_;
my $copy = GTF::Gene::new($gene->id,$gene->seqname,$gene->source,$gene->strand);
my $txs = $gene->transcripts;
my @new_txs;
my $next = 0;
foreach my $tx (@$txs){
$next++;
my $ntx = $tx->copy;
$copy->add_transcript($ntx);
}
return($copy);
}
# GTF::Gene::offset(ammount)
# This function will offset all locations in this gene by the given ammount
sub offset{
my ($gene,$offset) = @_;
my $txs = $gene->transcripts;
foreach my $tx (@$txs){
$tx->offset($offset);
}
}
# GTF::Gene::output_gtf([file_handle])
# This function outputs the information for this gene in standard
# gtf2 format. If takes and optional file_handle as the only argument
# to which it outputs the data. If no argument is given it outputs
# the data to stdout.
sub output_gtf{
my ($gene,$out_handle) = @_;
unless(defined($out_handle)){
$out_handle = \*STDOUT;
}
my $txs = $gene->transcripts;
foreach my $tx (@$txs){
$tx->output_gtf($out_handle);
print $out_handle "\n";
}
}
# GTF::Gene::output_gff([file_handle])
# This function outputs the information for this gene in standard
# GFF format. If takes and optional file_handle as the only argument
# to which it outputs the data. If no argument is given it outputs
# the data to stdout.
sub output_gff{
my ($gene,$out_handle) = @_;
unless(defined($out_handle)){
$out_handle = \*STDOUT;
}
my $txs = $gene->transcripts;
foreach my $tx (@$txs){
$tx->output_gff($out_handle);
}
}
# GTF::Gene::reverse_complement(seq_length)
# This function takes the length of the sequence this gene came from and
# reverse complements everything in the gene.
sub reverse_complement{
my ($gene, $seq_length) = @_;
my $txs = $gene->transcripts;
foreach my $tx (@$txs){
$tx->reverse_complement($seq_length);
}
$gene->{Modified} = 1;
}
# GTF::Gene::start()
# This function return the lowest coord of any feature in any tx of this gene
sub start{
my ($gene) = @_;
$gene->_update;
return $gene->{Start};
}
# GTF::Gene::stop()
# This function returns the highest coord of any feature in any tx of this gene
sub stop{
my ($gene) = @_;
$gene->_update;
return $gene->{Stop};
}
sub equals{
my ($gene,$compare) = @_;
$gene->_update;
unless($gene && $compare){
return 0;
}
unless(($gene->start == $compare->start) &&
($gene->stop == $compare->stop) &&
($gene->strand eq $compare->strand)){
return 0;
}
my $gene_txs = $gene->transcripts;
my $compare_txs = $compare->transcripts;
unless($#$gene_txs == $#$compare_txs){
return 0;
}
for(my $i = 0;$i <= $#$gene_txs;$i++){
unless($$gene_txs[$i]->equals($$compare_txs[$i])){
return 0;
}
}
return 1;
}
sub length{
my ($gene) = @_;
my $txs = $gene->transcripts;
if($#$txs == -1){
return 0;
}
my $start = $$txs[0]->start;
my $stop = $$txs[0]->stop;
my $total = 0;
foreach my $tx (@$txs){
if($tx->start < $start){
$start = $tx->start;
}
if($tx->stop > $stop){
$stop = $tx->stop;
}
}
return($stop - $start + 1);
}
sub gc_percentage{
my ($gene) = @_;
my $txs = $gene->transcripts;
if($#$txs == -1){
return 0;
}
my $total = 0;
foreach my $tx (@$txs){
$total += $tx->gc_percentage;
}
return($total/($#$txs+1));
}
sub match_percentage{
my ($gene) = @_;
my $txs = $gene->transcripts;
if($#$txs == -1){
return 0;
}
my $total = 0;
foreach my $tx (@$txs){
$total += $tx->match_percentage;
}
return($total/($#$txs+1));
}
sub mismatch_percentage{
my ($gene) = @_;
my $txs = $gene->transcripts;
if($#$txs == -1){
return 0;
}
my $total = 0;
foreach my $tx (@$txs){
$total += $tx->mismatch_percentage;
}
return($total/($#$txs+1));
}
sub unaligned_percentage{
my ($gene) = @_;
my $txs = $gene->transcripts;
if($#$txs == -1){
return 0;
}
my $total = 0;
foreach my $tx (@$txs){
$total += $tx->unaligned_percentage;
}
return($total/($#$txs+1));
}
sub tag{
my ($gene) = @_;
return $gene->{Tag};
}
sub set_tag{
my ($gene,$tag) = @_;
$gene->{Tag} = $tag;
}
sub _update{
my ($gene) = @_;
unless($gene->{Modified}){
return;
}
$gene->{Modified} = 0;
my $txs = $gene->{Transcripts};
# this slow sort is being done because the commented out sort below
# causes crashes sometimes and I have no idea why since the uncommented
# sort has no problems
# -- the reason for the problems is that perl's sort routine does not
# afford you the capability of modifying the $a or $b variable in
# the usersub callback, since they are aliases. rpz
$gene->{Start} = -1;
$gene->{Stop} = -1;
my $tmp;
for(my $a = 0; $a <= $#$txs; $a++){
for(my $b = $a+1; $b <= $#$txs; $b++){
if(($$txs[$a]->start > $$txs[$b]->start) ||
(($$txs[$a]->start == $$txs[$b]->start) &&
($$txs[$a]->stop > $$txs[$b]->stop))){
$tmp = $$txs[$a];
$$txs[$a] = $$txs[$b];
$$txs[$b] = $tmp;
}
}
if($$txs[$a]->stop > $gene->{Stop}){
# highest stop may not be stop of last tx so we check them all
$gene->{Stop} = $$txs[$a]->stop;
}
}
if($#$txs != -1){
# lowest start is start of first tx
$gene->{Start} = $$txs[0]->start;
}
}
###############################################################################
# GTF::Transcript
###############################################################################
# The GTF::Transcript object stores all data for a single transcript of a gtf
# file.
package GTF::Transcript;
# GTF::Transcript::new(transcript_id)
# This is the constructor for GTF::Feature objects. It takes 1 arguments.
# transciprt_id - the id for this transcript
sub new{
my $tx = bless {};
($tx->{Id}) = @_;
$tx->{Exons} = [];
$tx->{CDS} = [];
$tx->{UTR3} = [];
$tx->{UTR5} = [];
$tx->{Introns} = [];
$tx->{Starts} = [];
$tx->{Stops} = [];
$tx->{SEC} = [];
$tx->{Intron_CNS} = [];
$tx->{Inter_CNS} = [];
$tx->{Inter} = [];
$tx->{Coding_Start} = -1;
$tx->{Coding_Stop} = -1;
$tx->{Coding_Length} = -1;
$tx->{Gene} = -1;
$tx->{Start} = -1;
$tx->{Stop} = -1;
$tx->{Modified} = 1;
return $tx;
}
sub _set_gene{
my ($tx,$gene) = @_;
$tx->{Gene} = $gene;
}
sub add_feature{
my ($tx,$feature) = @_;
my $type = $feature->type;
if($type eq 'CDS'){
push @{$tx->{CDS}}, $feature;
$tx->{CDS} = [sort {$a->start <=> $b->start} @{$tx->{CDS}}];
}
elsif($type eq '3UTR'){
push @{$tx->{UTR3}}, $feature;
$tx->{UTR3} = [sort {$a->start <=> $b->start} @{$tx->{UTR3}}];
}
elsif($type eq '5UTR'){
push @{$tx->{UTR5}}, $feature;
$tx->{UTR5} = [sort {$a->start <=> $b->start} @{$tx->{UTR5}}];
}
elsif($type eq 'start_codon'){
push @{$tx->{Starts}}, $feature;
$tx->{Starts} = [sort {$a->start <=> $b->start} @{$tx->{Starts}}];
}
elsif($type eq 'stop_codon'){
push @{$tx->{Stops}}, $feature;
$tx->{Stops} = [sort {$a->start <=> $b->start} @{$tx->{Stops}}];
}
elsif($type eq 'exon'){
push @{$tx->{Exons}}, $feature;
$tx->{Exons} = [sort {$a->start <=> $b->start} @{$tx->{Exons}}];
}
elsif($type eq 'SEC'){
push @{$tx->{SEC}}, $feature;
$tx->{SEC} = [sort {$a->start <=> $b->start} @{$tx->{SEC}}];
}
elsif($type eq 'intron_CNS'){
push @{$tx->{Intron_CNS}}, $feature;
$tx->{Intron_CNS} = [sort {$a->start <=> $b->start} @{$tx->{Intron_CNS}}];
}
elsif($type eq 'inter_CNS'){
push @{$tx->{Inter_CNS}}, $feature;
$tx->{Inter_CNS} = [sort {$a->start <=> $b->start} @{$tx->{Inter_CNS}}];
}
elsif($type eq 'inter'){
push @{$tx->{Inter}}, $feature;
$tx->{Inter} = [sort {$a->start <=> $b->start} @{$tx->{Inter}}];
}
$feature->set_transcript($tx);
$tx->{Modified} = 1;
if($tx->{Gene} != -1){ $tx->{Gene}->{Modified} = 1;}
}
sub transfer_missing_utr{
my($dest, $src, $mode) = @_;
my $dest_utr3 = $dest->utr3;
my $dest_utr5 = $dest->utr5;
my $src_utr3 = $src->utr3;
my $src_utr5 = $src->utr5;
$mode = "Full" if ($#_ == 1); # default
if (($mode eq "3UTR") || ($mode eq "Full")) {
if (($#$dest_utr3 == -1) && ($#$src_utr3 > -1)){
print "Transfering 3UTR from ".$src->id." to ".$dest->id."\n";
foreach my $src_exon (@$src_utr3)
{
my $utr_exon = $src_exon->copy;
$utr_exon->set_transcript($dest);
$dest->add_feature($utr_exon);
}
}
}
if (($mode eq "5UTR") || ($mode eq "Full")) {
if (($#$dest_utr5 == -1) && ($#$src_utr5 > -1)){
print "Transfering 5UTR from ".$src->id." to ".$dest->id."\n";
foreach my $src_exon (@$src_utr5)
{
my $utr_exon = $src_exon->copy;
$utr_exon->set_transcript($dest);
$dest->add_feature($utr_exon);
}
}
}
}
# copies the exon obejcts info into utr5 and utr3 objects
# WARNING: don't call this if you already have utr5 and utr3 info
sub create_utr_objects_from_exons{
my ($tx) = @_;
my $exons = $tx->exons;
my $start = $tx->coding_start;
my $stop = $tx->coding_stop;
if($start != -1){
my $type = "5UTR";
if($tx->strand eq "-"){
$type = "3UTR";
}
for(my $i = 0; $i <= $#$exons; $i++){
if($$exons[$i]->start < $start){
my $utr = $$exons[$i]->copy;
$utr->set_type($type);
if($start < $utr->stop){
$utr->set_stop($start);
}
$tx->add_feature($utr);
}
else{
last;
}
}
}
if($stop != -1){
my $type = "3UTR";
if($tx->strand eq "-"){
$type = "5UTR";
}
for(my $i = $#$exons; $i >= 0; $i--){
if($$exons[$i]->stop > $stop){
my $utr = $$exons[$i]->copy;
$utr->set_type($type);
if($stop > $utr->start){
$utr->set_start($stop);
}
$tx->add_feature($utr);
}
else{
last;
}
}
}
}
sub exons{
my ($tx) = @_;
$tx->_update;
return $tx->{Exons};
}
sub cds{
my ($tx) = @_;
$tx->_update;
return $tx->{CDS};
}
sub utr3{
my ($tx) = @_;
$tx->_update;
return $tx->{UTR3};
}
sub utr5{
my ($tx) = @_;
$tx->_update;
return $tx->{UTR5};
}
sub introns{
my ($tx) = @_;
$tx->_update;
return $tx->{Introns};
}
sub start_codons{
my ($tx) = @_;
$tx->_update;
return $tx->{Starts};
}
sub stop_codons{
my ($tx) = @_;
$tx->_update;
return $tx->{Stops};
}
sub sec{
my ($tx) = @_;
$tx->_update;
return $tx->{SEC};
}
sub intron_cns{
my ($tx) = @_;
$tx->_update;
return $tx->{Intron_CNS};
}
sub inter_cns{
my ($tx) = @_;
$tx->_update;
return $tx->{Inter_CNS};
}
sub inter{
my ($tx) = @_;
$tx->_update;
return $tx->{Inter};
}
sub seqname{
my ($tx) = @_;
my $gene = $tx->gene;
return $gene->seqname;
}
sub source{
my ($tx) = @_;
my $gene = $tx->gene;
return $gene->source;
}
sub strand{
my ($tx) = @_;
my $gene = $tx->gene;
return $gene->strand;
}
sub set_id{
my ($tx,$id) = @_;
$tx->{Id} = $id;
}
sub id{
my ($tx) = @_;
return $tx->{Id};
}
sub gene_id{
my ($tx) = @_;
my $gene = $tx->gene;
return $gene->{Id};
}
sub gene{
my ($tx) = @_;
return $tx->{Gene};
}
# GTF::Transcript::remove_exon(exon start pos)
# Allows you to remove an exon from the list
# Removes the all exons at the specified position
sub remove_exon{
my ($tx, $pos) = @_;
my $new_exons = [];
my $removed = 0;
foreach my $exon (@{$tx->{Exons}}){
if($exon->start != $pos){
push @$new_exons, $exon;
}
else{
$removed++;
}
}
$tx->{Exons} = $new_exons;
return $removed;
}
# GTF::Transcript::remove_cds(cds start pos)
# Allows you to remove a cds from the list
# Removes the all cdss at the specified position
sub remove_cds{
my ($tx, $pos) = @_;
my $new_cds = [];
my $removed = 0;
foreach my $cds (@{$tx->{CDS}}){
if($cds->start != $pos){
push @$new_cds, $cds;
}
else{
$removed++;
}
}
$tx->{CDS} = $new_cds;
return $removed;
}
# GTF::Transcript::length()
# This function return the length of the tx from the start to the stop.
sub length{
my ($tx) = @_;
return($tx->stop - $tx->start + 1);
}
# GTF::Transcript::start()
# This function returns the start of this tx (lowest coord).
sub start{
my ($tx) = @_;
$tx->_update;
return $tx->{Start};
}
# GTF::Transcript::stop()
# This function return the end of the tx (highest coord).
sub stop{
my ($tx) = @_;
$tx->_update;
return $tx->{Stop};
}
# GTF::Transcript::coding_start()
# This function returns the start position (lowest coord) of coding region of
# this tx. Same as the start codon's <start> field.
sub coding_start{
my ($tx) = @_;
$tx->_update;
return $tx->{Coding_Start};
}
# GTF::Transcript::coding_stop()
# This function return the end position (highest coord) of the coding region of
# the tx. Same as the stop codon's <stop> field.
sub coding_stop{
my ($tx) = @_;
$tx->_update;
return $tx->{Coding_Stop};
}
# GTF::Transcript::coding_length()
# This function return the coding_length of the tx from the coding_start
# to the coding_stop.
sub coding_length{
my ($tx) = @_;
$tx->_update;
return $tx->{Coding_Length};
}
sub score{
my ($tx) = @_;
my $features = $tx->_all_features;
my $score = 0;
foreach my $feature (@$features){
if($feature->score ne "."){
$score += $feature->score;
}
}
return $score;
}
# GTF::Transcript::initial_exon()
# This function returns a CDS::Feature object representing the
# initial_exon (cds) of this tx
# NOTE: Isn't exon the wrong word for this?
sub initial_exon{
my ($tx) = @_;
my $starts = $tx->start_codons;
if($#$starts == -1){
return 0;
}
$tx->_update;
my $exons = $tx->{CDS};
if($tx->strand eq "-"){
return $$exons[$#{$exons}];
}
else{
return $$exons[0];
}
}
# GTF::Transcript::terminal_exon()
# This function returns a CDS::Feature object representing the
# terminal exon (cds) of this tx
# NOTE: Isn't exon the wrong word for this?
sub terminal_exon{
my ($tx) = @_;
my $stops = $tx->stop_codons;
if($#$stops == -1){
return 0;
}
$tx->_update;
my $exons = $tx->cds;
if($tx->strand eq "-"){
return $$exons[0];
}
else{
return $$exons[$#{$exons}];
}
}
sub equals{
my ($tx,$compare, $mode) = @_;
$mode = "Full" if ($#_ == 1); # default
unless($tx && $compare){
return 0;
}
unless(($tx->start == $compare->start) &&
($tx->stop == $compare->stop) &&
($tx->strand eq $compare->strand)){
return 0;
}
if (($mode eq "CDS") || ($mode eq "Full")) {
my $tx_cds = $tx->cds;
my $comp_cds = $compare->cds;
unless($#$tx_cds == $#$comp_cds){
return 0;
}
for(my $i = 0;$i <= $#$tx_cds;$i++){
unless($$tx_cds[$i]->equals($$comp_cds[$i])){
return 0;
}
}
my $tx_starts = $tx->start_codons;
my $comp_starts = $compare->start_codons;
unless($#$tx_starts == $#$comp_starts){
return 0;
}
for(my $i = 0;$i <= $#$tx_starts;$i++){
unless($$tx_starts[$i]->equals($$comp_starts[$i])){
return 0;
}
}
my $tx_stops = $tx->stop_codons;
my $comp_stops = $compare->stop_codons;
unless($#$tx_stops == $#$comp_stops){
return 0;
}
for(my $i = 0;$i <= $#$tx_stops;$i++){
unless($$tx_stops[$i]->equals($$comp_stops[$i])){
return 0;
}
}
}
if (($mode eq "3UTR") || ($mode eq "Full")) {
my $tx_utr3 = $tx->utr3;
my $comp_utr3 = $compare->utr3;
unless($#$tx_utr3 == $#$comp_utr3){
return 0;
}
for(my $i = 0;$i <= $#$tx_utr3;$i++){
unless($$tx_utr3[$i]->equals($$comp_utr3[$i])){
return 0;
}
}
}
if (($mode eq "5UTR") || ($mode eq "Full")) {
my $tx_utr5 = $tx->utr5;
my $comp_utr5 = $compare->utr5;
unless($#$tx_utr5 == $#$comp_utr5){
return 0;
}
for(my $i = 0;$i <= $#$tx_utr5;$i++){
unless($$tx_utr5[$i]->equals($$comp_utr5[$i])){
return 0;
}
}
}
my $tx_exon = $tx->exons;
my $comp_exon = $compare->exons;
unless($#$tx_exon == $#$comp_exon){
return 0;
}
for(my $i = 0;$i <= $#$tx_exon;$i++){
unless($$tx_exon[$i]->equals($$comp_exon[$i])){
return 0;
}
}
my $tx_sec = $tx->sec;
my $comp_sec = $compare->sec;
unless($#$tx_sec == $#$comp_sec){
return 0;
}
for(my $i = 0;$i <= $#$tx_sec;$i++){
unless($$tx_sec[$i]->equals($$comp_sec[$i])){
return 0;
}
}
return 1;
}
# GTF::Transcript::offset(ammount)
# This function will offset all locations in this tx by the given ammount
sub offset{
my ($tx,$offset) = @_;
unless(defined($offset)){
print STDERR "Undefined value passed to GTF::Gene::offset.\n";
return;
}
my $features = $tx->_all_features;
foreach my $feature (@$features){
if(defined($feature)){
$feature->offset($offset);
}
}
$tx->{Modified} = 1;
if($tx->{Gene} != -1){ $tx->{Gene}->{Modified} = 1;}
}
# GTF::Transcript::output_gtf([file_handle])
# This function outputs the information for this tx in standard
# gtf2 format. If takes and optional file_handle as the only argument
# to which it outputs the data. If no argument is given it outputs
# the data to stdout.
sub output_gtf{
my ($tx,$out_handle) = @_;
unless(defined $out_handle){
$out_handle = \*STDOUT;
}
my @features =
sort {$a->start <=> $b->start || $a->stop <=> $b->stop} @{$tx->_all_features};
foreach my $feature (@features){
$feature->output_gtf($out_handle);
}
}
# GTF::Transcript::output_gff([file_handle])
# This function outputs the information for this tx in standard
# GFF format. If takes and optional file_handle as the only argument
# to which it outputs the data. If no argument is given it outputs
# the data to stdout.
sub output_gff{
my ($tx,$out_handle) = @_;
unless(defined $out_handle){
$out_handle = \*STDOUT;
}
my @features =
sort {$a->start <=> $b->start || $a->stop <=> $b->stop} @{$tx->_all_features};
foreach my $feature (@features){
$feature->output_gff($out_handle);
}
}
# GTF::Transcript::reverse_complement(seq_length)
# This function takes the length of the sequence this tx came from and
# reverse complements everything in the gene.
sub reverse_complement{
my ($tx, $seq_length) = @_;
unless(defined($seq_length)){
print STDERR
"Undefined value for seq_length passed to GTF::Gene::reverse_complement.\n";
}
my $features = $tx->_all_features;
foreach my $feature (@$features){
$feature->reverse_complement($seq_length);
}
}
sub _all_features{
my ($tx) = @_;
my @features;
my $starts = $tx->start_codons;
push @features, @$starts;
my $stops = $tx->stop_codons;
push @features, @$stops;
my $exons = $tx->exons;
push @features, @$exons;
my $cds = $tx->cds;
push @features, @$cds;
my $utr3 = $tx->utr3;
push @features, @$utr3;
my $utr5 = $tx->utr5;
push @features, @$utr5;
my $sec = $tx->sec;
push @features, @$sec;
my $cns = $tx->intron_cns;
push @features, @$cns;
return \@features;
}
sub copy{
my ($tx) = @_;
my $copy = GTF::Transcript::new($tx->id);
my $starts = $tx->start_codons;
foreach my $old (@$starts){
my $new = $old->copy;
$copy->add_feature($new);
}
my $stops = $tx->stop_codons;
foreach my $old (@$stops){
my $new = $old->copy;
$copy->add_feature($new);
}
my $exons = $tx->exons;
foreach my $old (@$exons){
my $new = $old->copy;
$copy->add_feature($new);
}
my $cds = $tx->cds;
foreach my $old (@$cds){
my $new = $old->copy;
$copy->add_feature($new);
}
my $utr3 = $tx->utr3;
foreach my $old (@$utr3){
my $new = $old->copy;
$copy->add_feature($new);
}
my $utr5 = $tx->utr5;
foreach my $old (@$utr5){
my $new = $old->copy;
$copy->add_feature($new);
}
my $sec = $tx->sec;
foreach my $old (@$sec){
my $new = $old->copy;
$copy->add_feature($new);
}
return($copy);
}
sub gc_percentage{
my ($tx) = @_;
my $features = $tx->_all_features;
my $total = 0;
my $percent = 0;
foreach my $feature (@$features){
$percent += $feature->length * $feature->gc_percentage;
$total += $feature->length;
}
$percent /= $total;
return $percent;
}
sub match_percentage{
my ($tx) = @_;
my $features = $tx->_all_features;
my $total = 0;
my $percent = 0;
foreach my $feature (@$features){
$percent += $feature->length * $feature->match_percentage;
$total += $feature->length;
}
$percent /= $total;
return $percent;
}
sub mismatch_percentage{
my ($tx) = @_;
my $features = $tx->_all_features;
my $total = 0;
my $percent = 0;
foreach my $feature (@$features){
$percent += $feature->length * $feature->mismatch_percentage;
$total += $feature->length;
}
$percent /= $total;
return $percent;
}
sub unaligned_percentage{
my ($tx) = @_;
my $features = $tx->_all_features;
my $total = 0;
my $percent = 0;
foreach my $feature (@$features){
$percent += $feature->length * $feature->unaligned_percentage;
$total += $feature->length;
}
$percent /= $total;
return $percent;
}
sub tag{
my ($tx) = @_;
return $tx->{Tag};
}
sub set_tag{
my ($tx,$tag) = @_;
$tx->{Tag} = $tag;
}
sub _update{
my ($tx) = @_;
unless($tx->{Modified}){
return;
}
$tx->{Modified} = 0;
my $starts = $tx->start_codons;
my $stops = $tx->stop_codons;
my $cds = $tx->cds;
my $utr5 = $tx->utr5;
my $utr3 = $tx->utr3;
my $exons = $tx->exons;
foreach my $utr (@$utr5){
$utr->set_subtype("UTR5");
}
foreach my $utr (@$utr3){
$utr->set_subtype("UTR3");
}
#set subtypes
if($#$cds == 0){
if(($#$starts >= 0) && ($#$stops >= 0)){
$$cds[0]->set_subtype("Single");
}
elsif($#$starts >= 0){
$$cds[0]->set_subtype("Initial");
}
elsif($#$stops >= 0){
$$cds[0]->set_subtype("Terminal");
}
else{
$$cds[0]->set_subtype("Internal");
}
}
elsif($#$cds >= 0){
my $strand = $tx->strand;
my $starts = $tx->start_codons;
my $stops = $tx->stop_codons;
my $start = 0;
my $stop = $#$cds;
if($strand eq '-'){
if($#$starts >= 0){
$$cds[$#$cds]->set_subtype("Initial");
$stop--;
}
if($#$stops >= 0){
$$cds[0]->set_subtype("Terminal");
$start++;
}
}
else{
if($#$starts >= 0){
$$cds[0]->set_subtype("Initial");
$start++;
}
if($#$stops >= 0){
$$cds[$#$cds]->set_subtype("Terminal");
$stop--;
}
}
for(my $i = $start;$i <= $stop;$i++){
$$cds[$i]->set_subtype("Internal");
}
}
#get introns and coding length
my $clen = 0;
my @introns;
for(my $i = 0;$i < $#$cds;$i++){
$clen += $$cds[$i]->stop - $$cds[$i]->start +1;
my $start = $$cds[$i]->stop + 1;
my $stop = $$cds[$i+1]->start - 1;
if($start <= $stop){
my $intron = GTF::Feature::new("Intron",$start,$stop,0,'.');
$intron->set_subtype("Intron");
$intron->set_transcript($tx);
push @introns, $intron;
}
}
if($#$cds >= 0){
$clen += $$cds[$#$cds]->stop - $$cds[$#$cds]->start +1;
}
$tx->{Coding_Length} = $clen;
$tx->{Introns} = \@introns;
#get start/coding start
my $start = -1;
if(($#{$cds} >= 0) &&
(($start == -1) || ($start > $$cds[0]->start))){
$start = $$cds[0]->start;
}
$tx->{Coding_Start} = $start;
if(($#{$starts} >= 0) &&
(($start == -1) || ($start > $$starts[0]->start))){
$start = $$starts[0]->start;
}
if(($#{$stops} >= 0) &&
(($start == -1) || ($start > $$stops[0]->start))){
$start = $$stops[0]->start;
}
if(($#{$exons} >= 0) &&
(($start == -1) || ($start > $$exons[0]->start))){
$start = $$exons[0]->start;
}
if(($#{$utr5} >= 0) &&
(($start == -1) || ($start > $$utr5[0]->start))){
$start = $$utr5[0]->start;
}
if(($#{$utr3} >= 0) &&
(($start == -1) || ($start > $$utr3[0]->start))){
$start = $$utr3[0]->start;
}
$tx->{Start} = $start;
#get stop/coding stop
my $stop = -1;
if(($#{$cds} >= 0) &&
(($stop == -1) || ($stop < $$cds[$#$cds]->stop))){
$stop = $$cds[$#$cds]->stop;
}
$tx->{Coding_Stop} = $stop;
if(($#{$starts} >= 0) &&
(($stop == -1) || ($stop < $$starts[$#$starts]->stop))){
$stop = $$starts[$#$starts]->stop;
}
if(($#{$stops} >= 0) &&
(($stop == -1) || ($stop < $$stops[$#$stops]->stop))){
$stop = $$stops[$#$stops]->stop;
}
if(($#{$exons} >= 0) &&
(($stop == -1) || ($stop < $$exons[$#$exons]->stop))){
$stop = $$exons[$#$exons]->stop;
}
if(($#{$utr5} >= 0) &&
(($stop == -1) || ($stop < $$utr5[$#$utr5]->stop))){
$stop = $$utr5[$#$utr5]->stop;
}
if(($#{$utr3} >= 0) &&
(($stop == -1) || ($stop < $$utr3[$#$utr3]->stop))){
$stop = $$utr3[$#$utr3]->stop;
}
$tx->{Stop} = $stop;
}
###############################################################################
# GTF::Feature
###############################################################################
# The GTF::Feature object stores all data for a single feature of a gtf
# file. This object basically stores all data for a single non-comment
# line of a gtf file.
package GTF::Feature;
# GTF::Feature::new(feature, start, end, score, frame)
# This is the constructor for GTF::Feature objects. It takes 5 required
# arguments. They are:
# type - The <feature> field of the line. Should be either
# 'cds', 'exon', 'start_codon', or 'stop_codon'.
# start - The <start> field of the line. Should be a number.
# end - The <end> field of the line. Should be a number.
# score - The <score> field of the line. Should be a number or '.'.
# frame - The <frame> field of the line. Should be either
# '0', '1', '2', or '.'.
sub new {
my $feature = bless {};
($feature->{Type}, $feature->{Start}, $feature->{End}, $feature->{Score},
$feature->{Frame}) = @_;
my $start = $feature->{Start};
my $stop = $feature->{Stop};
if(defined($start) && defined($stop) && ($start > $stop)){
$feature->{Start} = $stop;
$feature->{Stop} = $start;
}
$feature->{Transcript} = 0;
$feature->{Seq} = 0;
$feature->{Conseq} = 0;
$feature->{Subtype} = 0;
$feature->{ASE} = 0; # rpz - Alternative Splicing Event
$feature->{Match} = -1;
$feature->{Mismatch} = -1;
$feature->{Unaligned} = -1;
$feature->{GC} = -1;
$feature->{A} = -1;
$feature->{C} = -1;
$feature->{G} = -1;
$feature->{T} = -1;
$feature->{N} = -1;
$feature->{0} = -1;
$feature->{1} = -1;
$feature->{2} = -1;
return $feature;
}
# GTF::Feature::copy()
# This function returns a new object which is a copy of this object
sub copy{
my ($feature) = @_;
my $copy = GTF::Feature::new($feature->type,$feature->start,$feature->stop,
$feature->score,$feature->frame);
$copy->set_bases($feature->get_a_count,$feature->get_c_count,$feature->get_g_count,
$feature->get_t_count,$feature->get_n_count);
$copy->set_conseq($feature->get_match_count,$feature->get_mismatch_count,
$feature->get_unaligned_count);
return($copy);
}
# GTF::Feature::offset(ammount)
# This function will offset all locations in this Feature by the given ammount
sub offset{
my ($feature,$offset) = @_;
unless(defined($offset)){
print STDERR "Undefined value passed to GTF::Feature::offset.\n";
return;
}
$feature->{Start} += $offset;
$feature->{End} += $offset;
}
# GTF::Feature::reverse_complement(seq_length)
# This function takes the length of the sequence this feature came from and
# reverse complements it.
sub reverse_complement{
my ($feature, $seq_length) = @_;
unless(defined($seq_length)){
print STDERR
"Undefined value for seq_length passed to ",
"GTF::Feature::reverse_complement.\n";
}
my $start = $seq_length - $feature->{Start} + 1;
my $stop = $seq_length - $feature->{End} + 1;
if($start < $stop){
$feature->{Start} = $start;
$feature->{End} = $stop;
}
else{
$feature->{Start} = $stop;
$feature->{End} = $start;
}
}
# GTF::Feature::type()
# This function returns the feature type, <feature> field of this line as
# a string. Should be either 'CDS', 'exon', 'start_codon', or 'stop_codon'.
sub type {shift->{Type}}
sub set_type{
my ($feature,$type) = @_;
$feature->{Type} = $type;
}
# GTF::Feature::type()
# This function returns the feature sub type, currently inly implemented for
# cds, should be 'Initial', 'Internal', 'Terminal', or 'Single'
sub subtype {shift->{Subtype}}
sub set_subtype{
my ($feature,$subtype) = @_;
$feature->{Subtype} = $subtype;
}
# GTF::Feature::ase()
# This function returns the feature's alternative splicing event,
# if there is one. Currently the only implemented ASE is 'Optional'
# rpz
sub ase {shift->{ASE}}
sub set_ase {
my ($feature, $ASE) = @_;
$feature->{ASE} = $ASE;
}
# GTF::Feature::length()
# This function returns the length of this feature in base pairs
# as a number.
sub length{
my ($feature) = @_;
return($feature->stop - $feature->start + 1);
}
# GTF::Feature::start()
# This function returns the start location ,<start> field, of this feature
# as a number.
sub start {shift->{Start}}
# GTF::Feature::set_start()
# This function sets the start location ,<start> field, of this feature
# as a number.
sub set_start{
my ($feature,$start) = @_;
$feature->{Start} = $start;
}
# GTF::Feature::end()
# This function returns the end location, <end> field, of this feature
# as a number.
sub end {shift->{End}}
# GTF::Feature::stop()
# This function returns the end location, <end> field, of this feature
# as a number.
# Same as GTF::Feature::end();
sub stop {shift->{End}}
# GTF::Feature::set_stop()
# This function sets the stop location ,<stop> field, of this feature
# as a number.
sub set_stop{
my ($feature,$stop) = @_;
$feature->{End} = $stop;
}
# GTF::Feature::score()
# This function returns the score, <score> field, of this line. This will
# be either a number or '.'.
sub score {shift->{Score}}
# GTF::Feature::frame()
# This function returns the frame, <frame> field, of this line. Should be
# '0', '1', '2', or '.'.
sub frame {shift->{Frame}}
# GTF::Feature::set_frame()
# This function sets the frame, <frame> field, of this line. Should be
# '0', '1', '2', or '.'.
sub set_frame{
my ($feature,$frame) = @_;
if(($frame eq '0') || ($frame eq '1') ||
($frame eq '2') || ($frame eq '.')){
$feature->{Frame} = $frame;
}
else{
print STDERR "Bad Frame Value \'$frame\' should be \'0\', \'1\',".
"\'2\', or \'.\'.\n";
}
}
# GTF::Feature::transcript_id()
# This function returns the transcript_id of this exon's transcript
sub transcript_id{
my ($feature) = @_;
my $tx = $feature->{Transcript};
return $tx->id;
}
# GTF::Feature::transcript()
# This function returns the transcript of this exon's transcript
sub transcript{
my ($feature) = @_;
return $feature->{Transcript};
}
# GTF::Feature::transcript()
# This function returns the transcript of this exon's transcript
sub set_transcript{
my ($feature,$tx) = @_;
$feature->{Transcript} = $tx;
}
# GTF::Feature::transcript_id()
# This function returns the transcript_id of this exon's transcript
sub gene_id{
my ($feature) = @_;
my $tx = $feature->transcript;
my $gene = $tx->gene;
return $gene->id;
}
# GTF::Feature::transcript()
# This function returns the transcript of this exon's transcript
sub gene{
my ($feature) = @_;
my $tx = $feature->{Transcript};
return $tx->gene;
}
# GTF::Feature::seqname()
# This function returns the sequnce name, <seqname> field, that this
# feature came from as a string.
sub seqname{
my ($feature) = @_;
my $tx = $feature->{Transcript};
return $tx->seqname;
}
# GTF::Feature::source()
# This function returns the source, <source> field, of this line as a
# string.
sub source{
my ($feature) = @_;
my $tx = $feature->{Transcript};
return $tx->source;
}
# GTF::Feature::strand()
# This function returns the strand, <strand> field, of this line as a
# string. Should be '+', '-', or '.'.
sub strand{
my ($feature) = @_;
my $tx = $feature->{Transcript};
return $tx->strand;
}
sub equals{
my ($feature,$compare) = @_;
unless($feature && $compare){
return 0;
}
if(($feature->start == $compare->start) &&
($feature->stop == $compare->stop) &&
($feature->strand eq $compare->strand) &&
($feature->type eq $compare->type)){
return 1;
}
return 0;
}
# GTF::Feature::output_gtf([file_handle])
# This function outputs the information for this feature in standard
# gtf2 format. If takes and optional file_handle as the only argument
# to which it outputs the data. If no argument is given it outputs
# the data to stdout.
sub output_gtf{
my ($feature,$out_handle) = @_;
unless(defined $out_handle){
$out_handle = \*STDOUT;
}
print $out_handle
"".$feature->seqname."\t".$feature->source."\t$feature->{Type}\t",
"$feature->{Start}\t$feature->{End}\t$feature->{Score}\t".$feature->strand."\t",
"$feature->{Frame}\tgene_id \"".$feature->gene_id."\"; transcript_id \"",
$feature->transcript_id."\";\n";
}
# GTF::Feature::output_gff([file_handle])
# This function outputs the information for this feature in standard
# GFF format. If takes and optional file_handle as the only argument
# to which it outputs the data. If no argument is given it outputs
# the data to stdout.
sub output_gff{
my ($feature,$out_handle) = @_;
unless(defined $out_handle){
$out_handle = \*STDOUT;
}
print $out_handle
"".$feature->seqname."\t".$feature->source."\t$feature->{Type}\t",
"$feature->{Start}\t$feature->{End}\t$feature->{Score}\t".$feature->strand."\t",
"$feature->{Frame}\t".$feature->transcript_id."\n";
}
# GTF::Feature::set_bases(A,C,G,T,N)
# This funtion allows you to set the number of A,C,G,T nucleotides
# in this feature
sub set_bases{
my ($feature,$a,$c,$g,$t,$n) = @_;
$feature->{A} = $a;
$feature->{C} = $c;
$feature->{G} = $g;
$feature->{T} = $t;
$feature->{N} = $n;
$feature->{Seq} = 1;
}
# GTF::Feature::set_conseq(n0,n1,n2)
# This funtion allows you to set the number of 0,1,2 in the
# conservation sequnce for this feature
sub set_conseq{
my ($feature,$n0,$n1,$n2) = @_;
$feature->{0} = $n0;
$feature->{1} = $n1;
$feature->{2} = $n2;
$feature->{Conseq} = 1;
}
# GTF::Feature::gc_percentage()
# This funciton returns the gc_percentage for this feature if the
# sequence was loaded otherwise it returns -1.
sub gc_percentage{
my ($feature) = @_;
if($feature->{GC} != -1){
return $feature->{GC};
}
unless($feature->{Seq}){
return -1;
}
my $a = $feature->{A};
my $c = $feature->{C};
my $g = $feature->{G};
my $t = $feature->{T};
my $n = $feature->{N};
my $total = $a + $c + $g + $t + $n;
my $gc = $g + $c;
my $percent = $gc/$total;
$feature->{GC} = $percent;
return $percent;
}
# GTF::Feature::match_percentage()
# This funciton returns the percentage of matches in the conservation sequence
# for this feature if it was loaded otherwise it returns -1.
sub match_percentage{
my ($feature) = @_;
if($feature->{Match} != -1){
return $feature->{Match};
}
unless($feature->{Conseq}){
return -1;
}
my $match = $feature->{1};
my $mismatch = $feature->{0};
my $unaligned = $feature->{2};
my $total = $match + $mismatch + $unaligned;
my $percent = $match/$total;
$feature->{Match} = $percent;
return $percent;
}
# GTF::Feature::mismatch_percentage()
# This funciton returns the percentage of mismatches in the conservation sequence
# for this feature if it was loaded otherwise it returns -1.
sub mismatch_percentage{
my ($feature) = @_;
if($feature->{Mismatch} != -1){
return $feature->{Mismatch};
}
unless($feature->{Conseq}){
return -1;
}
my $match = $feature->{1};
my $mismatch = $feature->{0};
my $unaligned = $feature->{2};
my $total = $match + $mismatch + $unaligned;
my $percent = $mismatch/$total;
$feature->{Mismatch} = $percent;
return $percent;
}
# GTF::Feature::unaligned_percentage()
# This funciton returns the percentage of unaligned nucs in the conservation sequence
# for this feature if it was loaded otherwise it returns -1.
sub unaligned_percentage{
my ($feature) = @_;
if($feature->{Unaligned} != -1){
return $feature->{Unaligned};
}
unless($feature->{Conseq}){
return -1;
}
my $match = $feature->{1};
my $mismatch = $feature->{0};
my $unaligned = $feature->{2};
my $total = $match + $mismatch + $unaligned;
my $percent = $unaligned/$total;
$feature->{Unaligned} = $percent;
return $percent;
}
# GTF::Feature::get_a_count()
# This funciton returns the number of A nucleotides in this features sequence
# if the sequence was loaded, otherwise it returns -1.
sub get_a_count{
my ($feature) = @_;
unless($feature->{Seq}){
return -1;
}
return $feature->{A};
}
# GTF::Feature::get_c_count()
# This funciton returns the number of C nucleotides in this features sequence
# if the sequence was loaded, otherwise it returns -1.
sub get_c_count{
my ($feature) = @_;
unless($feature->{Seq}){
return -1;
}
return $feature->{C};
}
# GTF::Feature::get_g_count()
# This funciton returns the number of G nucleotides in this features sequence
# if the sequence was loaded, otherwise it returns -1.
sub get_g_count{
my ($feature) = @_;
unless($feature->{Seq}){
return -1;
}
return $feature->{G};
}
# GTF::Feature::get_t_count()
# This funciton returns the number of T nucleotides in this features sequence
# if the sequence was loaded, otherwise it returns -1.
sub get_t_count{
my ($feature) = @_;
unless($feature->{Seq}){
return -1;
}
return $feature->{T};
}
# GTF::Feature::get_n_count()
# This funciton returns the number of N nucleotides in this features sequence
# if the sequence was loaded, otherwise it returns -1.
sub get_n_count{
my ($feature) = @_;
unless($feature->{Seq}){
return -1;
}
return $feature->{N};
}
# GTF::Feature::get_match_count()
# This funciton returns the number of matched nucleotides in this features
# conservation sequence if it was loaded, otherwise it returns -1.
sub get_match_count{
my ($feature) = @_;
unless($feature->{Conseq}){
return -1;
}
return $feature->{1};
}
# GTF::Feature::get_mismatch_count()
# This funciton returns the number of mismatched nucleotides in this features
# conservation sequence if it was loaded, otherwise it returns -1.
sub get_mismatch_count{
my ($feature) = @_;
unless($feature->{Conseq}){
return -1;
}
return $feature->{0};
}
# GTF::Feature::get_unaligned_count()
# This funciton returns the number of unaligned nucleotides in this features
# conservation sequence if it was loaded, otherwise it returns -1.
sub get_unaligned_count{
my ($feature) = @_;
unless($feature->{Conseq}){
return -1;
}
return $feature->{2};
}
sub tag{
my ($feature) = @_;
return $feature->{Tag};
}
sub set_tag{
my ($feature,$tag) = @_;
$feature->{Tag} = $tag;
}
1;
__END__