#!/usr/bin/env perl use strict; use warnings; use FindBin; use lib ("$FindBin::RealBin/../../PerlLib/"); use Ascii_genome_illustrator; use Cwd; my $usage = "usage: $0 refseqs.fa target.fa min_per_id [min_pct_aligned]\n\n"; my $REFSEQS_FA = $ARGV[0] or die $usage; my $TARGET_FA = $ARGV[1] or die $usage; my $MIN_PER_ID = $ARGV[2] or die $usage; my $MIN_PCT_ALIGNED = $ARGV[3] || 0; main: { my $cmd = "blat $TARGET_FA $REFSEQS_FA -t=dna -q=dna -out=blast9 $TARGET_FA.blat.out"; &process_cmd($cmd); # tried blastn, but it doesnt like the Trinity.fasta header format... #my $cmd = "makeblastdb -in trinity_out_dir/Trinity.fasta -dbtype nucl"; #$cmd = " blastn -db trinity_out_dir/Trinity.fasta -query refseqs.fa -outfmt 6 > blast.blastn.out"; #&process_cmd($cmd); &generate_ascii_illustration("$TARGET_FA.blat.out"); exit(0); } #### sub generate_ascii_illustration { my ($align_out_file) = @_; my %ref_seq_lengths = &get_seq_lengths($REFSEQS_FA); my %target_seq_lengths = &get_seq_lengths($TARGET_FA); my @hits = &parse_align_out($align_out_file); ## Generate a reference view. foreach my $ref_acc (keys %ref_seq_lengths) { my $length = $ref_seq_lengths{$ref_acc}; my $ascii_illustration = new Ascii_genome_illustrator($ref_acc, 60); #$ascii_illustration->add_feature($ref_acc, 1, $length, "="); my @matches = grep { $_->{ref_acc} eq $ref_acc } @hits; #@matches = reverse sort {$a->{bitscore}<=>$b->{bitscore}} @matches; @matches = sort {$a->{ref_start}<=>$b->{ref_start}} @matches; foreach my $match (@matches) { my $target_acc = $match->{target_acc}; my $target_start = $match->{target_start}; my $target_end = $match->{target_end}; my $per_id = $match->{per_id}; if ($per_id < $MIN_PER_ID) { next; } my $ref_start = $match->{ref_start}; my $ref_end = $match->{ref_end}; my $target_length = $target_seq_lengths{$target_acc}; my $pct_of_target_aligned = sprintf("%.2f", (abs($target_end-$target_start) + 1) / $target_length * 100); if ($pct_of_target_aligned < $MIN_PCT_ALIGNED) { next; } my $target_feature_name = "$target_acc $target_start-$target_end:$target_length ($pct_of_target_aligned\% aln, $per_id\% ID)"; if ($target_start > $target_end) { ($ref_start, $ref_end) = ($ref_end, $ref_start); } $ascii_illustration->add_feature($target_feature_name, $ref_start, $ref_end, "-"); } print $ascii_illustration->illustrate(1, $length); print "\n"; # spacer } return; } #### sub parse_align_out { my ($align_out_file) = @_; my @alignments; open (my $fh, $align_out_file) or die $!; while (<$fh>) { if (/^\#/) { next; } chomp; =blast_outfmt6 # Fields: query id, subject id, % identity, alignment length, mismatches, gap opens, q. start, q. end, s. start, s. end, evalue, bit score 0 NM_012009_Sh2d1b1 1 comp0_c0_seq1 2 100.00 3 1433 4 0 5 0 6 1 7 1433 8 1 9 1433 10 0.0 11 2647 =cut my @x = split(/\t/); my ($ref_acc, $target_acc, $per_id, $align_len, $mismatches, $gaps, $ref_start, $ref_end, $target_start, $target_end, $evalue, $bitscore) = @x; my $struct = { target_acc => $target_acc, ref_acc => $ref_acc, per_id => $per_id, mismatches => $mismatches, gaps => $gaps, target_start => $target_start, target_end => $target_end, ref_start => $ref_start, ref_end => $ref_end, evalue => $evalue, bitscore => $bitscore, }; push (@alignments, $struct); } return(@alignments); } #### sub get_seq_lengths { my ($refseqs_fa) = @_; my %lengths; my $acc; open (my $fh, $refseqs_fa) or die $!; while (<$fh>) { if (/>(\S+)/) { $acc = $1; } else { my $seq = $_; chomp $seq; my $len = length($seq); $lengths{$acc} += $len; } } close $fh; return(%lengths); } #### sub process_cmd { my ($cmd) = @_; print STDERR "CMD: $cmd\n"; my $ret = system($cmd); if ($ret) { die "Error, cmd: $cmd died with ret $ret"; } return; }