biyelunwen/99.scripts/trinity_utils/util/misc/process_minimap2_alignments.pl

184 lines
4.1 KiB
Perl

#!/usr/bin/env perl
use strict;
use warnings;
use FindBin;
use File::Basename;
use Cwd;
use Carp;
use Getopt::Long qw(:config no_ignore_case bundling pass_through);
my $usage = <<__EOUSAGE__;
######################################################################
#
# Required:
# --genome <string> target genome to align to
# --transcripts <string> cdna sequences to align
#
# Optional:
# --gtf <string> gene structure annotations in gtf format
# --CPU <int> number of threads (default: 2)
# -o|--output <string> bam output filename (default: basename(transcripts).mm2.bam)
#
# -I|--max_intron_length <int> maximum intron length (default: 100000)
#
# --incl_out_gff3 include gff3 formatted output file for alignments.
# --allow_secondary allow secondary alignments (default secondary=no)
#
# --eqx include --eqx flag
# --cs include long format via --cs w/ minimap2
# --hq pacbio CCS reads (--splice:hq for mm2)
#
#######################################################################
__EOUSAGE__
;
my $genome;
my $transcripts;
my $gtf;
my $CPU = 2;
my $help_flag;
my $output;
my $max_intron_length = 100000;
my $incl_out_gff3;
my $allow_secondary = 0;
my $include_cs_flag = 0;
my $include_eqx_flag = 0;
my $include_hq_flag = 0;
&GetOptions( 'h' => \$help_flag,
'genome=s' => \$genome,
'transcripts=s' => \$transcripts,
'gtf=s' => \$gtf,
'CPU=i' => \$CPU,
'o|output=s' => \$output,
'I|max_intron_length=i' => \$max_intron_length,
'incl_out_gff3' => \$incl_out_gff3,
'allow_secondary' => \$allow_secondary,
'cs' => \$include_cs_flag,
'hq' => \$include_hq_flag,
);
if ($help_flag) {
die $usage;
}
unless ($genome && $transcripts) {
die $usage;
}
my $include_cs_param = "";
my $include_cs_token = "";
if ($include_cs_flag) {
$include_cs_param = "--cs";
$include_cs_token = ".cs";
}
my $include_eqx_param = "";
my $include_eqx_token = "";
if ($include_eqx_flag) {
$include_eqx_param = "--eqx";
$include_eqx_token = ".eqx";
}
my $include_hq_param = "";
my $include_hq_token = "";
if ($include_hq_flag) {
$include_hq_param = ":hq";
$include_hq_token = ".hq";
}
unless ($output) {
$output = basename($transcripts) . ".mm2${include_cs_token}${include_eqx_token}${include_hq_token}.bam";
}
main: {
my $genomeBaseDir = dirname($genome);
my $genomeName = basename($genome);
my $mm2_idx = "$genomeBaseDir/$genomeName" . ".mm2";
my $cwd = cwd();
my $splice_file = "$mm2_idx.splice.bed";
unless (-e $mm2_idx) {
my $cmd = "minimap2 -d $mm2_idx $genome";
&process_cmd($cmd);
}
if ($gtf && ! -s $splice_file) {
my $cmd = "paftools.js gff2bed $gtf > $splice_file";
&process_cmd($cmd);
}
## run minimap2
my $splice_param = "";
if ($splice_file) {
$splice_param = "--junc-bed $splice_file";
}
my $secondary = ($allow_secondary) ? "" : "--secondary=no";
my $cmd = "minimap2 --sam-hit-only -ax splice$include_hq_param $splice_param $secondary -t $CPU -u b -G $max_intron_length $include_cs_param $include_eqx_param $mm2_idx $transcripts > $output.tmp.sam";
&process_cmd($cmd);
$cmd = "samtools view -Sb -T $genome $output.tmp.sam -o $output.tmp.unsorted.bam";
&process_cmd($cmd);
$cmd = "samtools sort $output.tmp.unsorted.bam -o $output";
&process_cmd($cmd);
$cmd = "samtools index $output";
#&process_cmd($cmd);
`$cmd`; # ignore error that occurs if file is too big.
if ($incl_out_gff3) {
$cmd = "$FindBin::Bin/SAM_to_gff3.minimap2.pl $output > $output.gff3";
&process_cmd($cmd);
}
unlink("$output.tmp.sam", "$output.tmp.unsorted.bam");
exit(0);
}
####
sub process_cmd {
my ($cmd) = @_;
print STDERR "CMD: $cmd\n";
#return;
my $ret = system($cmd);
if ($ret) {
die "Error, cmd: $cmd died with ret ($ret)";
}
return;
}