222 lines
4.8 KiB
Perl
222 lines
4.8 KiB
Perl
#!/usr/bin/env perl
|
|
|
|
use strict;
|
|
use warnings;
|
|
|
|
use FindBin;
|
|
use lib("$FindBin::RealBin/../../PerlLib");
|
|
use Pipeliner;
|
|
use File::Basename;
|
|
use Cwd;
|
|
|
|
use Carp;
|
|
use Getopt::Long qw(:config no_ignore_case bundling pass_through);
|
|
|
|
our $HISAT_HOME;
|
|
|
|
BEGIN {
|
|
|
|
if ($ENV{HISAT_HOME}) {
|
|
$HISAT_HOME = $ENV{HISAT_HOME};
|
|
}
|
|
else {
|
|
my $hisat_prog = `sh -c "command -v hisat"`;
|
|
if ($hisat_prog) {
|
|
chomp $hisat_prog;
|
|
$HISAT_HOME = dirname($hisat_prog);
|
|
}
|
|
else {
|
|
die "Error, cannot find hisat in PATH setting";
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
|
|
my $usage = <<__EOUSAGE__;
|
|
|
|
######################################################################
|
|
#
|
|
# Required:
|
|
# --genome <string> target genome to align to
|
|
# --reads <string> fastq files. If pairs, indicate both in quotes, ie. "left.fq right.fq"
|
|
#
|
|
# Optional:
|
|
# -N <int> max number of alignments to report. (default: 1)
|
|
# -G <string> GTF file for incorporating reference splice site info.
|
|
# --CPU <int> number of threads (default: 2)
|
|
# --out_prefix <string> output prefix (default: hisat)
|
|
# --run_as_single_reads if paired, run as single reads
|
|
#
|
|
#######################################################################
|
|
|
|
Include additional hisat arguments as additional command line parameters.
|
|
|
|
########################################################################
|
|
|
|
__EOUSAGE__
|
|
|
|
;
|
|
|
|
|
|
my ($genome, $reads);
|
|
|
|
my $CPU = 2;
|
|
|
|
my $help_flag;
|
|
|
|
my $out_prefix = "hisat";
|
|
my $gtf_file;
|
|
my $run_as_single_flag = 0;
|
|
my $num_top_hits = 1;
|
|
|
|
&GetOptions( 'h' => \$help_flag,
|
|
'genome=s' => \$genome,
|
|
'reads=s' => \$reads,
|
|
'CPU=i' => \$CPU,
|
|
'out_prefix=s' => \$out_prefix,
|
|
'G=s' => \$gtf_file,
|
|
'run_as_single_reads' => \$run_as_single_flag,
|
|
'N=i' => \$num_top_hits,
|
|
);
|
|
|
|
|
|
|
|
if ($help_flag) {
|
|
die $usage;
|
|
}
|
|
|
|
unless ($genome && $reads) {
|
|
die $usage;
|
|
}
|
|
|
|
|
|
main: {
|
|
|
|
my $hisat_index = "$genome.hisat.idx";
|
|
if (! -s "$hisat_index.1.bt2") {
|
|
## build hisat index
|
|
|
|
my $cmd = "$HISAT_HOME/hisat-build $genome $hisat_index";
|
|
&process_cmd($cmd);
|
|
}
|
|
|
|
|
|
my $splice_incl = "";
|
|
|
|
if ($gtf_file) {
|
|
|
|
my $gtf_splice = "$gtf_file.hisat.splice";
|
|
|
|
unless (-s $gtf_splice) {
|
|
my $cmd = "$HISAT_HOME/extract_splice_sites.py $gtf_file > $gtf_file.hisat.splice";
|
|
&process_cmd($cmd);
|
|
}
|
|
|
|
$splice_incl = " --known-splicesite-infile $gtf_splice ";
|
|
}
|
|
|
|
## run HISAT
|
|
|
|
$reads = &add_zcat_fifo_and_add_hisat_params($reads);
|
|
|
|
my $top_hits_count = "";
|
|
if ($num_top_hits > 1) {
|
|
$top_hits_count = " -k $num_top_hits ";
|
|
}
|
|
|
|
my @tmpfiles;
|
|
|
|
my $pipeliner = new Pipeliner(-verbose => 1);
|
|
|
|
my $cmd = "bash -c \"set -o pipefail && $HISAT_HOME/hisat -x $hisat_index -q $reads $splice_incl -p $CPU $top_hits_count @ARGV | gzip -c > $out_prefix.sam.gz\" ";
|
|
|
|
$pipeliner->add_commands( new Command($cmd, "$out_prefix.sam.gz.ok") );
|
|
push (@tmpfiles, "$out_prefix.sam.gz");
|
|
|
|
$cmd = "bash -c \"set -o pipefail && gunzip -c $out_prefix.sam.gz | samtools view -@ $CPU -F 4 -Sb -o $out_prefix.bam \"";
|
|
$pipeliner->add_commands( new Command($cmd, "$out_prefix.bam.ok") );
|
|
push (@tmpfiles, "$out_prefix.bam");
|
|
|
|
|
|
$cmd = "samtools sort -@ $CPU $out_prefix.bam -o $out_prefix.cSorted.bam";
|
|
$pipeliner->add_commands( new Command($cmd, "$out_prefix.cSorted.bam.ok") );
|
|
|
|
|
|
$pipeliner->run();
|
|
|
|
|
|
if (-s "$out_prefix.cSorted.bam") {
|
|
$cmd = "samtools index $out_prefix.cSorted.bam";
|
|
&process_cmd($cmd);
|
|
}
|
|
|
|
unlink(@tmpfiles);
|
|
|
|
exit(0);
|
|
}
|
|
|
|
|
|
####
|
|
sub add_zcat_fifo_and_add_hisat_params {
|
|
my ($reads) = @_;
|
|
|
|
$reads =~ s/^\s+|\s+$//g;
|
|
|
|
my @adj_reads_list;
|
|
|
|
my $counter = 0;
|
|
my @read_files = split(/\s+/, $reads);
|
|
|
|
my @updated_read_filenames;
|
|
|
|
foreach my $reads_file (@read_files) {
|
|
|
|
$counter++;
|
|
|
|
if ($reads_file =~ /\.gz$/) {
|
|
$reads_file = "<(zcat $reads_file)";
|
|
}
|
|
|
|
push (@updated_read_filenames, $reads_file);
|
|
|
|
# add decoration
|
|
$reads_file = (scalar(@read_files) == 2) ? "-$counter $reads_file" : "-U $reads_file";
|
|
|
|
push (@adj_reads_list, $reads_file);
|
|
}
|
|
|
|
if ($run_as_single_flag) {
|
|
return("-U " . join(",", @updated_read_filenames));
|
|
}
|
|
else {
|
|
|
|
|
|
my $adj_reads = join(" ", @adj_reads_list);
|
|
|
|
return($adj_reads);
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
####
|
|
sub process_cmd {
|
|
my ($cmd) = @_;
|
|
|
|
print STDERR "CMD: $cmd\n";
|
|
#return;
|
|
|
|
my $ret = system($cmd);
|
|
if ($ret) {
|
|
die "Error, cmd: $cmd died with ret ($ret)";
|
|
}
|
|
|
|
return;
|
|
}
|
|
|
|
|
|
|