biyelunwen/99.scripts/trinity_utils/util/support_scripts/bowtie2_wrapper.pl

339 lines
8.4 KiB
Perl

#!/usr/bin/env perl
use strict;
use warnings;
use FindBin;
use Cwd;
use File::Basename;
use Carp;
use Data::Dumper;
use Getopt::Long qw(:config no_ignore_case bundling);
$ENV{PATH} .= "\:$FindBin::RealBin/../trinity-plugins/rsem/sam/"; # include samtools in path, already included in rsem build.
$ENV{LC_ALL} = 'C'; # critical for proper sorting using [system "sort -k1,1 ..."] within the perl script
my $output_prefix = "bowtie2";
my $usage = <<_EOUSAGE_;
################################################################################################################
#
# --left and --right <string> reads
#
# or
#
# --single <string> reads
#
# Required inputs:
#
# --target <string> multi-fasta file containing the target sequences (should be named {refName}.fa )
#
# --seqType <string> fa | fq (fastA or fastQ format)
#
#
# Optional:
#
# --SS_lib_type <string> strand-specific library type: single: F or R paired: FR or RF
# examples: single RNA-Ligation method: F
# single dUTP method: R
# paired dUTP method: RF
#
# --num_top_hits <int> (default: 20)
#
# --retain_intermediate_files retain all the intermediate sam files produced (they take up lots of space! and there's lots of them)
#
# --max_dist_between_pairs default (2000)
#
# --just_prep_build just prepare the bowtie-build and stop.
#
# --output_prefix|o <string> prefix for output filename (default: $output_prefix)
#
# --CPU <int> number of threads
#
# ## General options
#
# Any options after '--' are passed onward to the alignments programs (except BLAT -which has certain options exposed above).
# For example, to set the number of processors used by Bowtie to 16 then use:
# -- -p 16
#
####################################################################################################################
#
# Example commands:
#
# $0 --seqType fq --left left.fq --right right.fq --target Trinity.fasta
#
#
####################################################################################################################
_EOUSAGE_
;
my $help_flag;
my $target_db;
my $left_file;
my $right_file;
my $single_file;
my $num_top_hits = 20;
my $max_dist_between_pairs = 2000;
my $seqType;
my $SS_lib_type;
my $retain_intermediate_files_flag = 0;
my $trinity_mode;
my $CPU = 1;
my $JUST_PREP_BUILD = 0;
unless (@ARGV) {
die $usage;
}
&GetOptions ( 'h' => \$help_flag,
## required inputs
'left=s' => \$left_file,
'right=s' => \$right_file,
'single=s' => \$single_file,
"target=s" => \$target_db,
"seqType=s" => \$seqType,
"CPU=i" => \$CPU,
## Optional:
"SS_lib_type=s" => \$SS_lib_type,
'output_prefix|o=s' => \$output_prefix,
'num_top_hits=i' => \$num_top_hits,
'max_dist_between_pairs=i' => \$max_dist_between_pairs,
'retain_intermediate_files' => \$retain_intermediate_files_flag,
'just_prep_build' => \$JUST_PREP_BUILD,
);
if ($help_flag) { die $usage; }
unless ($target_db && -s $target_db) {
die $usage . "Must specify target_db and it must exist at that location";
}
unless ($JUST_PREP_BUILD || ($seqType && $seqType =~ /^(fq|fa)$/)) {
die $usage . ", sorry do not understand seqType $seqType";
}
unless ($JUST_PREP_BUILD
||
($left_file && $right_file)
||
($single_file)
) {
die $usage . "sorry, cannot find files $left_file $right_file $single_file";
}
if ($SS_lib_type && $SS_lib_type !~ /^(F|R|FR|RF)$/) {
die "Error, SS_lib_type must be one of the following: (F, R, FR, RF) ";
}
## check for required programs
{
my @required_progs = qw(samtools bowtie2-build bowtie2);
foreach my $prog (@required_progs) {
my $path = `sh -c "command -v $prog"`;
unless ($path =~ /^\//) {
die "Error, path to required $prog cannot be found";
}
}
}
my $util_dir = "$FindBin::RealBin";
my ($start_dir, $work_dir, $num_hits);
main: {
$start_dir = cwd();
$left_file = &build_full_paths($left_file, $start_dir) if $left_file;
$right_file = &build_full_paths($right_file, $start_dir) if $right_file;
$target_db = &build_full_paths($target_db, $start_dir);
$single_file = &build_full_paths($single_file, $start_dir) if $single_file;
unless (-s "$target_db.fai") {
&process_cmd("samtools faidx $target_db");
}
######################################
## Prep the bowtie index of the target
######################################
my $index_ext = "bt2";
my @bowtie_build_files = <$target_db.*.$index_ext>;
print STDERR "bt2 index files: " . Dumper(\@bowtie_build_files);
my $index_file_checkpoint = "$target_db._${index_ext}_idx_.ok";
unless (@bowtie_build_files && -e $index_file_checkpoint) {
print STDERR "Note - bowtie-build indices do not yet exist. Indexing genome now.\n";
## run bowtie-build:
my $builder = "bowtie2-build";
my $cmd = "$builder -q $target_db $target_db";
&process_cmd($cmd);
&process_cmd("touch $index_file_checkpoint");
}
if ($JUST_PREP_BUILD) {
print STDERR "Just preparing build, stopping now.\n";
exit(0);
}
my @entries;
my $format = ($seqType eq "fq") ? "-q" : "-f";
my $output_file = "$output_prefix.coordSorted.bam";
my $cmd;
if ($left_file && $right_file) {
$cmd = "bash -c \"set -o pipefail; bowtie2 --local -k $num_top_hits --threads $CPU --no-unal $format -X $max_dist_between_pairs -x $target_db -1 $left_file -2 $right_file | samtools sort -@ $CPU -o - - > $output_file\" ";
}
else {
$cmd = "bash -c \"set -o pipefail; bowtie2 --local -k $num_top_hits --threads $CPU --no-unal $format -x $target_db -U $single_file | samtools sort -o - - > $output_file\" ";
}
&process_cmd($cmd) unless (-e "$output_file.ok");
&process_cmd("touch $output_file.ok") unless (-e "$output_file.ok");
my @to_delete;
if ($SS_lib_type) {
## strand-specific
## separate the sam based on strand, and create separate bam files. (for convenience sake)
$cmd = "$util_dir/SAM_strand_separator.pl $output_file $SS_lib_type";
&process_cmd($cmd);
foreach my $sam_file ("$output_file.+.sam", "$output_file.-.sam") {
my $bam_file = $sam_file;
$bam_file =~ s/\.sam$/\.bam/; # add suffix below
unless (-s $sam_file) { next; } # empty file
push (@to_delete, $sam_file);
$cmd = "samtools view -bt $target_db.fai $sam_file | samtools sort -o - - > $bam_file"; # .bam ext added auto
&process_cmd($cmd);
$cmd = "samtools index $bam_file";
&process_cmd($cmd);
}
}
## do final cleanpup of intermediate files
unless ($retain_intermediate_files_flag) {
&purge_files(@to_delete);
}
@to_delete = ();
exit(0);
}
####
sub process_cmd {
my ($cmd) = @_;
print STDERR "CMD: $cmd\n";
my $ret = system($cmd);
if ($ret) {
confess "Error, cmd: $cmd died with ret $ret";
}
return;
}
####
sub build_full_paths {
my ($path, $start_dir) = @_;
my @paths;
foreach my $p (split(/,/, $path)) {
if ($p !~ /^\//) {
$p = "$start_dir/$p";
}
push (@paths, $p);
}
$path = join(",", @paths);
return($path);
}
####
sub purge_files {
my @to_delete = @_;
foreach my $file (@to_delete) {
if (-e $file || -l $file) {
print STDERR "-cleaning up and removing intermediate file: $file\n";
unlink($file);
}
else {
#print STDERR "-warning: cannot locate file $file targeted for deletion.\n";
}
}
return;
}