#!/usr/bin/env perl use strict; use warnings; use FindBin; use Cwd; use File::Basename; use Carp; use Data::Dumper; use Getopt::Long qw(:config no_ignore_case bundling); $ENV{PATH} .= "\:$FindBin::RealBin/../trinity-plugins/rsem/sam/"; # include samtools in path, already included in rsem build. $ENV{LC_ALL} = 'C'; # critical for proper sorting using [system "sort -k1,1 ..."] within the perl script my $output_prefix = "bowtie2"; my $usage = <<_EOUSAGE_; ################################################################################################################ # # --left and --right reads # # or # # --single reads # # Required inputs: # # --target multi-fasta file containing the target sequences (should be named {refName}.fa ) # # --seqType fa | fq (fastA or fastQ format) # # # Optional: # # --SS_lib_type strand-specific library type: single: F or R paired: FR or RF # examples: single RNA-Ligation method: F # single dUTP method: R # paired dUTP method: RF # # --num_top_hits (default: 20) # # --retain_intermediate_files retain all the intermediate sam files produced (they take up lots of space! and there's lots of them) # # --max_dist_between_pairs default (2000) # # --just_prep_build just prepare the bowtie-build and stop. # # --output_prefix|o prefix for output filename (default: $output_prefix) # # --CPU number of threads # # ## General options # # Any options after '--' are passed onward to the alignments programs (except BLAT -which has certain options exposed above). # For example, to set the number of processors used by Bowtie to 16 then use: # -- -p 16 # #################################################################################################################### # # Example commands: # # $0 --seqType fq --left left.fq --right right.fq --target Trinity.fasta # # #################################################################################################################### _EOUSAGE_ ; my $help_flag; my $target_db; my $left_file; my $right_file; my $single_file; my $num_top_hits = 20; my $max_dist_between_pairs = 2000; my $seqType; my $SS_lib_type; my $retain_intermediate_files_flag = 0; my $trinity_mode; my $CPU = 1; my $JUST_PREP_BUILD = 0; unless (@ARGV) { die $usage; } &GetOptions ( 'h' => \$help_flag, ## required inputs 'left=s' => \$left_file, 'right=s' => \$right_file, 'single=s' => \$single_file, "target=s" => \$target_db, "seqType=s" => \$seqType, "CPU=i" => \$CPU, ## Optional: "SS_lib_type=s" => \$SS_lib_type, 'output_prefix|o=s' => \$output_prefix, 'num_top_hits=i' => \$num_top_hits, 'max_dist_between_pairs=i' => \$max_dist_between_pairs, 'retain_intermediate_files' => \$retain_intermediate_files_flag, 'just_prep_build' => \$JUST_PREP_BUILD, ); if ($help_flag) { die $usage; } unless ($target_db && -s $target_db) { die $usage . "Must specify target_db and it must exist at that location"; } unless ($JUST_PREP_BUILD || ($seqType && $seqType =~ /^(fq|fa)$/)) { die $usage . ", sorry do not understand seqType $seqType"; } unless ($JUST_PREP_BUILD || ($left_file && $right_file) || ($single_file) ) { die $usage . "sorry, cannot find files $left_file $right_file $single_file"; } if ($SS_lib_type && $SS_lib_type !~ /^(F|R|FR|RF)$/) { die "Error, SS_lib_type must be one of the following: (F, R, FR, RF) "; } ## check for required programs { my @required_progs = qw(samtools bowtie2-build bowtie2); foreach my $prog (@required_progs) { my $path = `sh -c "command -v $prog"`; unless ($path =~ /^\//) { die "Error, path to required $prog cannot be found"; } } } my $util_dir = "$FindBin::RealBin"; my ($start_dir, $work_dir, $num_hits); main: { $start_dir = cwd(); $left_file = &build_full_paths($left_file, $start_dir) if $left_file; $right_file = &build_full_paths($right_file, $start_dir) if $right_file; $target_db = &build_full_paths($target_db, $start_dir); $single_file = &build_full_paths($single_file, $start_dir) if $single_file; unless (-s "$target_db.fai") { &process_cmd("samtools faidx $target_db"); } ###################################### ## Prep the bowtie index of the target ###################################### my $index_ext = "bt2"; my @bowtie_build_files = <$target_db.*.$index_ext>; print STDERR "bt2 index files: " . Dumper(\@bowtie_build_files); my $index_file_checkpoint = "$target_db._${index_ext}_idx_.ok"; unless (@bowtie_build_files && -e $index_file_checkpoint) { print STDERR "Note - bowtie-build indices do not yet exist. Indexing genome now.\n"; ## run bowtie-build: my $builder = "bowtie2-build"; my $cmd = "$builder -q $target_db $target_db"; &process_cmd($cmd); &process_cmd("touch $index_file_checkpoint"); } if ($JUST_PREP_BUILD) { print STDERR "Just preparing build, stopping now.\n"; exit(0); } my @entries; my $format = ($seqType eq "fq") ? "-q" : "-f"; my $output_file = "$output_prefix.coordSorted.bam"; my $cmd; if ($left_file && $right_file) { $cmd = "bash -c \"set -o pipefail; bowtie2 --local -k $num_top_hits --threads $CPU --no-unal $format -X $max_dist_between_pairs -x $target_db -1 $left_file -2 $right_file | samtools sort -@ $CPU -o - - > $output_file\" "; } else { $cmd = "bash -c \"set -o pipefail; bowtie2 --local -k $num_top_hits --threads $CPU --no-unal $format -x $target_db -U $single_file | samtools sort -o - - > $output_file\" "; } &process_cmd($cmd) unless (-e "$output_file.ok"); &process_cmd("touch $output_file.ok") unless (-e "$output_file.ok"); my @to_delete; if ($SS_lib_type) { ## strand-specific ## separate the sam based on strand, and create separate bam files. (for convenience sake) $cmd = "$util_dir/SAM_strand_separator.pl $output_file $SS_lib_type"; &process_cmd($cmd); foreach my $sam_file ("$output_file.+.sam", "$output_file.-.sam") { my $bam_file = $sam_file; $bam_file =~ s/\.sam$/\.bam/; # add suffix below unless (-s $sam_file) { next; } # empty file push (@to_delete, $sam_file); $cmd = "samtools view -bt $target_db.fai $sam_file | samtools sort -o - - > $bam_file"; # .bam ext added auto &process_cmd($cmd); $cmd = "samtools index $bam_file"; &process_cmd($cmd); } } ## do final cleanpup of intermediate files unless ($retain_intermediate_files_flag) { &purge_files(@to_delete); } @to_delete = (); exit(0); } #### sub process_cmd { my ($cmd) = @_; print STDERR "CMD: $cmd\n"; my $ret = system($cmd); if ($ret) { confess "Error, cmd: $cmd died with ret $ret"; } return; } #### sub build_full_paths { my ($path, $start_dir) = @_; my @paths; foreach my $p (split(/,/, $path)) { if ($p !~ /^\//) { $p = "$start_dir/$p"; } push (@paths, $p); } $path = join(",", @paths); return($path); } #### sub purge_files { my @to_delete = @_; foreach my $file (@to_delete) { if (-e $file || -l $file) { print STDERR "-cleaning up and removing intermediate file: $file\n"; unlink($file); } else { #print STDERR "-warning: cannot locate file $file targeted for deletion.\n"; } } return; }