biyelunwen/99.scripts/trinity_utils/util/align_and_estimate_abundanc...

1016 lines
29 KiB
Perl

#!/usr/bin/env perl
use strict;
use warnings;
use FindBin;
use Cwd;
use Carp;
use Getopt::Long qw(:config no_ignore_case bundling pass_through);
use Data::Dumper;
my %aligner_params = (
############
## Bowtie-1
############
'bowtie_RSEM' => '--all --best --strata -m 300 --chunkmbs 512',
# params used by RSEM itself:
# -a -m 200
'bowtie_eXpress' => '--all --best --strata -m 300 --chunkmbs 512',
# bowtie -aS -X 800 --offrate 1 (requires: bowtie-build --offrate 1)
#############
## Bowtie-2
#############
'bowtie2_RSEM' => '--no-mixed --no-discordant --gbar 1000 --end-to-end -k 200 ',
## params used by RSEM itself:
# --dpad 0 --gbar 99999999 --mp 1,1 --np 1 --score-min L,0,-0.1 -I 1 -X 1000 --no-mixed --no-discordant -k 200
'bowtie2_eXpress' => '--no-mixed --no-discordant --gbar 1000 --end-to-end -k 200 ',
# recommended eXpress params: http://bio.math.berkeley.edu/eXpress/faq.html
# -a -X 600 --rdg 6,5 --rfg 6,5 --score-min L,-.6,-.4 --no-discordant --no-mixed
'bowtie_none' => '--all --best --strata -m 300 --chunkmbs 512',
'bowtie2_none' => '--no-mixed --no-discordant --gbar 1000 --end-to-end -k 200 ',
);
my $rsem_add_opts = "";
my $kallisto_add_opts = "";
my $salmon_add_opts= "";
my $salmon_kmer_length = 31;
my $usage = <<__EOUSAGE__;
#########################################################################
#
########################
# Essential parameters:
########################
#
# --transcripts <string> transcript fasta file
#
# --seqType <string> fq|fa
#
# If Paired-end:
#
# --left <string>
# --right <string>
#
# or Single-end:
#
# --single <string>
#
# or (preferred):
#
# --samples_file <string> tab-delimited text file indicating biological replicate relationships.
# ex.
# cond_A cond_A_rep1 A_rep1_left.fq A_rep1_right.fq
# cond_A cond_A_rep2 A_rep2_left.fq A_rep2_right.fq
# cond_B cond_B_rep1 B_rep1_left.fq B_rep1_right.fq
# cond_B cond_B_rep2 B_rep2_left.fq B_rep2_right.fq
#
# # if single-end instead of paired-end, then leave the 4th column above empty.
#
#
#
# --est_method <string> abundance estimation method.
# alignment_based: RSEM
# alignment_free: kallisto|salmon
#
###################################
# Potentially optional parameters:
###################################
#
# --output_dir <string> write all files to output directory
# (note, if using --samples_file, output_dir will be set automatically according to replicate name))
#
#
# if alignment_based est_method:
# --aln_method <string> bowtie|bowtie2 alignment method. (note: RSEM requires either bowtie or bowtie2)
#
###########
# Optional:
# #########
#
# --SS_lib_type <string> strand-specific library type: paired('RF' or 'FR'), single('F' or 'R').
#
# --samples_idx <int> restricte processing to sample entry (index starts at one)
#
#
# --thread_count number of threads to use (default = 4)
#
# --debug retain intermediate files
#
# --gene_trans_map <string> file containing 'gene(tab)transcript' identifiers per line.
# or
# --trinity_mode Setting --trinity_mode will automatically generate the gene_trans_map and use it.
#
#
# --prep_reference prep reference (builds target index)
#
#
########################################
#
# Parameters for single-end reads:
#
# --fragment_length <int> specify RNA-Seq fragment length (default: 200)
# --fragment_std <int> fragment length standard deviation (defalt: 80)
#
########################################
#
# bowtie-related parameters: (note, tool-specific settings are further below)
#
# --max_ins_size <int> maximum insert size (bowtie -X parameter, default: 800)
# --coordsort_bam provide coord-sorted bam in addition to the default (unsorted) bam.
#
########################################
# RSEM opts:
#
# --bowtie_RSEM <string> if using 'bowtie', default: \"$aligner_params{bowtie_RSEM}\"
# --bowtie2_RSEM <string> if using 'bowtie2', default: \"$aligner_params{bowtie2_RSEM}\"
# ** if you change the defaults, specify the full set of parameters to use! **
#
# --include_rsem_bam provide the RSEM enhanced bam file including posterior probabilities of read assignments.
# --rsem_add_opts <string> additional parameters to pass on to rsem-calculate-expression
#
##########################################################################
# kallisto opts:
#
# --kallisto_add_opts <string> default: $kallisto_add_opts
#
##########################################################################
#
# salmon opts:
#
# --salmon_add_opts <string> default: $salmon_add_opts
#
#
# Example usage
#
# ## Just prepare the reference for alignment and abundance estimation
#
# $0 --transcripts Trinity.fasta --est_method salmon --trinity_mode --prep_reference
#
# ## Run the alignment and abundance estimation (assumes reference has already been prepped, errors-out if prepped reference not located.)
#
# $0 --transcripts Trinity.fasta --seqType fq --left reads_1.fq --right reads_2.fq --est_method salmon --trinity_mode --output_dir salmon_quant
#
## ## prep the reference and run the alignment/estimation
#
# $0 --transcripts Trinity.fasta --seqType fq --left reads_1.fq --right reads_2.fq --est_method salmon --trinity_mode --prep_reference --output_dir salmon_quant
#
# ## Use a samples.txt file:
#
# $0 --transcripts Trinity.fasta --est_method salmon --prep_reference --trinity_mode --samples_file samples.txt --seqType fq
#
#########################################################################
__EOUSAGE__
;
my $output_dir;
my $help_flag;
my $transcripts;
my $bam_file;
my $DEBUG_flag = 0;
my $SS_lib_type;
my $thread_count = 4;
my $seqType;
my $left;
my $right;
my $single;
my $gene_trans_map_file;
my $max_ins_size = 800;
my $est_method;
my $aln_method = "";
my $retain_sorted_bam_file = 0;
my $fragment_length = 200;
my $fragment_std = 80;
my $output_prefix = "";
# devel opts
my $prep_reference = 0;
my $trinity_mode;
my $include_rsem_bam;
my $coordsort_bam_flag = 0;
my $samples_file = "";
my $samples_idx = 0;
&GetOptions ( 'help|h' => \$help_flag,
'transcripts=s' => \$transcripts,
'name_sorted_bam=s' => \$bam_file,
'debug' => \$DEBUG_flag,
'SS_lib_type=s' => \$SS_lib_type,
'thread_count=i' => \$thread_count,
'gene_trans_map=s' => \$gene_trans_map_file,
'trinity_mode' => \$trinity_mode,
'seqType=s' => \$seqType,
'left=s' => \$left,
'right=s' => \$right,
'single=s' => \$single,
'max_ins_size=i' => \$max_ins_size,
'samples_file=s' => \$samples_file,
'samples_idx=i' => \$samples_idx,
'output_dir=s' => \$output_dir,
'est_method=s' => \$est_method,
'aln_method=s' => \$aln_method,
'include_rsem_bam' => \$include_rsem_bam,
#'output_prefix=s' => \$output_prefix,
## devel opts
'prep_reference' => \$prep_reference,
# opts for single-end reads
'fragment_length=i' => \$fragment_length,
'fragment_std=i' => \$fragment_std,
#
'bowtie_RSEM=s' => \($aligner_params{'bowtie_RSEM'}),
'bowtie2_RSEM=s' => \($aligner_params{'bowtie2_RSEM'}),
'rsem_add_opts=s' => \$rsem_add_opts,
'kallisto_add_opts=s' => \$kallisto_add_opts,
'salmon_add_opts=s' => \$salmon_add_opts,
'coordsort_bam' => \$coordsort_bam_flag,
'salmon_kmer_length=i' => \$salmon_kmer_length,
);
if (@ARGV) {
die "Error, don't understand arguments: @ARGV ";
}
if ($help_flag) {
die $usage;
}
unless ($est_method) {
die $usage;
}
my @EST_METHODS = qw(RSEM kallisto salmon);
my %ALIGNMENT_BASED_EST_METHODS = map { + $_ => 1 } qw (RSEM);
my %ALIGNMENT_FREE_EST_METHODS = map { + $_ => 1 } qw (kallisto salmon);
unless (
($est_method && $prep_reference && $transcripts && (! ($single||$left||$right||$samples_file)) ) ## just prep reference
||
($transcripts && $est_method && $seqType && ($single || ($left && $right) || $samples_file)) # do alignment
) {
die "Error, missing parameter. See example usage options below.\n" . $usage;
}
if ($ALIGNMENT_FREE_EST_METHODS{$est_method}) {
$aln_method = "none";
}
elsif ($aln_method !~ /bowtie2?/) {
die "Error, --aln_method must be either 'bowtie' or 'bowtie2' ";
}
unless ($est_method =~ /^(RSEM|kallisto|salmon|none)$/i) {
die "Error, --est_method @EST_METHODS only\n";
}
my @samples_to_process;
if ($samples_file) {
@samples_to_process = &parse_samples_file($samples_file);
if ($samples_idx > 0) {
my $num_samples = scalar(@samples_to_process);
if ($samples_idx > $num_samples) {
die "Error, sample index $samples_idx > $num_samples num samples ";
}
@samples_to_process = ($samples_to_process[$samples_idx-1]); # run only that sample
}
}
elsif ( ($left && $right) || $single) {
unless ($output_dir) {
die "Error, must specify output directory name via: --output_dir ";
}
@samples_to_process = &create_sample_definition($output_dir, $left, $right, $single);
}
my $PE_mode = 1;
if ($single || (@samples_to_process && $samples_to_process[0]->{single})) {
unless ($fragment_length) {
die "Error, specify --fragment_length for single-end reads (note, not the length of the read but the mean fragment length)\n\n";
}
$PE_mode = 0;
}
$transcripts = &create_full_path($transcripts);
$gene_trans_map_file = &create_full_path($gene_trans_map_file) if $gene_trans_map_file;
if ($gene_trans_map_file && ! -s $gene_trans_map_file) {
die "Error, $gene_trans_map_file doesn't exist or is empty";
}
if ($SS_lib_type) {
unless ($SS_lib_type =~ /^(RF|FR|R|F)$/) {
die "Error, do not recognize SS_lib_type: [$SS_lib_type]\n";
}
if ($PE_mode && length($SS_lib_type) != 2 ) {
die "Error, SS_lib_type [$SS_lib_type] is not compatible with paired reads";
}
}
if ( $thread_count !~ /^\d+$/ ) {
die "Error, --thread_count value must be an integer";
}
{ # check for required tools in PATH
my $missing = 0;
my @tools = ('samtools');
if ($aln_method eq 'bowtie') {
push (@tools, 'bowtie-build', 'bowtie');
}
elsif ($aln_method eq 'bowtie2') {
push (@tools, 'bowtie2', 'bowtie2-build');
}
if ($est_method =~ /^RSEM$/i) {
push (@tools, 'rsem-calculate-expression');
}
elsif ($est_method eq 'kallisto') {
push (@tools, 'kallisto');
}
elsif ($est_method eq 'salmon') {
push (@tools, 'salmon');
}
foreach my $tool (@tools) {
my $p = `sh -c "command -v $tool"`;
unless ($p =~ /\w/) {
warn("ERROR, cannot find $tool in PATH setting: $ENV{PATH}\n\n");
$missing = 1;
}
}
if ($missing) {
die "Please be sure the utilities @tools are available via your PATH setting.\n";
}
}
main: {
if ($trinity_mode && ! $gene_trans_map_file) {
$gene_trans_map_file = "$transcripts.gene_trans_map";
my $cmd = "$FindBin::RealBin/support_scripts/get_Trinity_gene_to_trans_map.pl $transcripts > $gene_trans_map_file";
&process_cmd($cmd) unless (-e $gene_trans_map_file);
}
if ($ALIGNMENT_BASED_EST_METHODS{$est_method}) {
&run_alignment_BASED_estimation(@samples_to_process);
}
else {
&run_alignment_FREE_estimation(@samples_to_process);
}
exit(0);
}
####
sub run_alignment_FREE_estimation {
my @samples = @_;
if ($est_method eq "kallisto") {
&run_kallisto(@samples);
}
elsif ($est_method eq "salmon") {
&run_salmon(@samples);
}
else {
die "Error, not recognizing est_method: $est_method";
# sholdn't get here
}
}
####
sub run_alignment_BASED_estimation {
my @samples = @_;
my $db_index_name = "$transcripts.${aln_method}";
###############################################
## Prepare transcript database for alignments
###############################################
if ($prep_reference) {
my $cmd = "${aln_method}-build $transcripts $db_index_name";
unless (-e "$db_index_name.ok") {
if (-e "$db_index_name.started") {
print STDERR "WARNING - looks like the prep for $db_index_name was already started by another process. Proceeding with caution.\n";
}
&process_cmd("touch $db_index_name.started");
&process_cmd($cmd);
rename("$db_index_name.started", "$db_index_name.ok");
}
}
if (! -e "$db_index_name.ok") {
die "Error, index $db_index_name not prepared. Be sure to include parameter '--prep_reference' to first prepare the reference for alignment.";
}
my $rsem_prefix = &create_full_path("$transcripts.RSEM");
if ($est_method eq 'RSEM') {
if ($prep_reference) {
if (-e "$rsem_prefix.rsem.prepped.started") {
print STDERR "WARNING - appears that another process has started the rsem-prep step... proceeding with caution.\n";
}
unless (-e "$rsem_prefix.rsem.prepped.ok") {
&process_cmd("touch $rsem_prefix.rsem.prepped.started");
my $cmd = "rsem-prepare-reference "; #--no-bowtie"; # update for RSEM-2.15
if ($gene_trans_map_file) {
$cmd .= " --transcript-to-gene-map $gene_trans_map_file";
}
$cmd .= " $transcripts $rsem_prefix";
&process_cmd($cmd);
rename("$rsem_prefix.rsem.prepped.started", "$rsem_prefix.rsem.prepped.ok");
}
unless (-e "$rsem_prefix.rsem.prepped.ok") {
die "Error, the RSEM data must first be prepped. Please rerun with '--prep_reference' parameter.\n";
}
}
}
unless (@samples) {
print STDERR "Only prepping reference. Stopping now.\n";
exit(0);
}
print STDERR Dumper(\@samples);
my $curr_workdir = cwd();
foreach my $sample_href (@samples) {
chdir $curr_workdir or die "Error, cannot cd to $curr_workdir";
# process below will cd into output dir
&run_alignment_do_quant($sample_href, $db_index_name, $rsem_prefix);
}
}
####
sub run_alignment_do_quant {
my ($sample_href, $db_index_name, $rsem_prefix) = @_;
my $output_dir = $sample_href->{output_dir};
#####################
## Run alignments
#####################
unless (-d $output_dir) {
system("mkdir -p $output_dir");
}
chdir $output_dir or die "Error, cannot cd to output directory $output_dir";
my $prefix = $output_prefix;
if ($prefix) {
$prefix .= "."; # add separator in filename
}
my $bam_file = "${prefix}${aln_method}.bam";
my $bam_file_ok = "$bam_file.ok";
my $read_type = ($seqType eq "fq") ? "-q" : "-f";
##############
## Align reads
my $bowtie_cmd;
if ($aln_method eq 'bowtie') {
if ($PE_mode) {
my ($left_file, $right_file) = ($sample_href->{left}, $sample_href->{right});
## PE alignment
$bowtie_cmd = "set -o pipefail && bowtie $read_type " . $aligner_params{"${aln_method}_${est_method}"} . " -X $max_ins_size -S -p $thread_count $db_index_name -1 $left_file -2 $right_file | samtools view -@ $thread_count -F 4 -S -b | samtools sort -@ $thread_count -n -o $bam_file ";
}
else {
my $single_file = $sample_href->{single};
# SE alignment
$bowtie_cmd = "set -o pipefail && bowtie $read_type " . $aligner_params{"${aln_method}_${est_method}"} . " -S -p $thread_count $db_index_name $single_file | samtools view -@ $thread_count -F 4 -S -b | samtools sort -@ $thread_count -n -o $bam_file ";
}
}
elsif ($aln_method eq 'bowtie2') {
if ($PE_mode) {
## PE alignment
my ($left_file, $right_file) = ($sample_href->{left}, $sample_href->{right});
$bowtie_cmd = "set -o pipefail && bowtie2 " . $aligner_params{"${aln_method}_${est_method}"} . " $read_type -X $max_ins_size -x $db_index_name -1 $left_file -2 $right_file -p $thread_count | samtools view -@ $thread_count -F 4 -S -b | samtools sort -@ $thread_count -n -o $bam_file ";
}
else {
# SE alignment
my $single_file = $sample_href->{single};
$bowtie_cmd = "set -o pipefail && bowtie2 " . $aligner_params{"${aln_method}_${est_method}"} . " $read_type -x $db_index_name -U $single_file -p $thread_count | samtools view -@ $thread_count -F 4 -S -b | samtools sort -@ $thread_count -n -o $bam_file ";
}
}
&process_cmd($bowtie_cmd) unless (-s $bam_file && -e $bam_file_ok);
&process_cmd("touch $bam_file_ok") unless (-e $bam_file_ok);
if ($est_method eq "RSEM") {
# convert bam file for use with rsem:
&process_cmd("convert-sam-for-rsem -p $thread_count $bam_file $bam_file.for_rsem");
&run_RSEM("$bam_file.for_rsem.bam", $rsem_prefix, $output_prefix);
}
elsif ($est_method eq "none") {
print STDERR "Not running abundance estimation, stopping now after alignment.\n";
}
else {
die "Error, --est_method $est_method is not supported";
}
if ($coordsort_bam_flag) {
&sort_bam_file($bam_file);
}
return;
}
####
sub sort_bam_file {
my ($bam_file) = @_;
my $sorted_bam_file = $bam_file;
$sorted_bam_file =~ s/bam$/csorted/;
if (! -e "$sorted_bam_file.bam.ok") {
## sort the bam file
my $cmd = "samtools sort $bam_file -o $sorted_bam_file.bam";
&process_cmd($cmd);
$cmd = "samtools index $sorted_bam_file.bam";
&process_cmd($cmd);
&process_cmd("touch $sorted_bam_file.bam.ok");
}
return;
}
####
sub run_RSEM {
my ($bam_file, $rsem_prefix, $output_prefix) = @_;
unless ($output_prefix) {
$output_prefix = "RSEM";
}
my $keep_intermediate_files_opt = ($DEBUG_flag) ? "--keep-intermediate-files" : "";
my $fraglength_info_txt = "";
if ($single) {
$fraglength_info_txt = "--fragment-length-mean $fragment_length --fragment-length-sd $fragment_std";
}
my $SS_opt = "";
if ($SS_lib_type) {
if ($SS_lib_type =~ /^F/) {
$SS_opt = "--forward-prob 1.0";
}
else {
$SS_opt = "--forward-prob 0";
}
}
my $no_qualities_string = "";
if ($seqType eq 'fa') {
$no_qualities_string = "--no-qualities";
}
my $paired_flag_text = ($PE_mode) ? "--paired-end" : "";
my $rsem_bam_flag = ($include_rsem_bam) ? "" : "--no-bam-output";
my $cmd = "rsem-calculate-expression $no_qualities_string "
. "$paired_flag_text "
. " $rsem_add_opts "
. "-p $thread_count "
. "$fraglength_info_txt "
. "$keep_intermediate_files_opt "
. "$SS_opt $rsem_bam_flag "
. "--bam $bam_file "
. "$rsem_prefix "
. "$output_prefix ";
unless (-e "$output_prefix.isoforms.results.ok") {
&process_cmd($cmd);
}
&process_cmd("touch $output_prefix.isoforms.results.ok");
return;
}
####
sub process_cmd {
my ($cmd) = @_;
unless ($cmd) {
confess "Error, no cmd specified";
}
print STDERR "CMD: $cmd\n";
my $ret = system("bash", "-o", "pipefail", "-c", $cmd);
if ($ret) {
die "Error, cmd: $cmd died with ret: $ret";
}
return;
}
###
sub create_full_path {
my ($file_list) = shift;
my $cwd = cwd();
my @files;
foreach my $file (split(/,/, $file_list)) {
if ($file !~ m|^/|) { # must be a relative path
$file = $cwd . "/$file";
}
push (@files, $file);
}
$file_list = join(",", @files);
return($file_list);
}
####
sub add_zcat_gz {
my ($file_listing) = @_;
my @files;
foreach my $file (split(/,/, $file_listing)) {
if ($file =~ /\.gz$/) {
$file = "<(gunzip -c $file)"; # used to be zcat
}
push (@files, $file);
}
$file_listing = join(",", @files);
return($file_listing);
}
####
sub run_kallisto {
my @samples = @_;
my $kallisto_index = "$transcripts.kallisto_idx";
if ( (! $prep_reference) && (! -e $kallisto_index)) {
confess "Error, no kallisto index file: $kallisto_index, and --prep_reference not set. Re-run with --prep_reference";
}
if ($prep_reference && ! -e $kallisto_index) {
my $cmd = "kallisto index -i $kallisto_index $transcripts";
&process_cmd($cmd);
}
if ($SS_lib_type) {
# add strand-specific options for kallisto
my $kallisto_ss_opt = ($SS_lib_type =~ /^R/) ? "--rf-stranded" : "--fr-stranded";
if ($kallisto_add_opts !~ /$kallisto_ss_opt/) {
$kallisto_add_opts .= " $kallisto_add_opts";
}
}
foreach my $sample_href (@samples) {
my ($output_dir, $left_file, $right_file, $single_file) = ($sample_href->{output_dir},
$sample_href->{left},
$sample_href->{right},
$sample_href->{single});
if ($left_file && $right_file) {
my $cmd = "kallisto quant -i $kallisto_index $kallisto_add_opts -o $output_dir $left_file $right_file";
&process_cmd($cmd);
}
elsif ($single_file) {
my $cmd = "kallisto quant -l $fragment_length -s $fragment_std -i $kallisto_index -o $output_dir $kallisto_add_opts --single $single_file";
&process_cmd($cmd);
}
if ($gene_trans_map_file) {
my $cmd = "$FindBin::RealBin/support_scripts/kallisto_trans_to_gene_results.pl $output_dir/abundance.tsv $gene_trans_map_file > $output_dir/abundance.tsv.genes";
&process_cmd($cmd);
}
}
return;
}
####
sub run_salmon {
my (@samples) = @_;
my $salmon_index = "$transcripts.salmon.idx";
if ( (! $prep_reference) && (! -e $salmon_index)) {
confess "Error, no salmon index file: $salmon_index, and --prep_reference not set. Re-run with --prep_reference";
}
if ($prep_reference && ! -e $salmon_index) {
## Prep salmon index
my $cmd = "salmon index -t $transcripts --keepDuplicates -i $salmon_index -k $salmon_kmer_length -p $thread_count";
&process_cmd($cmd);
}
my $num_failures = 0;
foreach my $sample_href (@samples) {
my ($output_dir, $left_file, $right_file, $single_file) = ($sample_href->{output_dir},
$sample_href->{left},
$sample_href->{right},
$sample_href->{single});
my $outdir = $output_dir; #"$output_dir.$salmon_idx_type";
if (-s "$outdir/quant.sf") {
print STDERR "-output already exists: $outdir/quant.sf, skipping.\n";
next;
}
eval {
if ($left_file && $right_file) {
## PE mode
my $libtype = ($SS_lib_type) ? "IS" . substr($SS_lib_type, 0, 1) : "IU";
my $cmd = "salmon quant -i $salmon_index -l $libtype -1 $left_file -2 $right_file -o $outdir $salmon_add_opts -p $thread_count --validateMappings ";
&process_cmd($cmd);
}
elsif ($single_file) {
my $libtype = ($SS_lib_type) ? "S" . substr($SS_lib_type, 0, 1) : "U";
my $cmd = "salmon quant -i $salmon_index -l $libtype -r $single_file -o $outdir $salmon_add_opts -p $thread_count --validateMappings ";
&process_cmd($cmd);
}
if ($gene_trans_map_file) {
my $cmd = "$FindBin::RealBin/support_scripts/salmon_trans_to_gene_results.pl $output_dir/quant.sf $gene_trans_map_file > $output_dir/quant.sf.genes";
&process_cmd($cmd);
}
};
if ($@) {
$num_failures++;
print STDERR "Error detected: $@";
}
}
if ($num_failures) {
die "Error, encountered $num_failures failed salmon jobs. See errors above";
}
return;
}
####
sub parse_samples_file {
my ($samples_file) = @_;
my @samples_to_process;
my %seen;
open (my $fh, $samples_file) or die "Error, cannot open file: [$samples_file]";
while (<$fh>) {
chomp;
if (/^\#/) { next; }
unless (/\w/) { next; }
if (/^\-/) { next; }
s/^\s+|\s+$//g; # trim trailing ws
my @x = split(/\s+/);
my $sample_name = $x[0];
my $rep_name = $x[1];
if ($seen{$rep_name}) {
die "Error, replicate names must be unique. Found $rep_name listed multiple times";
}
$seen{$rep_name}++;
my $output_dir = $rep_name;
my $left_fq = $x[2];
my $right_fq = $x[3];
if ($left_fq) {
unless (-s $left_fq) {
die "Error, cannot locate file: $left_fq as specified in samples file: $samples_file";
}
$left_fq = &create_full_path($left_fq);
if ($left_fq =~ /\.gz$/) {
$left_fq = &add_zcat_gz($left_fq) if ($aln_method eq "bowtie");
}
}
else {
die "Error, cannot parse line $_ of samples file: $samples_file . See usage info for samples file formatting requirements.";
}
if ($right_fq) {
unless (-s $right_fq) {
die "Error, cannot locate file $right_fq as specified in samples file: $samples_file";
}
$right_fq = &create_full_path($right_fq);
if ($right_fq =~ /\.gz$/) {
$right_fq = &add_zcat_gz($right_fq) if ($aln_method eq "bowtie");
}
}
if ($left_fq && $right_fq) {
push (@samples_to_process, { left => $left_fq,
right => $right_fq,
output_dir => $output_dir,
} );
}
else {
push (@samples_to_process, { single => $left_fq,
output_dir => $output_dir,
} );
}
}
return (@samples_to_process);
}
####
sub create_sample_definition {
my ($output_dir, $left, $right, $single) = @_;
$left = &create_full_path($left) if $left;
$right = &create_full_path($right) if $right;
$single = &create_full_path($single) if $single;
if ($left && $left =~ /\.gz$/) {
$left = &add_zcat_gz($left) if ($aln_method eq "bowtie");
}
if ($right && $right =~ /\.gz$/) {
$right = &add_zcat_gz($right) if ($aln_method eq "bowtie");
}
if ($single && $single =~ /\.gz$/) {
$single = &add_zcat_gz($single) if ($aln_method eq "bowtie");
}
if ($left && $right) {
return ( { left => $left,
right => $right,
output_dir => $output_dir,
} );
}
else {
return( { single => $single,
output_dir => $output_dir,
} );
}
}