biyelunwen/99.scripts/ticr/mb.pl

1103 lines
30 KiB
Perl
Executable File

#!/usr/bin/perl
use strict;
use warnings;
use POSIX;
use IO::Select;
use IO::Socket;
use Digest::MD5;
use Getopt::Long;
use Cwd qw(abs_path);
use Fcntl qw(:flock);
use File::Path qw(remove_tree);
use Time::HiRes qw(time usleep);
my $os_name = $^O;
# Turn on autoflush
$|++;
# Maximum number of threads to use
my $max_forks;
# Server port
my $port = 10002;
# Stores executing machine hostnames
my @machines;
my %machines;
# Path to text file containing computers to run on
my $machine_file_path;
# MrBayes block which will be used for each run
my $mb_block;
# Where this script is located
my $script_path = abs_path($0);
# Directory script was called from
my $init_dir = abs_path(".");
# Where the script was called from
my $initial_directory = $ENV{PWD};
# Allow for reusing info from an old run
my $input_is_dir = 0;
# How the script was called
my $invocation = "perl mb.pl @ARGV";
# Name of output directory
my $project_name = "mb-".int(time());
#my $project_name = "mb-dir";
# Read commandline settings
GetOptions(
#"no-forks" => \$no_forks,
"mb-block|m:s" => \$mb_block,
"machine-file:s" => \$machine_file_path,
"check|c:f" => \&check_nonconvergent,
"remove|r:f" => \&remove_nonconvergent,
"out-dir|o=s" => \$project_name,
"n-threads|T" => \$max_forks,
"port=i" => \$port,
"server-ip:s" => \&client, # for internal usage only
"help|h" => sub { print &help; exit(0); },
"usage" => sub { print &usage; exit(0); },
);
# Get paths to required executables
my $mb = check_path_for_exec("mb");
my $archive = shift(@ARGV);
# Some error checking
die "You must specify an archive file.\n\n", &usage if (!defined($archive));
die "Could not locate '$archive', perhaps you made a typo.\n" if (!-e $archive);
die "You specified a MrBayes run archive instead of an MDL gene archive.\n" if ($archive =~ /\.mb\.tar$/);
die "Could not locate '$machine_file_path'.\n" if (defined($machine_file_path) && !-e $machine_file_path);
die "You must specify a file containing a valid MrBayes block which will be appended to each gene.\n\n", &usage if (!defined($mb_block));
die "Could not locate '$mb_block', perhaps you made a typo.\n\n" if (!-e $mb_block);
# Input is a previous run directory, reuse information
$input_is_dir++ if (-d $archive);
# Determine which machines we will run the analyses on
if (defined($machine_file_path)) {
# Get list of machines
print "Fetching machine names listed in '$machine_file_path'...\n";
open(my $machine_file, '<', $machine_file_path);
chomp(@machines = <$machine_file>);
close($machine_file);
# Check that we can connect to specified machines
foreach my $index (0 .. $#machines) {
my $machine = $machines[$index];
print " Testing connection to: $machine...\n";
# Attempt to ssh onto machine with a five second timeout
my $ssh_test = `timeout 5 ssh -v $machine exit 2>&1`;
# Look for machine's IP in test connection
my $machine_ip;
if ($ssh_test =~ /Connecting to \S+ \[(\S+)\] port \d+\./s) {
$machine_ip = $1;
}
# Could connect but passwordless login not enabled
if ($ssh_test =~ /Are you sure you want to continue connecting \(yes\/no\)/s) {
print " Connection to $machine failed, removing from list of useable machines (passwordless login not enabled).\n";
splice(@machines, $index, 1);
}
# Successful connection
elsif (defined($machine_ip)) {
print " Connection to $machine [$machine_ip] successful.\n";
$machines{$machine} = $machine_ip;
}
# Unsuccessful connection
else {
print " Connection to $machine failed, removing from list of useable machines.\n";
splice(@machines, $index, 1);
}
}
}
print "\nScript was called as follows:\n$invocation\n";
# Load MrBayes block into memory
open(my $mb_block_file, "<", $mb_block) or die "Could not open '$mb_block': $!.\n";
my @mb_block = <$mb_block_file>;
close($mb_block_file);
my $archive_root;
my $archive_root_no_ext;
if (!$input_is_dir) {
# Clean run with no prior output
# Extract name information from input file
($archive_root = $archive) =~ s/.*\/(.*)/$1/;
($archive_root_no_ext = $archive) =~ s/(.*\/)?(.*)(\.tar\.gz)|(\.tgz)/$2/;
# Initialize working directory
# Remove conditional eventually
mkdir($project_name) || die "Could not create '$project_name'$!.\n" if (!-e $project_name);
my $archive_abs_path = abs_path($archive);
# Remove conditional eventually
run_cmd("ln -s $archive_abs_path $project_name/$archive_root") if (! -e "$project_name/$archive_root");
}
else {
# Prior output available, set relevant variables
$project_name = $archive;
my @contents = glob("$project_name/*");
# Determine the archive name by looking for a symlink
my $found_name = 0;
foreach my $file (@contents) {
if (-l $file) {
$file =~ s/\Q$project_name\E\///;
#$archive = $file;
$archive = "$project_name/$file";
$found_name = 1;
}
}
die "Could not locate archive in '$project_name'.\n" if (!$found_name);
# Extract name information from input file
($archive_root = $archive) =~ s/.*\/(.*)/$1/;
($archive_root_no_ext = $archive) =~ s/(.*\/)?(.*)(\.tar\.gz)|(\.tgz)/$2/;
}
# The name of the output archive
my $mb_archive = "$archive_root_no_ext.mb.tar";
chdir($project_name);
# Change how Ctrl+C is interpreted to allow for clean up
$SIG{'INT'} = 'INT_handler';
# Define and initialize directories
my $gene_dir = "genes/";
mkdir($gene_dir) or die "Could not create '$gene_dir': $!.\n" if (!-e $gene_dir);
# Check if completed genes from a previous run exist
my %complete_genes;
if (-e $mb_archive) {
print "\nArchive containing completed MrBayes runs found for this dataset found in '$mb_archive'.\n";
print "Completed runs contained in this archive will be removed from the job queue.\n";
# Add gene names in tarball to list of completed genes
chomp(my @complete_genes = `tar tf '$mb_archive'`);
foreach my $gene (@complete_genes) {
$gene =~ s/\.tar\.gz//;
$complete_genes{$gene}++;
}
}
# Unarchive input genes
chomp(my @genes = `tar xvf '$init_dir/$archive' -C $gene_dir 2>&1`);
@genes = map { s/x //; $_ } @genes if ($os_name eq "darwin");
chdir($gene_dir);
# Remove completed genes
if (%complete_genes) {
foreach my $index (reverse(0 .. $#genes)) {
if (exists($complete_genes{$genes[$index]})) {
unlink($genes[$index]);
splice(@genes, $index, 1);
}
}
}
die "\nAll jobs have already completed.\n\n" if (!@genes);
# Append given MrBayes block to the end of each gene
print "\nAppending MrBayes block to each gene... ";
foreach my $gene (@genes) {
open(my $gene_file, ">>", $gene) or die "Could not open '$gene': $!.\n";
print {$gene_file} "\n", @mb_block;
close($gene_file);
}
print "done.\n\n";
# Returns the external IP address of this computer
chomp(my $server_ip = `dig +short myip.opendns.com \@resolver1.opendns.com 2>&1`);
if ($server_ip !~ /(?:[0-9]{1,3}\.){3}[0-9]{1,3}/) {
print "Could not determine external IP address, only local clients will be created.\n";
$server_ip = "127.0.0.1";
}
# Initialize a server
my $sock = IO::Socket::INET->new(
LocalPort => $port,
Blocking => 0,
Reuse => 1,
Listen => SOMAXCONN,
Proto => 'tcp')
or die "Could not create server socket: $!.\n";
$sock->autoflush(1);
print "Job server successfully created.\n";
# Should probably do this earlier
# Determine server hostname and add to machines if none were specified by the user
chomp(my $server_hostname = `hostname`);
if (scalar(@machines) == 0) {
push(@machines, $server_hostname);
$machines{$server_hostname} = "127.0.0.1";
}
elsif (scalar(@machines) == 1) {
# Check if the user input only the local machine in the config
if ($machines{$machines[0]} eq $server_ip) {
$machines{$machines[0]} = "127.0.0.1";
}
}
my @pids;
foreach my $machine (@machines) {
# Fork and create a client on the given machine
my $pid = fork();
if ($pid == 0) {
close(STDIN);
close(STDOUT);
close(STDERR);
(my $script_name = $script_path) =~ s/.*\///;
# Move required datafiles to machines, initialize clients
if ($machines{$machine} ne "127.0.0.1" && $machines{$machine} ne $server_ip) {
# Send this script to the machine
system("scp", "-q", $script_path, $machine.":/tmp");
# Send MrBayes executable to the machine
system("scp", "-q", $mb, $machine.":/tmp");
# Execute this perl script on the given machine
# -tt forces pseudo-terminal allocation and lets us stop remote processes
exec("ssh", "-tt", "$machine", "perl", "/tmp/$script_name", "--server-ip=$server_ip:$port");
}
else {
# Send this script to the machine
system("cp", $script_path, "/tmp");
# Send MrBayes executable to the machine
system("cp", $mb, "/tmp");
# Execute this perl script on the given machine
exec("perl", "/tmp/$script_name", "--server-ip=127.0.0.1:$port");
}
exit(0);
}
else {
push(@pids, $pid);
}
}
#chdir($gene_dir);
my $select = IO::Select->new($sock);
# Don't create zombies
$SIG{CHLD} = 'IGNORE';
# Stores which job is next in queue
my $job_number = 0;
# Number of open connections to a client
my $total_connections;
# Number of complete jobs (necessary?)
my $complete_count = 0;
# Number of connections server has closed
my $closed_connections = 0;
# Minimum number of connections server should expect
my $starting_connections = scalar(@machines);
my $time = time();
my $num_digits = get_num_digits({'NUMBER' => scalar(@genes)});
# Begin the server's job distribution
while ((!defined($total_connections) || $closed_connections != $total_connections) || $total_connections < $starting_connections) {
# Contains handles to clients which have sent information to the server
my @clients = $select->can_read(0);
# Free up CPU by sleeping for 10 ms
usleep(10000);
# Handle each ready client individually
CLIENT: foreach my $client (@clients) {
# Client requesting new connection
if ($client == $sock) {
$total_connections++;
$select->add($sock->accept());
}
else {
# Get client's message
my $response = <$client>;
next if (not defined($response)); # a response should never actually be undefined
# Client wants to send us a file
if ($response =~ /SEND_FILE: (.*)/) {
my $file_name = $1;
receive_file({'FILE_PATH' => $file_name, 'FILE_HANDLE' => $client});
}
# Client has finished a job
if ($response =~ /DONE (.*) \|\|/) {
$complete_count++;
printf(" Analyses complete: %".$num_digits."d/%d.\r", $complete_count, scalar(@genes));
# Perform appending of new gene to tarball in a fork as this can take some time
my $pid;
until (defined($pid)) { $pid = fork(); usleep(30000); }
if ($pid == 0) {
# Check if this is the first to complete, if so we must create the directory
my $completed_gene = $1;
if (!-e "../$mb_archive") {
system("touch", "$mb_archive");
system("tar", "cf", "../$mb_archive", $completed_gene);
unlink($completed_gene);
}
else {
# Obtain a file lock on archive so another process doesn't simultaneously try to add to it
open(my $mb_archive_file, "<", "../$mb_archive");
flock($mb_archive_file, LOCK_EX) || die "Could not lock '$mb_archive_file': $!.\n";
# Add completed gene
system("tar", "rf", "../$mb_archive", $completed_gene);
unlink($completed_gene);
# Release lock
flock($mb_archive_file, LOCK_UN) || die "Could not unlock '$mb_archive_file': $!.\n";
close($mb_archive_file);
}
exit(0);
}
else {
push(@pids, $pid);
}
}
# Client wants a new job
if ($response =~ /NEW: (.*)/) {
my $client_ip = $1;
# Check if jobs remain in the queue
if ($job_number < scalar(@genes)) {
printf("\n Analyses complete: %".$num_digits."d/%d.\r", 0, scalar(@genes)) if ($job_number == 0);
my $gene = $genes[$job_number];
# Check whether the client is remote or local, send it needed files if remote
if ($client_ip ne $server_ip) {
# Fork to perform the file transfer and prevent stalling the server
my $pid;
until (defined($pid)) { $pid = fork(); usleep(30000); }
#my $pid = fork();
if ($pid == 0) {
send_file({'FILE_PATH' => $gene, 'FILE_HANDLE' => $client});
unlink($gene);
print {$client} "NEW: $gene\n";
exit(0);
}
else {
push(@pids, $pid);
}
}
else {
print {$client} "CHDIR: ".abs_path("./")."\n";
print {$client} "NEW: $gene\n";
}
$job_number++;
}
else {
# Client has asked for a job, but there are none remaining
print {$client} "HANGUP\n";
$select->remove($client);
$client->close();
$closed_connections++;
next CLIENT;
}
}
}
}
}
# Don't think this is needed
foreach my $pid (@pids) {
waitpid($pid, 0);
}
print "\n All connections closed.\n";
print "Total execution time: ", sec2human(time() - $time), ".\n\n";
# Go back to project directory, delete empty gene dir
chdir("..");
&INT_handler;
sub client {
my ($opt_name, $address) = @_;
my ($server_ip, $port) = split(":", $address);
chdir("/tmp");
my $mb = "/tmp/mb";
#my $pgrp = getpgrp();
my $pgrp = $$;
setpgrp();
# Determine this host's IP
chomp(my $ip = `dig +short myip.opendns.com \@resolver1.opendns.com`);
# Set IP to localhost if we don't have internet
if ($ip !~ /(?:[0-9]{1,3}\.){3}[0-9]{1,3}/) {
$ip = "127.0.0.1";
}
# Spawn more clients
my @pids;
# my $total_forks = get_free_cpus();
# A slightly modification in order to limit forks to 10
my $total_forks = 12;
if ($total_forks > 1) {
foreach my $fork (1 .. $total_forks - 1) {
my $pid = fork();
if ($pid == 0) {
last;
}
else {
push(@pids, $pid);
}
}
}
# The name of the gene we are working on
my $gene;
# Stores filenames of unneeded files
my @unlink;
# Change signal handling so killing the server kills these processes and cleans up
$SIG{CHLD} = 'IGNORE';
$SIG{HUP} = sub { unlink($0, $mb); kill -15, $$; };
$SIG{TERM} = sub { unlink(glob($gene."*")) if defined($gene); exit(0)};
# Connect to the server
my $sock = new IO::Socket::INET(
PeerAddr => $server_ip.":".$port,
Proto => 'tcp')
or exit(0);
$sock->autoflush(1);
print {$sock} "NEW: $ip\n";
while (chomp(my $response = <$sock>)) {
if ($response =~ /SEND_FILE: (.*)/) {
my $file_name = $1;
receive_file({'FILE_PATH' => $file_name, 'FILE_HANDLE' => $sock});
}
elsif ($response =~ /CHDIR: (.*)/) {
chdir($1);
}
elsif ($response =~ /NEW: (.*)/) {
$gene = $1;
# Redirect STDOUT to a log file
open(my $std_out, ">&", *STDOUT);
open(STDOUT, ">", $gene.".log");
system($mb, $gene);
# Put STDOUT back to normal
open(STDOUT, ">&", $std_out);
close($std_out);
unlink($gene);
# Zip and tarball the results
my @results = glob($gene."*");
my $gene_archive_name = "$gene.tar.gz";
@results = grep {!/\Q$gene_archive_name\E/} @results;
system("tar", "czf", $gene_archive_name, @results);
unlink(@results);
# Send the results back to the server if this is a remote client
if ($server_ip ne "127.0.0.1" && $server_ip ne $ip) {
send_file({'FILE_PATH' => $gene_archive_name, 'FILE_HANDLE' => $sock});
unlink($gene_archive_name);
}
# Request a new job
print {$sock} "DONE $gene_archive_name || NEW: $ip\n";
}
elsif ($response eq "HANGUP") {
last;
}
}
# Have initial client wait for all others to finish and clean up
if ($$ == $pgrp) {
foreach my $pid (@pids) {
waitpid($pid, 0);
}
unlink($0, $mb);
}
exit(0);
}
sub check_nonconvergent {
my ($opt_name, $threshold) = @_;
# We have to do weird things here to get the input name
my @ARGV = split(/\s+/, $invocation);
shift(@ARGV); shift(@ARGV);
# Look for a directory in arguments provided
my $archive;
foreach my $arg (@ARGV) {
if (-d $arg) {
$archive = $arg;
}
}
# Die if user didn't give us a directory
if (!defined($archive)) {
print "You must specify a directory previously generated by this script to check for nonconvergent genes.\n";
exit(0);
}
# Prior output available, set relevant variables
$project_name = $archive;
my @contents = glob("$project_name/*");
# Determine the archive name by looking for a symlink
my $found_name = 0;
foreach my $file (@contents) {
if (-l $file) {
$file =~ s/\Q$project_name\E\///;
$archive = $file;
$found_name = 1;
}
}
die "Could not locate archive in '$project_name'.\n" if (!$found_name);
chdir($project_name);
# Extract name information from input file
(my $archive_root = $archive) =~ s/.*\/(.*)/$1/;
(my $archive_root_no_ext = $archive) =~ s/(.*\/)?(.*)(\.tar\.gz)|(\.tgz)/$2/;
# Should have some completed genes in it
my $incomplete_archive = $archive_root_no_ext.".mb.tar";
# Check that the incomplete archive exists
if (!-e $incomplete_archive) {
print "Could not locate an archive containing completed MrBayes runs.\n";
exit(0);
}
print "\nScript was called as follows:\n$invocation\n\n";
# Create a temporary directory for our operations
my $check_dir = "tmp/";
mkdir($check_dir) if (!-e $check_dir);
$SIG{INT} = sub { remove_tree($check_dir); exit(0) };
# Open tarball in genes directory
chomp(my @genes = `tar xvf '$incomplete_archive' -C $check_dir 2>&1`);
@genes = map { s/x //; $_ } @genes if ($os_name eq "darwin");
@genes = sort { (local $a = $a) =~ s/.*-(\d+)-\d+\..*/$1/;
(local $b = $b) =~ s/.*-(\d+)-\d+\..*/$1/;
$a <=> $b } @genes;
my $longest_name_length = length($genes[$#genes]);
print "MrBayes results available for ", scalar(@genes), " total genes:\n";
chdir($check_dir);
$SIG{INT} = sub { chdir(".."); remove_tree($check_dir); exit(0) };
# Parse log of each gene to determine final standard deviation of split frequencies
my $count = 0;
foreach my $gene (@genes) {
chomp(my @contents = `tar xvf '$gene' 2>&1`);
@contents = map { s/x //; $_ } @contents if ($os_name eq "darwin");
(my $log_file_path = $gene) =~ s/\.tar\.gz$/.log/;
# Check log file exists
if (!-e $log_file_path) {
print "Could not locate log file for '$gene'.\n";
exit(0);
}
open(my $log_file, "<", $log_file_path);
chomp(my @data = <$log_file>);
close($log_file);
my @splits = grep { /Average standard deviation of split frequencies:/ } @data;
my $final_split = pop(@splits);
$final_split =~ s/.*frequencies: (.*)/$1/;
#print " $gene: $final_split\n";
printf(" %-${longest_name_length}s: %s\n", $gene, $final_split);
if (!defined($final_split) || $final_split > $threshold) {
$count++;
}
unlink(@contents);
}
printf("%d gene(s) failed to meet the threshold of %s (%.2f%%).\n", $count, $threshold, ($count / scalar(@genes) * 100));
# Clean up and exit
kill(2, $$);
}
sub remove_nonconvergent {
my ($opt_name, $threshold) = @_;
# We have to do weird things here to get the input name
my @ARGV = split(/\s+/, $invocation);
shift(@ARGV); shift(@ARGV);
# Look for a directory in arguments provided
my $archive;
foreach my $arg (@ARGV) {
if (-d $arg) {
$archive = $arg;
}
}
# Die if user didn't give us a directory
if (!defined($archive)) {
print "You must specify a directory previously generated by this script to check for nonconvergent genes.\n";
exit(0);
}
my $initial_archive = $archive;
# Prior output available, set relevant variables
$project_name = $archive;
my @contents = glob("$project_name/*");
# Determine the archive name by looking for a symlink
my $found_name = 0;
foreach my $file (@contents) {
if (-l $file) {
$file =~ s/\Q$project_name\E\///;
$archive = $file;
$found_name = 1;
}
}
die "Could not locate archive in '$project_name'.\n" if (!$found_name);
chdir($project_name);
# Extract name information from input file
(my $archive_root = $archive) =~ s/.*\/(.*)/$1/;
(my $archive_root_no_ext = $archive) =~ s/(.*\/)?(.*)(\.tar\.gz)|(\.tgz)/$2/;
# Should have some completed genes in it
my $incomplete_archive = $archive_root_no_ext.".mb.tar";
# Check that the incomplete archive exists
if (!-e $incomplete_archive) {
print "Could not locate an archive containing completed MrBayes runs.\n";
exit(0);
}
print "\nScript was called as follows:\n$invocation\n\n";
# Create a temporary directory for our operations
my $check_dir = "tmp/";
mkdir($check_dir) if (!-e $check_dir);
$SIG{INT} = sub { remove_tree($check_dir); exit(0) };
# Open tarball in genes directory
chomp(my @genes = `tar xvf '$incomplete_archive' -C $check_dir 2>&1`);
@genes = map { s/x //; $_ } @genes if ($os_name eq "darwin");
@genes = sort { (local $a = $a) =~ s/.*-(\d+)-\d+\..*/$1/;
(local $b = $b) =~ s/.*-(\d+)-\d+\..*/$1/;
$a <=> $b } @genes;
my $longest_name_length = length($genes[$#genes]);
print "MrBayes results available for ", scalar(@genes), " total genes:\n";
chdir($check_dir);
$SIG{INT} = sub { chdir(".."); remove_tree($check_dir); exit(0) };
# Parse log of each gene to determine final standard deviation of split frequencies
my $count = 0;
foreach my $gene (@genes) {
chomp(my @contents = `tar xvf '$gene' 2>&1`);
@contents = map { s/x //; $_ } @contents if ($os_name eq "darwin");
(my $log_file_path = $gene) =~ s/\.tar\.gz$/.log/;
# Check log file exists
if (!-e $log_file_path) {
print "Could not locate log file for '$gene'.\n";
exit(0);
}
open(my $log_file, "<", $log_file_path);
chomp(my @data = <$log_file>);
close($log_file);
my @splits = grep { /Average standard deviation of split frequencies:/ } @data;
my $final_split = pop(@splits);
$final_split =~ s/.*frequencies: (.*)/$1/;
#print " $gene: $final_split";
if (!defined($final_split) || $final_split > $threshold) {
unlink($gene);
#print " -- REMOVED\n";
printf(" %-${longest_name_length}s: %s -- REMOVED\n", $gene, $final_split);
$count++;
}
else {
printf(" %-${longest_name_length}s: %s\n", $gene, $final_split);
#print "\n";
}
unlink(@contents);
}
printf("%d gene(s) failed to meet the threshold of %s (%.2f%%) and have been removed.\n", $count, $threshold, ($count / scalar(@genes) * 100));
# Determine which genes met threshold and still remain
@genes = glob($archive_root_no_ext."*.nex.tar.gz");
@genes = sort { (local $a = $a) =~ s/.*-(\d+)-\d+\..*/$1/;
(local $b = $b) =~ s/.*-(\d+)-\d+\..*/$1/;
$a <=> $b } @genes;
# Recreate archive with remaining genes
if (@genes) {
system("tar", "cf", $incomplete_archive, @genes);
unlink(@genes);
system("mv", $incomplete_archive, "..");
}
else {
# Delete the working directory if no genes meet the threshold
print "No genes met the threshold, removing specified directory.\n";
chdir($initial_directory);
$SIG{INT} = sub { remove_tree($initial_archive); exit(0) };
}
# Clean up and exit
kill(2, $$);
}
sub hashsum {
my $settings = shift;
my $file_path = $settings->{'FILE_PATH'};
open(my $file, "<", $file_path) or die "Couldn't open file '$file_path': $!.\n";
my $md5 = Digest::MD5->new;
my $md5sum = $md5->addfile(*$file)->hexdigest;
close($file);
return $md5sum;
}
sub send_file {
my $settings = shift;
my $file_path = $settings->{'FILE_PATH'};
my $file_handle = $settings->{'FILE_HANDLE'};
my $hash = hashsum({'FILE_PATH' => $file_path});
print {$file_handle} "SEND_FILE: $file_path\n";
open(my $file, "<", $file_path) or die "Couldn't open file '$file_path': $!.\n";
while (<$file>) {
print {$file_handle} $_;
}
close($file);
print {$file_handle} " END_FILE: $hash\n";
# Stall until we know status of file transfer
while (defined(my $response = <$file_handle>)) {
chomp($response);
last if ($response eq "TRANSFER_SUCCESS");
die "Unsuccessful file transfer, checksums did not match.\n" if ($response eq "TRANSFER_FAILURE");
}
}
sub receive_file {
my $settings = shift;
my $file_path = $settings->{'FILE_PATH'};
my $file_handle = $settings->{'FILE_HANDLE'};
my $check_hash;
open(my $file, ">", $file_path);
while (<$file_handle>) {
if ($_ =~ /(.*) END_FILE: (\S+)/) {
print {$file} $1;
$check_hash = $2;
last;
}
else {
print {$file} $_;
}
}
close($file);
# Use md5 hashsum to make sure transfer worked
my $hash = hashsum({'FILE_PATH' => $file_path});
if ($hash ne $check_hash) {
die "Unsuccessful file transfer, checksums do not match.\n'$hash' - '$check_hash'\n"; # hopefully this never pops up
print {$file_handle} "TRANSFER_FAILURE\n"
}
else {
print {$file_handle} "TRANSFER_SUCCESS\n";
}
}
sub INT_handler {
# Kill ssh process(es) spawn by this script
foreach my $pid (@pids) {
#kill(-1, $pid);
kill(15, $pid);
}
# Move into gene directory
#chdir("$initial_directory");
chdir("$initial_directory/$project_name");
# Try to delete directory five times, if it can't be deleted print an error message
# I've found this method is necessary for analyses performed on AFS drives
my $count = 0;
until (!-e $gene_dir || $count == 5) {
$count++;
remove_tree($gene_dir, {error => \my $err});
sleep(1);
}
#logger("Could not clean all files in './$gene_dir/'.") if ($count == 5);
print "Could not clean all files in './$gene_dir/'.\n" if ($count == 5);
exit(0);
}
sub clean_up {
my $settings = shift;
my $remove_dirs = $settings->{'DIRS'};
my $current_dir = getcwd();
# chdir($alignment_root);
# unlink(glob($gene_dir."$alignment_name*"));
# #unlink($server_check_file) if (defined($server_check_file));
#
# if ($remove_dirs) {
# rmdir($gene_dir);
# }
chdir($current_dir);
}
sub get_num_digits {
my $settings = shift;
my $number = $settings->{'NUMBER'};
my $digits = 1;
while (floor($number / 10) != 0) {
$number = floor($number / 10);
$digits++;
}
return $digits;
}
sub sec2human {
my $secs = shift;
# Constants
my $secs_in_min = 60;
my $secs_in_hour = 60 * 60;
my $secs_in_day = 24 * 60 * 60;
$secs = int($secs);
return "0 seconds" if (!$secs);
# Calculate units of time
my $days = int($secs / $secs_in_day);
my $hours = ($secs / $secs_in_hour) % 24;
my $mins = ($secs / $secs_in_min) % 60;
$secs = $secs % 60;
# Format return nicely
my $time;
if ($days) {
$time .= ($days != 1) ? "$days days, " : "$days day, ";
}
if ($hours) {
$time .= ($hours != 1) ? "$hours hours, " : "$hours hour, ";
}
if ($mins) {
$time .= ($mins != 1) ? "$mins minutes, " : "$mins minute, ";
}
if ($secs) {
$time .= ($secs != 1) ? "$secs seconds " : "$secs second ";
}
else {
# Remove comma
chop($time);
}
chop($time);
return $time;
}
sub get_free_cpus {
return $max_forks if (defined($max_forks));
my $os_name = $^O;
# Returns a two-member array containing CPU usage observed by top,
# top is run twice as its first output is usually inaccurate
my @percent_free_cpu;
if ($os_name eq "darwin") {
# Mac OS
chomp(@percent_free_cpu = `top -i 1 -l 2 | grep "CPU usage"`);
}
else {
# Linux
chomp(@percent_free_cpu = `top -b -n2 -d0.05 | grep "Cpu(s)"`);
}
my $percent_free_cpu = pop(@percent_free_cpu);
if ($os_name eq "darwin") {
# Mac OS
$percent_free_cpu =~ s/.*?(\d+\.\d+)%\s+id.*/$1/;
}
else {
# linux
$percent_free_cpu =~ s/.*?(\d+\.\d)\s*%?ni,\s*(\d+\.\d)\s*%?id.*/$1 + $2/; # also includes %nice as free
$percent_free_cpu = eval($percent_free_cpu);
}
my $total_cpus;
if ($os_name eq "darwin") {
# Mac OS
$total_cpus = `sysctl -n hw.ncpu`;
}
else {
# linux
$total_cpus = `grep --count 'cpu' /proc/stat` - 1;
}
my $free_cpus = ceil($total_cpus * $percent_free_cpu / 100);
if ($free_cpus == 0 || $free_cpus !~ /^\d+$/) {
$free_cpus = 1; # assume that at least one cpu can be used
}
return $free_cpus;
}
sub run_cmd {
my $command = shift;
my $return = system($command);
if ($return) {
logger("'$command' died with error: '$return'.\n");
#kill(2, $parent_pid);
exit(0);
}
}
sub check_path_for_exec {
my $exec = shift;
my $path = $ENV{PATH}.":."; # include current directory as well
my @path_dirs = split(":", $path);
my $exec_path;
foreach my $dir (@path_dirs) {
$dir .= "/" if ($dir !~ /\/$/);
$exec_path = abs_path($dir.$exec) if (-e $dir.$exec && -x $dir.$exec && !-d $dir.$exec);
}
die "Could not find the following executable: '$exec'. This script requires this program in your path.\n" if (!defined($exec_path));
return $exec_path;
}
sub usage {
return "Usage: mb.pl ([PARTITION TARBALL] [-m MRBAYES BLOCK]) || ([MRBAYES TARBALL] [-c THRESHOLD] || [-r THRESHOLD])\n";
}
sub help {
print <<EOF;
@{[usage()]}
Parallel execution of MrBayes on a large dataset
-m, --mb-block text file containing MrBayes commands to append to each input partition (REQUIRED)
-c, --check outputs how many MrBayes runs standard deviation of split frequencies have reached the specified threshold
-r, --remove removes MrBayes runs with standard deviation of split frequencies below the specified threshold
-o, --out-dir name of the directory to store output files in (default: "mb-" + Unix time of script invocation)
-T, --n-threads the number of forks ALL hosts running analyses can use concurrently (default: current number of free CPUs)
--machine-file file name containing hosts to ssh onto and perform analyses on, passwordless login MUST be enabled
for each host specified in this file
--port specifies the port to utilize on the server (Default: 10002)
-h, --help display this help and exit
--usage display proper script invocation format
Examples:
perl mb.pl align.tgz -m bayes.txt --machine-file hosts.txt runs MrBayes on each partition stored in align.tgz using parameters
stored in bayes.txt on computers specified in hosts.txt
perl mb.pl align.mb.tar --check 0.02 prints which genes in align.mb.tar have MCMC chains which reached a
standard deviation of split frequencies below 0.02
perl mb.pl align.mb.tar --remove 0.05 removes genes in align.mb.tar that have MCMC chains which did not reach
a standard deviation of split frequencies of 0.05
Mail bug reports and suggestions to <noah.stenz.github\@gmail.com>
EOF
exit(0);
}