biyelunwen/99.scripts/miscs/bucky.pl

#!/usr/bin/perl
use strict;
use warnings;
use POSIX;
use IO::Select;
use IO::Socket;
use Digest::MD5;
use Getopt::Long;
use Cwd qw(abs_path);
use Fcntl qw(:flock SEEK_END);
use POSIX qw(ceil :sys_wait_h);
use File::Path qw(remove_tree);
use Time::HiRes qw(time usleep);

# Get OS name
my $os_name = $^O;

# Turn on autoflush
$|++;

# Max number of forks to use
my $max_forks = get_free_cpus();

# Server port
my $port = 10003;

# Stores executing machine hostnames
my @machines;
my %machines;

# Path to text file containing computers to run on
my $machine_file_path;

# MrBayes block which will be used for each run
my $mb_block;

# Where this script is located
my $script_path = abs_path($0);

# Directory script was called from
my $init_dir = abs_path(".");

# Where the script was called from
my $initial_directory = $ENV{PWD};

# General script settings
my $no_forks;

# Allow for reusing info from an old run
my $input_is_dir = 0;

# Allow user to specify mbsum output as input for the script
my $input_is_mbsum = 0;

# How the script was called
my $invocation = "perl bucky.pl @ARGV";

# Name of output directory
my $project_name = "bucky-".int(time());
#my $project_name = "bucky-dir";

# BUCKy settings
my $alpha = 1;
my $ngen = 1000000;

my @unlink;

# Read commandline settings
GetOptions(
	"no-forks"          => \$no_forks,
	"machine-file=s"    => \$machine_file_path,
	"alpha|a=s"         => \$alpha,
	"ngen|n=i"          => \$ngen,
	"port=i"            => \$port,
	"no-mbsum|s"        => \$input_is_mbsum,
	"n-threads|T=i"     => \$max_forks,
	"out-dir|o=s"       => \$project_name,
	"server-ip=s"       => \&client, # for internal usage only
	"help|h"            => sub { print &help; exit(0); },
	"usage"             => sub { print &usage; exit(0); },
);


# Get paths to required executables
my $bucky = check_path_for_exec("bucky");
my $mbsum = check_path_for_exec("mbsum") if (!$input_is_mbsum);

# Check that BUCKy version >= 1.4.4
check_bucky_version($bucky);

my $archive = shift(@ARGV);

# Some error checking
die "You must specify an archive file.\n\n", &usage if (!defined($archive));
die "Could not locate '$archive', perhaps you made a typo.\n" if (!-e $archive);
die "Could not locate '$machine_file_path'.\n" if (defined($machine_file_path) && !-e $machine_file_path);
die "Invalid alpha for BUCKy specified, input must be a float or 'infinity'.\n" if ($alpha !~ /(^inf(inity)?)|(^\d+(\.\d+)?$)/i);

# Input is a previous run directory, reuse information
$input_is_dir++ if (-d $archive);

# Determine which machines we will run the analyses on
if (defined($machine_file_path)) {

	# Get list of machines
	print "Fetching machine names listed in '$machine_file_path'...\n";
	open(my $machine_file, '<', $machine_file_path);
	chomp(@machines = <$machine_file>);
	close($machine_file);

	# Check that we can connect to specified machines
	foreach my $index (0 .. $#machines) {
		my $machine = $machines[$index];
		print "  Testing connection to: $machine...\n";

		# Attempt to ssh onto machine with a five second timeout
		my $ssh_test = `timeout 5 ssh -v $machine exit 2>&1`;

		# Look for machine's IP in test connection
		my $machine_ip;
		if ($ssh_test =~ /Connecting to \S+ \[(\S+)\] port \d+\./s) {
			$machine_ip = $1;
		}

		# Could connect but passwordless login not enabled
		if ($ssh_test =~ /Are you sure you want to continue connecting \(yes\/no\)/s) {
			print "    Connection to $machine failed, removing from list of useable machines (passwordless login not enabled).\n";
			splice(@machines, $index, 1);
		}
		# Successful connection
		elsif (defined($machine_ip)) {
			print "    Connection to $machine [$machine_ip] successful.\n";
			$machines{$machine} = $machine_ip;
		}
		# Unsuccessful connection
		else {
			print "    Connection to $machine failed, removing from list of useable machines.\n";
			splice(@machines, $index, 1);
		}
	}
}

print "\nScript was called as follows:\n$invocation\n\n";

my $archive_root;
my $archive_root_no_ext;
if (!$input_is_dir) {

	# Clean run with no prior output

	# Extract name information from input file

	if ($input_is_mbsum) {
		($archive_root = $archive) =~ s/.*\/(.*)/$1/;
		($archive_root_no_ext = $archive) =~ s/(.*\/)?(.*)(\.tar(\.gz)?$)|(\.tgz$)/$2/;
	}
	else {
		($archive_root = $archive) =~ s/.*\/(.*)/$1/;
		($archive_root_no_ext = $archive) =~ s/(.*\/)?(.*)(\.mb\.tar(\.gz)?$)|(\.mb\.tgz$)/$2/;
	}
	die "Could not determine archive root name, did you specify the proper input?\n" if (!defined($archive_root_no_ext));

	# Initialize working directory
	# Remove conditional eventually
	mkdir($project_name) || die "Could not create '$project_name'$!.\n" if (!-e $project_name);

	my $archive_abs_path = abs_path($archive);
	# Remove conditional eventually
	run_cmd("ln -s $archive_abs_path $project_name/$archive_root") if (! -e "$project_name/$archive_root");
}
else {

	# Prior output available, set relevant variables

	$project_name = $archive;
	my @contents = glob("$project_name/*");

	# Determine the archive name by looking for a symlink
	my $found_name = 0;
	foreach my $file (@contents) {
		if (-l $file) {
			$file =~ s/\Q$project_name\E\///;
			$archive = $file;
			$found_name = 1;
		}
	}
	die "Could not locate archive in '$project_name'.\n" if (!$found_name);

	# Extract name information from input file
#	($archive_root = $archive) =~ s/.*\/(.*)/$1/;
#	($archive_root_no_ext = $archive) =~ s/(.*\/)?(.*)(\.mb\.tar(\.gz)?$)|(\.mb\.tgz$)/$2/;
	if ($input_is_mbsum) {
		($archive_root = $archive) =~ s/.*\/(.*)/$1/;
		($archive_root_no_ext = $archive) =~ s/(.*\/)?(.*)(\.tar(\.gz)?$)|(\.tgz$)/$2/;
	}
	else {
		($archive_root = $archive) =~ s/.*\/(.*)/$1/;
		($archive_root_no_ext = $archive) =~ s/(.*\/)?(.*)(\.mb\.tar(\.gz)?$)|(\.mb\.tgz$)/$2/;
	}
	die "Could not determine archive root name, is your input file properly named?\n" if (!defined($archive_root_no_ext));
}

# The name of the output archive
my $mbsum_archive = "$archive_root_no_ext.mbsum.tar.gz";
my $bucky_archive = "$archive_root_no_ext.BUCKy.tar";
my $quartet_output = "$archive_root_no_ext.CFs.csv";

$mbsum_archive = $archive if ($input_is_mbsum);

chdir($project_name);

# Change how Ctrl+C is interpreted to allow for clean up
$SIG{'INT'} = 'INT_handler';

# Define and initialize directories
my $mb_out_dir = "mb-out/";
my $mb_sum_dir = "mb-sum/";

mkdir($mb_out_dir) or die "Could not create '$mb_out_dir': $!.\n" if (!-e $mb_out_dir);
mkdir($mb_sum_dir) or die "Could not create '$mb_sum_dir': $!.\n" if (!-e $mb_sum_dir);

# Check if completed genes from a previous run exist
my %complete_quartets;
if (-e $bucky_archive && -e $quartet_output) {
	print "Archive containing completed quartets found for this dataset found in '$bucky_archive'.\n";
	print "Completed quartets within in this archive will be removed from the job queue.\n\n";

	# Because it takes longer to append to a tarball than append to a text file, the tarball most
	# likely has fewer quartet entries in it, we must therefore account for this ensuring that
	# the tarball and csv have the same quartet entries

	# See which quartets in the tarball are complete
	chomp(my @complete_quartets_tarball = `tar tf '$init_dir/$project_name/$bucky_archive'`);

	# Add quartets to a hash for easier lookup
	my %complete_quartets_tarball;
	foreach my $complete_quartet (@complete_quartets_tarball) {
		$complete_quartets_tarball{$complete_quartet}++;
	}

	# Load csv into memory
	#open(my $quartet_output_file, "<", $quartet_output);
	open(my $quartet_output_file, "<", "$init_dir/$project_name/$quartet_output");
	(my @quartet_info = <$quartet_output_file>);
	close($quartet_output_file);

	# Remove header line
	my $header = shift(@quartet_info);

	# Rewrite csv to include only quartets also contained in tarball
	#open($quartet_output_file, ">", $quartet_output);
	open($quartet_output_file, ">", "$init_dir/$project_name/$quartet_output");
	print {$quartet_output_file} $header;
	foreach my $quartet (@quartet_info) {
		my ($taxon1, $taxon2, $taxon3, $taxon4) = split(",", $quartet);
		my $quartet_name = "$taxon1--$taxon2--$taxon3--$taxon4";
		my $quartet_name_tarball = $quartet_name.".tar.gz";

		if (exists($complete_quartets_tarball{$quartet_name_tarball})) {
			$complete_quartets{$quartet_name}++;
			print {$quartet_output_file} $quartet;
		}
	}
	close($quartet_output_file);
}

my @taxa;
my @genes;
if ($input_is_mbsum) {

	# Unarchive input genes
	chomp(@genes = `tar xvf '$init_dir/$project_name/$archive' -C $mb_sum_dir 2>&1`);
	@genes = map { s/x //; $_ } @genes if ($os_name eq "darwin");

	die "No genes found in '$archive'.\n" if (!@genes);

	# Move into MrBayes output directory
	chdir($mb_sum_dir);

	# Remove subdirectories that may have been a part of the tarball
	my @dirs;
	my @gene_roots;
	foreach my $gene (@genes) {
		(my $gene_root = $gene) =~ s/.*\///;
		next if ($gene eq $gene_root);

		# Move mbsum files so they are no longer in subdirectories
		if (!-d $gene) {
			system("mv '$gene' '$gene_root'");
			push(@gene_roots, $gene_root);
		}
		else {
			push(@dirs, $gene);
			#splice(@genes, $index, 1);
		}
	}
	@genes = @gene_roots if (@gene_roots);

	# Remove any subdirectories that may have been in the input
	foreach my $dir (@dirs) {
		remove_tree($dir);
	}

	# Parse taxa present in each gene, determine which are shared across all genes
	my %taxa;
	foreach my $gene (@genes) {
		my @taxa = @{parse_mbsum_taxa($gene)};

		# Count taxa present
		foreach my $taxon (@taxa) {
			$taxa{$taxon}++;
		}
	}

	# Add taxa present in all genes to analysis
	foreach my $taxon (keys %taxa) {
		if ($taxa{$taxon} == scalar(@genes)) {
			push(@taxa, $taxon);
		}
	}

	# Archive genes
	system("tar", "czf", $mbsum_archive, @genes);
}
else {

	# Unarchive input genes
	chomp(@genes = `tar xvf '$init_dir/$archive' -C '$mb_out_dir' 2>&1`);
	@genes = map { s/x //; $_ } @genes if ($os_name eq "darwin");

	# Move into MrBayes output directory
	chdir($mb_out_dir);

	# Check that each gene has a log file
	my %taxa;
	foreach my $gene (@genes) {

		# Unzip a single gene
		chomp(my @mb_files = `tar xvf '$gene' 2>&1`);
		@mb_files = map { s/x //; $_ } @mb_files if ($os_name eq "darwin");

		# Locate the log file output by MrBayes
		my $log_file_name;
		foreach my $file (@mb_files) {
			if ($file =~ /\.log$/) {
				$log_file_name = $file;
				last;
			}
		}
		die "Could not locate log file for '$gene'.\n" if (!defined($log_file_name));

		# Parse log file for run information
		my $mb_log = parse_mb_log($log_file_name);

		# Check for taxa present
		my @taxa = @{$mb_log->{TAXA}};
		foreach my $taxon (@taxa) {
			$taxa{$taxon}++;
		}

		# Clean up
		unlink(@mb_files);
	}

	# Add taxa present in all genes to analysis
	foreach my $taxon (keys %taxa) {
		if ($taxa{$taxon} == scalar(@genes)) {
			push(@taxa, $taxon);
		}
	}
}

# Create list of possible quartets
my @quartets = combine(\@taxa, 4);

my $original_size = scalar(@quartets);

# Remove completed quartets
if (%complete_quartets) {
	foreach my $index (reverse(0 .. $#quartets)) {
		my $quartet = $quartets[$index];
		$quartet = join("--", @{$quartet});
		if (exists($complete_quartets{$quartet})) {
			splice(@quartets, $index, 1);
		}
	}
}

print "Found ".scalar(@taxa)." taxa shared across all genes in this archive, ".scalar(@quartets).
	  " of $original_size possible quartets will be run using output from ".scalar(@genes)." total genes.\n";

# Go back to working directory
chdir("..");

# Determine whether or not we need to run mbsum on the specified input
my $should_summarize = 1;
if (-e $mbsum_archive && $input_is_dir && !$input_is_mbsum) {
	chomp(my @sums = `tar tf '$mbsum_archive'`) || die "Something appears to be wrong with '$mbsum_archive'.\n";

	# Check that each gene has actually been summarized, if not redo the summaries
	if (scalar(@sums) != scalar(@genes)) {
		unlink($mbsum_archive);
	}
	else {
		$should_summarize = 0;
	}
}

$should_summarize = 0 if ($input_is_mbsum);

# Summarize MrBayes output if needed
if ($should_summarize) {
	# Run mbsum on each gene
	print "Summarizing MrBayes output for ".scalar(@genes)." genes.\n";

	my @pids;
	foreach my $gene (@genes) {

		# Wait until a CPU is available
		until(okay_to_run(\@pids)) {};

		my $pid;
		until (defined($pid)) { $pid = fork(); usleep(30000); }

		# The child fork
		if ($pid == 0) {
			#setpgrp();
			run_mbsum($gene);
			exit(0);
		}
		else {
			push(@pids, $pid);
		}
	}

	# Wait for all summaries to finish
	foreach my $pid (@pids) {
		waitpid($pid, 0);
	}
	undef(@pids);

	# Remove directory storing mb output
	remove_tree($mb_out_dir);

	# Archive and zip mb summaries
	chdir($mb_sum_dir);
	#system("tar", "czf", $mbsum_archive, glob("$archive_root_no_ext*.sum"));
	system("tar", "czf", $mbsum_archive, glob("*.sum"));
	system("cp", $mbsum_archive, "..");
	chdir("..");
}

die "\nAll quartets have already been completed.\n\n" if (!@quartets);

# Returns the external IP address of this computer
chomp(my $server_ip = `dig +short myip.opendns.com \@resolver1.opendns.com 2>&1`);
if ($server_ip !~ /(?:[0-9]{1,3}\.){3}[0-9]{1,3}/) {
	print "Could not determine external IP address, only local clients will be created.\n";
	$server_ip = "127.0.0.1";
}

# Initialize a server
my $sock = IO::Socket::INET->new(
	LocalPort  => $port,
	Blocking   => 0,
	Reuse      => 1,
	Listen     => SOMAXCONN,
	Proto      => 'tcp')
or die "Could not create server socket: $!.\n";
$sock->autoflush(1);

print "Job server successfully created.\n";

# Should probably do this earlier
# Determine server hostname and add to machines if none were specified by the user
chomp(my $server_hostname = `hostname`);
if (scalar(@machines) == 0) {
	push(@machines, $server_hostname);
	$machines{$server_hostname} = "127.0.0.1";
}
elsif (scalar(@machines) == 1) {
	# Check if the user input only the local machine in the config
	if ($machines{$machines[0]} eq $server_ip) {
		$machines{$machines[0]} = "127.0.0.1";
	}
}

my @pids;
foreach my $machine (@machines) {

	# Fork and create a client on the given machine
	my $pid = fork();
	if ($pid == 0) {
		close(STDIN);
		close(STDOUT);
		close(STDERR);

		(my $script_name = $script_path) =~ s/.*\///;

		# Send over/copy files depending on where analyses will be run
		if ($machines{$machine} ne "127.0.0.1" && $machines{$machine} ne $server_ip) {

			# Send MrBayes summaries to remote machines
			if ($input_is_mbsum) {
				system("scp", "-q", "$mb_sum_dir/$mbsum_archive", $machine.":/tmp");
			}
			else {
				system("scp", "-q", $mbsum_archive, $machine.":/tmp");
			}

			# Send this script to the machine
			system("scp", "-q", $script_path, $machine.":/tmp");

			# Send BUCKy executable to the machine
			system("scp", "-q", $bucky, $machine.":/tmp");

			# Execute this perl script in client mode on the given machine
			# -tt forces pseudo-terminal allocation and lets us stop remote processes
			exec("ssh", "-tt", $machine, "perl", "/tmp/$script_name", $mbsum_archive, "--server-ip=$server_ip:$port");
		}
		else {
			# Send this script to the machine
			system("cp", $script_path, "/tmp");

			# Send BUCKy executable to the machine
			system("cp", $bucky, "/tmp");

			# Execute this perl script in client mode
			exec("perl", "/tmp/$script_name", "$init_dir/$project_name/$mb_sum_dir/$mbsum_archive", "--server-ip=127.0.0.1:$port");
		}

		exit(0);
	}
	else {
		push(@pids, $pid);
	}
}

# Move into mbsum directory
chdir($mb_sum_dir);

my $select = IO::Select->new($sock);

# Stores which job is next in queue
my $job_number = 0;

# Number of open connections to a client
my $total_connections;

# Number of complete jobs (necessary?)
my $complete_count = 0;

# Number of connections server has closed
my $closed_connections = 0;

# Minimum number of connections server should expect
my $starting_connections = scalar(@machines);

my $time = time();
my $num_digits = get_num_digits({'NUMBER' => scalar(@quartets)});

# Begin the server's job distribution
my %complete_queue;
my $complete_queue_max_size = 100;
while ((!defined($total_connections) || $closed_connections != $total_connections) || $total_connections < $starting_connections) {
	# Contains handles to clients which have sent information to the server
	my @clients = $select->can_read(0);

	# Free up CPU by sleeping for 10 ms
	usleep(10000);

	# Reap any children that we can
	foreach my $pid (@pids) {
		waitpid($pid, WNOHANG);
	}

	# Handle each ready client individually
	CLIENT: foreach my $client (@clients) {

		if (scalar(keys %complete_queue) > $complete_queue_max_size) {
			my $cwd = abs_path(".");
			chdir($mb_sum_dir);
			dump_quartets(\%complete_queue);
			chdir($cwd);
			undef(%complete_queue);
		}

		# Client requesting new connection
		if ($client == $sock) {
			$total_connections++;
			$select->add($sock->accept());
		}
		else {

			# Get client's message
			my $response = <$client>;
			next if (not defined($response)); # a response should never actually be undefined

			# Client wants to send us a file
			if ($response =~ /SEND_FILE: (.*)/) {
				my $file_name = $1;
				receive_file({'FILE_PATH' => $file_name, 'FILE_HANDLE' => $client});
			}

			# Client has finished a job
			if ($response =~ /DONE '(.*)' '(.*)' \|\|/) {
				$complete_count++;
				printf("  Analyses complete: %".$num_digits."d/%d.\r", $complete_count, scalar(@quartets));

				my $completed_quartet = $1;
				my $quartet_statistics = $2;

				#push(@unlink, glob("$completed_quartet*"));
				push(@unlink, glob(abs_path(".")."/$completed_quartet*"));
				$complete_queue{$completed_quartet} = $quartet_statistics;
			}

			# Client wants a new job
			if ($response =~ /NEW: (.*)/) {
				my $client_ip = $1;

				# Check if jobs remain in the queue
				if ($job_number < scalar(@quartets)) {
					printf("\n  Analyses complete: %".$num_digits."d/%d.\r", 0, scalar(@quartets)) if ($job_number == 0);

					my $quartet = join("--", @{$quartets[$job_number]});

					# Tell local clients to move into mbsum directory
					if ($client_ip eq $server_ip) {
						print {$client} "CHDIR: ".abs_path(".")."\n";
					}

					# Invocation changes if we want to use a prior of infinity
					if ($alpha =~ /(^inf(inity)?)/i) {
						print {$client} "NEW: '$quartet' '--use-independence-prior -n $ngen'\n";
					}
					else {
						print {$client} "NEW: '$quartet' '-a $alpha -n $ngen'\n";
					}
					$job_number++;
				}
				else {
					# Client has asked for a job, but there are none remaining
					print {$client} "HANGUP\n";
					$select->remove($client);
					$client->close();
					$closed_connections++;
					next CLIENT;
				}
			}
		}
	}
}

# Dump remaining quartets
dump_quartets(\%complete_queue);

# Wait until all children have completed
foreach my $pid (@pids) {
	waitpid($pid, 0);
}

print "\n  All connections closed.\n";
print "Total execution time: ", sec2human(time() - $time), ".\n\n";

rmdir("$initial_directory/$project_name/$mb_sum_dir");

sub client {
	my ($opt_name, $address) = @_;

	my ($server_ip, $port) = split(":", $address);

	chdir("/tmp");
	my $bucky = "/tmp/bucky";

	my $pgrp = $$;
	setpgrp();

	# Determine this host's IP
	chomp(my $ip = `dig +short myip.opendns.com \@resolver1.opendns.com`);

	# Set IP to localhost if we don't have internet
	if ($ip !~ /(?:[0-9]{1,3}\.){3}[0-9]{1,3}/) {
		$ip = "127.0.0.1";
	}

	# Determine file name of mbsum archive the client should use
	my @ARGV = split(/\s+/, $invocation);
	shift(@ARGV); shift(@ARGV); # remove "perl" and "bucky.pl"
	my $mbsum_archive = shift(@ARGV);

	# Extract files from mbsum archive
	my @sums;
	if ($mbsum_archive =~ /\//) {
		chomp(@sums = `tar tf '$mbsum_archive'`);
	}
	else {
		chomp(@sums = `tar xvf '$mbsum_archive' 2>&1`);
		@sums = map { s/x //; $_ } @sums if ($os_name eq "darwin");
	}

	# Spawn more clients
	my @pids;
	# A slightly modification in order to limit forks to 10
	# my $total_forks = get_free_cpus();
	my $total_forks = 10;
	if ($total_forks > 1) {
		foreach my $fork (1 .. $total_forks - 1) {
			my $pid = fork();
			if ($pid == 0) {
				last;
			}
			else {
				push(@pids, $pid);
			}
		}
	}

	# The name of the quartet we are working on
	my $quartet;

	# Stores names of unneeded files
	my @unlink;

	# Change signal handling so killing the server kills these processes and cleans up
	$SIG{HUP} = sub { unlink($0, $bucky); unlink(@sums); unlink($mbsum_archive); kill -15, $$; };
	$SIG{TERM} = sub { unlink(glob("$quartet*")) if (defined($quartet)); exit(0); };

	# Connect to the server
	my $sock = new IO::Socket::INET(
		PeerAddr  => $server_ip.":".$port,
		Proto => 'tcp')
	or exit(0);
	$sock->autoflush(1);

	print {$sock} "NEW: $ip\n";
	while (chomp(my $response = <$sock>)) {

		if ($response =~ /CHDIR: (.*)/) {
			chdir($1);
		}
		elsif ($response =~ /NEW: '(.*)' '(.*)'/) {
			$quartet = $1;
			my $bucky_settings = $2;

			# If client is local this needs to be defined now
			#chomp(@sums = `tar tf $mbsum_archive`) if (!@sums);

			# Create prune tree file contents required for BUCKy
			my $count = 0;
			my $prune_tree_output = "translate\n";
			foreach my $member (split("--", $quartet)) {
				$count++;
				$prune_tree_output .= " $count $member";
				if ($count == 4) {
					$prune_tree_output .= ";\n";
				}
				else {
					$prune_tree_output .= ",\n";
				}
			}

			# Write prune tree file
			my $prune_file_path = "$quartet-prune.txt";

			push(@unlink, $prune_file_path);

			open(my $prune_file, ">", $prune_file_path);
			print {$prune_file} $prune_tree_output;
			close($prune_file);

			push(@unlink, "$quartet.input", "$quartet.out", "$quartet.cluster", "$quartet.concordance", "$quartet.gene");

			# Run BUCKy on specified quartet
			system($bucky, split(" ", $bucky_settings), "-cf", 0, "-o", $quartet, "-p", $prune_file_path, @sums);
			unlink($prune_file_path);

			# Zip and tarball the results
			my @results = glob($quartet."*");
			my $quartet_archive_name = "$quartet.tar.gz";

			# Open concordance file and parse out the three possible resolutions
			my $num_genes = get_used_genes("$quartet.out");
			my $split_info = parse_concordance_output("$quartet.concordance", $num_genes);

			# Archive and compress results
			system("tar", "czf", $quartet_archive_name, @results);

			# Send the results back to the server if this is a remote client
			if ($server_ip ne "127.0.0.1" && $server_ip ne $ip) {
				send_file({'FILE_PATH' => $quartet_archive_name, 'FILE_HANDLE' => $sock});
				unlink($quartet_archive_name);
			}

			unlink(@unlink);
			undef(@unlink);

			print {$sock} "DONE '$quartet_archive_name' '$split_info' || NEW: $ip\n";
		}
		elsif ($response eq "HANGUP") {
			last;
		}
	}

	# Have initial client wait for all others to finish and clean up
	if ($$ == $pgrp) {
		foreach my $pid (@pids) {
			waitpid($pid, 0);
		}
		unlink($0, $bucky);
		unlink(@sums, $mbsum_archive);
		#if ($server_ip ne $ip) {
		#	unlink(@sums, $mbsum_archive);
		#}
		#else {
		#	unlink(@sums);
		#}
	}

	exit(0);
}

sub dump_quartets {
	my $quartets = shift;

	my $pid;
	until (defined($pid)) { $pid = fork(); usleep(30000); }
	if ($pid == 0) {

		my @completed_quartets = keys %{$quartets};
		my @quartet_statistics = values %{$quartets};

		$SIG{TERM} = sub { close(STDIN); close(STDOUT); close(STDERR); unlink(@unlink); exit(0); };

		# Check if this is the first to complete, if so we must create CF output file
		if (!-e "../$quartet_output") {
			open(my $quartet_output_file, ">", "../$quartet_output");
			print {$quartet_output_file} "taxon1,taxon2,taxon3,taxon4,CF12_34,CF12_34_lo,CF12_34_hi,CF13_24,CF13_24_lo,CF13_24_hi,CF14_23,CF14_23_lo,CF14_23_hi,ngenes\n";
			foreach my $quartet (@quartet_statistics) {
				print {$quartet_output_file} $quartet,"\n";
			}
			close($quartet_output);
		}
		else {
			# Obtain a file lock on archive so another process doesn't simultaneously try to add to it
			open(my $quartet_output_file, ">>", "../$quartet_output");
			flock($quartet_output_file, LOCK_EX) || die "Could not lock '$quartet_output': $!.\n";
			seek($quartet_output_file, 0, SEEK_END) || die "Could not seek '$quartet_output': $!.\n";

			# Add completed gene
			#print {$quartet_output_file} @quartet_statistics,"\n";
			foreach my $quartet (@quartet_statistics) {
				print {$quartet_output_file} $quartet,"\n";
			}

			# Release lock
			flock($quartet_output_file, LOCK_UN) || die "Could not unlock '$quartet_output': $!.\n";
			close($quartet_output_file);
		}

		# Check if this is the first to complete, if so we must create BUCKy tarball
		# if (!-e "../$bucky_archive") {
		# 	system("tar", "cf", "../$bucky_archive", @completed_quartets);
		# 	unlink(@completed_quartets);
		# }
		# else {
		# 	my @unlink;
		# 	foreach my $completed_quartet (@completed_quartets) {
		# 		(my $quartet = $completed_quartet) =~ s/\.tar\.gz//;
		# 		push(@unlink, glob("$quartet*"));
		# 	}
		# 	#$SIG{TERM} = sub { close(STDIN); close(STDOUT); close(STDERR); unlink(glob("$quartet*")); exit(0); };
		# 	#$SIG{TERM} = sub { close(STDIN); close(STDOUT); close(STDERR); unlink(@unlink); exit(0); };

		# 	# Obtain a file lock on archive so another process doesn't simultaneously try to add to it
		# 	open(my $bucky_archive_file, "<", "../$bucky_archive");
		# 	flock($bucky_archive_file, LOCK_EX) || die "Could not lock '$bucky_archive': $!.\n";

		# 	# Add completed gene
		# 	system("tar", "rf", "../$bucky_archive", @completed_quartets);
		# 	unlink(@completed_quartets);

		# 	# Release lock
		# 	flock($bucky_archive_file, LOCK_UN) || die "Could not unlock '$bucky_archive': $!.\n";
		# 	close($bucky_archive_file);
		# }

		unlink(@unlink);
		undef(@unlink);

		exit(0);
	}
	else {
		push(@pids, $pid);
	}

#	unlink(@unlink);
#	undef(@unlink);

	return;
}

sub get_used_genes {
	my $file_name = shift;

	# Number of genes actually used by BUCKy
	my $num_genes;

	# Open up the BUCKy output file
	open(my $bucky_out, "<", $file_name);
	while (my $line = <$bucky_out>) {
		if ($line =~ /Read (\d+) genes with a total of/) {
			$num_genes = $1;
			last;
		}
	}
	close($bucky_out);

	die "Error determining number of genes used by BUCKy ($file_name).\n" if (!defined($num_genes));

	return $num_genes;
}

sub parse_concordance_output {
	my ($file_name, $ngenes) = @_;

	my @taxa;
	my %splits;

	# Open up the specified output file
	open(my $concordance_file, "<", $file_name);

	my $split;
	my $in_translate;
	my $in_all_splits;
	while (my $line = <$concordance_file>) {

		# Parse the translate table
		if ($in_translate) {
			if ($line =~ /\d+ (.*)([,;])/) {
				my $taxon = $1;
				my $line_end = $2;
				push(@taxa, $taxon);

				$in_translate = 0 if ($line_end eq ';');
			}
		}

		# Parse the split information
		if ($in_all_splits) {

			# Set the split we are parsing information from
			if ($line =~ /^(\{\S+\})/) {
				my $current_split = $1;
				if ($current_split eq "{1,4|2,3}") {
					$split = "14|23";
				}
				elsif ($current_split eq "{1,3|2,4}") {
					$split = "13|24";
				}
				elsif ($current_split eq "{1,2|3,4}") {
					$split = "12|34";
				}
			}

			# Parse mean number of loci for split
			if ($line =~ /=\s+(\S+) \(number of loci\)/) {
				$splits{$split}->{"CF"} = $1 / $ngenes;
			}

			# Parse 95% confidence interval
			if ($line =~ /95% CI for CF = \((\d+),(\d+)\)/) {
				#$splits{$split}->{"95%_CI"} = "(".($1 / $ngenes).",".($2 / $ngenes).")";
				$splits{$split}->{"95%_CI_LO"} = ($1 / $ngenes);
				$splits{$split}->{"95%_CI_HI"} = ($2 / $ngenes);
			}
		}

		$in_translate++ if ($line =~ /^translate/);
		$in_all_splits++ if ($line =~ /^All Splits:/);
	}

	# Concat taxa names together
	my $return = join(",", @taxa);
	$return .= ",";

	# Concat split proportions with their 95% CI to return
	if (exists($splits{"12|34"})) {
		#$return .= $splits{"12|34"}->{"CF"}.$splits{"12|34"}->{"95%_CI"}."\t";
		$return .= $splits{"12|34"}->{"CF"}.",".$splits{"12|34"}->{"95%_CI_LO"}.",".$splits{"12|34"}->{"95%_CI_HI"}.",";
	}
	else {
		#$return .= "0(0,0)\t";
		$return .= "0,0,0,";
	}

	if (exists($splits{"13|24"})) {
		#$return .= $splits{"13|24"}->{"CF"}.$splits{"13|24"}->{"95%_CI"}."\t";
		$return .= $splits{"13|24"}->{"CF"}.",".$splits{"13|24"}->{"95%_CI_LO"}.",".$splits{"13|24"}->{"95%_CI_HI"}.",";
	}
	else {
		#$return .= "0(0,0)\t";
		$return .= "0,0,0,";
	}

	if (exists($splits{"14|23"})) {
		#$return .= $splits{"14|23"}->{"CF"}.$splits{"14|23"}->{"95%_CI"};
		$return .= $splits{"14|23"}->{"CF"}.",".$splits{"14|23"}->{"95%_CI_LO"}.",".$splits{"14|23"}->{"95%_CI_HI"};
	}
	else {
		#$return .= "0(0,0)";
		$return .= "0,0,0";
	}

	# Append number of genes used
	$return .= ",$ngenes";

	return $return;
}

sub parse_mb_log {
	my $log_file_name = shift;

	# Open the specified mb log file and parse useful information from it

	my @taxa;
	my $ngen;
	my $nruns;
	my $burnin;
	my $burninfrac;
	my $samplefreq;
	open(my $log_file, "<", $log_file_name);
	while (my $line = <$log_file>) {
		if ($line =~ /Taxon\s+\d+ -> (\S+)/) {
			push(@taxa, $1);
		}
		elsif ($line =~ /Setting number of runs to (\d+)/) {
			$nruns = $1;
		}
		elsif ($line =~ /Setting burnin fraction to (\S+)/) {
			$burninfrac = $1;
		}
		elsif ($line =~ /Setting chain burn-in to (\d+)/) {
			$burnin = $1;
		}
		elsif ($line =~ /Setting sample frequency to (\d+)/) {
			$samplefreq = $1;
		}
		elsif ($line =~ /Setting number of generations to (\d+)/) {
			$ngen = $1;
		}
	}
	close($log_file);

	if (defined($burnin)) {
		#return {'SAMPLEFREQ' => $samplefreq, 'NRUNS' => $nruns, 'BURNIN' => $burnin };
		return {'SAMPLEFREQ' => $samplefreq, 'NRUNS' => $nruns, 'BURNIN' => $burnin, 'TAXA' => \@taxa};
	}
	else {
		return {'NGEN' => $ngen, 'NRUNS' => $nruns, 'BURNINFRAC' => $burninfrac,
				'SAMPLEFREQ' => $samplefreq, 'TAXA' => \@taxa};
	}
}

sub parse_mbsum_taxa {
	my $mbsum_file_name = shift;

	# Open the specified mbsum file and parse its taxa list

	my @taxa;
	my $in_translate_block = 0;
	open(my $mbsum_file, "<", $mbsum_file_name);
	while (my $line = <$mbsum_file>) {
		$in_translate_block++ if ($line =~ /translate/);

		if ($in_translate_block == 1 && $line =~ /\d+\s+([^,;]+)/) {
			push(@taxa, $1);
		}
	}
	close($mbsum_file);

	# Check if there were multiple translate blocks in the file which is indicative of an error with the creation of the file
	die "Something is amiss with '$mbsum_file_name', multiple translate blocks ($in_translate_block) were detected when there should only be one.\n" if ($in_translate_block > 1);

	# Check that we actually parsed something, otherwise input is improperly formatted/not mbsum output
	die "No taxa parsed for file '$mbsum_file_name', does '$archive' actually contain mbsum output?.\n" if (!@taxa);

	return \@taxa;
}

sub run_mbsum {
	my $tarball = shift;

	# Unzip specified tarball
	chomp(my @mb_files = `tar xvf '$mb_out_dir$tarball' -C '$mb_out_dir' 2>&1`);
	@mb_files = map { s/x //; $_ } @mb_files if ($os_name eq "darwin");

	# Determine name for this partition's log file
	my $log_file_name;
	foreach my $file (@mb_files) {
		if ($file =~ /\.log$/) {
			$log_file_name = $file;
			last;
		}
	}
	die "Could not locate log file for '$tarball'.\n" if (!defined($log_file_name));

	# Parse log file
	my $mb = parse_mb_log("$mb_out_dir$log_file_name");

	(my $gene_name = $tarball) =~ s/\.nex\.tar\.gz//;

	# Determine number of trees mbsum should remove from each file

	my $trim;
	if ($mb->{BURNIN}) {
		$trim = $mb->{BURNIN} + 1;
	}
	else {
		$trim = ((($mb->{NGEN} / $mb->{SAMPLEFREQ}) * $mb->{NRUNS} * $mb->{BURNINFRAC}) / $mb->{NRUNS}) + 1;
	}

	# Summarize gene's tree files
	system("$mbsum '$mb_out_dir$gene_name.'*.t -n $trim -o '$mb_sum_dir$gene_name.sum' >/dev/null 2>&1");

	# Clean up extracted files
	chdir($mb_out_dir);
	unlink(@mb_files);
}

sub okay_to_run {
	my $pids = shift;

	# Free up a CPU by sleeping for 10 ms
	usleep(10000);

	my $current_forks = scalar(@{$pids});
	foreach my $index (reverse(0 .. $current_forks - 1)) {
		next if ($index < 0);

		my $pid = @{$pids}[$index];
		my $wait = waitpid($pid, WNOHANG);

		# Successfully reaped child
		if ($wait > 0) {
			$current_forks--;
			#splice(@pids, $index, 1);
			splice(@{$pids}, $index, 1);
		}
	}

	return ($current_forks < $max_forks);
}

sub hashsum {
	my $settings = shift;

	my $file_path = $settings->{'FILE_PATH'};

	open(my $file, "<", $file_path) or die "Couldn't open file '$file_path': $!.\n";
	my $md5 = Digest::MD5->new;
	my $md5sum = $md5->addfile(*$file)->hexdigest;
	close($file);

	return $md5sum;
}

sub send_file {
	my $settings = shift;

	my $file_path = $settings->{'FILE_PATH'};
	my $file_handle = $settings->{'FILE_HANDLE'};

	my $hash = hashsum({'FILE_PATH' => $file_path});
	print {$file_handle} "SEND_FILE: $file_path\n";

	open(my $file, "<", $file_path) or die "Couldn't open file '$file_path': $!.\n";
	while (<$file>) {
		print {$file_handle} $_;
	}
	close($file);

	print {$file_handle} " END_FILE: $hash\n";

	# Stall until we know status of file transfer
	while (defined(my $response = <$file_handle>)) {
		chomp($response);

		last if ($response eq "TRANSFER_SUCCESS");
		die "Unsuccessful file transfer, checksums did not match.\n" if ($response eq "TRANSFER_FAILURE");
	}
}

sub receive_file {
	my $settings = shift;

	my $file_path = $settings->{'FILE_PATH'};
	my $file_handle = $settings->{'FILE_HANDLE'};

	my $check_hash;
	open(my $file, ">", $file_path);
	while (<$file_handle>) {
		if ($_ =~ /(.*) END_FILE: (\S+)/) {
			print {$file} $1;
			$check_hash = $2;
			last;
		}
		else {
			print {$file} $_;
		}
	}
	close($file);

	# Use md5 hashsum to make sure transfer worked
	my $hash = hashsum({'FILE_PATH' => $file_path});
	if ($hash ne $check_hash) {
		die "Unsuccessful file transfer, checksums do not match.\n'$hash' - '$check_hash'\n"; # hopefully this never pops up
		print {$file_handle} "TRANSFER_FAILURE\n"
	}

	else {
		print {$file_handle} "TRANSFER_SUCCESS\n";
	}
}

sub INT_handler {
	#dump_quartets(\%complete_queue);

	unlink(@unlink);

	# Kill ssh process(es) spawned by this script
	foreach my $pid (@pids) {
		#kill(-9, $pid);
		#kill(15, $pid);
		kill(1, $pid);
	}

	# Move into gene directory
	chdir("$initial_directory/$project_name");

	rmdir($mb_out_dir);

	# Try to delete directory once per second for five seconds, if it can't be deleted print an error message
	# I've found this method is necessary for analyses performed on AFS drives
	my $count = 0;
	until (!-e $mb_sum_dir || $count == 5) {
		$count++;

		remove_tree($mb_sum_dir, {error => \my $err});
		sleep(1);
	}
	#logger("Could not clean all files in './$gene_dir/'.") if ($count == 5);
	print "Could not clean all files in './$mb_sum_dir/'.\n" if ($count == 5);

	exit(0);
}

sub clean_up {
	my $settings = shift;

	my $remove_dirs = $settings->{'DIRS'};
	my $current_dir = getcwd();

#	chdir($alignment_root);
#	unlink(glob($gene_dir."$alignment_name*"));
#	#unlink($server_check_file) if (defined($server_check_file));
#
#	if ($remove_dirs) {
#		rmdir($gene_dir);
#	}
	chdir($current_dir);
}

sub get_num_digits {
	my $settings = shift;

	my $number = $settings->{'NUMBER'};

	my $digits = 1;
	while (floor($number / 10) != 0) {
		$number = floor($number / 10);
		$digits++;
	}

	return $digits;
}

sub sec2human {
	my $secs = shift;

	# Constants
	my $secs_in_min = 60;
	my $secs_in_hour = 60 * 60;
	my $secs_in_day = 24 * 60 * 60;

	$secs = int($secs);

	return "0 seconds" if (!$secs);

	# Calculate units of time
	my $days = int($secs / $secs_in_day);
	my $hours = ($secs / $secs_in_hour) % 24;
	my $mins = ($secs / $secs_in_min) % 60;
	$secs = $secs % 60;

	# Format return nicely
	my $time;
	if ($days) {
		$time .= ($days != 1) ? "$days days, " : "$days day, ";
	}
	if ($hours) {
		$time .= ($hours != 1) ? "$hours hours, " : "$hours hour, ";
	}
	if ($mins) {
		$time .= ($mins != 1) ? "$mins minutes, " : "$mins minute, ";
	}
	if ($secs) {
		$time .= ($secs != 1) ? "$secs seconds " : "$secs second ";
	}
	else {
		# Remove comma
		chop($time);
	}
	chop($time);

	return $time;
}

sub get_free_cpus {

	my $os_name = $^O;

	# Returns a two-member array containing CPU usage observed by top,
	# top is run twice as its first output is usually inaccurate
	my @percent_free_cpu;
	if ($os_name eq "darwin") {
		# Mac OS
		chomp(@percent_free_cpu = `top -i 1 -l 2 | grep "CPU usage"`);
	}
	else {
		# Linux
		chomp(@percent_free_cpu = `top -b -n2 -d0.05 | grep "Cpu(s)"`);
	}

	my $percent_free_cpu = pop(@percent_free_cpu);

	if ($os_name eq "darwin") {
		# Mac OS
		$percent_free_cpu =~ s/.*?(\d+\.\d+)%\s+id.*/$1/;
	}
	else {
		# linux
		$percent_free_cpu =~ s/.*?(\d+\.\d)\s*%?ni,\s*(\d+\.\d)\s*%?id.*/$1 + $2/; # also includes %nice as free
		$percent_free_cpu = eval($percent_free_cpu);
	}

	my $total_cpus;
	if ($os_name eq "darwin") {
		# Mac OS
		$total_cpus = `sysctl -n hw.ncpu`;
	}
	else {
		# linux
		$total_cpus = `grep --count 'cpu' /proc/stat` - 1;
	}

	my $free_cpus = ceil($total_cpus * $percent_free_cpu / 100);

	if ($free_cpus == 0 || $free_cpus !~ /^\d+$/) {
		$free_cpus = 1; # assume that at least one cpu can be used
	}

	return $free_cpus;
}

sub run_cmd {
	my $command = shift;

	my $return = system($command);

	if ($return) {
		logger("'$command' died with error: '$return'.\n");
		#kill(2, $parent_pid);
		exit(0);
	}
}

sub check_path_for_exec {
	my $exec = shift;

	my $path = $ENV{PATH}.":."; # include current directory as well
	my @path_dirs = split(":", $path);

	my $exec_path;
	foreach my $dir (@path_dirs) {
		$dir .= "/" if ($dir !~ /\/$/);
		$exec_path = abs_path($dir.$exec) if (-e $dir.$exec && -x $dir.$exec && !-d $dir.$exec);
	}

	die "Could not find the following executable: '$exec'. This script requires this program in your path.\n" if (!defined($exec_path));
	return $exec_path;
}

# I grabbed this from StackOverflow so that's why its style is different #DontFixWhatIsntBroken:
# https://stackoverflow.com/questions/10299961/in-perl-how-can-i-generate-all-possible-combinations-of-a-list
sub combine {
	my ($list, $n) = @_;
	die "Insufficient list members" if ($n > @$list);

	return map [$_], @$list if ($n <= 1);

	my @comb;

	for (my $i = 0; $i+$n <= @$list; ++$i) {
		my $val  = $list->[$i];
		my @rest = @$list[$i + 1 .. $#$list];
		push(@comb, [$val, @$_]) for combine(\@rest, $n - 1);
	}

	return @comb;
}

sub check_bucky_version {
	my $bucky = shift;

	print "\nChecking for BUCKy version >= 1.4.4...\n";

	# Run BUCKy with --version and extract version info
	chomp(my @version_info = grep { /BUCKy version/ } `$bucky --version`);
	my $version_info = shift(@version_info);

	die "  Could not determine BUCKy version.\n" if (!defined($version_info));

	# Get the actual version number
	my $version;
	if ($version_info =~ /BUCKy\s+version\s+([^\s|,]+)/) {
		$version = $1;
	}
	die "  Could not determine BUCKy version.\n" if (!defined($version_info));

	# Version testing
	#my @versions = qw/2.1b 1.2.1000 1 0.9.8 2.3 1.4.5 1.4.3 1.4 1.500.2 1.4.4 1.4.4.1/;
	#foreach my $version (@versions) {

	print "  BUCKy version: $version.\n";

	# Split version number based on period delimiters
	my @version_parts = split(/\./, $version);

	# Future proofing if letters are ever used (we won't ever care about them)
	@version_parts = map { s/[a-zA-Z]+//g; $_ } @version_parts;
	die "  Error determining BUCKy version.\n" if (!@version_parts);

	# Check that version is >= 1.4.4
	if (defined($version_parts[0]) && $version_parts[0] > 1) {
		print "  BUCKy version check passed.\n";
		return;
	}
	elsif ((defined($version_parts[0]) && $version_parts[0] == 1) && (defined($version_parts[1]) && $version_parts[1] > 4)) {
		print "  BUCKy version check passed.\n";
		return;
	}
	elsif (((defined($version_parts[0]) && $version_parts[0] == 1) && (defined($version_parts[1]) && $version_parts[1] == 4))
		  && defined($version_parts[2]) && $version_parts[2] >= 4) {
		print "  BUCKy version check passed.\n";
		return;
	}
	else {
		die "  BUCKy version check failed, update to version >= 1.4.4.\n";
	}

	#print "\n";
	#}
}

sub usage {
	return "Usage: bucky.pl [MRBAYES TARBALL]\n";
}

sub help {
print <<EOF;
@{[usage()]}
Parallel execution of BUCKy on all possible quartets in a given alignment

  -a, --alpha            value of alpha to use when running BUCKy, use "infinity" for infinity (default: 1)
  -n, --ngen             number of generations to run BUCKy MCMC chain (default: 1000000 generations)
  -o, --out-dir          name of the directory to store output files in (default: "bucky-" + Unix time of script invocation)
  -T, --n-threads        the number of forks ALL hosts running analyses can use concurrently (default: current number of free CPUs)
  -s, --no-mbsum         informs the script that the input is a tarball containing output already parsed by mbsum
  --machine-file         file name containing hosts to ssh onto and perform analyses on, passwordless login MUST be enabled
                         for each host specified in this file
  --port                 specifies the port to utilize on the server (Default: 10003)
  -h, --help             display this help and exit
  --usage                display proper script invocation format

Examples:
  perl bucky.pl align.mb.tar --machine-file hosts.txt     runs BUCKy using computers specified in hosts.txt using MrBayes output
                                                          stored in align.mb.tar

Mail bug reports and suggestions to <noah.stenz.github\@gmail.com>
EOF
exit(0);
}