use warnings;
use Cwd;
use File::Basename;
@ARGV < 2 && die "usage: ssh.pl log-file command-line arguments...";
$jobstart = 1;
$jobend = 1;
$qsub_opts="";
if (@ARGV > 0) {
while (@ARGV >= 2 && $ARGV[0] =~ m:^-:) {
$switch = shift @ARGV;
if ($switch eq "-V") {
$qsub_opts .= "-V ";
} else {
$option = shift @ARGV;
if ($switch eq "-sync" && $option =~ m/^[yY]/) {
$qsub_opts .= "-sync ";
}
$qsub_opts .= "$switch $option ";
if ($switch eq "-pe") {
$option2 = shift @ARGV;
$qsub_opts .= "$option2 ";
}
}
}
if ($ARGV[0] =~ m/^([\w_][\w\d_]*)+=(\d+):(\d+)$/) {
$jobname = $1;
$jobstart = $2;
$jobend = $3;
shift;
if ($jobstart > $jobend) {
die "run.pl: invalid job range $ARGV[0]";
}
if ($jobstart <= 0) {
die "run.pl: invalid job range $ARGV[0], start must be strictly positive (this is required for GridEngine compatibility)";
}
} elsif ($ARGV[0] =~ m/^([\w_][\w\d_]*)+=(\d+)$/) {
$jobname = $1;
$jobstart = $2;
$jobend = $2;
shift;
} elsif ($ARGV[0] =~ m/.+\=.*\:.*$/) {
print STDERR "Warning: suspicious first argument to run.pl: $ARGV[0]\n";
}
}
if ($qsub_opts ne "") {
print STDERR "Warning: ssh.pl ignoring options \"$qsub_opts\"\n";
}
{
if (!open(Q, "<.queue/machines")) {
print STDERR "ssh.pl: expected the file .queue/machines to exist.\n";
exit(1);
}
@machines = ();
while (<Q>) {
chop;
if ($_ ne "") {
@A = split;
if (@A != 1) {
die "ssh.pl: bad line '$_' in .queue/machines.";
}
if ($A[0] !~ m/^[a-z0-9\.\-]+/) {
die "ssh.pl: invalid machine name '$A[0]'";
}
push @machines, $A[0];
}
}
if (@machines == 0) { die "ssh.pl: no machines listed in .queue/machines"; }
}
$logfile = shift @ARGV;
if (defined $jobname && $logfile !~ m/$jobname/ &&
$jobend > $jobstart) {
print STDERR "ssh.pl: you are trying to run a parallel job but "
. "you are putting the output into just one log file ($logfile)\n";
exit(1);
}
{
$offset = 0;
@A = split(".", basename($logfile));
foreach $a (@A) { if ($a =~ m/^\d+$/) { $offset += $a; } }
}
$cmd = "";
foreach $x (@ARGV) {
if ($x =~ m/^\S+$/) { $cmd .= $x . " "; }
elsif ($x =~ m:\":) { $cmd .= "'$x' "; }
else { $cmd .= "\"$x\" "; }
}
for ($jobid = $jobstart; $jobid <= $jobend; $jobid++) {
$childpid = fork();
if (!defined $childpid) { die "Error forking in ssh.pl (writing to $logfile)"; }
if ($childpid == 0) {
if (defined $jobname) {
$cmd =~ s/$jobname/$jobid/g;
$logfile =~ s/$jobname/$jobid/g;
}
{
$local_offset = $offset + $jobid - 1;
$num_machines = scalar @machines;
$machines_index = ($local_offset + $num_machines) % $num_machines;
$machine = $machines[$machines_index];
}
if (!open(S, "|ssh $machine bash")) {
print STDERR "ssh.pl failed to ssh to $machine";
exit(1);
}
$cwd = getcwd();
$logdir = dirname($logfile);
print S "set -e\n";
print S "cd $cwd\n";
print S ". ./path.sh\n";
print S "mkdir -p $logdir\n";
print S "time1=\`date +\"%s\"\`\n";
print S "( echo '#' Running on \`hostname\`\n";
print S " echo '#' Started at \`date\`\n";
print S " echo -n '# '; cat <<EOF\n";
print S "$cmd\n";
print S "EOF\n";
print S ") >$logfile\n";
print S "set +e\n";
print S " ( $cmd ) 2>>$logfile >>$logfile\n";
print S "ret=\$?\n";
print S "set -e\n";
print S "time2=\`date +\"%s\"\`\n";
print S "echo '#' Accounting: time=\$((\$time2-\$time1)) threads=1 >>$logfile\n";
print S "echo '#' Finished at \`date\` with status \$ret >>$logfile\n";
print S "exit \$ret";
$ret = close(S);
$ssh_return_status = $?;
if (! $ret && $! != 0) { die "ssh.pl: unexpected problem ssh'ing to machine $machine"; }
if ($ssh_return_status != 0) { exit(1); }
else { exit(0); }
}
}
$ret = 0;
$numfail = 0;
for ($jobid = $jobstart; $jobid <= $jobend; $jobid++) {
$r = wait();
if ($r == -1) { die "Error waiting for child process"; }
if ($? != 0) { $numfail++; $ret = 1; }
}
if ($ret != 0) {
$njobs = $jobend - $jobstart + 1;
if ($njobs == 1) {
if (defined $jobname) {
$logfile =~ s/$jobname/$jobstart/;
}
print STDERR "ssh.pl: job failed, log is in $logfile\n";
if ($logfile =~ m/JOB/) {
print STDERR "run.pl: probably you forgot to put JOB=1:\$nj in your script.";
}
}
else {
$logfile =~ s/$jobname/*/g;
print STDERR "ssh.pl: $numfail / $njobs failed, log is in $logfile\n";
}
}
exit ($ret);