Commit fab18048 authored by Kevin Harms's avatar Kevin Harms
Browse files

Updated to allow specifiying a restart file that will track progress and

generate checkpoint/restart files for each fsstats run.
Checkpoints are stored in a checkpoints directory that must be created
by the user.


git-svn-id: https://svn.mcs.anl.gov/repos/darshan/trunk@201 3b7491f3-a168-0410-bf4b-c445ed680a29
parent 699c1d06
...@@ -7,13 +7,19 @@ ...@@ -7,13 +7,19 @@
use Cwd; use Cwd;
use Getopt::Long; use Getopt::Long;
use English; use English;
use File::Temp qw(tempfile);
my $hostfile; my $hostfile;
my $dirfile; my $dirfile;
my $restartfile;
my $shareddir;
my @hosts; my @hosts;
my @dirs; my @dirs;
my %pidmapping = (); my %pidmapping = ();
my %pidmapping_dir = (); my %pidmapping_dir = ();
my %pidmapping_time = ();
my %dirtiming = ();
my %chkpntlog = ();
process_args(); process_args();
...@@ -29,15 +35,63 @@ open(FILE, $dirfile) or die("Error opening dirfile."); ...@@ -29,15 +35,63 @@ open(FILE, $dirfile) or die("Error opening dirfile.");
chomp(@dirs); chomp(@dirs);
close(FILE); close(FILE);
open(FILE, $restartfile) or goto SKIP;
while (<FILE>)
{
my $dir;
my $checkpoint;
my $status;
my $elapsedtime;
chomp;
($status, $elapsedtime, $checkpoint, $dir) = split(/,/, $_, 4);
$chkpntlog{$dir} = [$checkpoint, $status, $elapsedtime];
}
close(FILE);
SKIP:
my $process_forked = 0;
# loop as long as we still have directories to process or outstanding jobs # loop as long as we still have directories to process or outstanding jobs
while ($#dirs > -1 || keys(%pidmapping)) ITERATION: while ($#dirs > -1 || keys(%pidmapping))
{ {
if($#hosts > -1 && $#dirs > -1) if($#hosts > -1 && $#dirs > -1)
{ {
# we have work to do and a free host to do it on # we have work to do and a free host to do it on
# grab a directory and host out of the lists # grab a directory and host out of the lists
my $dir = pop(@dirs); my $dir = pop(@dirs);
my $need_restart = 0;
if (not defined $chkpntlog{$dir})
{
my ($fd, $chkname) = tempfile("checkpoints/chkXXXXXXXX", DIR=>$shareddir,
SUFFIX=>".fsstats");
close($fd);
$chkpntlog{$dir} = [$chkname, 0, 0];
$need_restart = 0;
}
else
{
$need_restart = 1;
}
my $info = $chkpntlog{$dir};
$dirtiming{$dir} = $info->[2];
#
# Check to see if this dir is completed
#
if ($info->[1] == 1)
{
print("dir completed: $dir\n");
next ITERATION;
}
if ($need_restart == 1 && $info->[1] == 0)
{
print("restarting dir: $dir\n");
}
my $host = pop(@hosts); my $host = pop(@hosts);
my $pid = fork(); my $pid = fork();
...@@ -46,7 +100,7 @@ while ($#dirs > -1 || keys(%pidmapping)) ...@@ -46,7 +100,7 @@ while ($#dirs > -1 || keys(%pidmapping))
die("Error: could not fork."); die("Error: could not fork.");
} }
my $cmd = "./fsstats-runner.bash $host $dir"; my $cmd = "./fsstats-runner.bash $host $dir $info->[0] $need_restart";
if($pid == 0) if($pid == 0)
{ {
...@@ -73,18 +127,24 @@ while ($#dirs > -1 || keys(%pidmapping)) ...@@ -73,18 +127,24 @@ while ($#dirs > -1 || keys(%pidmapping))
# parent keeps up with what child is running where # parent keeps up with what child is running where
$pidmapping{$pid} = $host; $pidmapping{$pid} = $host;
$pidmapping_dir{$pid} = $dir; $pidmapping_dir{$pid} = $dir;
$pidmapping_time{$pid} = time();
$process_forked = 1;
} }
} }
else else
{ {
# we have launched as much as we can; wait for something to finish # we have launched as much as we can; wait for something to finish
if ($process_forked)
{
my $child = waitpid(-1, 0); my $child = waitpid(-1, 0);
if($child < 1) if($child < 1)
{ {
die("Error: lost track of a child process.\n"); die("Error: lost track of a child process.\n");
} }
my $child_error_code = $?; my $child_error_code = $?;
my $elapsedtime = time() - $pidmapping_time{$child};
print("fsstats of $pidmapping_dir{$child} on $pidmapping{$child} "); print("fsstats of $pidmapping_dir{$child} on $pidmapping{$child} ");
if($child_error_code == 0) if($child_error_code == 0)
{ {
...@@ -92,24 +152,49 @@ while ($#dirs > -1 || keys(%pidmapping)) ...@@ -92,24 +152,49 @@ while ($#dirs > -1 || keys(%pidmapping))
} }
else else
{ {
print(" FAILURE (continuing).\n"); print(" FAILURE [$child_error_code] (continuing).\n");
} }
# update checkpoint
my $update = $chkpntlog{$pidmapping_dir{$child}};
$update->[1] = 1;
$update->[2] = $elapsedtime;
$chkpntlog{$pidmapping_dir{$child}} = $update;
dump_checkpoint($restartfile, \%chkpntlog);
# store total time to process directory
$dirtiming{$pidmapping_dir{$child}} = $elapsedtime;
# put the host back on the queue # put the host back on the queue
push(@hosts, $pidmapping{$child}); push(@hosts, $pidmapping{$child});
delete($pidmapping{$child}); delete($pidmapping{$child});
delete($pidmapping_dir{$child}); delete($pidmapping_dir{$child});
delete($pidmapping_time{$child});
}
} }
} }
foreach my $key (sort { $dirtiming{$b} cmp $dirtiming{$a} } keys %dirtiming)
{
print "$key $dirtiming{$key}\n";
}
exit 0;
#
# Argument Parsing
#
sub process_args sub process_args
{ {
use vars qw( $opt_help ); my $help_switch;
Getopt::Long::Configure("no_ignore_case", "bundling"); Getopt::Long::Configure("no_ignore_case", "bundling");
GetOptions( "help" ); GetOptions( "help" => \$help_switch,
"restart=s" => \$restartfile,
"shareddir=s" => \$shareddir );
if($opt_help) if($help_switch)
{ {
print_help(); print_help();
exit(0); exit(0);
...@@ -129,6 +214,28 @@ sub process_args ...@@ -129,6 +214,28 @@ sub process_args
return; return;
} }
#
# Dumps checkpoint to restart file.
# Rewrites the whole file.
#
sub dump_checkpoint
{
my $file = shift;
my $hash = shift;
open (CHKPNT, ">", $file) or die "checkpoint failed: $file\n";
foreach my $key (keys %$hash)
{
my $item = $hash->{$key};
#print CHKPNT "$key,$item->[0],$item->[1]\n";
print CHKPNT "$item->[1],$item->[2],$item->[0],$key\n";
}
close (CHKPNT);
}
#
# Usage info
#
sub print_help sub print_help
{ {
print <<EOF; print <<EOF;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment