Commit 2d354b85 authored by Michael Salim's avatar Michael Salim
Browse files

MPIRunner unit tests and bugfixes

parent 13d7915a
...@@ -306,8 +306,8 @@ auto timeout retry: {self.auto_timeout_retry} ...@@ -306,8 +306,8 @@ auto timeout retry: {self.auto_timeout_retry}
@property @property
def app_cmd(self): def app_cmd(self):
if self.application: if self.application:
app = ApplicationDefinition.objects.get(name=job.application) app = ApplicationDefinition.objects.get(name=self.application)
return f"{app.executable} {app.application_args}" return f"{app.executable} {self.application_args}"
else: else:
return self.direct_command return self.direct_command
...@@ -345,7 +345,9 @@ auto timeout retry: {self.auto_timeout_retry} ...@@ -345,7 +345,9 @@ auto timeout retry: {self.auto_timeout_retry}
return {variable:value for (variable,value) in entries} return {variable:value for (variable,value) in entries}
def get_envs(self, *, timeout=False, error=False): def get_envs(self, *, timeout=False, error=False):
envs = os.environ.copy() keywords = 'PATH LIBRARY BALSAM DJANGO PYTHON'.split()
envs = {var:value for var,value in os.environ.items()
if any(keyword in var for keyword in keywords)}
try: try:
app = self.get_application() app = self.get_application()
except NoApplication: except NoApplication:
...@@ -402,10 +404,15 @@ auto timeout retry: {self.auto_timeout_retry} ...@@ -402,10 +404,15 @@ auto timeout retry: {self.auto_timeout_retry}
top = os.path.join(top, self.workflow) top = os.path.join(top, self.workflow)
name = self.name.replace(' ', '_') name = self.name.replace(' ', '_')
path = os.path.join(top, name) path = os.path.join(top, name)
if os.path.exists(path): path += "_"
for char in str(self.job_id): if os.path.exists(path):
path += char jid = str(self.job_id)
if not os.path.exists(path): break path += "_" + jid[0]
i = 1
while os.path.exists(path):
path += jid[i]
i += 1
os.makedirs(path) os.makedirs(path)
self.working_directory = path self.working_directory = path
self.save(update_fields=['working_directory']) self.save(update_fields=['working_directory'])
......
...@@ -83,8 +83,9 @@ class Scheduler: ...@@ -83,8 +83,9 @@ class Scheduler:
self.remaining_seconds -= elapsed_time self.remaining_seconds -= elapsed_time
self.last_check_seconds = now self.last_check_seconds = now
return self.remaining_seconds return self.remaining_seconds
else:
sched_id = self.current_scheduler_id
sched_id = self.current_scheduler_id
if sched_id is None: if sched_id is None:
return float("inf") return float("inf")
try: try:
......
...@@ -39,8 +39,8 @@ children = None ...@@ -39,8 +39,8 @@ children = None
_envs = {k:v for k,v in os.environ.items() if k.find('BALSAM')>=0} _envs = {k:v for k,v in os.environ.items() if k.find('BALSAM')>=0}
JOB_ID = _envs.get('BALSAM_JOB_ID', '') JOB_ID = _envs.get('BALSAM_JOB_ID', '')
TIMEOUT = bool(_envs.get('BALSAM_JOB_TIMEOUT', False)) TIMEOUT = _envs.get('BALSAM_JOB_TIMEOUT', False) == "TRUE"
ERROR = bool(_envs.get('BALSAM_JOB_ERROR', False)) ERROR = _envs.get('BALSAM_JOB_ERROR', False) == "TRUE"
if JOB_ID: if JOB_ID:
JOB_ID = uuid.UUID(JOB_ID) JOB_ID = uuid.UUID(JOB_ID)
......
...@@ -78,7 +78,9 @@ def main(args, transition_pool, runner_group, job_source): ...@@ -78,7 +78,9 @@ def main(args, transition_pool, runner_group, job_source):
timeout = lambda : scheduler.remaining_time_seconds() <= 0.0 timeout = lambda : scheduler.remaining_time_seconds() <= 0.0
while not timeout(): while not timeout():
logger.debug("\n************\nSERVICE LOOP\n************") logger.debug("\n******************\n"
"BEGIN SERVICE LOOP\n"
"******************")
wait = True wait = True
for stat in transition_pool.get_statuses(): for stat in transition_pool.get_statuses():
......
...@@ -29,6 +29,15 @@ logger = logging.getLogger(__name__) ...@@ -29,6 +29,15 @@ logger = logging.getLogger(__name__)
from importlib.util import find_spec from importlib.util import find_spec
MPI_ENSEMBLE_EXE = find_spec("balsamlauncher.mpi_ensemble").origin MPI_ENSEMBLE_EXE = find_spec("balsamlauncher.mpi_ensemble").origin
def get_tail(fname, nlines=5, indent=' '):
proc = Popen(f'tail -n {nlines} {fname}'.split(),stdout=PIPE,
stderr=STDOUT)
tail = str(proc.communicate()[0])
lines = tail.split('\n')
for i, line in enumerate(lines[:]):
lines[i] = indent + line
return '\n'.join(lines)
class MonitorStream(Thread): class MonitorStream(Thread):
'''Thread: non-blocking read of a process's stdout''' '''Thread: non-blocking read of a process's stdout'''
...@@ -98,6 +107,9 @@ class MPIRunner(Runner): ...@@ -98,6 +107,9 @@ class MPIRunner(Runner):
tpr = job.threads_per_rank tpr = job.threads_per_rank
tpc = job.threads_per_core tpc = job.threads_per_core
# Note that environment variables are passed through the MPI run command
# line, rather than Popen directly, due to ALCF restrictions:
# https://www.alcf.anl.gov/user-guides/running-jobs-xc40#environment-variables
mpi_str = self.mpi_cmd(worker_list, app_cmd=app_cmd, envs=envs, mpi_str = self.mpi_cmd(worker_list, app_cmd=app_cmd, envs=envs,
num_ranks=nranks, ranks_per_node=rpn, num_ranks=nranks, ranks_per_node=rpn,
threads_per_rank=tpr, threads_per_core=tpc) threads_per_rank=tpr, threads_per_core=tpc)
...@@ -111,6 +123,7 @@ class MPIRunner(Runner): ...@@ -111,6 +123,7 @@ class MPIRunner(Runner):
self.popen_args['stderr'] = STDOUT self.popen_args['stderr'] = STDOUT
self.popen_args['bufsize'] = 1 self.popen_args['bufsize'] = 1
logger.info(f"MPI Runner Popen args: {self.popen_args['args']}") logger.info(f"MPI Runner Popen args: {self.popen_args['args']}")
logger.info(f"MPI Runner: writing output to {outname}")
def update_jobs(self): def update_jobs(self):
job = self.jobs[0] job = self.jobs[0]
...@@ -125,12 +138,16 @@ class MPIRunner(Runner): ...@@ -125,12 +138,16 @@ class MPIRunner(Runner):
curstate = 'RUN_DONE' curstate = 'RUN_DONE'
msg = '' msg = ''
else: else:
logger.debug(f"MPI Job {job.cute_id} return code!=0: error")
curstate = 'RUN_ERROR' curstate = 'RUN_ERROR'
msg = str(retcode) self.process.communicate()
self.outfile.close()
tail = get_tail(self.outfile.name)
msg = f"RETURN CODE {retcode}:\n{tail}"
logger.debug(msg)
if job.state != curstate: job.update_state(curstate, msg) # TODO: handle RecordModified if job.state != curstate: job.update_state(curstate, msg) # TODO: handle RecordModified
class MPIEnsembleRunner(Runner): class MPIEnsembleRunner(Runner):
'''One subprocess: an ensemble of serial jobs run in an mpi4py wrapper''' '''One subprocess: an ensemble of serial jobs run in an mpi4py wrapper'''
def __init__(self, job_list, worker_list): def __init__(self, job_list, worker_list):
...@@ -272,14 +289,18 @@ class RunnerGroup: ...@@ -272,14 +289,18 @@ class RunnerGroup:
for runner in self.runners: runner.update_jobs() for runner in self.runners: runner.update_jobs()
self.lock.release() self.lock.release()
for runner in self.runners[:]: finished_runners = (r for r in self.runners if r.finished())
if runner.finished():
for job in runner.jobs: for runner in finished_runners:
if job.state not in 'RUN_DONE RUN_ERROR RUN_TIMEOUT'.split(): if any(j.state not in ['RUN_DONE','RUN_ERROR','RUN_TIMEOUT'] for j in runner.jobs):
msg = (f"Job {job.cute_id} runner process done, but " self.lock.acquire()
"failed to update job state.") runner.update_jobs()
logger.exception(msg) self.lock.release()
raise RuntimeError(msg) if any(j.state not in ['RUN_DONE','RUN_ERROR','RUN_TIMEOUT'] for j in runner.jobs):
msg = (f"Job {job.cute_id} runner process done, but failed to update job state.")
logger.exception(msg)
raise RuntimeError(msg)
else:
any_finished = True any_finished = True
self.runners.remove(runner) self.runners.remove(runner)
for worker in runner.worker_list: for worker in runner.worker_list:
......
...@@ -40,6 +40,9 @@ class WorkerGroup: ...@@ -40,6 +40,9 @@ class WorkerGroup:
def __iter__(self): def __iter__(self):
return iter(self.workers) return iter(self.workers)
def __len__(self):
return len(self.workers)
def __getitem__(self, i): def __getitem__(self, i):
return self.workers[i] return self.workers[i]
......
...@@ -4,14 +4,16 @@ import django ...@@ -4,14 +4,16 @@ import django
import tempfile import tempfile
import unittest import unittest
tempdir = tempfile.TemporaryDirectory()
os.environ['BALSAM_TEST_DIRECTORY'] = tempdir.name
os.environ['BALSAM_TEST']='1'
os.environ['DJANGO_SETTINGS_MODULE'] = 'argobalsam.settings'
django.setup()
if __name__ == "__main__": if __name__ == "__main__":
tempdir = tempfile.TemporaryDirectory(dir=os.getcwd(), prefix="testdata_")
os.environ['BALSAM_TEST_DIRECTORY'] = tempdir.name
os.environ['BALSAM_TEST']='1'
os.environ['DJANGO_SETTINGS_MODULE'] = 'argobalsam.settings'
django.setup()
loader = unittest.defaultTestLoader loader = unittest.defaultTestLoader
if len(sys.argv) > 1: if len(sys.argv) > 1:
names = sys.argv[1:] names = sys.argv[1:]
......
from mpi4py import MPI
import argparse
import time
from sys import exit
rank = MPI.COMM_WORLD.Get_rank()
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('--sleep', type=int, default=0)
parser.add_argument('--retcode', type=int, default=0)
args = parser.parse_args()
print("Rank", rank, "on", MPI.Get_processor_name())
if args.sleep:
time.sleep(args.sleep)
exit(args.retcode)
from collections import namedtuple
import os
import sys
import time
from importlib.util import find_spec
from tests.BalsamTestCase import BalsamTestCase, cmdline from tests.BalsamTestCase import BalsamTestCase, cmdline
from django.conf import settings
from balsam.schedulers import Scheduler from balsam.schedulers import Scheduler
from balsam.models import BalsamJob, ApplicationDefinition from balsam.models import BalsamJob, ApplicationDefinition
...@@ -8,30 +15,82 @@ from balsamlauncher import worker ...@@ -8,30 +15,82 @@ from balsamlauncher import worker
from balsamlauncher import runners from balsamlauncher import runners
from balsamlauncher.launcher import get_args, create_new_runners from balsamlauncher.launcher import get_args, create_new_runners
class TestRunners(BalsamTestCase): class TestMPIRunner(BalsamTestCase):
'''Integration test for WorkerGroup, JobReader, and Runners/RunnerGroup''' '''start, update_jobs, finished, error/timeout handling'''
def setUp(self): def setUp(self):
self.scheduler = Scheduler.scheduler_main scheduler = Scheduler.scheduler_main
self.host_type = self.scheduler.host_type self.host_type = scheduler.host_type
if self.host_type == 'DEFAULT': if self.host_type == 'DEFAULT':
config = get_args('--consume-all --num-workers 4 --max-ranks-per-node 4'.split()) config = get_args('--consume-all --num-workers 2 --max-ranks-per-node 4'.split())
else: else:
config = get_args('--consume-all') config = get_args('--consume-all')
self.worker_group = worker.WorkerGroup(config, host_type=self.host_type, self.worker_group = worker.WorkerGroup(config, host_type=self.host_type,
workers_str=scheduler.workers_str) workers_str=scheduler.workers_str,
workers_file=scheduler.workers_file)
self.job_source = jobreader.JobReader.from_config(config) self.job_source = jobreader.JobReader.from_config(config)
def testMPIEnsembleRunner(self): app_path = f"{sys.executable} {find_spec('tests.mock_mpi_app').origin}"
'''Several non-MPI jobs packaged into one mpi4py wrapper''' self.app = ApplicationDefinition()
# Some jobs will pass; some will fail; some will timeout self.app.name = "mock_mpi"
pass self.app.description = "print and sleep"
self.app.executable = app_path
self.app.save()
def assert_output_file_contains_n_ranks(self, fp, n):
'''specific check of mock_mpi_app.py output'''
found = []
for line in fp:
found.append(int(line.split()[1]))
self.assertSetEqual(set(range(n)), set(found))
def testMPIRunner_passes(self): def testMPIRunner_passes(self):
# varying ranks, rpn, tpr, tpc, envs # Test:
# varying application args work_configs = []
# check for successful job run, update, and output WorkerConfig = namedtuple('WorkerConfig', ['workers', 'num_nodes',
pass 'ranks_per_node'])
# 2 ranks on one node
node0 = self.worker_group[0]
cfg = WorkerConfig([node0], 1, 2)
work_configs.append(cfg)
# max ranks on one node
cfg = WorkerConfig([node0], 1, node0.max_ranks_per_node)
work_configs.append(cfg)
# max ranks on all nodes
cfg = WorkerConfig(list(self.worker_group), len(self.worker_group),
node0.max_ranks_per_node)
work_configs.append(cfg)
for i, (workers, num_nodes, rpn) in enumerate(work_configs):
job = BalsamJob()
job.name = f"test{i}"
job.application = "mock_mpi"
job.allowed_work_sites = settings.BALSAM_SITE
job.num_nodes = 1
job.ranks_per_node = 2
job.save()
self.assertEquals(job.state, 'CREATED')
job.create_working_path()
workers = self.worker_group[0]
runner = runners.MPIRunner([job], [workers])
runner.start()
runner.update_jobs()
while not runner.finished():
self.assertEquals(job.state, 'RUNNING')
runner.update_jobs()
time.sleep(0.5)
runner.update_jobs()
self.assertEquals(job.state, 'RUN_DONE')
outpath = os.path.join(job.working_directory, f"test{i}.out")
self.assertEqual(outpath, runner.outfile.name)
with open(outpath) as fp:
self.assert_output_file_contains_n_ranks(fp, 2)
def testMPIRunner_fails(self): def testMPIRunner_fails(self):
# ensure correct when job returns nonzero # ensure correct when job returns nonzero
...@@ -41,6 +100,21 @@ class TestRunners(BalsamTestCase): ...@@ -41,6 +100,21 @@ class TestRunners(BalsamTestCase):
# ensure correct when longr-running job times out # ensure correct when longr-running job times out
pass pass
class TestMPIEnsemble:
def setUp(self):
pass
def testMPIEnsembleRunner(self):
'''Several non-MPI jobs packaged into one mpi4py wrapper'''
# Some jobs will pass; some will fail; some will timeout
pass
class TestRunnerGroup:
def setUp(self):
pass
def test_create_runners(self): def test_create_runners(self):
# Create sets of jobs intended to exercise each code path # Create sets of jobs intended to exercise each code path
# in a single call to launcher.create_new_runners() # in a single call to launcher.create_new_runners()
......
...@@ -5,6 +5,7 @@ logger = logging.getLogger('console') ...@@ -5,6 +5,7 @@ logger = logging.getLogger('console')
try: try:
INSTALL_PATH = os.environ['ARGOBALSAM_INSTALL_PATH'] INSTALL_PATH = os.environ['ARGOBALSAM_INSTALL_PATH']
LOGGING_DIRECTORY = os.path.join(INSTALL_PATH, 'log') # where to store log files
except KeyError as e: except KeyError as e:
logger.error('Environment not setup: ' + str(e)) logger.error('Environment not setup: ' + str(e))
raise raise
...@@ -147,7 +148,6 @@ SENDER_CONFIG = { ...@@ -147,7 +148,6 @@ SENDER_CONFIG = {
#------------------------------ #------------------------------
# logging settings # logging settings
#------------------------------ #------------------------------
LOGGING_DIRECTORY = os.path.join(INSTALL_PATH, 'log') # where to store log files
LOG_HANDLER_LEVEL = 'DEBUG' LOG_HANDLER_LEVEL = 'DEBUG'
LOG_BACKUP_COUNT = 5 # number of files worth of history LOG_BACKUP_COUNT = 5 # number of files worth of history
LOG_FILE_SIZE_LIMIT = 100 * 1024 * 1024 # file size at which to move to a new log file LOG_FILE_SIZE_LIMIT = 100 * 1024 * 1024 # file size at which to move to a new log file
...@@ -231,6 +231,7 @@ elif 'launcher' in ' '.join(sys.argv): ...@@ -231,6 +231,7 @@ elif 'launcher' in ' '.join(sys.argv):
else: else:
logger = logging.getLogger('console') logger = logging.getLogger('console')
def log_uncaught_exceptions(exctype, value, tb,logger=logger): def log_uncaught_exceptions(exctype, value, tb,logger=logger):
logger.error(f"Uncaught Exception {exctype}: {value}",exc_info=(exctype,value,tb)) logger.error(f"Uncaught Exception {exctype}: {value}",exc_info=(exctype,value,tb))
logger = logging.getLogger('console') logger = logging.getLogger('console')
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment