Commit ab05fbd8 authored by Michael Salim's avatar Michael Salim
Browse files

major reorganization; added setuptools and sphinx docs folder

parent 75d8ac3b
...@@ -12,10 +12,7 @@ https://docs.djangoproject.com/en/1.9/ref/settings/ ...@@ -12,10 +12,7 @@ https://docs.djangoproject.com/en/1.9/ref/settings/
import os,logging import os,logging
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
from balsam.user_settings import *
logger.info('here')
from user_settings import *
...@@ -38,7 +35,7 @@ ALLOWED_HOSTS = [] ...@@ -38,7 +35,7 @@ ALLOWED_HOSTS = []
# Application definition # Application definition
INSTALLED_APPS = [ INSTALLED_APPS = [
'balsam.apps.BalsamCoreConfig', 'balsam.service.apps.BalsamCoreConfig',
'django.contrib.admin', 'django.contrib.admin',
'django.contrib.auth', 'django.contrib.auth',
'django.contrib.contenttypes', 'django.contrib.contenttypes',
...@@ -58,7 +55,7 @@ MIDDLEWARE_CLASSES = [ ...@@ -58,7 +55,7 @@ MIDDLEWARE_CLASSES = [
'django.middleware.clickjacking.XFrameOptionsMiddleware', 'django.middleware.clickjacking.XFrameOptionsMiddleware',
] ]
ROOT_URLCONF = 'argobalsam.urls' ROOT_URLCONF = 'balsam.django_config.urls'
TEMPLATES = [ TEMPLATES = [
{ {
...@@ -76,7 +73,7 @@ TEMPLATES = [ ...@@ -76,7 +73,7 @@ TEMPLATES = [
}, },
] ]
WSGI_APPLICATION = 'argobalsam.wsgi.application' WSGI_APPLICATION = 'balsam.django_config.wsgi.application'
......
...@@ -17,6 +17,5 @@ from django.conf.urls import url,include ...@@ -17,6 +17,5 @@ from django.conf.urls import url,include
from django.contrib import admin from django.contrib import admin
urlpatterns = [ urlpatterns = [
#url(r'^argo/',include('argo.urls')),
url(r'^admin/', admin.site.urls), url(r'^admin/', admin.site.urls),
] ]
...@@ -11,6 +11,6 @@ import os ...@@ -11,6 +11,6 @@ import os
from django.core.wsgi import get_wsgi_application from django.core.wsgi import get_wsgi_application
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "argobalsam.settings") os.environ.setdefault("DJANGO_SETTINGS_MODULE", "balsam.django_config.settings")
application = get_wsgi_application() application = get_wsgi_application()
'''Python API for Balsam DAG Manipulations '''Python API for Balsam DAG Manipulations
Example usage: Example usage:
>>> import balsamlauncher.dag as dag >>> import launcher.dag as dag
>>> >>>
>>> output = open('expected_output').read() >>> output = open('expected_output').read()
>>> >>>
...@@ -18,8 +18,8 @@ Example usage: ...@@ -18,8 +18,8 @@ Example usage:
>>> >>>
''' '''
import django as django import django
import os as os import os
import uuid import uuid
__all__ = ['JOB_ID', 'TIMEOUT', 'ERROR', __all__ = ['JOB_ID', 'TIMEOUT', 'ERROR',
...@@ -27,10 +27,10 @@ __all__ = ['JOB_ID', 'TIMEOUT', 'ERROR', ...@@ -27,10 +27,10 @@ __all__ = ['JOB_ID', 'TIMEOUT', 'ERROR',
'add_job', 'add_dependency', 'spawn_child', 'add_job', 'add_dependency', 'spawn_child',
'kill'] 'kill']
os.environ['DJANGO_SETTINGS_MODULE'] = 'argobalsam.settings' os.environ['DJANGO_SETTINGS_MODULE'] = 'balsam.django_config.settings'
django.setup() django.setup()
from balsam.models import BalsamJob as _BalsamJob from balsam.service.models import BalsamJob as _BalsamJob
from django.conf import settings from django.conf import settings
current_job = None current_job = None
......
from collections import defaultdict from collections import defaultdict
from django.conf import settings from django.conf import settings
import balsam.models from balsam.service import models
from balsam.models import BalsamJob BalsamJob = models.BalsamJob
import logging import logging
import uuid import uuid
...@@ -32,7 +32,7 @@ class JobReader(): ...@@ -32,7 +32,7 @@ class JobReader():
def _get_jobs(self): raise NotImplementedError def _get_jobs(self): raise NotImplementedError
def _filter(self, job_queryset): def _filter(self, job_queryset):
jobs = job_queryset.exclude(state__in=balsam.models.END_STATES) jobs = job_queryset.exclude(state__in=models.END_STATES)
jobs = jobs.filter(allowed_work_sites__icontains=settings.BALSAM_SITE) jobs = jobs.filter(allowed_work_sites__icontains=settings.BALSAM_SITE)
return jobs return jobs
......
...@@ -2,33 +2,36 @@ ...@@ -2,33 +2,36 @@
scheduling service and submits directly to a local job queue, or by the scheduling service and submits directly to a local job queue, or by the
Balsam service metascheduler''' Balsam service metascheduler'''
import argparse import argparse
from math import floor
import os import os
from sys import exit from sys import exit
import signal import signal
import time import time
import django import django
os.environ['DJANGO_SETTINGS_MODULE'] = 'argobalsam.settings' os.environ['DJANGO_SETTINGS_MODULE'] = 'balsam.django_config.settings'
django.setup() django.setup()
from django.conf import settings from django.conf import settings
import logging import logging
logger = logging.getLogger('balsamlauncher') logger = logging.getLogger('balsam.launcher')
logger.info("Loading Balsam Launcher") logger.info("Loading Balsam Launcher")
from balsam.schedulers import Scheduler from balsam.service.schedulers import Scheduler
scheduler = Scheduler.scheduler_main scheduler = Scheduler.scheduler_main
from balsamlauncher import jobreader from balsam.launcher import jobreader
from balsamlauncher import transitions from balsam.launcher import transitions
from balsamlauncher import worker from balsam.launcher import worker
from balsamlauncher import runners from balsam.launcher import runners
from balsamlauncher.exceptions import * from balsam.launcher.exceptions import *
ALMOST_RUNNABLE_STATES = ['READY','STAGED_IN']
RUNNABLE_STATES = ['PREPROCESSED', 'RESTART_READY'] RUNNABLE_STATES = ['PREPROCESSED', 'RESTART_READY']
WAITING_STATES = ['CREATED', 'LAUNCHER_QUEUED', 'AWAITING_PARENTS']
HANDLING_EXIT = False HANDLING_EXIT = False
def delay(period=settings.BALSAM_SERVICE_PERIOD): def delay_generator(period=settings.BALSAM_SERVICE_PERIOD):
nexttime = time.time() + period nexttime = time.time() + period
while True: while True:
now = time.time() now = time.time()
...@@ -45,37 +48,15 @@ def elapsed_time_minutes(): ...@@ -45,37 +48,15 @@ def elapsed_time_minutes():
while True: while True:
yield (time.time() - start) / 60.0 yield (time.time() - start) / 60.0
def sufficient_time(job): def remaining_time_minutes(time_limit_minutes=0.0):
return 60*job.wall_time_minutes < scheduler.remaining_time_seconds() elapsed_timer = elapsed_time_minutes()
while True:
def get_runnable_jobs(jobs, running_pks): if time_limit_minutes > 0.0:
runnable_jobs = [job for job in jobs remaining = time_limit_minutes - next(elapsed_timer)
if job.pk not in running_pks and
job.state in RUNNABLE_STATES and
sufficient_time(job)]
return runnable_jobs
def create_new_runners(jobs, runner_group, worker_group):
created_one = False
running_pks = runner_group.running_job_pks
runnable_jobs = get_runnable_jobs(jobs, running_pks)
while runnable_jobs:
logger.debug(f"Have {len(runnable_jobs)} new runnable jobs (out of "
f"{len(jobs)})")
try:
runner_group.create_next_runner(runnable_jobs, worker_group)
except ExceededMaxRunners:
logger.info("Exceeded max concurrent runners; waiting")
break
except NoAvailableWorkers:
logger.info("Not enough idle workers to start any new runs")
break
else: else:
created_one = True remaining = scheduler.remaining_time_seconds() / 60.0
running_pks = runner_group.running_job_pks if remaining > 0: yield remaining
runnable_jobs = get_runnable_jobs(jobs, running_pks) else: break
return created_one
def check_parents(job, lock): def check_parents(job, lock):
job.refresh_from_db() job.refresh_from_db()
...@@ -92,53 +73,85 @@ def check_parents(job, lock): ...@@ -92,53 +73,85 @@ def check_parents(job, lock):
lock.release() lock.release()
logger.info(f'{job.cute_id} waiting for parents') logger.info(f'{job.cute_id} waiting for parents')
def log_time(minutes_left):
if minutes_left > 1e12:
return
whole_minutes = floor(minutes_left)
whole_seconds = round((minutes_left - whole_minutes)*60)
time_str = f"{whole_minutes:02d} min : {whole_seconds:02d} sec remaining"
logger.info(time_str)
def main(args, transition_pool, runner_group, job_source): def main(args, transition_pool, runner_group, job_source):
delay_timer = delay()
elapsed_min = elapsed_time_minutes()
logger.debug(f"time limit provided {args.time_limit_minutes}")
last_created = 0.0
if args.time_limit_minutes > 0:
def timeout():
elapsed = next(elapsed_min)
logger.debug(f"{elapsed} minutes elapsed out of {args.time_limit_minutes}")
return elapsed >= args.time_limit_minutes
else:
timeout = lambda : scheduler.remaining_time_seconds() <= 0.0
while not timeout(): delay_sleeper = delay_generator()
runner_create_period = settings.BALSAM_RUNNER_CREATION_PERIOD_SEC
last_runner_created = time.time()
remaining_timer = remaining_time_minutes(args.time_limit_minutes)
for remaining_minutes in remaining_timer:
logger.info("\n******************\n" logger.info("\n******************\n"
"BEGIN SERVICE LOOP\n" "BEGIN SERVICE LOOP\n"
"******************") "******************")
wait = True log_time(remaining_minutes)
for stat in transition_pool.get_statuses(): wait = False delay = True
# Update after any finished transitions
for stat in transition_pool.get_statuses(): delay = False
job_source.refresh_from_db() job_source.refresh_from_db()
waiting_jobs = (j for j in job_source.jobs if
j.state in 'CREATED AWAITING_PARENTS LAUNCHER_QUEUED'.split()) # Update jobs awaiting dependencies
for job in waiting_jobs: check_parents(job, transition_pool.lock) waiting_jobs = (j for j in job_source.jobs if j.state in WAITING_STATES)
for job in waiting_jobs:
check_parents(job, transition_pool.lock)
# Enqueue new transitions
transitionable_jobs = [ transitionable_jobs = [
job for job in job_source.jobs job for job in job_source.jobs
if job not in transition_pool if job not in transition_pool
and job.state in transitions.TRANSITIONS and job.state in transitions.TRANSITIONS
] ]
for job in transitionable_jobs: for job in transitionable_jobs:
transition_pool.add_job(job) transition_pool.add_job(job)
wait = False delay = False
fxn = transitions.TRANSITIONS[job.state] fxn = transitions.TRANSITIONS[job.state]
logger.info(f"Queued transition: {job.cute_id} will undergo {fxn}") logger.info(f"Queued transition: {job.cute_id} will undergo {fxn}")
# Update jobs that are running/finished
any_finished = runner_group.update_and_remove_finished() any_finished = runner_group.update_and_remove_finished()
if any_finished: wait = False if any_finished: delay = False
job_source.refresh_from_db() job_source.refresh_from_db()
if time.time() - last_created > 5:
created = create_new_runners(job_source.jobs, runner_group, worker_group) # Decide whether or not to start a new runner
if created: runnable_jobs = [
last_created = time.time() job for job in job_source.jobs
wait = False if job.pk not in runner_group.running_job_pks and
if wait: next(delay_timer) job.state in RUNNABLE_STATES and
job.wall_time_minutes <= remaining_minutes
]
logger.debug(f"Have {len(runnable_jobs)} runnable jobs")
almost_runnable = any(j.state in ALMOST_RUNNABLE_STATES for j in job_source.jobs)
now = time.time()
runner_ready = bool(now - last_runner_created > runner_create_period)
num_serial = len([j for j in runnable_jobs if j.num_ranks == 1])
worker = worker_group[0]
max_serial_per_ensemble = 2 * worker.num_nodes * worker.max_ranks_per_node
ensemble_ready = (num_serial >= max_serial_per_ensemble) or (num_serial == 0)
if runnable_jobs:
if runner_ready or not almost_runnable or ensemble_ready:
try:
runner_group.create_next_runner(runnable_jobs, worker_group)
except ExceededMaxRunners:
logger.info("Exceeded max concurrent runners; waiting")
except NoAvailableWorkers:
logger.info("Not enough idle workers to start any new runs")
else:
last_runner_created = now
if delay: next(delay_sleeper)
def on_exit(runner_group, transition_pool, job_source): def on_exit(runner_group, transition_pool, job_source):
global HANDLING_EXIT global HANDLING_EXIT
...@@ -146,9 +159,6 @@ def on_exit(runner_group, transition_pool, job_source): ...@@ -146,9 +159,6 @@ def on_exit(runner_group, transition_pool, job_source):
HANDLING_EXIT = True HANDLING_EXIT = True
logger.debug("Entering on_exit cleanup function") logger.debug("Entering on_exit cleanup function")
logger.debug("on_exit: flush job queue")
transition_pool.flush_job_queue()
logger.debug("on_exit: update/remove/timeout jobs from runner group") logger.debug("on_exit: update/remove/timeout jobs from runner group")
runner_group.update_and_remove_finished(timeout=True) runner_group.update_and_remove_finished(timeout=True)
...@@ -181,7 +191,7 @@ def get_args(inputcmd=None): ...@@ -181,7 +191,7 @@ def get_args(inputcmd=None):
def detect_dead_runners(job_source): def detect_dead_runners(job_source):
for job in job_source.by_states['RUNNING']: for job in job_source.by_states['RUNNING']:
logger.info(f'Picked up running job {job.cute_id}: marking RESTART_READY') logger.info(f'Picked up dead running job {job.cute_id}: marking RESTART_READY')
job.update_state('RESTART_READY', 'Detected dead runner') job.update_state('RESTART_READY', 'Detected dead runner')
if __name__ == "__main__": if __name__ == "__main__":
......
...@@ -5,18 +5,17 @@ import logging ...@@ -5,18 +5,17 @@ import logging
import django import django
import signal import signal
os.environ['DJANGO_SETTINGS_MODULE'] = 'argobalsam.settings' os.environ['DJANGO_SETTINGS_MODULE'] = 'balsam.django_config.settings'
django.setup() django.setup()
logger = logging.getLogger('balsamlauncher.mpi_ensemble') logger = logging.getLogger('balsam.launcher.mpi_ensemble')
from subprocess import Popen, STDOUT from subprocess import Popen, STDOUT
from mpi4py import MPI from mpi4py import MPI
from balsamlauncher.util import cd, get_tail from balsam.launcher.util import cd, get_tail
from balsamlauncher.exceptions import * from balsam.launcher.exceptions import *
from balsam.models import BalsamJob from balsam.service.models import BalsamJob
COMM = MPI.COMM_WORLD COMM = MPI.COMM_WORLD
RANK = COMM.Get_rank() RANK = COMM.Get_rank()
......
...@@ -24,16 +24,16 @@ from queue import Queue, Empty ...@@ -24,16 +24,16 @@ from queue import Queue, Empty
from django.conf import settings from django.conf import settings
from django.db import transaction from django.db import transaction
from balsam.models import InvalidStateError from balsam.service.models import InvalidStateError
from balsamlauncher import mpi_commands from balsam.launcher import mpi_commands
from balsamlauncher.exceptions import * from balsam.launcher.exceptions import *
from balsamlauncher.util import cd, get_tail from balsam.launcher.util import cd, get_tail
import logging import logging
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
from importlib.util import find_spec from importlib.util import find_spec
MPI_ENSEMBLE_EXE = find_spec("balsamlauncher.mpi_ensemble").origin MPI_ENSEMBLE_EXE = find_spec("balsam.launcher.mpi_ensemble").origin
class MonitorStream(Thread): class MonitorStream(Thread):
...@@ -266,7 +266,7 @@ class RunnerGroup: ...@@ -266,7 +266,7 @@ class RunnerGroup:
# If there are not enough serial jobs; run the larger of: # If there are not enough serial jobs; run the larger of:
# largest MPI job that fits, or the remaining serial jobs # largest MPI job that fits, or the remaining serial jobs
if nserial >= nidle_ranks: if nserial >= nidle_ranks:
jobs = serial_jobs[:nidle_ranks] # TODO: try putting ALL serial jobs into one MPIEnsemble jobs = serial_jobs[:2*nidle_ranks] # TODO:Expt w/ > 2 jobs per worker
assigned_workers = idle_workers assigned_workers = idle_workers
runner_class = MPIEnsembleRunner runner_class = MPIEnsembleRunner
msg = (f"Running {len(jobs)} serial jobs on {nidle_workers} workers " msg = (f"Running {len(jobs)} serial jobs on {nidle_workers} workers "
...@@ -295,6 +295,7 @@ class RunnerGroup: ...@@ -295,6 +295,7 @@ class RunnerGroup:
runner.start() runner.start()
self.runners.append(runner) self.runners.append(runner)
for worker in assigned_workers: worker.idle = False for worker in assigned_workers: worker.idle = False
logger.debug(f"Using workers: {[w.id for w in assigned_workers]}")
def update_and_remove_finished(self, timeout=False): def update_and_remove_finished(self, timeout=False):
# TODO: Benchmark performance overhead; does grouping into one # TODO: Benchmark performance overhead; does grouping into one
...@@ -323,6 +324,7 @@ class RunnerGroup: ...@@ -323,6 +324,7 @@ class RunnerGroup:
raise RuntimeError(msg) raise RuntimeError(msg)
else: else:
self.runners.remove(runner) self.runners.remove(runner)
logger.debug(f"Freeing workers: {[w.id for w in runner.worker_list]}")
for worker in runner.worker_list: for worker in runner.worker_list:
worker.idle = True worker.idle = True
......
...@@ -17,13 +17,13 @@ from django.core.exceptions import ObjectDoesNotExist ...@@ -17,13 +17,13 @@ from django.core.exceptions import ObjectDoesNotExist
from django.conf import settings from django.conf import settings
from django import db from django import db
from common import transfer from balsam.common import transfer
from balsamlauncher.exceptions import * from balsam.launcher.exceptions import *
from balsam.models import BalsamJob, NoApplication from balsam.service.models import BalsamJob, NoApplication
from balsamlauncher.util import get_tail from balsam.launcher.util import get_tail
import logging import logging
logger = logging.getLogger('balsamlauncher.transitions') logger = logging.getLogger('balsam.launcher.transitions')
# SQLite exclusive lock is broken on Windows & OSX; even with two writers, two # SQLite exclusive lock is broken on Windows & OSX; even with two writers, two
# records, and a long timeout, a "database locked" exception is thrown # records, and a long timeout, a "database locked" exception is thrown
...@@ -50,7 +50,6 @@ POSTPROCESS_TIMEOUT_SECONDS = 300 ...@@ -50,7 +50,6 @@ POSTPROCESS_TIMEOUT_SECONDS = 300
SITE = settings.BALSAM_SITE SITE = settings.BALSAM_SITE
StatusMsg = namedtuple('StatusMsg', ['pk', 'state', 'msg']) StatusMsg = namedtuple('StatusMsg', ['pk', 'state', 'msg'])
JobMsg = namedtuple('JobMsg', ['priority', 'jobid'])
def on_exit(): def on_exit():
logger.debug("TransitionProc caught SIGTERM: do nothing and wait for end") logger.debug("TransitionProc caught SIGTERM: do nothing and wait for end")
...@@ -85,6 +84,7 @@ def main(job_queue, status_queue, lock): ...@@ -85,6 +84,7 @@ def main(job_queue, status_queue, lock):
lock.acquire() lock.acquire()
job.update_state('FAILED', str(e)) job.update_state('FAILED', str(e))
lock.release() lock.release()
s = StatusMsg(job.pk, 'FAILED', str(e)) s = StatusMsg(job.pk, 'FAILED', str(e))
status_queue.put(s) status_queue.put(s)
buf = StringIO() buf = StringIO()
...@@ -141,8 +141,8 @@ class TransitionProcessPool: ...@@ -141,8 +141,8 @@ class TransitionProcessPool:
if job.state not in TRANSITIONS: raise TransitionNotFoundError if job.state not in TRANSITIONS: raise TransitionNotFoundError
priority = PRIORITIES[job.state] priority = PRIORITIES[job.state]
m = JobMsg(priority, job.pk) job_msg = (priority, job.pk)
self.job_queue.put(m) self.job_queue.put(job_msg)
self.transitions_pk_list.append(job.pk) self.transitions_pk_list.append(job.pk)
def get_statuses(self): def get_statuses(self):