Commit ab05fbd8 authored by Michael Salim's avatar Michael Salim
Browse files

major reorganization; added setuptools and sphinx docs folder

parent 75d8ac3b
......@@ -12,10 +12,7 @@ https://docs.djangoproject.com/en/1.9/ref/settings/
import os,logging
logger = logging.getLogger(__name__)
logger.info('here')
from user_settings import *
from balsam.user_settings import *
......@@ -38,7 +35,7 @@ ALLOWED_HOSTS = []
# Application definition
INSTALLED_APPS = [
'balsam.apps.BalsamCoreConfig',
'balsam.service.apps.BalsamCoreConfig',
'django.contrib.admin',
'django.contrib.auth',
'django.contrib.contenttypes',
......@@ -58,7 +55,7 @@ MIDDLEWARE_CLASSES = [
'django.middleware.clickjacking.XFrameOptionsMiddleware',
]
ROOT_URLCONF = 'argobalsam.urls'
ROOT_URLCONF = 'balsam.django_config.urls'
TEMPLATES = [
{
......@@ -76,7 +73,7 @@ TEMPLATES = [
},
]
WSGI_APPLICATION = 'argobalsam.wsgi.application'
WSGI_APPLICATION = 'balsam.django_config.wsgi.application'
......
......@@ -17,6 +17,5 @@ from django.conf.urls import url,include
from django.contrib import admin
urlpatterns = [
#url(r'^argo/',include('argo.urls')),
url(r'^admin/', admin.site.urls),
]
......@@ -11,6 +11,6 @@ import os
from django.core.wsgi import get_wsgi_application
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "argobalsam.settings")
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "balsam.django_config.settings")
application = get_wsgi_application()
'''Python API for Balsam DAG Manipulations
Example usage:
>>> import balsamlauncher.dag as dag
>>> import launcher.dag as dag
>>>
>>> output = open('expected_output').read()
>>>
......@@ -18,8 +18,8 @@ Example usage:
>>>
'''
import django as django
import os as os
import django
import os
import uuid
__all__ = ['JOB_ID', 'TIMEOUT', 'ERROR',
......@@ -27,10 +27,10 @@ __all__ = ['JOB_ID', 'TIMEOUT', 'ERROR',
'add_job', 'add_dependency', 'spawn_child',
'kill']
os.environ['DJANGO_SETTINGS_MODULE'] = 'argobalsam.settings'
os.environ['DJANGO_SETTINGS_MODULE'] = 'balsam.django_config.settings'
django.setup()
from balsam.models import BalsamJob as _BalsamJob
from balsam.service.models import BalsamJob as _BalsamJob
from django.conf import settings
current_job = None
......
from collections import defaultdict
from django.conf import settings
import balsam.models
from balsam.models import BalsamJob
from balsam.service import models
BalsamJob = models.BalsamJob
import logging
import uuid
......@@ -32,7 +32,7 @@ class JobReader():
def _get_jobs(self): raise NotImplementedError
def _filter(self, job_queryset):
jobs = job_queryset.exclude(state__in=balsam.models.END_STATES)
jobs = job_queryset.exclude(state__in=models.END_STATES)
jobs = jobs.filter(allowed_work_sites__icontains=settings.BALSAM_SITE)
return jobs
......
......@@ -2,33 +2,36 @@
scheduling service and submits directly to a local job queue, or by the
Balsam service metascheduler'''
import argparse
from math import floor
import os
from sys import exit
import signal
import time
import django
os.environ['DJANGO_SETTINGS_MODULE'] = 'argobalsam.settings'
os.environ['DJANGO_SETTINGS_MODULE'] = 'balsam.django_config.settings'
django.setup()
from django.conf import settings
import logging
logger = logging.getLogger('balsamlauncher')
logger = logging.getLogger('balsam.launcher')
logger.info("Loading Balsam Launcher")
from balsam.schedulers import Scheduler
from balsam.service.schedulers import Scheduler
scheduler = Scheduler.scheduler_main
from balsamlauncher import jobreader
from balsamlauncher import transitions
from balsamlauncher import worker
from balsamlauncher import runners
from balsamlauncher.exceptions import *
from balsam.launcher import jobreader
from balsam.launcher import transitions
from balsam.launcher import worker
from balsam.launcher import runners
from balsam.launcher.exceptions import *
ALMOST_RUNNABLE_STATES = ['READY','STAGED_IN']
RUNNABLE_STATES = ['PREPROCESSED', 'RESTART_READY']
WAITING_STATES = ['CREATED', 'LAUNCHER_QUEUED', 'AWAITING_PARENTS']
HANDLING_EXIT = False
def delay(period=settings.BALSAM_SERVICE_PERIOD):
def delay_generator(period=settings.BALSAM_SERVICE_PERIOD):
nexttime = time.time() + period
while True:
now = time.time()
......@@ -45,37 +48,15 @@ def elapsed_time_minutes():
while True:
yield (time.time() - start) / 60.0
def sufficient_time(job):
return 60*job.wall_time_minutes < scheduler.remaining_time_seconds()
def get_runnable_jobs(jobs, running_pks):
runnable_jobs = [job for job in jobs
if job.pk not in running_pks and
job.state in RUNNABLE_STATES and
sufficient_time(job)]
return runnable_jobs
def create_new_runners(jobs, runner_group, worker_group):
created_one = False
running_pks = runner_group.running_job_pks
runnable_jobs = get_runnable_jobs(jobs, running_pks)
while runnable_jobs:
logger.debug(f"Have {len(runnable_jobs)} new runnable jobs (out of "
f"{len(jobs)})")
try:
runner_group.create_next_runner(runnable_jobs, worker_group)
except ExceededMaxRunners:
logger.info("Exceeded max concurrent runners; waiting")
break
except NoAvailableWorkers:
logger.info("Not enough idle workers to start any new runs")
break
def remaining_time_minutes(time_limit_minutes=0.0):
elapsed_timer = elapsed_time_minutes()
while True:
if time_limit_minutes > 0.0:
remaining = time_limit_minutes - next(elapsed_timer)
else:
created_one = True
running_pks = runner_group.running_job_pks
runnable_jobs = get_runnable_jobs(jobs, running_pks)
return created_one
remaining = scheduler.remaining_time_seconds() / 60.0
if remaining > 0: yield remaining
else: break
def check_parents(job, lock):
job.refresh_from_db()
......@@ -92,53 +73,85 @@ def check_parents(job, lock):
lock.release()
logger.info(f'{job.cute_id} waiting for parents')
def log_time(minutes_left):
if minutes_left > 1e12:
return
whole_minutes = floor(minutes_left)
whole_seconds = round((minutes_left - whole_minutes)*60)
time_str = f"{whole_minutes:02d} min : {whole_seconds:02d} sec remaining"
logger.info(time_str)
def main(args, transition_pool, runner_group, job_source):
delay_timer = delay()
elapsed_min = elapsed_time_minutes()
logger.debug(f"time limit provided {args.time_limit_minutes}")
last_created = 0.0
if args.time_limit_minutes > 0:
def timeout():
elapsed = next(elapsed_min)
logger.debug(f"{elapsed} minutes elapsed out of {args.time_limit_minutes}")
return elapsed >= args.time_limit_minutes
else:
timeout = lambda : scheduler.remaining_time_seconds() <= 0.0
while not timeout():
delay_sleeper = delay_generator()
runner_create_period = settings.BALSAM_RUNNER_CREATION_PERIOD_SEC
last_runner_created = time.time()
remaining_timer = remaining_time_minutes(args.time_limit_minutes)
for remaining_minutes in remaining_timer:
logger.info("\n******************\n"
"BEGIN SERVICE LOOP\n"
"******************")
wait = True
for stat in transition_pool.get_statuses(): wait = False
log_time(remaining_minutes)
delay = True
# Update after any finished transitions
for stat in transition_pool.get_statuses(): delay = False
job_source.refresh_from_db()
waiting_jobs = (j for j in job_source.jobs if
j.state in 'CREATED AWAITING_PARENTS LAUNCHER_QUEUED'.split())
for job in waiting_jobs: check_parents(job, transition_pool.lock)
# Update jobs awaiting dependencies
waiting_jobs = (j for j in job_source.jobs if j.state in WAITING_STATES)
for job in waiting_jobs:
check_parents(job, transition_pool.lock)
# Enqueue new transitions
transitionable_jobs = [
job for job in job_source.jobs
if job not in transition_pool
and job.state in transitions.TRANSITIONS
]
for job in transitionable_jobs:
transition_pool.add_job(job)
wait = False
delay = False
fxn = transitions.TRANSITIONS[job.state]
logger.info(f"Queued transition: {job.cute_id} will undergo {fxn}")
# Update jobs that are running/finished
any_finished = runner_group.update_and_remove_finished()
if any_finished: wait = False
if any_finished: delay = False
job_source.refresh_from_db()
if time.time() - last_created > 5:
created = create_new_runners(job_source.jobs, runner_group, worker_group)
if created:
last_created = time.time()
wait = False
if wait: next(delay_timer)
# Decide whether or not to start a new runner
runnable_jobs = [
job for job in job_source.jobs
if job.pk not in runner_group.running_job_pks and
job.state in RUNNABLE_STATES and
job.wall_time_minutes <= remaining_minutes
]
logger.debug(f"Have {len(runnable_jobs)} runnable jobs")
almost_runnable = any(j.state in ALMOST_RUNNABLE_STATES for j in job_source.jobs)
now = time.time()
runner_ready = bool(now - last_runner_created > runner_create_period)
num_serial = len([j for j in runnable_jobs if j.num_ranks == 1])
worker = worker_group[0]
max_serial_per_ensemble = 2 * worker.num_nodes * worker.max_ranks_per_node
ensemble_ready = (num_serial >= max_serial_per_ensemble) or (num_serial == 0)
if runnable_jobs:
if runner_ready or not almost_runnable or ensemble_ready:
try:
runner_group.create_next_runner(runnable_jobs, worker_group)
except ExceededMaxRunners:
logger.info("Exceeded max concurrent runners; waiting")
except NoAvailableWorkers:
logger.info("Not enough idle workers to start any new runs")
else:
last_runner_created = now
if delay: next(delay_sleeper)
def on_exit(runner_group, transition_pool, job_source):
global HANDLING_EXIT
......@@ -146,9 +159,6 @@ def on_exit(runner_group, transition_pool, job_source):
HANDLING_EXIT = True
logger.debug("Entering on_exit cleanup function")
logger.debug("on_exit: flush job queue")
transition_pool.flush_job_queue()
logger.debug("on_exit: update/remove/timeout jobs from runner group")
runner_group.update_and_remove_finished(timeout=True)
......@@ -181,7 +191,7 @@ def get_args(inputcmd=None):
def detect_dead_runners(job_source):
for job in job_source.by_states['RUNNING']:
logger.info(f'Picked up running job {job.cute_id}: marking RESTART_READY')
logger.info(f'Picked up dead running job {job.cute_id}: marking RESTART_READY')
job.update_state('RESTART_READY', 'Detected dead runner')
if __name__ == "__main__":
......
......@@ -5,18 +5,17 @@ import logging
import django
import signal
os.environ['DJANGO_SETTINGS_MODULE'] = 'argobalsam.settings'
os.environ['DJANGO_SETTINGS_MODULE'] = 'balsam.django_config.settings'
django.setup()
logger = logging.getLogger('balsamlauncher.mpi_ensemble')
logger = logging.getLogger('balsam.launcher.mpi_ensemble')
from subprocess import Popen, STDOUT
from mpi4py import MPI
from balsamlauncher.util import cd, get_tail
from balsamlauncher.exceptions import *
from balsam.models import BalsamJob
from balsam.launcher.util import cd, get_tail
from balsam.launcher.exceptions import *
from balsam.service.models import BalsamJob
COMM = MPI.COMM_WORLD
RANK = COMM.Get_rank()
......
......@@ -24,16 +24,16 @@ from queue import Queue, Empty
from django.conf import settings
from django.db import transaction
from balsam.models import InvalidStateError
from balsamlauncher import mpi_commands
from balsamlauncher.exceptions import *
from balsamlauncher.util import cd, get_tail
from balsam.service.models import InvalidStateError
from balsam.launcher import mpi_commands
from balsam.launcher.exceptions import *
from balsam.launcher.util import cd, get_tail
import logging
logger = logging.getLogger(__name__)
from importlib.util import find_spec
MPI_ENSEMBLE_EXE = find_spec("balsamlauncher.mpi_ensemble").origin
MPI_ENSEMBLE_EXE = find_spec("balsam.launcher.mpi_ensemble").origin
class MonitorStream(Thread):
......@@ -266,7 +266,7 @@ class RunnerGroup:
# If there are not enough serial jobs; run the larger of:
# largest MPI job that fits, or the remaining serial jobs
if nserial >= nidle_ranks:
jobs = serial_jobs[:nidle_ranks] # TODO: try putting ALL serial jobs into one MPIEnsemble
jobs = serial_jobs[:2*nidle_ranks] # TODO:Expt w/ > 2 jobs per worker
assigned_workers = idle_workers
runner_class = MPIEnsembleRunner
msg = (f"Running {len(jobs)} serial jobs on {nidle_workers} workers "
......@@ -295,6 +295,7 @@ class RunnerGroup:
runner.start()
self.runners.append(runner)
for worker in assigned_workers: worker.idle = False
logger.debug(f"Using workers: {[w.id for w in assigned_workers]}")
def update_and_remove_finished(self, timeout=False):
# TODO: Benchmark performance overhead; does grouping into one
......@@ -323,6 +324,7 @@ class RunnerGroup:
raise RuntimeError(msg)
else:
self.runners.remove(runner)
logger.debug(f"Freeing workers: {[w.id for w in runner.worker_list]}")
for worker in runner.worker_list:
worker.idle = True
......
......@@ -17,13 +17,13 @@ from django.core.exceptions import ObjectDoesNotExist
from django.conf import settings
from django import db
from common import transfer
from balsamlauncher.exceptions import *
from balsam.models import BalsamJob, NoApplication
from balsamlauncher.util import get_tail
from balsam.common import transfer
from balsam.launcher.exceptions import *
from balsam.service.models import BalsamJob, NoApplication
from balsam.launcher.util import get_tail
import logging
logger = logging.getLogger('balsamlauncher.transitions')
logger = logging.getLogger('balsam.launcher.transitions')
# SQLite exclusive lock is broken on Windows & OSX; even with two writers, two
# records, and a long timeout, a "database locked" exception is thrown
......@@ -50,7 +50,6 @@ POSTPROCESS_TIMEOUT_SECONDS = 300
SITE = settings.BALSAM_SITE
StatusMsg = namedtuple('StatusMsg', ['pk', 'state', 'msg'])
JobMsg = namedtuple('JobMsg', ['priority', 'jobid'])
def on_exit():
logger.debug("TransitionProc caught SIGTERM: do nothing and wait for end")
......@@ -85,6 +84,7 @@ def main(job_queue, status_queue, lock):
lock.acquire()
job.update_state('FAILED', str(e))
lock.release()
s = StatusMsg(job.pk, 'FAILED', str(e))
status_queue.put(s)
buf = StringIO()
......@@ -141,8 +141,8 @@ class TransitionProcessPool:
if job.state not in TRANSITIONS: raise TransitionNotFoundError
priority = PRIORITIES[job.state]
m = JobMsg(priority, job.pk)
self.job_queue.put(m)
job_msg = (priority, job.pk)
self.job_queue.put(job_msg)
self.transitions_pk_list.append(job.pk)
def get_statuses(self):
......@@ -154,21 +154,12 @@ class TransitionProcessPool:
except queue.Empty:
break
def flush_job_queue(self):
return
while not self.job_queue.empty():
try:
self.job_queue.get_nowait()
except queue.Empty:
break
logger.debug("Flushed transition process job queue")
def end_and_wait(self):
priority = PRIORITIES['end']
m = JobMsg(priority, 'end')
job_msg = (priority, 'end')
logger.debug("Sending end message and waiting on transition processes")
for proc in self.procs:
self.job_queue.put(m)
self.job_queue.put(job_msg)
for proc in self.procs:
proc.join()
self.pqueue_manager.shutdown()
......@@ -457,8 +448,8 @@ PRIORITIES = {
'end' : -1,
'READY': 0,
'STAGED_IN': 0,
'RUN_DONE': 1,
'RUN_TIMEOUT': 1,
'RUN_ERROR': 1,
'POSTPROCESSED': 1,
'RUN_DONE': 2,
'POSTPROCESSED': 2,
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment