Commit ab05fbd8 authored by Michael Salim's avatar Michael Salim

major reorganization; added setuptools and sphinx docs folder

parent 75d8ac3b
...@@ -8,6 +8,8 @@ balsamjobs ...@@ -8,6 +8,8 @@ balsamjobs
argobalsam_env argobalsam_env
env env
db.sqlite3 db.sqlite3
argo/migrations balsam/argo/migrations
balsam/migrations balsam/service/migrations
experiments experiments
docs/_build/*
*.egg-info
include README.md
include LICENSE.md
include docs
# HPC Edge Service and Workflow Management System # HPC Edge Service and Workflow Management System
**Authors:** J. Taylor Childers (Argonne National Laboratory), Tom Uram (Argonne National Laboratory), Doug Benjamin (Duke University), Misha Salim (Argonne National Laboratory) **Authors:** J. Taylor Childers (Argonne National Laboratory), Tom Uram (Argonne National Laboratory), Doug Benjamin (Duke University), Misha Salim (Argonne National Laboratory)
An HPC Edge Service to manage remote job submission. The goal of this service is to provide a secure interface for submitting jobs to large computing resources.
# Prerequisites # Prerequisites
The Argo and Balsam services require Python 3.6, mpi4py, Django, and django-concurrency. The Argo and Balsam services require Python 3.6, mpi4py, Django, and django-concurrency.
......
import common.Serializer as Serializer import balsam.common.Serializer as Serializer
class ArgoJobStatus: class ArgoJobStatus:
def __init__(self): def __init__(self):
......
...@@ -6,9 +6,9 @@ from django.db.utils import load_backend ...@@ -6,9 +6,9 @@ from django.db.utils import load_backend
from django.conf import settings from django.conf import settings
from common import MessageReceiver from common import MessageReceiver
from argo import QueueMessage from balsam.argo import QueueMessage
from argo.models import ArgoJob,ArgoSubJob,BALSAM_JOB_TO_SUBJOB_STATE_MAP from balsam.argo.models import ArgoJob,ArgoSubJob,BALSAM_JOB_TO_SUBJOB_STATE_MAP
from balsam import BalsamJobStatus,models from balsam.service import BalsamJobStatus,models
class JobStatusReceiver(MessageReceiver.MessageReceiver): class JobStatusReceiver(MessageReceiver.MessageReceiver):
''' subscribes to the balsam job status queue and updates a job state ''' ''' subscribes to the balsam job status queue and updates a job state '''
......
...@@ -5,9 +5,9 @@ from django.db import connections,DEFAULT_DB_ALIAS ...@@ -5,9 +5,9 @@ from django.db import connections,DEFAULT_DB_ALIAS
from django.db.utils import load_backend from django.db.utils import load_backend
from django.conf import settings from django.conf import settings
from argo import models,QueueMessage from balsam.argo import models,QueueMessage
from common import db_tools from balsam.common import db_tools
from common import MessageReceiver,Serializer from balsam.common import MessageReceiver,Serializer
def CreateWorkingPath(job_id): def CreateWorkingPath(job_id):
path = os.path.join(settings.ARGO_WORK_DIRECTORY,str(job_id)) path = os.path.join(settings.ARGO_WORK_DIRECTORY,str(job_id))
......
...@@ -4,4 +4,4 @@ from django.apps import AppConfig ...@@ -4,4 +4,4 @@ from django.apps import AppConfig
class ArgoCoreConfig(AppConfig): class ArgoCoreConfig(AppConfig):
name = 'argo' name = 'balsam.argo'
...@@ -18,7 +18,7 @@ import warnings ...@@ -18,7 +18,7 @@ import warnings
from django import forms from django import forms
from django.forms.widgets import CheckboxInput from django.forms.widgets import CheckboxInput
from argo import models from balsam.argo import models
import logging import logging
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
......
...@@ -9,11 +9,11 @@ from django.core.exceptions import ObjectDoesNotExist ...@@ -9,11 +9,11 @@ from django.core.exceptions import ObjectDoesNotExist
from django.conf import settings from django.conf import settings
from django.core.validators import validate_comma_separated_integer_list from django.core.validators import validate_comma_separated_integer_list
from argo import QueueMessage,ArgoJobStatus from balsam.argo import QueueMessage,ArgoJobStatus
from common import log_uncaught_exceptions,MessageInterface from balsam.common import log_uncaught_exceptions,MessageInterface
from common import Serializer,transfer,Mail,db_tools from balsam.common import Serializer,transfer,Mail,db_tools
from balsam.models import BalsamJob from balsam.service.models import BalsamJob
from balsam.models import STATES_BY_NAME as BALSAM_STATES_BY_NAME from balsam.service.models import STATES_BY_NAME as BALSAM_STATES_BY_NAME
# assign this function to the system exception hook # assign this function to the system exception hook
sys.excepthook = log_uncaught_exceptions.log_uncaught_exceptions sys.excepthook = log_uncaught_exceptions.log_uncaught_exceptions
...@@ -185,7 +185,7 @@ def send_status_message(job,message=None): ...@@ -185,7 +185,7 @@ def send_status_message(job,message=None):
# ------------ Job States ---------------------------- # ------------ Job States ----------------------------
from common.JobState import JobState from balsam.common.JobState import JobState
# Job States # Job States
CREATE_FAILED = JobState('CREATE_FAILED') CREATE_FAILED = JobState('CREATE_FAILED')
......
...@@ -2,7 +2,7 @@ from django.conf.urls import url ...@@ -2,7 +2,7 @@ from django.conf.urls import url
import logging import logging
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
from argo import views from balsam.argo import views
local_urlpatterns = [ local_urlpatterns = [
url(r'^$', views.index, name='index'), url(r'^$', views.index, name='index'),
......
import os,logging,sys import os,logging,sys
from django.shortcuts import render,get_object_or_404 from django.shortcuts import render,get_object_or_404
from argo.html_forms import JobDisplayForm from balsam.argo.html_forms import JobDisplayForm
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
from argo import models from balsam.argo import models
# Create your views here. # Create your views here.
......
from common.file_tools import delete_old_files_directories from balsam.common.file_tools import delete_old_files_directories
import time import time
class DirCleaner: class DirCleaner:
......
from common import PikaMessageInterface, NoMessageInterface from balsam.common import PikaMessageInterface, NoMessageInterface
from django.conf import settings from django.conf import settings
import logging,sys,multiprocessing,time,os import logging,sys,multiprocessing,time,os
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
......
...@@ -3,7 +3,7 @@ import time ...@@ -3,7 +3,7 @@ import time
import threading import threading
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
from common import MessageInterface from balsam.common import MessageInterface
class NoMessageInterface(MessageInterface.MessageInterface): class NoMessageInterface(MessageInterface.MessageInterface):
......
...@@ -5,7 +5,7 @@ import logging ...@@ -5,7 +5,7 @@ import logging
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
logging.getLogger('pika').setLevel(logging.WARNING) logging.getLogger('pika').setLevel(logging.WARNING)
from common import MessageInterface from balsam.common import MessageInterface
class PikaMessageInterface(MessageInterface.MessageInterface): class PikaMessageInterface(MessageInterface.MessageInterface):
......
...@@ -2,8 +2,8 @@ import multiprocessing,logging ...@@ -2,8 +2,8 @@ import multiprocessing,logging
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
from django.db import utils,connections,DEFAULT_DB_ALIAS from django.db import utils,connections,DEFAULT_DB_ALIAS
from balsam import QueueMessage from balsam.service import QueueMessage
from common import db_tools from balsam.common import db_tools
class TransitionJob(multiprocessing.Process): class TransitionJob(multiprocessing.Process):
''' spawns subprocess which finds the DB entry for the given id ''' spawns subprocess which finds the DB entry for the given id
......
...@@ -12,10 +12,7 @@ https://docs.djangoproject.com/en/1.9/ref/settings/ ...@@ -12,10 +12,7 @@ https://docs.djangoproject.com/en/1.9/ref/settings/
import os,logging import os,logging
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
from balsam.user_settings import *
logger.info('here')
from user_settings import *
...@@ -38,7 +35,7 @@ ALLOWED_HOSTS = [] ...@@ -38,7 +35,7 @@ ALLOWED_HOSTS = []
# Application definition # Application definition
INSTALLED_APPS = [ INSTALLED_APPS = [
'balsam.apps.BalsamCoreConfig', 'balsam.service.apps.BalsamCoreConfig',
'django.contrib.admin', 'django.contrib.admin',
'django.contrib.auth', 'django.contrib.auth',
'django.contrib.contenttypes', 'django.contrib.contenttypes',
...@@ -58,7 +55,7 @@ MIDDLEWARE_CLASSES = [ ...@@ -58,7 +55,7 @@ MIDDLEWARE_CLASSES = [
'django.middleware.clickjacking.XFrameOptionsMiddleware', 'django.middleware.clickjacking.XFrameOptionsMiddleware',
] ]
ROOT_URLCONF = 'argobalsam.urls' ROOT_URLCONF = 'balsam.django_config.urls'
TEMPLATES = [ TEMPLATES = [
{ {
...@@ -76,7 +73,7 @@ TEMPLATES = [ ...@@ -76,7 +73,7 @@ TEMPLATES = [
}, },
] ]
WSGI_APPLICATION = 'argobalsam.wsgi.application' WSGI_APPLICATION = 'balsam.django_config.wsgi.application'
......
...@@ -17,6 +17,5 @@ from django.conf.urls import url,include ...@@ -17,6 +17,5 @@ from django.conf.urls import url,include
from django.contrib import admin from django.contrib import admin
urlpatterns = [ urlpatterns = [
#url(r'^argo/',include('argo.urls')),
url(r'^admin/', admin.site.urls), url(r'^admin/', admin.site.urls),
] ]
...@@ -11,6 +11,6 @@ import os ...@@ -11,6 +11,6 @@ import os
from django.core.wsgi import get_wsgi_application from django.core.wsgi import get_wsgi_application
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "argobalsam.settings") os.environ.setdefault("DJANGO_SETTINGS_MODULE", "balsam.django_config.settings")
application = get_wsgi_application() application = get_wsgi_application()
'''Python API for Balsam DAG Manipulations '''Python API for Balsam DAG Manipulations
Example usage: Example usage:
>>> import balsamlauncher.dag as dag >>> import launcher.dag as dag
>>> >>>
>>> output = open('expected_output').read() >>> output = open('expected_output').read()
>>> >>>
...@@ -18,8 +18,8 @@ Example usage: ...@@ -18,8 +18,8 @@ Example usage:
>>> >>>
''' '''
import django as django import django
import os as os import os
import uuid import uuid
__all__ = ['JOB_ID', 'TIMEOUT', 'ERROR', __all__ = ['JOB_ID', 'TIMEOUT', 'ERROR',
...@@ -27,10 +27,10 @@ __all__ = ['JOB_ID', 'TIMEOUT', 'ERROR', ...@@ -27,10 +27,10 @@ __all__ = ['JOB_ID', 'TIMEOUT', 'ERROR',
'add_job', 'add_dependency', 'spawn_child', 'add_job', 'add_dependency', 'spawn_child',
'kill'] 'kill']
os.environ['DJANGO_SETTINGS_MODULE'] = 'argobalsam.settings' os.environ['DJANGO_SETTINGS_MODULE'] = 'balsam.django_config.settings'
django.setup() django.setup()
from balsam.models import BalsamJob as _BalsamJob from balsam.service.models import BalsamJob as _BalsamJob
from django.conf import settings from django.conf import settings
current_job = None current_job = None
......
from collections import defaultdict from collections import defaultdict
from django.conf import settings from django.conf import settings
import balsam.models from balsam.service import models
from balsam.models import BalsamJob BalsamJob = models.BalsamJob
import logging import logging
import uuid import uuid
...@@ -32,7 +32,7 @@ class JobReader(): ...@@ -32,7 +32,7 @@ class JobReader():
def _get_jobs(self): raise NotImplementedError def _get_jobs(self): raise NotImplementedError
def _filter(self, job_queryset): def _filter(self, job_queryset):
jobs = job_queryset.exclude(state__in=balsam.models.END_STATES) jobs = job_queryset.exclude(state__in=models.END_STATES)
jobs = jobs.filter(allowed_work_sites__icontains=settings.BALSAM_SITE) jobs = jobs.filter(allowed_work_sites__icontains=settings.BALSAM_SITE)
return jobs return jobs
......
...@@ -2,33 +2,36 @@ ...@@ -2,33 +2,36 @@
scheduling service and submits directly to a local job queue, or by the scheduling service and submits directly to a local job queue, or by the
Balsam service metascheduler''' Balsam service metascheduler'''
import argparse import argparse
from math import floor
import os import os
from sys import exit from sys import exit
import signal import signal
import time import time
import django import django
os.environ['DJANGO_SETTINGS_MODULE'] = 'argobalsam.settings' os.environ['DJANGO_SETTINGS_MODULE'] = 'balsam.django_config.settings'
django.setup() django.setup()
from django.conf import settings from django.conf import settings
import logging import logging
logger = logging.getLogger('balsamlauncher') logger = logging.getLogger('balsam.launcher')
logger.info("Loading Balsam Launcher") logger.info("Loading Balsam Launcher")
from balsam.schedulers import Scheduler from balsam.service.schedulers import Scheduler
scheduler = Scheduler.scheduler_main scheduler = Scheduler.scheduler_main
from balsamlauncher import jobreader from balsam.launcher import jobreader
from balsamlauncher import transitions from balsam.launcher import transitions
from balsamlauncher import worker from balsam.launcher import worker
from balsamlauncher import runners from balsam.launcher import runners
from balsamlauncher.exceptions import * from balsam.launcher.exceptions import *
ALMOST_RUNNABLE_STATES = ['READY','STAGED_IN']
RUNNABLE_STATES = ['PREPROCESSED', 'RESTART_READY'] RUNNABLE_STATES = ['PREPROCESSED', 'RESTART_READY']
WAITING_STATES = ['CREATED', 'LAUNCHER_QUEUED', 'AWAITING_PARENTS']
HANDLING_EXIT = False HANDLING_EXIT = False
def delay(period=settings.BALSAM_SERVICE_PERIOD): def delay_generator(period=settings.BALSAM_SERVICE_PERIOD):
nexttime = time.time() + period nexttime = time.time() + period
while True: while True:
now = time.time() now = time.time()
...@@ -45,37 +48,15 @@ def elapsed_time_minutes(): ...@@ -45,37 +48,15 @@ def elapsed_time_minutes():
while True: while True:
yield (time.time() - start) / 60.0 yield (time.time() - start) / 60.0
def sufficient_time(job): def remaining_time_minutes(time_limit_minutes=0.0):
return 60*job.wall_time_minutes < scheduler.remaining_time_seconds() elapsed_timer = elapsed_time_minutes()
while True:
def get_runnable_jobs(jobs, running_pks): if time_limit_minutes > 0.0:
runnable_jobs = [job for job in jobs remaining = time_limit_minutes - next(elapsed_timer)
if job.pk not in running_pks and
job.state in RUNNABLE_STATES and
sufficient_time(job)]
return runnable_jobs
def create_new_runners(jobs, runner_group, worker_group):
created_one = False
running_pks = runner_group.running_job_pks
runnable_jobs = get_runnable_jobs(jobs, running_pks)
while runnable_jobs:
logger.debug(f"Have {len(runnable_jobs)} new runnable jobs (out of "
f"{len(jobs)})")
try:
runner_group.create_next_runner(runnable_jobs, worker_group)
except ExceededMaxRunners:
logger.info("Exceeded max concurrent runners; waiting")
break
except NoAvailableWorkers:
logger.info("Not enough idle workers to start any new runs")
break
else: else:
created_one = True remaining = scheduler.remaining_time_seconds() / 60.0
running_pks = runner_group.running_job_pks if remaining > 0: yield remaining
runnable_jobs = get_runnable_jobs(jobs, running_pks) else: break
return created_one
def check_parents(job, lock): def check_parents(job, lock):
job.refresh_from_db() job.refresh_from_db()
...@@ -92,53 +73,85 @@ def check_parents(job, lock): ...@@ -92,53 +73,85 @@ def check_parents(job, lock):
lock.release() lock.release()
logger.info(f'{job.cute_id} waiting for parents') logger.info(f'{job.cute_id} waiting for parents')
def log_time(minutes_left):
if minutes_left > 1e12:
return
whole_minutes = floor(minutes_left)
whole_seconds = round((minutes_left - whole_minutes)*60)
time_str = f"{whole_minutes:02d} min : {whole_seconds:02d} sec remaining"
logger.info(time_str)
def main(args, transition_pool, runner_group, job_source): def main(args, transition_pool, runner_group, job_source):
delay_timer = delay()
elapsed_min = elapsed_time_minutes()
logger.debug(f"time limit provided {args.time_limit_minutes}")
last_created = 0.0
if args.time_limit_minutes > 0:
def timeout():
elapsed = next(elapsed_min)
logger.debug(f"{elapsed} minutes elapsed out of {args.time_limit_minutes}")
return elapsed >= args.time_limit_minutes
else:
timeout = lambda : scheduler.remaining_time_seconds() <= 0.0
while not timeout(): delay_sleeper = delay_generator()
runner_create_period = settings.BALSAM_RUNNER_CREATION_PERIOD_SEC
last_runner_created = time.time()
remaining_timer = remaining_time_minutes(args.time_limit_minutes)
for remaining_minutes in remaining_timer:
logger.info("\n******************\n" logger.info("\n******************\n"
"BEGIN SERVICE LOOP\n" "BEGIN SERVICE LOOP\n"
"******************") "******************")
wait = True log_time(remaining_minutes)
for stat in transition_pool.get_statuses(): wait = False delay = True
# Update after any finished transitions
for stat in transition_pool.get_statuses(): delay = False
job_source.refresh_from_db() job_source.refresh_from_db()
waiting_jobs = (j for j in job_source.jobs if
j.state in 'CREATED AWAITING_PARENTS LAUNCHER_QUEUED'.split())
for job in waiting_jobs: check_parents(job, transition_pool.lock)
# Update jobs awaiting dependencies
waiting_jobs = (j for j in job_source.jobs if j.state in WAITING_STATES)
for job in waiting_jobs:
check_parents(job, transition_pool.lock)
# Enqueue new transitions
transitionable_jobs = [ transitionable_jobs = [
job for job in job_source.jobs job for job in job_source.jobs
if job not in transition_pool if job not in transition_pool
and job.state in transitions.TRANSITIONS and job.state in transitions.TRANSITIONS
] ]
for job in transitionable_jobs: for job in transitionable_jobs:
transition_pool.add_job(job) transition_pool.add_job(job)
wait = False delay = False
fxn = transitions.TRANSITIONS[job.state] fxn = transitions.TRANSITIONS[job.state]
logger.info(f"Queued transition: {job.cute_id} will undergo {fxn}") logger.info(f"Queued transition: {job.cute_id} will undergo {fxn}")
# Update jobs that are running/finished
any_finished = runner_group.update_and_remove_finished() any_finished = runner_group.update_and_remove_finished()
if any_finished: wait = False if any_finished: delay = False
job_source.refresh_from_db() job_source.refresh_from_db()
if time.time() - last_created > 5:
created = create_new_runners(job_source.jobs, runner_group, worker_group) # Decide whether or not to start a new runner
if created: runnable_jobs = [
last_created = time.time()