Commit 6e22729b authored by Michael Salim's avatar Michael Salim

working on launcher

parent 185c199a
......@@ -6,3 +6,5 @@ class NoAvailableWorkers(BalsamRunnerException): pass
class BalsamTransitionError(Exception): pass
class TransitionNotFoundError(BalsamTransitionException, ValueError): pass
class MPIEnsembleError(Exception): pass
from collections import defaultdict
import balsam.models
from balsam.models import BalsamJob
......
......@@ -2,22 +2,19 @@
scheduling service and submits directly to a local job queue, or by the
Balsam service metascheduler'''
import argparse
from collections import defaultdict
import os
import multiprocessing
import queue
from sys import exit
import signal
import time
import django
from django.conf import settings
from django.db import transaction
from balsam import scheduler
from balsam.launcher import jobreader
from balsam.launcher import transitions
from balsam.launcher import worker
from balsam.launcher import runners
from balsam.launcher.exceptions import *
START_TIME = time.time() + 10.0
......@@ -35,6 +32,7 @@ def delay(period=settings.BALSAM_SERVICE_PERIOD):
nexttime = now + tosleep + period
yield
class HostEnvironment:
'''Set user- and environment-specific settings for this run'''
RECOGNIZED_HOSTS = {
......@@ -109,6 +107,7 @@ def get_runnable_jobs(jobs, running_pks, host_env):
return runnable_jobs
def create_new_runners(jobs, runner_group, worker_group, host_env):
created_one = False
running_pks = runner_group.running_job_pks
runnable_jobs = get_runnable_jobs(jobs, running_pks, host_env)
while runnable_jobs:
......@@ -117,8 +116,11 @@ def create_new_runners(jobs, runner_group, worker_group, host_env):
except (ExceededMaxRunners, NoAvailableWorkers) as e:
break
else:
created_one = True
running_pks = runner_group.running_job_pks
runnable_jobs = get_runnable_jobs(jobs, running_pks, host_env)
return created_one
def main(args, transition_pool, runner_group, job_source):
host_env = HostEnvironment(args)
......@@ -144,8 +146,8 @@ def main(args, transition_pool, runner_group, job_source):
wait = False
any_finished = runner_group.update_and_remove_finished()
create_new_runners(job_source.jobs, runner_group, worker_group, host_env)
if any_finished: wait = False
created = create_new_runners(job_source.jobs, runner_group, worker_group, host_env)
if any_finished or created: wait = False
if wait: next(delay_timer)
def on_exit(runner_group, transition_pool, job_source):
......@@ -181,6 +183,9 @@ def get_args():
forever if no limit is detected or specified)")
return parser.parse_args()
def detect_dead_runners(job_source):
for job in job_source.by_states['RUNNING']:
job.update_state('RESTART_READY', 'Detected dead runner')
if __name__ == "__main__":
os.environ['DJANGO_SETTINGS_MODULE'] = 'argobalsam.settings'
......@@ -188,9 +193,12 @@ if __name__ == "__main__":
args = get_args()
job_source = jobreader.JobReader.from_config(args)
job_source.refresh_from_db()
runner_group = runners.RunnerGroup()
transition_pool = transitions.TransitionProcessPool()
detect_dead_runners(job_source)
handl = lambda a,b: on_exit(runner_group, transition_pool, job_source)
signal.signal(signal.SIGINT, handl)
signal.signal(signal.SIGTERM, handl)
......
......@@ -6,8 +6,7 @@ from subprocess import Popen, STDOUT
from mpi4py import MPI
from balsam.launcher.runners import cd
class MPIEnsembleError(Exception): pass
from balsam.launcher.exceptions import *
COMM = MPI.COMM_WORLD
RANK = COMM.Get_rank()
......
'''A Runner is constructed with a list of jobs and a list of idle workers. It
creates and monitors the execution subprocess, updating job states in the DB as
necessary. RunnerGroup contains the list of Runner objects, logic for creating
the next Runner (i.e. assigning jobs to nodes), and the public interface'''
necessary. RunnerGroup has a collection of Runner objects, logic for creating
the next Runner (i.e. assigning jobs to nodes), and the public interface to
monitor runners'''
import functools
from math import ceil
......@@ -15,6 +16,7 @@ from threading import Thread
from queue import Queue, Empty
from django.conf import settings
from django.db import transaction
import balsam.models
from balsam.launcher import mpi_commands
......@@ -197,15 +199,18 @@ class RunnerGroup:
rpw = workers[0].ranks_per_worker
assert all(w.ranks_per_worker == rpw for w in idle_workers)
serial_jobs = [j for j in runnable_jobs if j.num_nodes == 1 and
j.processes_per_node == 1]
serial_jobs = [j for j in runnable_jobs
if j.num_nodes == 1 and j.processes_per_node == 1]
nserial = len(serial_jobs)
mpi_jobs = [j for j in runnable_jobs if 1 < j.num_nodes <= nidle or
(1==j.num_nodes<=nidle and j.processes_per_node > 1)]
(1==j.num_nodes<=nidle and j.processes_per_node > 1)]
largest_mpi_job = (max(mpi_jobs, key=lambda job: job.num_nodes)
if mpi_jobs else None)
# Try to fill all available nodes with serial ensemble runner
# If there are not enough serial jobs; run the larger of:
# largest MPI job that fits, or the remaining serial jobs
if nserial >= nidle*rpw:
jobs = serial_jobs[:nidle*rpw]
assigned_workers = idle_workers
......@@ -243,5 +248,4 @@ class RunnerGroup:
@property
def running_job_pks(self):
active_runners = [r for r in self.runners if not r.finished()]
return [j.pk for runner in active_runners for j in runner.jobs]
return [j.pk for runner in self.runners for j in runner.jobs]
This diff is collapsed.
......@@ -137,15 +137,15 @@ class BalsamJob(models.Model):
input_files = models.TextField(
'Input File Patterns',
help_text="A string of filename patterns that will be searched in the parents'"\
help_text="Space-delimited filename patterns that will be searched in the parents'"\
"working directories. Every matching file will be made available in this"\
"job's working directory (symlinks for local Balsam jobs, file transfer for"\
"remote Balsam jobs). Default: all files from parent jobs are made available.",
default='*')
stage_in_urls = models.TextField(
stage_in_url = models.TextField(
'External stage in files or folders', help_text="A list of URLs for external data to be staged in prior to job processing. Job dataflow from parents to children is NOT handled here; see `input_files` field instead.",
default='')
stage_out_files = models.TextField(
stage_out_files = models.TextField
'External stage out files or folders',
help_text="A string of filename patterns. Matches will be transferred to the stage_out_url. Default: no files are staged out",
default='')
......@@ -154,7 +154,7 @@ class BalsamJob(models.Model):
help_text='The URLs to which designated stage out files are sent.',
default='')
requested_wall_time_minutes = models.IntegerField(
wall_time_minutes = models.IntegerField(
'Job Wall Time in Minutes',
help_text='The number of minutes the job is expected to take',
default=1)
......@@ -215,6 +215,21 @@ class BalsamJob(models.Model):
help_text='A script that is run in a job working directory after the job has completed.'
' If blank, will default to the default_postprocess script defined for the application.',
default='')
post_error_handler = models.BooleanField(
'Let postprocesser try to handle RUN_ERROR',
help_text='If true, the postprocessor will be invoked for RUN_ERROR jobs'
' and it is up to the script to handle error and update job state.',
default=False)
post_timeout_handler = models.BooleanField(
'Let postprocesser try to handle RUN_TIMEOUT',
help_text='If true, the postprocessor will be invoked for RUN_TIMEOUT jobs'
' and it is up to the script to handle timeout and update job state.',
default=False)
auto_timeout_retry = models.BooleanField(
'Automatically restart jobs that have timed out',
help_text="If True and post_timeout_handler is False, then jobs will "
"simply be marked RESTART_READY upon timing out.",
default=True)
state = models.TextField(
'Job State',
......@@ -246,13 +261,13 @@ description: {self.description[:50]}
working_directory: {self.working_directory}
parents: {self.parents}
input_files: {self.input_files}
stage_in_urls: {self.stage_in_urls}
stage_in_url: {self.stage_in_url}
stage_out_files: {self.stage_out_files}
stage_out_urls: {self.stage_out_urls}
wall_time_minutes: {self.wall_time_minutes}
num_nodes: {self.num_nodes}
processes_per_node: {self.processes_per_node}
scheduler_id: {self.scheduler_id}
scheduler_id: {self.scheduler_id}
runtime_seconds: {self.runtime_seconds}
application: {self.application}
'''
......@@ -266,6 +281,13 @@ application: {self.application}
parent_ids = self.get_parents_by_id()
return BalsamJob.objects.filter(job_id__in=parent_ids)
def get_children(self):
return BalsamJob.objects.filter(parents__icontains=str(self.pk))
def get_children_by_id(self):
children = self.get_children()
return [c.pk for c in children]
def set_parents(self, parents):
try:
parents_list = list(parents)
......@@ -279,6 +301,40 @@ application: {self.application}
self.parents = json.dumps(parents_list)
self.save(update_fields=['parents'])
def get_application(self):
if not self.application:
return None
return ApplicationDefinition.objects.get(name=self.application)
@staticmethod
def parse_envstring(s):
result = {}
entries = s.split(':')
entries = [e.split('=') for e in entries]
return {variable:value for (variable,value) in entries}
def get_envs(self, *, timeout=False, error=False):
envs = os.environ.copy()
app = self.get_application()
if app and app.environ_vars:
app_vars = self.parse_envstring(app.environ_vars)
envs.update(app_vars)
if self.environ_vars:
job_vars = self.parse_envstring(self.environ_vars)
envs.update(job_vars)
children = self.get_children_by_id()
children = json.dumps([str(c) for c in children])
balsam_envs = dict(
BALSAM_JOB_ID=str(self.pk),
BALSAM_PARENT_IDS=str(self.parents),
BALSAM_CHILD_IDS=children,
BALSAM_JOB_TIMEOUT=str(timeout),
BALSAM_JOB_ERROR=str(error)
)
envs.update(balsam_envs)
return envs
def update_state(self, new_state, message=''):
if new_state not in STATES:
raise InvalidStateError(f"{new_state} is not a job state in balsam.models")
......@@ -301,9 +357,10 @@ application: {self.application}
top = os.path.join(top, self.workflow)
name = self.name.replace(' ', '_')
path = os.path.join(top, name)
if os.path.exists(path): path += "_"
for char in str(self.job_id):
if not os.path.exists(path): break
path += char
if not os.path.exists(path): break
os.makedirs(path)
self.working_directory = path
self.save(update_fields=['working_directory'])
......@@ -340,6 +397,10 @@ class ApplicationDefinition(models.Model):
'Postprocessing Script',
help_text='A script that is run in a job working directory after the job has completed.',
default='')
environ_vars = models.TextField(
'Environment variables specific to this application',
help_text="Colon-separated list of envs like VAR1=value2:VAR2=value2",
default='')
def __str__(self):
s = 'Application: ' + self.name + '\n'
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment