Commit 7fbe0a4b authored by Paul Rich's avatar Paul Rich
Browse files

Merge branch '98-fix-forker-LOC' into 'develop'

fix for bug where commuincation interruption to a forker would kill its tasks

Closes #98

See merge request !59
parents 89376c7f f03d4acb
...@@ -159,7 +159,6 @@ class ProcessGroupManager(object): #degenerate with ProcessMonitor. ...@@ -159,7 +159,6 @@ class ProcessGroupManager(object): #degenerate with ProcessMonitor.
completed_pgs = [] completed_pgs = []
now = int(time.time()) now = int(time.time())
for forker in self.forkers: for forker in self.forkers:
completed[forker] = []
try: try:
child_data = ComponentProxy(forker).get_children("process group", None) child_data = ComponentProxy(forker).get_children("process group", None)
except ComponentLookupError, e: except ComponentLookupError, e:
...@@ -168,37 +167,39 @@ class ProcessGroupManager(object): #degenerate with ProcessMonitor. ...@@ -168,37 +167,39 @@ class ProcessGroupManager(object): #degenerate with ProcessMonitor.
_logger.error("unexpected exception while getting a list of children from the %s component", _logger.error("unexpected exception while getting a list of children from the %s component",
forker, exc_info=True) forker, exc_info=True)
else: else:
completed[forker] = []
for child in child_data: for child in child_data:
children[(forker, child['id'])] = child children[(forker, child['id'])] = child
#clean up orphaned process groups #clean up orphaned process groups
for pg in self.process_groups.values(): for pg in self.process_groups.values():
if now < pg.startup_timeout: if pg.forker in completed:
#wait for startup timeout. We don't want any hasty kills if now < pg.startup_timeout:
continue #wait for startup timeout. We don't want any hasty kills
pg_id = pg.id
child_uid = (pg.forker, pg.head_pid)
if child_uid not in children:
if pg.mode == 'interactive':
#interactive job, there is no child job
if pg.interactive_complete:
completed_pgs.append(pg)
#not really orphaned, but this causes the proper cleanup
#to occur
orphaned.append(pg_id)
continue continue
orphaned.append(pg_id) pg_id = pg.id
_logger.warning('%s: orphaned job exited with unknown status', pg.jobid) child_uid = (pg.forker, pg.head_pid)
pg.exit_status = 1234567 if child_uid not in children:
completed_pgs.append(pg) if pg.mode == 'interactive':
else: #interactive job, there is no child job
children[child_uid]['found'] = True if pg.interactive_complete:
pg.update_data(children[child_uid]) completed_pgs.append(pg)
if pg.exit_status is not None: #not really orphaned, but this causes the proper cleanup
_logger.info('%s: job exited with status %s', pg.jobid, #to occur
pg.exit_status) orphaned.append(pg_id)
completed[pg.forker].append(children[child_uid]['id']) continue
orphaned.append(pg_id)
_logger.warning('%s: orphaned job exited with unknown status', pg.jobid)
pg.exit_status = 1234567
completed_pgs.append(pg) completed_pgs.append(pg)
else:
children[child_uid]['found'] = True
pg.update_data(children[child_uid])
if pg.exit_status is not None:
_logger.info('%s: job exited with status %s', pg.jobid,
pg.exit_status)
completed[pg.forker].append(children[child_uid]['id'])
completed_pgs.append(pg)
#check for children without process groups and clean #check for children without process groups and clean
for forker, child_id in children.keys(): for forker, child_id in children.keys():
if not children[(forker, child_id)].has_key('found'): if not children[(forker, child_id)].has_key('found'):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment