Commit ab82997c authored by Paul Rich's avatar Paul Rich
Browse files

Resources now go idle when there are other reservations on the system

Resources weren't actually exititng the cleanup state when there were
other resrervations on the system.  The check to mark nodes idle was not
actually ocurring when a reservation existed to mark nodes as idle..
parent e73f7cc1
...@@ -71,11 +71,8 @@ def reserve(user, jobid, nodecount, attributes=None, node_id_list=None): ...@@ -71,11 +71,8 @@ def reserve(user, jobid, nodecount, attributes=None, node_id_list=None):
params[key] = val params[key] = val
if node_id_list is not None: if node_id_list is not None:
params['node_list'] = [int(i) for i in node_id_list] params['node_list'] = [int(i) for i in node_id_list]
_logger.debug('reserve request: %s', str(BasilRequest('RESERVE',
params=params)))
retval = _call_sys_forker(BASIL_PATH, str(BasilRequest('RESERVE', retval = _call_sys_forker(BASIL_PATH, str(BasilRequest('RESERVE',
params=params))) params=params)))
_logger.debug('reserve return %s', retval)
return retval return retval
def release(alps_res_id): def release(alps_res_id):
......
...@@ -258,34 +258,22 @@ class CraySystem(BaseSystem): ...@@ -258,34 +258,22 @@ class CraySystem(BaseSystem):
#determine if summary may be used under normal operation #determine if summary may be used under normal operation
#updated for >= 1.6 interface #updated for >= 1.6 interface
inven_nodes = ALPSBridge.extract_system_node_data(ALPSBridge.system()) inven_nodes = ALPSBridge.extract_system_node_data(ALPSBridge.system())
#inventory = ALPSBridge.system()
reservations = ALPSBridge.fetch_reservations() reservations = ALPSBridge.fetch_reservations()
#reserved_nodes = ALPSBridge.reserved_nodes() #reserved_nodes = ALPSBridge.reserved_nodes()
except (ALPSBridge.ALPSError, ComponentLookupError): except (ALPSBridge.ALPSError, ComponentLookupError):
_logger.warning('Error contacting ALPS for state update. Aborting this update', _logger.warning('Error contacting ALPS for state update. Aborting this update',
exc_info=True) exc_info=True)
return return
inven_reservations = reservations.get('reservations', []) # no reservations will be blank inven_reservations = reservations.get('reservations', [])
fetch_time_start = time.time() fetch_time_start = time.time()
_logger.debug("time in ALPS fetch: %s seconds", (time.time() - fetch_time_start)) _logger.debug("time in ALPS fetch: %s seconds", (time.time() - fetch_time_start))
start_time = time.time() start_time = time.time()
# if node.status not in ['cleanup', 'cleanup-pending']:
# node.status = 'idle'
# check our reservation objects. If a res object doesn't correspond # check our reservation objects. If a res object doesn't correspond
# to any backend reservations, this reservation object should be # to any backend reservations, this reservation object should be
# dropped # dropped
alps_res_to_delete = [] alps_res_to_delete = []
current_alps_res_ids = [int(res['reservation_id']) for res in current_alps_res_ids = [int(res['reservation_id']) for res in
inven_reservations] inven_reservations]
for alps_res in self.alps_reservations.values():
if not alps_res.alps_res_id in current_alps_res_ids:
alps_res_to_delete.append(alps_res)
for res in alps_res_to_delete:
_logger.warning('Deleting orphaned ALPS reservation %s',
res.alps_res_id)
del self.alps_reservations[str(res.jobid)]
# Check our reservations. If it's ID is not in the inventory, then the
# nodes need to be returned to the pool. Give them the 'idle' state
res_jobid_to_delete = [] res_jobid_to_delete = []
if self.alps_reservations == {}: if self.alps_reservations == {}:
# if we have nodes in cleanup-pending but no alps reservations, # if we have nodes in cleanup-pending but no alps reservations,
...@@ -295,14 +283,9 @@ class CraySystem(BaseSystem): ...@@ -295,14 +283,9 @@ class CraySystem(BaseSystem):
for node in self.nodes.values(): for node in self.nodes.values():
if node.status in ['cleanup', 'cleanup-pending']: if node.status in ['cleanup', 'cleanup-pending']:
node.status = 'idle' node.status = 'idle'
for alps_res in self.alps_reservations.values(): for alps_res in self.alps_reservations.values():
#find alps_id associated reservation #find alps_id associated reservation
if int(alps_res.alps_res_id) not in current_alps_res_ids: if int(alps_res.alps_res_id) not in current_alps_res_ids:
#for res_info in inven_reservations:
#if int(alps_res.alps_res_id) == int(res_info['reservation_id']):
# found = True
#if not found:
for node_id in alps_res.node_ids: for node_id in alps_res.node_ids:
if not self.nodes[str(node_id)].reserved: if not self.nodes[str(node_id)].reserved:
#pending hardware status update #pending hardware status update
...@@ -311,6 +294,15 @@ class CraySystem(BaseSystem): ...@@ -311,6 +294,15 @@ class CraySystem(BaseSystem):
for jobid in res_jobid_to_delete: for jobid in res_jobid_to_delete:
_logger.info('%s: ALPS reservation for this job complete.', jobid) _logger.info('%s: ALPS reservation for this job complete.', jobid)
del self.alps_reservations[str(jobid)] del self.alps_reservations[str(jobid)]
# Check our reservations. If it's ID is not in the inventory, then the
# nodes need to be returned to the pool. Give them the 'idle' state
for alps_res in self.alps_reservations.values():
if not alps_res.alps_res_id in current_alps_res_ids:
alps_res_to_delete.append(alps_res)
for res in alps_res_to_delete:
_logger.warning('Deleting orphaned ALPS reservation %s',
res.alps_res_id)
del self.alps_reservations[str(res.jobid)]
#process group should already be on the way down since cqm released the #process group should already be on the way down since cqm released the
#resource reservation #resource reservation
cleanup_nodes = [node for node in self.nodes.values() cleanup_nodes = [node for node in self.nodes.values()
...@@ -672,7 +664,7 @@ class CraySystem(BaseSystem): ...@@ -672,7 +664,7 @@ class CraySystem(BaseSystem):
self.logger.info("job %s: nodes '%s' released. Cleanup pending.", self.logger.info("job %s: nodes '%s' released. Cleanup pending.",
jobid, compact_num_list(succeeded_nodes)) jobid, compact_num_list(succeeded_nodes))
if failed_nodes != []: if failed_nodes != []:
self.logger.warning("job %s: failed to reserve nodes '%s'", self.logger.warning("job %s: failed to release nodes '%s'",
jobid, compact_num_list(failed_nodes)) jobid, compact_num_list(failed_nodes))
else: else:
completed = True completed = True
......
...@@ -145,6 +145,7 @@ class Resource(object): ...@@ -145,6 +145,7 @@ class Resource(object):
if not self.reserved: if not self.reserved:
_logger.warning('Release of already free resource %s attempted.' \ _logger.warning('Release of already free resource %s attempted.' \
' Release ignored.', self.name) ' Release ignored.', self.name)
released = True
elif (force or (user == self.reserved_by or elif (force or (user == self.reserved_by or
jobid == self.reserved_jobid)): jobid == self.reserved_jobid)):
if force: if force:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment