Commit 4ed3c118 authored by Paul Rich's avatar Paul Rich

Merge branch '158-cray-drain-flap' into 'develop'

Resolve "Cray system draining flap"

Closes #158

See merge request aig/cobalt!120
parents 232e6cd4 7b6f8ce7
......@@ -1133,11 +1133,11 @@ class CraySystem(BaseSystem):
# 1. idle nodes that are already marked for draining.
# 2. Nodes that are in an in-use status (busy, allocated).
# 3. Nodes marked for cleanup that are not allocated to a real
# jobid. CLEANING_ID is a sentiel jobid value so we can set
# a drain window on cleaning nodes easiliy. Not sure if this
# jobid. CLEANING_ID is a sentinel jobid value so we can set
# a drain window on cleaning nodes easily. Not sure if this
# is the right thing to do. --PMR
candidate_list = []
candidate_list = [nid for nid in node_id_list
candidate_list = [str(nid) for nid in node_id_list
if (not self.nodes[str(nid)].draining and
(self.nodes[str(nid)].status in ['idle']) or
(self.nodes[str(nid)].status in cleanup_statuses)
......@@ -1158,8 +1158,9 @@ class CraySystem(BaseSystem):
if (self.nodes[str(nid)].status != 'down' and
self.nodes[str(nid)].managed):
self.nodes[str(nid)].set_drain(loc_time[1], job['jobid'])
candidate_list.extend([nid for nid in running_nodes if
self.nodes[str(nid)].draining])
# It's a list not a set, need to ensure uniqueness
candidate_list.extend([str(nid) for nid in running_nodes if
self.nodes[str(nid)].draining and str(nid) not in candidate_list])
candidate_drain_time = int(loc_time[1])
if len(candidate_list) >= int(job['nodes']):
# Enough nodes have been found to drain for this job
......
......@@ -691,6 +691,24 @@ class TestCraySystem(object):
assert_match(self.system.nodes[str(i)].drain_jobid, 1, "Bad drain job")
assert_match(self.system.nodes[str(i)].drain_until, 100.0, "Bad drain time")
def test_seelct_nodes_for_draining_no_exit_flap(self):
'''CraySystem._select_nodes_for_draining: do not flap when job exits with other running jobs.'''
# This results in the "draining flap" on job exit during a large job drain. You need multiple running
# jobs, an exiting job on a cleanup node that still shows as running from the queue manager, and you
# have to get a duplicate node into the candidate_list while this is running. This is based on the
# local simulator reproduction case.
end_times = [[['2'], 100.0], [['3'], 200.0]]
self.system.nodes['2'].status = 'cleanup-pending'
self.system.nodes['3'].status = 'busy'
self.base_job['nodes'] = 5
drain_nodes = self.system._select_nodes_for_draining(self.base_job,
end_times)
assert_match(sorted(drain_nodes), ['1', '2', '3', '4', '5'], "Bad Selection")
for i in ['1', '2', '3', '4', '5']:
assert_match(self.system.nodes[str(i)].draining, True, "Draining not set")
assert_match(self.system.nodes[str(i)].drain_jobid, 1, "Bad drain job")
assert_match(self.system.nodes[str(i)].drain_until, 200.0, "Bad drain time")
# common checks for find_job_location
def assert_draining(self, nid, until, drain_jobid):
assert self.system.nodes[str(nid)].draining, "Node %s should be draining" % nid
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment