Commit 71952df1 authored by Paul Rich's avatar Paul Rich
Browse files

Merge branch '39-bad-drain-selection' into 'master'

Fixing a bug where we can drain on a down node.

This could happen when the node goes down while a job is running,
causing the node to still show up in the job end_times.
Closes #39

See merge request !24
parents 40edc37d 59c5411c
......@@ -754,7 +754,6 @@ class CraySystem(BaseSystem):
if self.nodes[str(node_id)].status in
self.nodes[str(node_id)].DOWN_STATUSES]
if drain_time is not None:
print drain_time, BACKFILL_EPSILON, drain_time - BACKFILL_EPSILON
unavailable_nodes.extend([node_id for node_id in node_id_list
if (self.nodes[str(node_id)].draining and
(self.nodes[str(node_id)].drain_until - BACKFILL_EPSILON) < int(drain_time))])
......@@ -1002,8 +1001,14 @@ class CraySystem(BaseSystem):
nid in required) and
not self.nodes[str(nid)].draining)]
for nid in running_nodes:
self.nodes[str(nid)].set_drain(loc_time[1], job['jobid'])
candidate_list.extend(running_nodes)
# We set a drain on all running nodes for use in a later
# so that we can "favor" draining on the longest
# running set of nodes.
if (self.nodes[str(nid)].status != 'down' and
self.nodes[str(nid)].managed):
self.nodes[str(nid)].set_drain(loc_time[1], job['jobid'])
candidate_list.extend([nid for nid in running_nodes if
self.nodes[str(nid)].draining])
candidate_drain_time = int(loc_time[1])
if len(candidate_list) >= int(job['nodes']):
# Enough nodes have been found to drain for this job
......
......@@ -584,6 +584,24 @@ class TestCraySystem(object):
assert_match(self.system.nodes[str(i)].drain_jobid, 1, "Bad drain job")
assert_match(self.system.nodes[str(i)].drain_until, now + 300, "Bad drain time")
def test_select_nodes_for_draining_running_but_down(self):
'''CraySystem._select_nodes_for_draining: do not drain down node if job still "running"'''
# If a node dies while a job is running, it will still show up in the
# end-times range until termination of that job is complete.
end_times = [[['1-4'], 100.0]]
self.system.nodes['2'].status = 'down'
self.base_job['nodes'] = 4
drain_nodes = self.system._select_nodes_for_draining(self.base_job,
end_times)
assert_match(sorted(drain_nodes), ['1', '3', '4', '5'], "Bad Selection")
assert_match(self.system.nodes['2'].draining, False, "Draining set")
assert_match(self.system.nodes['2'].drain_jobid, None, "Should not have drain_jobid", is_match)
assert_match(self.system.nodes['2'].drain_until, None, "Should not have drain_until", is_match)
for i in ['1', '3', '4', '5']:
assert_match(self.system.nodes[str(i)].draining, True, "Draining not set")
assert_match(self.system.nodes[str(i)].drain_jobid, 1, "Bad drain job")
assert_match(self.system.nodes[str(i)].drain_until, 100.0, "Bad drain time")
# common checks for find_job_location
def assert_draining(self, nid, until, drain_jobid):
assert self.system.nodes[str(nid)].draining, "Node %s should be draining" % nid
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment