Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
AIG-public
Cobalt
Commits
160dd1f5
Commit
160dd1f5
authored
Sep 13, 2016
by
Paul Rich
Browse files
assemble_queue_data now can now eliminate nodes marked for draining.
parent
33cd8327
Changes
2
Hide whitespace changes
Inline
Side-by-side
src/lib/Components/system/CraySystem.py
View file @
160dd1f5
...
...
@@ -679,7 +679,7 @@ class CraySystem(BaseSystem):
retlist
.
extend
(
expand_num_list
(
locs
))
return
retlist
def
_assemble_queue_data
(
self
,
job
,
idle_only
=
True
):
def
_assemble_queue_data
(
self
,
job
,
idle_only
=
True
,
no_draining
=
False
):
'''put together data for a queue, or queue-like reservation structure.
Input:
...
...
@@ -746,6 +746,9 @@ class CraySystem(BaseSystem):
unavailable_nodes
=
[
node_id
for
node_id
in
node_id_list
if
self
.
nodes
[
str
(
node_id
)].
status
in
self
.
nodes
[
str
(
node_id
)].
DOWN_STATUSES
]
if
no_draining
:
unavailable_nodes
.
extend
([
node_id
for
node_id
in
node_id_list
if
self
.
nodes
[
str
(
node_id
)].
draining
])
for
node_id
in
set
(
unavailable_nodes
):
node_id_list
.
remove
(
node_id
)
return
node_id_list
...
...
@@ -841,6 +844,7 @@ class CraySystem(BaseSystem):
'''
now
=
time
.
time
()
resource_until_time
=
now
+
TEMP_RESERVATION_TIME
end_times
.
sort
(
key
=
lambda
x
:
x
[
1
])
#sort on end time, ascending.
with
self
.
_node_lock
:
# only valid for this scheduler iteration.
self
.
_clear_draining_for_queues
(
arg_list
[
0
][
'queue'
])
...
...
@@ -851,29 +855,35 @@ class CraySystem(BaseSystem):
try
:
for
loc_time
in
end_times
:
loc_spec
=
loc_time
[
0
]
time
=
loc_time
[
1
]
end_
time
=
loc_time
[
1
]
for
loc
in
expand_num_list
(
loc_spec
):
node_end_times
[
str
(
loc
)]
=
time
node_end_times
[
str
(
loc
)]
=
end_
time
except
KeyError
:
_logger
.
error
(
"Invalid value for end_times: %s"
,
end_times
)
return
best_match
else
:
for
node
,
end_time
in
node_end_times
.
iteritems
():
#initilaize our end times.
self
.
nodes
[
str
(
node
)].
set_drain
(
end_time
)
for
job
in
arg_list
:
label
=
'%s/%s'
%
(
job
[
'jobid'
],
job
[
'user'
])
try
:
node_id_list
=
self
.
_assemble_queue_data
(
job
)
available_node_list
=
self
.
_assemble_queue_data
(
job
,
idle_only
=
False
)
except
ValueError
as
exc
:
_logger
.
warning
(
'Job %s: requesting locations that are not in queue for that job.'
,
job
[
'jobid'
])
continue
if
int
(
job
[
'nodes'
])
>
len
(
node_
id_
list
):
if
int
(
job
[
'nodes'
])
>
len
(
available_
node_list
):
# will happen with reserved jobs.
continue
if
len
(
node_id_list
)
==
0
:
# There are definitely insufficient nodes to run this job
# trivial exclude. Don't break out of the whole thing, may
# have disjoint queues.
continue
elif
int
(
job
[
'nodes'
])
<=
len
(
node_id_list
):
# enough nodes are in a working state to consider the job.
# enough nodes are idle that we can run this job
compact_locs
=
self
.
_associate_and_run_immediate
(
job
,
resource_until_time
,
node_id_list
)
# do we want to allow multiple placements in a single
...
...
@@ -883,15 +893,17 @@ class CraySystem(BaseSystem):
_logger
.
info
(
"%s: Job selected for running on nodes %s"
,
label
,
compact_locs
)
break
#for now only select one location
el
se
:
#
TODO:
drain
ing goes here
el
if
DRAIN_MODE
in
[
'backfill'
,
'drain-only'
]
:
# drain
sufficient nodes for this job to run
drain_node_ids
=
self
.
_select_nodes_for_draining
(
job
,
node_end_times
)
_logger
.
info
(
'%s: nodes %s selected for draining.'
,
label
,
compact_num_list
(
drain_node_ids
))
for
job
in
arg_list
:
#TODO: backfill pass goes here
pass
if
DRAIN_MODE
in
[
'backfill'
]:
for
job
in
arg_list
:
# Backfill is first fit
#TODO: backfill pass goes here
pass
return
best_match
def
_ALPS_reserve_resources
(
self
,
job
,
new_time
,
node_id_list
):
...
...
@@ -947,15 +959,27 @@ class CraySystem(BaseSystem):
Inputs:
job - dictionary of job information to consider
node_end_times - a list of nodes and their endtimes should be sorted
in order of location preference
Return:
List of node ids that have been selected for draining for this job
List of node ids that have been selected for draining for this job,
as well as the expected drain time.
'''
try
:
node_id_list
=
self
.
_assemble_queue_data
(
job
,
idle_only
=
False
)
except
ValueError
:
_logger
.
warning
(
'Job %s: requesting locations that are not in queue for that job.'
,
job
[
'jobid'
])
else
:
with
self
.
_node_lock
:
if
len
(
node_id_list
)
>=
int
(
job
[
'nodes'
]):
#order the node ids by id and drain-time.
node_id_list
.
sort
()
node_id_list
.
sort
(
reverse
=
True
,
key
=
lambda
nid
:
self
.
nodes
[
str
(
nid
)].
drain_until
)
return
[]
...
...
testsuite/TestCobalt/TestComponents/test_cray.py
View file @
160dd1f5
...
...
@@ -267,7 +267,6 @@ class TestCraySystem(object):
nodelist
=
self
.
system
.
_assemble_queue_data
(
self
.
base_job
)
assert
sorted
(
nodelist
)
==
[
'1'
,
'2'
,
'4'
],
'Wrong nodes in list %s'
%
nodelist
#need testcase with loc targeting down nodes.
def
test_assemble_queue_data_attrs_location_blocked_nodes
(
self
):
'''CraySystem._assemble_queue_data: return only idle locations'''
self
.
system
.
nodes
[
'1'
].
status
=
'busy'
...
...
@@ -289,6 +288,16 @@ class TestCraySystem(object):
nodelist
=
self
.
system
.
_assemble_queue_data
(
self
.
base_job
)
assert
nodelist
==
[],
'Wrong node in list %s'
%
nodelist
def
test_assemble_queue_data_attrs_non_draining
(
self
):
'''CraySystem._assemble_queue_data: return idle and non draining only'''
self
.
system
.
nodes
[
'1'
].
status
=
'busy'
self
.
system
.
nodes
[
'2'
].
status
=
'down'
self
.
system
.
nodes
[
'3'
].
status
=
'allocated'
self
.
system
.
nodes
[
'4'
].
set_drain
(
100
,
1
)
nodelist
=
self
.
system
.
_assemble_queue_data
(
self
.
base_job
,
no_draining
=
True
)
assert_match
(
sorted
(
nodelist
),
[
'5'
],
"Bad Nodelist"
)
def
test_find_queue_equivalence_classes_single
(
self
):
'''CraySystem.find_queue_equivalence_classes: single queue'''
self
.
system
.
find_queue_equivalence_classes
([],
[
'default'
],
[])
...
...
@@ -358,7 +367,6 @@ class TestCraySystem(object):
else
:
assert
node
.
draining
,
"drain should not be cleared for node %s"
%
node
.
node_id
@
patch
.
object
(
CraySystem
,
'_ALPS_reserve_resources'
,
fake_reserve
)
@
patch
.
object
(
time
,
'time'
,
return_value
=
500.000
)
def
test_find_job_location_allocate_first_fit
(
self
,
*
args
,
**
kwargs
):
...
...
@@ -372,3 +380,4 @@ class TestCraySystem(object):
assert
self
.
system
.
nodes
[
'1'
].
reserved_until
==
800.0
,
(
'reserved until expected 800.0, got %s'
%
self
.
system
.
nodes
[
'1'
].
reserved_until
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment