Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
AIG-public
Cobalt
Commits
1dd9fe62
Commit
1dd9fe62
authored
Aug 24, 2016
by
Paul Rich
Browse files
Merge branch 'master' into 25-fix-reservation-location
parents
f6a08b49
b1d21737
Changes
4
Hide whitespace changes
Inline
Side-by-side
src/clients/setres.py
View file @
1dd9fe62
...
...
@@ -88,20 +88,39 @@ def verify_locations(partitions):
if
system_type
in
[
'alps_system'
]:
# nodes come in as a compact list. expand this.
check_partitions
=
[]
# if we're not a compact list, convert to a compact list. Get this,
# ideally, in one call
for
num_list
in
partitions
:
check_partitions
.
extend
(
expand_num_list
(
num_list
))
for
p
in
check_partitions
:
test_parts
=
client_utils
.
component_call
(
SYSMGR
,
False
,
'verify_locations'
,
(
check_partitions
,))
if
len
(
test_parts
)
!=
len
(
check_partitions
):
missing
=
[
p
for
p
in
check_partitions
if
p
not
in
test_parts
]
client_utils
.
logger
.
error
(
"Missing partitions: %s"
%
(
" "
.
join
(
missing
)))
sys
.
exit
(
1
)
test_parts
=
client_utils
.
component_call
(
SYSMGR
,
False
,
'verify_locations'
,
(
check_partitions
,))
# On Cray we will be a little looser to make setting reservations
# easier.
client_utils
.
logger
.
info
(
'Found Nodes: %s'
,
compact_num_list
(
test_parts
))
missing_nodes
=
set
(
check_partitions
)
-
set
(
test_parts
)
if
len
(
missing_nodes
)
!=
0
:
# TODO: relax this, we should allow for this to occur, but
# reservation-queue data amalgamation will need a fix to get
# this to work. --PMR
client_utils
.
logger
.
error
(
"Missing partitions: %s"
%
(
","
.
join
([
str
(
nid
)
for
nid
in
missing_nodes
])))
client_utils
.
logger
.
error
(
"Aborting reservation setup."
)
sys
.
exit
(
1
)
#sys.exit(1)
else
:
for
p
in
check_partitions
:
test_parts
=
client_utils
.
component_call
(
SYSMGR
,
False
,
'verify_locations'
,
(
check_partitions
,))
if
len
(
test_parts
)
!=
len
(
check_partitions
):
missing
=
[
p
for
p
in
check_partitions
if
p
not
in
test_parts
]
client_utils
.
logger
.
error
(
"Missing partitions: %s"
%
(
" "
.
join
(
missing
)))
sys
.
exit
(
1
)
def
validate_args
(
parser
,
spec
,
opt_count
):
"""
Validate setres arguments. Will return true if we want to continue processing options.
"""
system_type
=
client_utils
.
component_call
(
SYSMGR
,
False
,
'get_implementation'
,
())
if
parser
.
options
.
partitions
!=
None
:
parser
.
args
+=
[
part
for
part
in
parser
.
options
.
partitions
.
split
(
':'
)]
...
...
@@ -120,7 +139,7 @@ def validate_args(parser,spec,opt_count):
if
only_id_change
:
# make the ID change and we are done with setres
if
parser
.
options
.
res_id
!=
None
:
set_res_id
(
parser
)
if
parser
.
options
.
cycle_id
!=
None
:
...
...
@@ -150,17 +169,26 @@ def validate_args(parser,spec,opt_count):
client_utils
.
logger
.
error
(
"Cannot use -D while changing start or cycle time"
)
sys
.
exit
(
1
)
if
not
parser
.
no_args
():
verify_locations
(
parser
.
args
)
# if we have command line arguments put them in spec
if
not
parser
.
no_args
():
spec
[
'partitions'
]
=
":"
.
join
(
parser
.
args
)
if
system_type
in
[
'alps_system'
]:
if
not
parser
.
no_args
():
nodes
=
[]
for
arg
in
parser
.
args
:
nodes
.
extend
(
expand_num_list
(
arg
))
compact_nodes
=
compact_num_list
(
nodes
)
verify_locations
([
compact_nodes
])
spec
[
'partitions'
]
=
compact_nodes
else
:
if
not
parser
.
no_args
():
verify_locations
(
parser
.
args
)
if
not
parser
.
no_args
():
spec
[
'partitions'
]
=
":"
.
join
(
parser
.
args
)
continue_processing_options
=
True
# continue, setres is not done.
return
continue_processing_options
def
modify_reservation
(
parser
):
"""
this will handle reservation modifications
...
...
src/lib/Components/system/CraySystem.py
View file @
1dd9fe62
...
...
@@ -19,8 +19,6 @@ from Cobalt.DataTypes.ProcessGroup import ProcessGroup
from
Cobalt.Util
import
compact_num_list
,
expand_num_list
from
Cobalt.Util
import
init_cobalt_config
,
get_config_option
_logger
=
logging
.
getLogger
(
__name__
)
init_cobalt_config
()
...
...
@@ -44,9 +42,6 @@ class ALPSProcessGroup(ProcessGroup):
#inherit generic getstate and setstate methods from parent
class
CraySystem
(
BaseSystem
):
'''Cray/ALPS-specific system component. Behaviors should go here. Direct
ALPS interaction through BASIL/other APIs should go through the ALPSBridge
...
...
@@ -80,6 +75,8 @@ class CraySystem(BaseSystem):
while
bridge_pending
:
# purge stale children from prior run. Also ensure the
# system_script_forker is currently up.
# These attempts may fail due to system_script_forker not being up.
# We don't want to trash the statefile in this case.
try
:
ALPSBridge
.
init_bridge
()
except
ALPSBridge
.
BridgeError
:
...
...
@@ -96,11 +93,10 @@ class CraySystem(BaseSystem):
self
.
process_manager
=
ProcessGroupManager
(
pgroup_type
=
ALPSProcessGroup
)
else
:
self
.
process_manager
=
ProcessGroupManager
(
pgroup_type
=
ALPSProcessGroup
).
__setstate__
(
spec
[
'process_manager'
])
self
.
logger
.
info
(
'pg type %s'
,
self
.
process_manager
.
process_groups
.
item_cls
)
self
.
logger
.
debug
(
'pg type %s'
,
self
.
process_manager
.
process_groups
.
item_cls
)
#self.process_manager.forkers.append('alps_script_forker')
self
.
process_manager
.
update_launchers
()
self
.
pending_start_timeout
=
1200
#20 minutes for long reboots.
self
.
pending_start_timeout
=
PENDING_STARTUP_TIMEOUT
_logger
.
info
(
'PROCESS MANAGER INTIALIZED'
)
#resource management setup
self
.
nodes
=
{}
#cray node_id: CrayNode
...
...
@@ -124,8 +120,7 @@ class CraySystem(BaseSystem):
#state update thread and lock
self
.
_node_lock
=
threading
.
RLock
()
self
.
_gen_node_to_queue
()
self
.
node_update_thread
=
thread
.
start_new_thread
(
self
.
_run_update_state
,
tuple
())
self
.
node_update_thread
=
thread
.
start_new_thread
(
self
.
_run_update_state
,
tuple
())
_logger
.
info
(
'UPDATE THREAD STARTED'
)
self
.
current_equivalence_classes
=
[]
self
.
killing_jobs
=
{}
...
...
@@ -388,7 +383,7 @@ class CraySystem(BaseSystem):
#resource reservation
cleanup_nodes
=
[
node
for
node
in
self
.
nodes
.
values
()
if
node
.
status
==
'cleanup-pending'
]
#If we have a block marked for cleanup, send a rele
s
ae message.
#If we have a block marked for cleanup, send a relea
s
e message.
released_res_jobids
=
[]
for
node
in
cleanup_nodes
:
for
alps_res
in
self
.
alps_reservations
.
values
():
...
...
@@ -1293,23 +1288,30 @@ class ALPSReservation(object):
# fetch reservation information so that we can send kills to
# interactive apruns.
resinfo
=
ALPSBridge
.
fetch_reservations
()
apids
=
_find_non_batch_apids
(
resinfo
[
'reservations'
])
apids
=
_find_non_batch_apids
(
resinfo
[
'reservations'
]
,
self
.
alps_res_id
)
else
:
_logger
.
info
(
'ALPS reservation: %s has no claims left.'
,
self
.
alps_res_id
)
self
.
dying
=
True
return
apids
def
_find_non_batch_apids
(
resinfo
):
'''Extract apids from non-ba
tch
items.'''
def
_find_non_batch_apids
(
resinfo
,
alps_res_id
):
'''Extract apids from non-ba
sil
items.'''
apids
=
[]
for
alps_res
in
resinfo
:
#wow, this is ugly.
for
applications
in
alps_res
[
'ApplicationArray'
]:
for
application
in
applications
.
values
():
for
app_data
in
application
:
for
commands
in
app_data
[
'CommandArray'
]:
for
command
in
commands
.
values
():
if
command
[
0
][
'cmd'
]
!=
'BASIL'
:
apids
.
append
(
app_data
[
'application_id'
])
if
str
(
alps_res
[
'reservation_id'
])
==
str
(
alps_res_id
):
#wow, this is ugly. Traversing the XML from BASIL
for
applications
in
alps_res
[
'ApplicationArray'
]:
for
application
in
applications
.
values
():
for
app_data
in
application
:
# applicaiton id is at the app_data level. Multiple
# commands don't normally happen. Maybe in a MPMD job?
# All commands will have the same applicaiton id.
for
commands
in
app_data
[
'CommandArray'
]:
for
command
in
commands
.
values
():
# BASIL is the indicaiton of a apbasil
# reservation. apruns with the application of
# BASIL would be an error.
if
command
[
0
][
'cmd'
]
!=
'BASIL'
:
apids
.
append
(
app_data
[
'application_id'
])
return
apids
src/lib/Components/system/base_pg_manager.py
View file @
1dd9fe62
...
...
@@ -48,13 +48,10 @@ class ProcessGroupManager(object): #degenerate with ProcessMonitor.
self
.
process_groups
=
ProcessGroupDict
()
self
.
process_groups
.
item_cls
=
self
.
pgroup_type
else
:
self
.
process_groups
=
ProcessGroupDict
()
self
.
process_groups
.
item_cls
=
self
.
pgroup_type
_logger
.
info
(
"%s"
,
state
[
'process_groups'
])
for
pgroup
in
state
[
'process_groups'
]:
pg
=
self
.
process_groups
.
item_cls
().
__setstate__
(
pgroup
)
self
.
process_groups
[
pg
.
id
]
=
pg
self
.
process_groups
.
q_add
(
state
[
'process_groups'
])
self
.
process_groups
=
state
.
get
(
'process_groups'
,
ProcessGroupDict
())
for
pg
in
self
.
process_groups
.
values
():
_logger
.
info
(
'recovering pgroup %s, jobid %s'
,
pg
.
id
,
pg
.
jobid
)
self
.
process_groups
.
id_gen
.
set
(
int
(
state
[
'next_pg_id'
]))
self
.
process_group_actions
=
{}
self
.
forkers
=
[]
#list of forker identifiers to use with ComponentProxy
...
...
@@ -64,8 +61,7 @@ class ProcessGroupManager(object): #degenerate with ProcessMonitor.
def
__getstate__
(
self
):
state
=
{}
state
[
'process_groups'
]
=
[
pg
.
__getstate__
for
pg
in
self
.
process_groups
.
values
()]
state
[
'process_groups'
]
=
self
.
process_groups
state
[
'next_pg_id'
]
=
self
.
process_groups
.
id_gen
.
idnum
+
1
return
state
...
...
tools/apkill_enhanced.sh
0 → 100755
View file @
1dd9fe62
#!/bin/bash
# For those times when politely asking an applicaiton to terminate isn't
# sufficient. This first sends the passed signal through to apkill, then waits
# 5 minutes for termination. Then sends another apkill -SIGKILL. This is as big
# a hammer as we have for termination.
apkill_cmd
=
'/opt/cray/alps/default/bin/apkill'
apstat_cmd
=
'/opt/cray/alps/default/bin/apstat'
if
[[
"$#"
-lt
"2"
]]
then
echo
"Usage: enhanced_apkill -[signal] [pidlist]"
exit
1
fi
$apkill_cmd
$1
${
@
:2
}
sleep
300
for
apid
in
${
@
:2
}
do
found_apid
=
`
awk
"/
$apid
/ {print
\\
$0
}"
<
((
apstat
-a
$apid
2> /dev/null
))
`
if
[
-n
"
$found_apid
"
]
then
echo
Sending SIGKILL to
$apid
$apkill_cmd
-9
$apid
fi
done
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment