Commit 3e43e2a2 authored by Paul Rich's avatar Paul Rich
Browse files

SSD monitoring and status reporting added.

parent 7e22422b
......@@ -263,21 +263,25 @@ def fetch_ssd_diags(nid_list=None, raw=False):
ret_info = _call_sys_forker_capmc(CAPMC_PATH, args)
if not raw: # Not all consistency is foolish.
fixed_ret_info = {}
for key, val in ret_info.items():
if key not in ['e', 'err_msg']:
for diag_key, diag_val in ret_info[key]:
if diag_key not in ['serial_num', 'size']:
fixed_ret_info[key][diag_key] = diag_val
elif diag_key == 'serial_num':
fixed_ret_info[key]['serial_number'] = diag_val
elif diag_key == 'size':
# It's storage so apparently we're using 10^3 instead of 2^10
# Going from GB back to bytes
fixed_ret_info[key][diag_key] = int(1000000000 * int(diag_val))
else:
fixed_ret_info[key] = val
fixed_ret_info['e'] = ret_info['e']
fixed_ret_info['err_msg'] = ret_info['err_msg']
fixed_ret_info['ssd_diags'] = []
diag_info = ret_info['ssd_diags']
for info in diag_info:
fixed_diag_info = {}
for diag_key, diag_val in info.items():
if diag_key not in ['serial_num', 'size']:
fixed_diag_info[diag_key] = diag_val
elif diag_key == 'serial_num':
fixed_diag_info['serial_number'] = diag_val
elif diag_key == 'size':
# It's storage so apparently we're using 10^3 instead of 2^10
# Going from GB back to bytes
fixed_diag_info[diag_key] = int(1000000000 * int(diag_val))
fixed_ret_info['ssd_diags'].append(fixed_diag_info)
ret_info = fixed_ret_info
return ret_info
def _log_xmlrpc_error(runid, fault):
'''Log an xmlrpc error.
......@@ -345,8 +349,8 @@ def _call_sys_forker(path, tag, label, args=None, in_str=None):
# invalid. If we never got one, then let the
# caller handle the error.
if child['exit_status'] != 0:
_logger.error("BASIL returned a status of %s",
child['exit_status'])
_logger.error("%s returned a status of %s, stderr: %s",
cmd, child['exit_status'], "\n".join(child['stderr']))
resp = child['stdout_string']
try:
ComponentProxy(FORKER).cleanup_children([runid])
......@@ -393,6 +397,7 @@ def _call_sys_forker_basil(basil_path, in_str):
def _call_sys_forker_capmc(capmc_path, args):
'''Call a CAPMC command and recieve response'''
resp = _call_sys_forker(capmc_path, 'apbridge', 'capmc_ssd', args=args)
parsed_response = {}
try:
parsed_response = json.loads(resp)
except TypeError:
......
......@@ -202,6 +202,12 @@ class CraySystem(BaseSystem):
reservations = ALPSBridge.fetch_reservations()
_logger.info('ALPS RESERVATION DATA FETCHED')
# reserved_nodes = ALPSBridge.reserved_nodes()
ssd_enabled = ALPSBridge.fetch_ssd_enable()
_logger.info('CAPMC SSD ENABLED DATA FETCHED')
ssd_info = ALPSBridge.fetch_ssd_static_data()
_logger.info('CAPMC SSD DETAIL DATA FETCHED')
ssd_diags = ALPSBridge.fetch_ssd_diags()
_logger.info('CAPMC SSD DIAG DATA FETCHED')
except Exception:
#don't crash out here. That may trash a statefile.
_logger.error('Possible transient encountered during initialization. Retrying.',
......@@ -210,7 +216,7 @@ class CraySystem(BaseSystem):
else:
pending = False
self._assemble_nodes(inventory, system)
self._assemble_nodes(inventory, system, ssd_enabled, ssd_info, ssd_diags)
#Reversing the node name to id lookup is going to save a lot of cycles.
for node in self.nodes.values():
self.node_name_to_id[node.name] = node.node_id
......@@ -219,10 +225,23 @@ class CraySystem(BaseSystem):
# self._assemble_reservations(reservations, reserved_nodes)
return
def _assemble_nodes(self, inventory, system):
def _assemble_nodes(self, inventory, system, ssd_enabled, ssd_info, ssd_diags):
'''merge together the INVENTORY and SYSTEM query data to form as
complete a picture of a node as we can.
Args:
inventory - ALPS QUERY(INVENTORY) data
system - ALPS QUERY(SYSTEM) data
ssd_enable - CAPMC get_ssd_enable data
ssd_info - CAPMC get_ssds data
ssd_diags - CAPMC get_ssd_diags data
Returns:
None
Side Effects:
Populates the node dictionary
'''
nodes = {}
for nodespec in inventory['nodes']:
......@@ -236,8 +255,34 @@ class CraySystem(BaseSystem):
if nodes[node_id].role.upper() not in ['BATCH']:
nodes[node_id].status = 'down'
nodes[node_id].status = nodespec['state']
self._update_ssd_data(nodes, ssd_enabled, ssd_info, ssd_diags)
self.nodes = nodes
def _update_ssd_data(self, nodes, ssd_enabled=None, ssd_info=None, ssd_diags=None):
'''Update/add ssd data from CAPMC'''
if ssd_enabled is not None:
for ssd_data in ssd_enabled['nids']:
try:
nodes[str(ssd_data['nid'])].attributes['ssd_enabled'] = int(ssd_data['ssd_enable'])
except KeyError:
_logger.warning('ssd info present for nid %s, but not reported in ALPS.', ssd_data['nid'])
if ssd_info is not None:
for ssd_data in ssd_info['nids']:
try:
nodes[str(ssd_data['nid'])].attributes['ssd_info'] = ssd_data
except KeyError:
_logger.warning('ssd info present for nid %s, but not reported in ALPS.', ssd_data['nid'])
if ssd_diags is not None:
for diag_info in ssd_diags['ssd_diags']:
try:
node = nodes[str(diag_info['nid'])]
except KeyError:
_logger.warning('ssd diag data present for nid %s, but not reported in ALPS.', ssd_data['nid'])
else:
for field in ['life_remaining', 'ts', 'firmware', 'percent_used']:
node.attributes['ssd_info'][field] = diag_info[field]
def _assemble_reservations(self, reservations, reserved_nodes):
# FIXME: we can recover reservations now. Implement this.
pass
......@@ -341,6 +386,7 @@ class CraySystem(BaseSystem):
self.nodes[str(nid)] = new_node
self.logger.warning('Node %s added to tracking.', nid)
@exposed
def update_node_state(self):
'''update the state of cray nodes. Check reservation status and system
......@@ -371,6 +417,9 @@ class CraySystem(BaseSystem):
inven_nodes = ALPSBridge.extract_system_node_data(ALPSBridge.system())
reservations = ALPSBridge.fetch_reservations()
#reserved_nodes = ALPSBridge.reserved_nodes()
# Fetch SSD diagnostic data and enabled flags. I would hope these change in event of dead ssd
ssd_enabled = ALPSBridge.fetch_ssd_enable()
ssd_diags = ALPSBridge.fetch_ssd_diags()
except (ALPSBridge.ALPSError, ComponentLookupError):
_logger.warning('Error contacting ALPS for state update. Aborting this update',
exc_info=True)
......@@ -485,6 +534,8 @@ class CraySystem(BaseSystem):
self._reconstruct_node(inven_node, recon_inventory)
# _logger.error('UNS: ALPS reports node %s but not in our node list.',
# inven_node['node_id'])
# Update SSD data:
self._update_ssd_data(self.nodes, ssd_enabled=ssd_enabled, ssd_diags=ssd_diags)
#should down win over running in terms of display?
#keep node that are marked for cleanup still in cleanup
for node in cleanup_nodes:
......
......@@ -1376,8 +1376,8 @@ def print_node_details(args):
retval = str(value)
return retval
nodes = component_call(SYSMGR, False, 'get_nodes',
(True, expand_node_args(args)))
nodes = json.loads(component_call(SYSMGR, False, 'get_nodes',
(True, expand_node_args(args), None, True)))
res_queues = _setup_res_info()
for node in nodes.values():
header_list = []
......@@ -1393,6 +1393,10 @@ def print_node_details(args):
if res_queues.get(str(node['node_id']), False):
queues.extend(res_queues[str(node['node_id'])])
value_list.append(':'.join(queues))
elif key == 'attributes':
for attr_key, attr_val in value.items():
header_list.append(key +'.'+ attr_key )
value_list.append(gen_printable_value(attr_val))
else:
header_list.append(key)
value_list.append(gen_printable_value(value))
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment