Commit a4d252a8 authored by Paul Rich's avatar Paul Rich
Browse files

Upgrading error handling in ALPSReservation.release.

parent 18a414d3
......@@ -6,9 +6,11 @@ import time
import sys
import xmlrpclib
import json
from xml.etree.ElementTree import ParseError
import ConfigParser
import Cobalt.Util
import Cobalt.Components.system.AlpsBridge as ALPSBridge
from Cobalt.Components.system.AlpsBridge import ALPSError
from Cobalt.Components.base import Component, exposed, automatic, query, locking
from Cobalt.Components.system.base_system import BaseSystem
from Cobalt.Components.system.CrayNode import CrayNode
......@@ -1577,18 +1579,40 @@ class ALPSReservation(object):
#release already issued. Ignore
return
apids = []
status = ALPSBridge.release(self.alps_res_id)
if int(status['claims']) != 0:
_logger.info('ALPS reservation: %s still has %s claims.',
self.alps_res_id, status['claims'])
# fetch reservation information so that we can send kills to
# interactive apruns.
resinfo = ALPSBridge.fetch_reservations()
apids = _find_non_batch_apids(resinfo['reservations'], self.alps_res_id)
try:
status = ALPSBridge.release(self.alps_res_id)
except ParseError as exc:
_logger.error("ALPSReservation.release: error parsing release message from ALPS: %s", exc)
except ALPSError as exc:
_logger.error("ALPSReservation.release: ALPS error in reservation release: %s", exc)
except xmlrpclib.Fault as exc:
_logger.error("ALPSReservation.release: XMLRPC error in reservation release: %s", exc)
else:
_logger.info('ALPS reservation: %s has no claims left.',
self.alps_res_id)
self.dying = True
if int(status['claims']) != 0:
_logger.info('ALPS reservation: %s still has %s claims.',
self.alps_res_id, status['claims'])
# fetch reservation information so that we can send kills to
# interactive apruns.
try:
resinfo = ALPSBridge.fetch_reservations()
except ParseError as exc:
_logger.error("ALPSReservation.release: error parsing reservation fetch from ALPS: %s", exc)
except ALPSError as exc:
_logger.error("ALPSReservation.release: ALPS error in reservation fetch: %s", exc)
except xmlrpclib.Fault as exc:
_logger.error("ALPSReservation.release: XMLRPC error in reservation fetch: %s", exc)
else:
try:
apids = _find_non_batch_apids(resinfo['reservations'], self.alps_res_id)
except KeyError:
# This occurs if the claim is removed and the aprun cleanup
# completes between the release and the subsequent fetch reservations
# Generally happens on systems on the last cobalt job run.
_logger.info('No ALPS reservations remaining on system.')
else:
_logger.info('ALPS reservation: %s has no claims left.',
self.alps_res_id)
self.dying = True
return apids
def _find_non_batch_apids(resinfo, alps_res_id):
......@@ -1600,14 +1624,18 @@ def _find_non_batch_apids(resinfo, alps_res_id):
for applications in alps_res['ApplicationArray']:
for application in applications.values():
for app_data in application:
# applicaiton id is at the app_data level. Multiple
# application id is at the app_data level. Multiple
# commands don't normally happen. Maybe in a MPMD job?
# All commands will have the same applicaiton id.
# All commands will have the same application id.
for commands in app_data['CommandArray']:
for command in commands.values():
# BASIL is the indicaiton of a apbasil
# BASIL is the indication of a apbasil
# reservation. apruns with the application of
# BASIL would be an error.
if command[0]['cmd'] != 'BASIL':
apids.append(app_data['application_id'])
# in case of MPMD command, iterate
for command_data in command: #this is actually a list of arrays
if command_data['cmd'] != 'BASIL':
# This is the basil tracking reservation, so we can get appid from here
apids.append(app_data['application_id'])
return apids
......@@ -7,6 +7,8 @@ elogin_hosts: foo:bar
import Cobalt
import TestCobalt
import sys
import xml.etree.ElementTree
import xmlrpclib
config_file = Cobalt.CONFIG_FILES[0]
config_fp = open(config_file, "w")
config_fp.write(SYSTEM_CONFIG_ENTRY)
......@@ -1151,3 +1153,146 @@ class TestCraySystem(object):
info = self.system._ALPS_reserve_resources(job, new_time, node_id_list)
assert_match(info, node_id_list, 'Bad reservation info returned')
TestCraySystem.verify_alps_reservation_dict(self.system)
class TestALPSReservation(object):
'''Tests for the ALPSReservation class in src/lib/Components/system/CraySystem.py'''
def setup(self, *args, **kwargs):
self.base_spec = {'name':'test', 'state': 'UP', 'node_id': '1', 'role':'batch',
'architecture': 'XT', 'SocketArray':['foo', 'bar'],
'queues':['default'],
}
self.nodes = {}
for i in range(1,6):
self.base_spec['name'] = "test%s" % i
self.base_spec['node_id'] = str(i)
node_dict=dict(self.base_spec)
self.nodes[str(i)] = CrayNode(node_dict)
self.base_job = {'jobid':1, 'user':'crusher', 'attrs':{},
'queue':'default', 'nodes': 1, 'walltime': 60,
}
@patch('Cobalt.Components.system.CraySystem.ALPSBridge.release', return_value={'claims': '0'})
@patch('Cobalt.Components.system.CraySystem.ALPSBridge.fetch_reservations')
def test_ALPSReservation_release_no_claims(self, mock_fetch_reservations, mock_release):
'''ALPSReservation.release: no claims'''
spec = {'reserved_nodes': [1], 'reservation_id': 2, 'pagg_id': 3, }
alps_res = Cobalt.Components.system.CraySystem.ALPSReservation(self.base_job, spec, self.nodes.values())
apids = alps_res.release()
assert_match(apids, [], "Wrong apids returned.")
assert alps_res.dying, "ALPSReservation not marked as dying"
assert_match(mock_release.call_count, 1, "ALPSBridge.release call count wrong.")
assert_match(mock_fetch_reservations.call_count, 0, "ALPSBridge.fetch_reservations call count wrong.")
@patch('Cobalt.Components.system.CraySystem.ALPSBridge.release', return_value={'claims': '1'})
@patch('Cobalt.Components.system.CraySystem.ALPSBridge.fetch_reservations')
def test_ALPSReservation_release_have_claims(self, mock_fetch_reservations, mock_release):
'''ALPSReservation.release: has claims remaining'''
spec = {'reserved_nodes': [1], 'reservation_id': 2, 'pagg_id': 3, }
mock_fetch_reservations.return_value = {'reservations':
[{'reservation_id': '2',
'ApplicationArray': [{'Application': [{'CommandArray':[{'Command': [{'cmd': 'BATCH'}]}], 'application_id': '10'}]
}],
},
{'reservation_id': '4',
'ApplicationArray': [{'CommandArray': [{'cmd': '/bin/date'}], 'application_id': '12'}],
'ApplicationArray': [{'Application': [{'CommandArray':[{'Command': [{'cmd': '/bin/date'}]}], 'application_id': '12'}]
}],
}
]
}
alps_res = Cobalt.Components.system.CraySystem.ALPSReservation(self.base_job, spec, self.nodes.values())
apids = alps_res.release()
assert_match(apids, ['10'], "Wrong apids returned.")
assert alps_res.dying, "ALPSReservation not marked as dying"
assert_match(mock_release.call_count, 1, "ALPSBridge.release call count wrong.")
assert_match(mock_fetch_reservations.call_count, 1, "ALPSBridge.fetch_reservations call count wrong.")
@patch('Cobalt.Components.system.CraySystem.ALPSBridge.release', side_effect=xml.etree.ElementTree.ParseError('Error parsing XML'))
@patch('Cobalt.Components.system.CraySystem.ALPSBridge.fetch_reservations')
def test_ALPSReservation_release_fail_release_xmlparse(self, mock_fetch_reservations, mock_release):
'''ALPSReservation.release: graceful reserved ParserError failure'''
spec = {'reserved_nodes': [1], 'reservation_id': 2, 'pagg_id': 3, }
alps_res = Cobalt.Components.system.CraySystem.ALPSReservation(self.base_job, spec, self.nodes.values())
apids = alps_res.release()
assert_match(apids, [], "Wrong apids returned.")
assert not alps_res.dying, "ALPSReservation marked as dying"
assert_match(mock_release.call_count, 1, "ALPSBridge.release call count wrong.")
assert_match(mock_fetch_reservations.call_count, 0, "ALPSBridge.fetch_reservations call count wrong.")
@patch('Cobalt.Components.system.CraySystem.ALPSBridge.release',
side_effect=Cobalt.Components.system.AlpsBridge.ALPSError('Error reported from ALPS', "PERMANENT"))
@patch('Cobalt.Components.system.CraySystem.ALPSBridge.fetch_reservations')
def test_ALPSReservation_release_fail_release_alpserror(self, mock_fetch_reservations, mock_release):
'''ALPSReservation.release: graceful reserved ALPS Error failure'''
spec = {'reserved_nodes': [1], 'reservation_id': 2, 'pagg_id': 3, }
alps_res = Cobalt.Components.system.CraySystem.ALPSReservation(self.base_job, spec, self.nodes.values())
apids = alps_res.release()
assert_match(apids, [], "Wrong apids returned.")
assert not alps_res.dying, "ALPSReservation marked as dying"
assert_match(mock_release.call_count, 1, "ALPSBridge.release call count wrong.")
assert_match(mock_fetch_reservations.call_count, 0, "ALPSBridge.fetch_reservations call count wrong.")
@patch('Cobalt.Components.system.CraySystem.ALPSBridge.release', side_effect=xmlrpclib.Fault(faultCode=1, faultString='test'))
@patch('Cobalt.Components.system.CraySystem.ALPSBridge.fetch_reservations')
def test_ALPSReservation_release_fail_release_xmlrpc(self, mock_fetch_reservations, mock_release):
'''ALPSReservation.release: graceful reserved XML-RPC failure'''
spec = {'reserved_nodes': [1], 'reservation_id': 2, 'pagg_id': 3, }
alps_res = Cobalt.Components.system.CraySystem.ALPSReservation(self.base_job, spec, self.nodes.values())
apids = alps_res.release()
assert_match(apids, [], "Wrong apids returned.")
assert not alps_res.dying, "ALPSReservation marked as dying"
assert_match(mock_release.call_count, 1, "ALPSBridge.release call count wrong.")
assert_match(mock_fetch_reservations.call_count, 0, "ALPSBridge.fetch_reservations call count wrong.")
@patch('Cobalt.Components.system.CraySystem.ALPSBridge.release', return_value={'claims': '1'})
@patch('Cobalt.Components.system.CraySystem.ALPSBridge.fetch_reservations', side_effect=xml.etree.ElementTree.ParseError('Error parsing XML'))
def test_ALPSReservation_release_fail_res_fetch_xmlparse(self, mock_fetch_reservations, mock_release):
'''ALPSReservation.release: graceful fetch ParserError failure'''
spec = {'reserved_nodes': [1], 'reservation_id': 2, 'pagg_id': 3, }
alps_res = Cobalt.Components.system.CraySystem.ALPSReservation(self.base_job, spec, self.nodes.values())
apids = alps_res.release()
assert_match(apids, [], "Wrong apids returned.")
assert alps_res.dying, "ALPSReservation not marked as dying"
assert_match(mock_release.call_count, 1, "ALPSBridge.release call count wrong.")
assert_match(mock_fetch_reservations.call_count, 1, "ALPSBridge.fetch_reservations call count wrong.")
@patch('Cobalt.Components.system.CraySystem.ALPSBridge.release', return_value={'claims': '1'})
@patch('Cobalt.Components.system.CraySystem.ALPSBridge.fetch_reservations',
side_effect=Cobalt.Components.system.AlpsBridge.ALPSError('Error reported from ALPS', "PERMANENT"))
def test_ALPSReservation_release_fail_res_fetch_alpserror(self, mock_fetch_reservations, mock_release):
'''ALPSReservation.release: graceful fetch ALPS Error failure'''
spec = {'reserved_nodes': [1], 'reservation_id': 2, 'pagg_id': 3, }
alps_res = Cobalt.Components.system.CraySystem.ALPSReservation(self.base_job, spec, self.nodes.values())
apids = alps_res.release()
assert_match(apids, [], "Wrong apids returned.")
assert alps_res.dying, "ALPSReservation not marked as dying"
assert_match(mock_release.call_count, 1, "ALPSBridge.release call count wrong.")
assert_match(mock_fetch_reservations.call_count, 1, "ALPSBridge.fetch_reservations call count wrong.")
@patch('Cobalt.Components.system.CraySystem.ALPSBridge.release', return_value={'claims': '1'})
@patch('Cobalt.Components.system.CraySystem.ALPSBridge.fetch_reservations', side_effect=xmlrpclib.Fault(faultCode=1, faultString='test'))
def test_ALPSReservation_release_fail_res_fetch_xmlrpc(self, mock_fetch_reservations, mock_release):
'''ALPSReservation.release: graceful fetch XML-RPC failure'''
spec = {'reserved_nodes': [1], 'reservation_id': 2, 'pagg_id': 3, }
alps_res = Cobalt.Components.system.CraySystem.ALPSReservation(self.base_job, spec, self.nodes.values())
apids = alps_res.release()
assert_match(apids, [], "Wrong apids returned.")
assert alps_res.dying, "ALPSReservation not marked as dying"
assert_match(mock_release.call_count, 1, "ALPSBridge.release call count wrong.")
assert_match(mock_fetch_reservations.call_count, 1, "ALPSBridge.fetch_reservations call count wrong.")
@patch('Cobalt.Components.system.CraySystem.ALPSBridge.release', return_value={'claims': '1'})
@patch('Cobalt.Components.system.CraySystem.ALPSBridge.fetch_reservations')
def test_ALPSReservation_release_fail_res_fetch_no_reservations(self, mock_fetch_reservations, mock_release):
'''ALPSReservation.release: graceful handling of already removed reservations post release request'''
spec = {'reserved_nodes': [1], 'reservation_id': 2, 'pagg_id': 3, }
mock_fetch_reservations.return_value={} #No reservation data should trigger a KeyError
alps_res = Cobalt.Components.system.CraySystem.ALPSReservation(self.base_job, spec, self.nodes.values())
apids = alps_res.release()
assert_match(apids, [], "Wrong apids returned.")
assert alps_res.dying, "ALPSReservation not marked as dying"
assert_match(mock_release.call_count, 1, "ALPSBridge.release call count wrong.")
assert_match(mock_fetch_reservations.call_count, 1, "ALPSBridge.fetch_reservations call count wrong.")
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment