test_cray.py 20.2 KB
Newer Older
Paul Rich's avatar
Paul Rich committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
# Test Cray-specific utilities/calls.
from nose.tools import raises
from testsuite.TestCobalt.Utilities.assert_functions import assert_match, assert_not_match
from Cobalt.Components.system.CrayNode import CrayNode
import Cobalt.Exceptions
import time
from Cobalt.Components.system.CraySystem import CraySystem
from Cobalt.Components.system.base_pg_manager import ProcessGroupManager
import Cobalt.Components.system.AlpsBridge as AlpsBridge

from mock import MagicMock, Mock, patch


def is_match(a, b):
    return a is b

class TestCrayNode(object):

    def setup(self):
        self.spec = {'name':'test', 'state': 'UP', 'node_id': 1, 'role':'batch',
                'architecture': 'XT', 'SocketArray':['foo', 'bar'],
                }
        self.base_node = CrayNode(self.spec)

    def teardown(self):
        del self.spec
        del self.base_node

    def test_init(self):
30
        '''CrayNode.__init__: test initilaizer'''
Paul Rich's avatar
Paul Rich committed
31
32
33
34
35
36
37
38
39
40
41
42
43
        spec = {'name':'test', 'state': 'UP', 'node_id': 1, 'role':'batch',
                'architecture': 'XT', 'SocketArray':['foo', 'bar'],
                }
        node = CrayNode(spec)
        assert_match(node.status, 'idle', 'bad status')
        assert_match(node.node_id, 1, 'bad nodeid')
        assert_match(node.role, 'BATCH', 'bad role')
        assert_match(node.attributes['architecture'], 'XT',
                'bad architecture',  is_match)
        assert_match(node.segment_details, ['foo', 'bar'],
                'bad segment')
        assert_match(node.ALPS_status, 'UNKNOWN',
                'bad default ALPS status')
44
45
        assert 'alps-interactive' in node.RESOURCE_STATUSES,(
                'alps-interactive not in resource statuses')
Paul Rich's avatar
Paul Rich committed
46
47

    def test_init_alps_states(self):
48
        '''CrayNode.__init__: alps states correctly set'''
Paul Rich's avatar
Paul Rich committed
49
50
51
52
53
54
55
56
57
58
        cray_state_list = ['UP', 'DOWN', 'UNAVAILABLE', 'ROUTING', 'SUSPECT',
                           'ADMIN', 'UNKNOWN', 'UNAVAIL', 'SWDOWN', 'REBOOTQ',
                           'ADMINDOWN']
        correct_alps_states = {'UP': 'idle', 'DOWN':'down', 'UNAVAILABLE':'down',
                               'ROUTING':'down', 'SUSPECT':'down', 'ADMIN':'down',
                               'UNKNOWN':'down', 'UNAVAIL': 'down', 'SWDOWN': 'down',
                               'REBOOTQ':'down', 'ADMINDOWN':'down'}
        for state in cray_state_list:
            self.spec['state'] = state
            node = CrayNode(self.spec)
59
60
61
            assert node.status == correct_alps_states[state],(
                    "%s should map to %s" % (node.status,
                        correct_alps_states[state]))
Paul Rich's avatar
Paul Rich committed
62
63

    def test_non_cray_statuses(self):
64
        '''CrayNode.status: can set cobalt-tracking statuses.'''
Paul Rich's avatar
Paul Rich committed
65
66
67
68
69
70
71
        test_statuses = ['busy', 'cleanup-pending', 'allocated',
                'alps-interactive']
        for status in test_statuses:
            self.base_node.status = status
            assert_match(self.base_node.status, status, "failed validation")

class TestCraySystem(object):
72
73
    '''Test Cray system component functionality'''
    #SETUP AND TEARDOWN HELPERS
Paul Rich's avatar
Paul Rich committed
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
    @patch.object(AlpsBridge, 'init_bridge')
    @patch.object(CraySystem, '_init_nodes_and_reservations', return_value=None)
    @patch.object(CraySystem, '_run_update_state', return_value=None)
    def setup(self, *args, **kwargs):
        self.system = CraySystem()
        self.base_spec = {'name':'test', 'state': 'UP', 'node_id': '1', 'role':'batch',
                'architecture': 'XT', 'SocketArray':['foo', 'bar'],
                'queues':['default'],
                }
        for i in range(1,6):
            self.base_spec['name'] = "test%s" % i
            self.base_spec['node_id'] = str(i)
            node_dict=dict(self.base_spec)
            self.system.nodes[str(i)] = CrayNode(node_dict)
            self.system.node_name_to_id[node_dict['name']] = node_dict['node_id']
89
90
        for node in self.system.nodes.values():
            node.managed = True
Paul Rich's avatar
Paul Rich committed
91
92
93
        self.system._gen_node_to_queue()

        self.base_job = {'jobid':1, 'user':'crusher', 'attrs':{},
94
                'queue':'default', 'nodes': 1,
Paul Rich's avatar
Paul Rich committed
95
96
97
98
99
100
                }

    def teardown(self):
        del self.system
        del self.base_job

101
102
103
104
105
106
107
108
109
110
111
    # HELPER MOCK FUNCTIONS
    def fake_reserve(self, job, new_time, node_id_list):
        '''Mimic first-fit function of ALPS placement scheme'''
        # self gets overriden by the call within fjl to be the real system
        # component.
        ret_nodes = []
        if job['nodes'] <= len(node_id_list):
            ret_nodes = node_id_list[:int(job['nodes'])]
        return ret_nodes

    #TESTS
Paul Rich's avatar
Paul Rich committed
112
    def test_assemble_queue_data(self):
113
        '''CraySystem._assemble_queue_data: base functionality'''
114
        nodelist =  self.system._assemble_queue_data(self.base_job)
115
        assert_match(sorted(nodelist), ['1', '2', '3', '4', '5'], 'nodelist mismatch')
Paul Rich's avatar
Paul Rich committed
116
117

    def test_assemble_queue_data_bad_queue(self):
118
        '''CraySystem._assemble_queue_data: return nothing if queue for job doesn't exist'''
Paul Rich's avatar
Paul Rich committed
119
        self.base_job['queue'] = 'foo'
120
        nodelist = self.system._assemble_queue_data(self.base_job)
121
        assert_match(nodelist, [], 'nonempty nodelist')
Paul Rich's avatar
Paul Rich committed
122
123

    def test_assemble_queue_data_multiple_queue(self):
124
        '''CraySystem._assemble_queue_data: return only proper queue nodes'''
Paul Rich's avatar
Paul Rich committed
125
126
127
        self.system.nodes['1'].queues = ['foo']
        self.system.nodes['4'].queues = ['bar']
        self.system._gen_node_to_queue()
128
        nodelist = self.system._assemble_queue_data(self.base_job)
129
        assert_match(sorted(nodelist), ['2', '3', '5'], 'Wrong nodelist')
Paul Rich's avatar
Paul Rich committed
130
131

    def test_assemble_queue_data_multiple_queue_overlap(self):
132
        '''CraySystem._assemble_queue_data: return only proper queue nodes in overlaping queues'''
Paul Rich's avatar
Paul Rich committed
133
134
135
136
        self.system.nodes['1'].queues = ['foo', 'default', 'bar']
        self.system.nodes['4'].queues = ['default','bar']
        self.system.nodes['5'].queues = ['baz']
        self.system._gen_node_to_queue()
137
        nodelist = self.system._assemble_queue_data(self.base_job)
138
        assert_match(sorted(nodelist), ['1', '2', '3', '4'], 'Wrong nodelist')
Paul Rich's avatar
Paul Rich committed
139
        self.base_job['queue'] = 'foo'
140
        nodelist = self.system._assemble_queue_data(self.base_job)
141
        assert_match(nodelist, ['1'], 'Wrong nodelist')
Paul Rich's avatar
Paul Rich committed
142
        self.base_job['queue'] = 'bar'
143
        nodelist = self.system._assemble_queue_data(self.base_job)
144
        assert_match(sorted(nodelist), ['1', '4'], 'Wrong nodelist')
Paul Rich's avatar
Paul Rich committed
145
        self.base_job['queue'] = 'baz'
146
        nodelist = self.system._assemble_queue_data(self.base_job)
147
        assert_match(nodelist, ['5'], 'Wrong nodelist')
Paul Rich's avatar
Paul Rich committed
148

149
150
    def test_assemble_queue_data_idle(self):
        '''CraySystem._assemble_queue_data: return only idle nodes'''
Paul Rich's avatar
Paul Rich committed
151
152
153
        self.system.nodes['1'].status = 'busy'
        self.system.nodes['4'].status = 'ADMINDOWN'
        self.system._gen_node_to_queue()
154
        nodelist = self.system._assemble_queue_data(self.base_job)
155
        assert_match(sorted(nodelist), ['2','3','5'], 'Wrong nodelist')
Paul Rich's avatar
Paul Rich committed
156

157
158
159
160
161
162
163
164
165
166
167
168
169
170
    def test_assemble_queue_data_non_down(self):
        '''CraySystem._assemble_queue_data: return nodes that are not down'''
        self.system.nodes['1'].status = 'busy'
        self.system.nodes['2'].status = 'cleanup-pending'
        self.system.nodes['3'].status = 'allocated'
        self.system.nodes['4'].status = 'ADMINDOWN'
        nodelist = self.system._assemble_queue_data(self.base_job,
                idle_only=False)
        assert sorted(nodelist) == ['1','2','3','5'], 'Wrong nodes in list %s' % nodelist
        self.system.nodes['1'].status = 'SUSPECT'
        self.system.nodes['2'].status = 'alps-interactive'
        nodelist = self.system._assemble_queue_data(self.base_job,
                idle_only=False)
        assert sorted(nodelist) == ['3','5'], 'Wrong nodes in list %s' % nodelist
171

Paul Rich's avatar
Paul Rich committed
172
    def test_assemble_queue_data_attrs_location(self):
173
        '''CraySystem._assemble_queue_data: return only attr locaiton loc'''
Paul Rich's avatar
Paul Rich committed
174
        self.base_job['attrs'] = {'location':'3'}
175
        nodelist = self.system._assemble_queue_data(self.base_job)
Paul Rich's avatar
Paul Rich committed
176
177
        assert nodelist == ['3'], 'Wrong node in list %s' % nodelist

178
179
180
    def test_assemble_queue_data_attrs_location_repeats(self):
        '''CraySystem._assemble_queue_data: eliminate repeat location entries'''
        self.base_job['attrs'] = {'location':'1,1,2,3'}
181
        nodelist = self.system._assemble_queue_data(self.base_job)
182
183
        assert sorted(nodelist) == ['1', '2', '3'], 'Wrong node in list %s' % nodelist

184
185
    @raises(ValueError)
    def test_assemble_queue_data_attrs_bad_location(self):
186
        '''CraySystem._assemble_queue_data: raise error for location completely outside of
187
188
        queue'''
        self.base_job['attrs'] = {'location':'6'}
189
        nodelist = self.system._assemble_queue_data(self.base_job)
190
191
        assert nodelist == ['3'], 'Wrong node in list %s' % nodelist

Paul Rich's avatar
Paul Rich committed
192
    def test_assemble_queue_data_attrs_location_multi(self):
193
        '''CraySystem._assemble_queue_data: return only attr locaiton complex loc string'''
Paul Rich's avatar
Paul Rich committed
194
        self.base_job['attrs'] = {'location':'1-3,5'}
195
        nodelist = self.system._assemble_queue_data(self.base_job)
Paul Rich's avatar
Paul Rich committed
196
197
198
        assert sorted(nodelist) == ['1','2','3','5'], 'Wrong nodes in list %s' % nodelist

    def test_assemble_queue_data_forbidden_loc(self):
199
        '''CraySystem._assemble_queue_data: avoid reserved nodes'''
Paul Rich's avatar
Paul Rich committed
200
        self.base_job['forbidden'] = ['1-3','5']
201
        nodelist = self.system._assemble_queue_data(self.base_job)
Paul Rich's avatar
Paul Rich committed
202
203
204
        assert sorted(nodelist) == ['4'], 'Wrong nodes in list %s' % nodelist

    def test_assemble_queue_data_forbidden_loc_attrs_loc(self):
205
        '''CraySystem._assemble_queue_data: avoid reserved nodes despite location being set'''
Paul Rich's avatar
Paul Rich committed
206
207
        self.base_job['forbidden'] = ['1-3']
        self.base_job['attrs'] = {'location':'1-4'}
208
        nodelist = self.system._assemble_queue_data(self.base_job)
Paul Rich's avatar
Paul Rich committed
209
210
211
        assert sorted(nodelist) == ['4'], 'Wrong nodes in list %s' % nodelist

    def test_assemble_queue_data_forbidden_loc_attrs_loc_complete(self):
212
        '''CraySystem._assemble_queue_data: avoid reserved nodes block location if superset'''
Paul Rich's avatar
Paul Rich committed
213
214
        self.base_job['forbidden'] = ['1-3']
        self.base_job['attrs'] = {'location':'1-3'}
215
        nodelist = self.system._assemble_queue_data(self.base_job)
Paul Rich's avatar
Paul Rich committed
216
217
218
        assert sorted(nodelist) == [], 'Wrong nodes in list %s' % nodelist

    def test_assemble_queue_data_forbidden_loc_attrs_loc_permit(self):
219
        '''CraySystem._assemble_queue_data: forbidden doesn't block everything'''
Paul Rich's avatar
Paul Rich committed
220
221
        self.base_job['forbidden'] = ['1-3']
        self.base_job['attrs'] = {'location':'4-5'}
222
        nodelist = self.system._assemble_queue_data(self.base_job)
Paul Rich's avatar
Paul Rich committed
223
224
225
        assert sorted(nodelist) == ['4','5'], 'Wrong nodes in list %s' % nodelist

    def test_assemble_queue_data_reserved_loc(self):
226
        '''CraySystem._assemble_queue_data: return reservation nodes'''
Paul Rich's avatar
Paul Rich committed
227
228
        self.base_job['required'] = ['2-4']
        self.base_job['queue'] = 'reservation'
229
        nodelist = self.system._assemble_queue_data(self.base_job)
Paul Rich's avatar
Paul Rich committed
230
231
232
        assert sorted(nodelist) == ['2','3','4'], 'Wrong nodes in list %s' % nodelist

    def test_assemble_queue_data_reserved_loc_idle_only(self):
233
        '''CraySystem._assemble_queue_data: return reservation nodes that are idle'''
Paul Rich's avatar
Paul Rich committed
234
235
236
237
238
239
        self.system.nodes['1'].status = 'busy'
        self.system.nodes['2'].status = 'cleanup-pending'
        self.system.nodes['3'].status = 'allocated'
        self.system.nodes['4'].status = 'ADMINDOWN'
        self.base_job['required'] = ['1-5']
        self.base_job['queue'] = 'reservation'
240
        nodelist = self.system._assemble_queue_data(self.base_job)
Paul Rich's avatar
Paul Rich committed
241
242
        assert sorted(nodelist) == ['5'], 'Wrong nodes in list %s' % nodelist

243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
    def test_assemble_queue_data_reserved_loc_non_down(self):
        '''CraySystem._assemble_queue_data: return reservation nodes that are not down'''
        self.system.nodes['1'].status = 'busy'
        self.system.nodes['2'].status = 'cleanup-pending'
        self.system.nodes['3'].status = 'allocated'
        self.system.nodes['4'].status = 'ADMINDOWN'
        self.base_job['required'] = ['1-5']
        self.base_job['queue'] = 'reservation'
        nodelist = self.system._assemble_queue_data(self.base_job,
                idle_only=False)
        assert sorted(nodelist) == ['1','2','3','5'], 'Wrong nodes in list %s' % nodelist
        self.system.nodes['1'].status = 'SUSPECT'
        self.system.nodes['2'].status = 'alps-interactive'
        self.base_job['required'] = ['1-5']
        self.base_job['queue'] = 'reservation'
        nodelist = self.system._assemble_queue_data(self.base_job,
                idle_only=False)
        assert sorted(nodelist) == ['3','5'], 'Wrong nodes in list %s' % nodelist

Paul Rich's avatar
Paul Rich committed
262
    def test_assemble_queue_data_reserved_loc_location_set(self):
263
        '''CraySystem._assemble_queue_data: return reservation nodes for job with location set'''
Paul Rich's avatar
Paul Rich committed
264
265
266
        self.base_job['required'] = ['1-4']
        self.base_job['attrs'] = {'location':'1,2,4'}
        self.base_job['queue'] = 'reservation'
267
        nodelist = self.system._assemble_queue_data(self.base_job)
Paul Rich's avatar
Paul Rich committed
268
        assert sorted(nodelist) == ['1','2','4'], 'Wrong nodes in list %s' % nodelist
269

270
271
272
273
274
275
276
    def test_assemble_queue_data_attrs_location_blocked_nodes(self):
        '''CraySystem._assemble_queue_data: return only idle locations'''
        self.system.nodes['1'].status = 'busy'
        self.system.nodes['2'].status = 'cleanup-pending'
        self.system.nodes['3'].status = 'allocated'
        self.system.nodes['4'].status = 'ADMINDOWN'
        self.base_job['attrs'] = {'location':'1-5'}
277
        nodelist = self.system._assemble_queue_data(self.base_job)
278
279
280
281
282
283
284
285
286
287
        assert nodelist == ['5'], 'Wrong node in list %s' % nodelist

    def test_assemble_queue_data_attrs_location_all_blocked_nodes(self):
        '''CraySystem._assemble_queue_data: return no locations if attrs location nodes are
        all non idle'''
        self.system.nodes['1'].status = 'busy'
        self.system.nodes['2'].status = 'cleanup-pending'
        self.system.nodes['3'].status = 'allocated'
        self.system.nodes['4'].status = 'ADMINDOWN'
        self.base_job['attrs'] = {'location':'1-4'}
288
        nodelist = self.system._assemble_queue_data(self.base_job)
289
        assert nodelist == [], 'Wrong node in list %s' % nodelist
290

291
292
293
294
295
296
297
    def test_assemble_queue_data_attrs_non_draining(self):
        '''CraySystem._assemble_queue_data: return idle and non draining only'''
        self.system.nodes['1'].status = 'busy'
        self.system.nodes['2'].status = 'down'
        self.system.nodes['3'].status = 'allocated'
        self.system.nodes['4'].set_drain(100, 1)
        nodelist = self.system._assemble_queue_data(self.base_job,
298
                drain_time=150)
299
300
        assert_match(sorted(nodelist), ['5'], "Bad Nodelist")

301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
    def test_assemble_queue_data_attrs_within_draining(self):
        '''CraySystem._assemble_queue_data: return idle and draining if within
        time'''
        self.system.nodes['1'].status = 'busy'
        self.system.nodes['2'].status = 'down'
        self.system.nodes['3'].set_drain(50.0, 2)
        self.system.nodes['4'].set_drain(100.0, 1)
        nodelist = self.system._assemble_queue_data(self.base_job,
                drain_time=90.0)
        assert_match(sorted(nodelist), ['4', '5'], "Bad Nodelist")

    def test_assemble_queue_data_attrs_match_draining(self):
        '''CraySystem._assemble_queue_data: return idle and matched drain node'''
        self.system.nodes['1'].status = 'busy'
        self.system.nodes['2'].status = 'down'
        self.system.nodes['3'].status = 'allocated'
        self.system.nodes['4'].set_drain(100.0, 1)
        nodelist = self.system._assemble_queue_data(self.base_job,
                drain_time=100.0)
        assert_match(sorted(nodelist), ['4', '5'], "Bad Nodelist")

322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
    def test_find_queue_equivalence_classes_single(self):
        '''CraySystem.find_queue_equivalence_classes: single queue'''
        self.system.find_queue_equivalence_classes([], ['default'], [])
        self.system.current_equivalence_classes
        assert len(self.system.current_equivalence_classes) == 1, 'Have %s equiv classes, should have 1.'
        for equiv in self.system.current_equivalence_classes:
            assert equiv['queues'] == ['default'], 'mismatch in returned equiv class queues'

    def test_find_queue_equivalence_classes_overlap(self):
        '''CraySystem.find_queue_equivalence_classes: partial overlapping queues'''
        self.system.nodes['1'].queues = ['foo']
        self.system.nodes['2'].queues = ['foo', 'default']
        self.system.find_queue_equivalence_classes([], ['default', 'foo'], [])
        assert len(self.system.current_equivalence_classes) == 1, (
                'Have %s equiv classes, should have 1.' %
                len(self.system.current_equivalence_classes))
        for equiv in self.system.current_equivalence_classes:
            assert sorted(equiv['queues']) == ['default', 'foo'], (
                    'mismatch in returned equiv class queues %s' %
                    equiv['queues'])

    def test_find_queue_equivalence_classes_disjoint(self):
        '''CraySystem.find_queue_equivalence_classes: disjoint queues'''
        self.system.nodes['1'].queues = ['foo']
        self.system.nodes['2'].queues = ['foo']
        val = self.system.find_queue_equivalence_classes([], ['default', 'foo'], [])
        self.system.current_equivalence_classes
        expect = [{'reservations': [], 'queues': ['foo']},
                  {'reservations': [], 'queues': ['default']}]
        assert self.system.current_equivalence_classes == expect, (
                'Expected %s, got %s' % (expect,
                    self.system.current_equivalence_classes))
        assert val == self.system.current_equivalence_classes, (
                "val/current_equivalence_class mismatch\nReturn: %s\nInternal: %s")

    def test_clear_draining_for_queues_full_clear(self):
        '''CraySystem._clear_draining_for_queues: clear queue's draining times'''
        for node in self.system.nodes.values():
            node.set_drain(100.0, 300)
        self.system.find_queue_equivalence_classes([], ['default'], [])
        self.system._clear_draining_for_queues('default')
        for node in self.system.nodes.values():
            assert not node.draining, "node %s marked as draining!" % node.node_id

    def test_clear_draining_for_queues_multi_queue(self):
        '''CraySystem._clear_draining_for_queues: clear whole equivalence class'''
        self.system.nodes['1'].queues = ['foo']
        self.system.nodes['2'].queues = ['foo', 'default']
        for node in self.system.nodes.values():
            node.set_drain(100.0, 300)
        self.system.find_queue_equivalence_classes([], ['default', 'foo'], [])
        self.system._clear_draining_for_queues('default')
        for node in self.system.nodes.values():
            assert not node.draining, "node %s marked as draining!" % node.node_id

    def test_clear_drianing_for_queues_one_equiv(self):
        '''CraySystem._clear_draining_for_queues: clear only one equivalence class'''
        self.system.nodes['1'].queues = ['foo']
        self.system.nodes['2'].queues = ['foo']
        for node in self.system.nodes.values():
            node.set_drain(100.0, 300)
        self.system.find_queue_equivalence_classes([], ['default', 'foo'], [])
        self.system._clear_draining_for_queues('default')
        for node in self.system.nodes.values():
            if node.node_id not in ['1', '2']:
                assert not node.draining, "node %s marked as draining!" % node.node_id
            else:
                assert node.draining, "drain should not be cleared for node %s" % node.node_id

391
392
    @patch.object(CraySystem, '_ALPS_reserve_resources', fake_reserve)
    @patch.object(time, 'time', return_value=500.000)
393
    def test_find_job_location_allocate_first_fit(self, *args, **kwargs):
394
395
396
        '''CraySystem.find_job_locaton: Assign basic job to nodes'''
        retval = self.system.find_job_location([self.base_job], [], [])
        assert retval == {1: ['1']}, 'bad loc: expected %s, got %s' % ({1: ['1']}, retval)
397
398
399
        assert self.system.pending_starts[1] == 800.0, (
                'bad pending start: expected %s, got %s' %
                (800.0, self.system.pending_starts[1]))
400
        assert self.system.nodes['1'].reserved_jobid == 1, 'Node not reserved'
401
402
        assert self.system.nodes['1'].reserved_until == 800.0, (
                'reserved until expected 800.0, got %s' % self.system.nodes['1'].reserved_until)
403

404