Commit d0a678e6 authored by Paul Rich's avatar Paul Rich
Browse files

Commiting work to date on backfiller revisions for BG/Q

After discussion the current algorithm for determining backfill time
needs to be replaced and needs to depend on which blocks are selected
for draining.  This is a commit for the current algorithm's optimistic
and pessimistic backfill modes.
parent 672d68c3
...@@ -1299,24 +1299,42 @@ class BGBaseSystem (Component): ...@@ -1299,24 +1299,42 @@ class BGBaseSystem (Component):
#Initialize block to immediately available or ready 5 min from now. #Initialize block to immediately available or ready 5 min from now.
#the backfill time is in sec from Epoch. #the backfill time is in sec from Epoch.
#initialize everything that isn't idle as being ready 5 min from now. #initialize everything that isn't idle as being ready 5 min from now.
print BACKFILL_MODE
for block in blocks.itervalues(): for block in blocks.itervalues():
if block.state == 'idle': if block.name in job_end_times.keys():
block.backfill_time = max(now + minimum_not_idle,
job_end_times[block.name])
for child in block._children:
child.backfill_time = block.backfill_time
elif block.state == 'idle':
block.backfill_time = now block.backfill_time = now
else: else:
# Not idle, but not job running according to the queue manager.
# Usually this is cleanup.
block.backfill_time = now + minimum_not_idle block.backfill_time = now + minimum_not_idle
block.draining = False block.draining = False
for block in blocks.itervalues(): for block in blocks.itervalues():
if block.name in job_end_times.keys(): if block.name in job_end_times.keys():
# Keep at least minimum_not_idle open for cleanup. Also, job may be runing over time. #job actively running
if job_end_times[block.name] > block.backfill_time:
block.backfill_time = job_end_times[block.name]
#iterate over current jobs. Blocks with running jobs are set to the job's end time (startime + walltime) #iterate over current jobs. Blocks with running jobs are set to the job's end time (startime + walltime)
#Iterate over parents and set their time to the backfill window as well. #Iterate over parents and set their time to the backfill window as well.
# only set the parent block's time if it is earlier than the block's time # only set the parent block's time if it is earlier than the block's time
for parent_block in block._parents: if BACKFILL_MODE == 'PESSIMISTIC':
if parent_block.backfill_time < block.backfill_time: if job_end_times[block.name] > block.backfill_time:
parent_block.backfill_time = block.backfill_time block.backfill_time = job_end_times[block.name]
for parent_block in block._parents:
if parent_block.backfill_time < block.backfill_time:
parent_block.backfill_time = block.backfill_time
elif BACKFILL_MODE == 'OPTIMISTIC':
for parent_block in block._parents:
# push times to top-level. These will get pushed
# back down to children later. This keeps times
# reasonalble for child-blocks ultimately, and
# should allow for looser backfilling
if (parent_block.backfill_time > block.backfill_time or
block.backfill_time == now):
parent_block.backfill_time = block.backfill_time
#Over all blocks, ignore if the time has not been changed, otherwise push #Over all blocks, ignore if the time has not been changed, otherwise push
# the backfill time to children. Do so if the child is either immediately available # the backfill time to children. Do so if the child is either immediately available
...@@ -1324,19 +1342,21 @@ class BGBaseSystem (Component): ...@@ -1324,19 +1342,21 @@ class BGBaseSystem (Component):
# is this backwards? # is this backwards?
for block in blocks.itervalues(): for block in blocks.itervalues():
if block.backfill_time == now: if block.backfill_time == now:
# Block not impacted by a running job # Block not impacted by a running job
continue continue
if BACKFILL_MODE == 'PESSIMISTIC': if BACKFILL_MODE == 'PESSIMISTIC':
for child in block._children: for child in block._children:
child_block = child if child.backfill_time == now or child.backfill_time > block.backfill_time:
if child_block.backfill_time == now or child_block.backfill_time > block.backfill_time: child.backfill_time = block.backfill_time
child_block.backfill_time = block.backfill_time
elif BACKFILL_MODE == 'OPTIMISTIC': elif BACKFILL_MODE == 'OPTIMISTIC':
# blocks should be at least their children's time.
print "tagging children", block.name
for child in block._children: for child in block._children:
child_block = child print child.name, child.backfill_time, block.backfill_time
if child_block.backfill_time < block.backfill_time: if child.backfill_time < block.backfill_time:
child_block.backfill_time = block.backfill_time print "tagging from child", child.name, child.backfill_time
child.backfill_time = block.backfill_time
print "tagging from child", child.name, child.backfill_time
#Go back through, if we're actually running a job on a block, all of it's children should have timese set to the greater of their current time or the parent block's time #Go back through, if we're actually running a job on a block, all of it's children should have timese set to the greater of their current time or the parent block's time
for name in job_end_times.iterkeys(): for name in job_end_times.iterkeys():
job_block = blocks[name] job_block = blocks[name]
......
...@@ -31,7 +31,13 @@ class BackfillMockBlock(object): ...@@ -31,7 +31,13 @@ class BackfillMockBlock(object):
self._children.update(set(child_blocks)) self._children.update(set(child_blocks))
self._relatives.update(set(child_blocks)) self._relatives.update(set(child_blocks))
@property
def parents(self):
return [block.name for block in self._parents]
@property
def children(self):
return [block.name for block in self._children]
class TestBackfillTime(object): class TestBackfillTime(object):
...@@ -149,11 +155,32 @@ class TestBackfillTime(object): ...@@ -149,11 +155,32 @@ class TestBackfillTime(object):
job_done_1= 600.0 job_done_1= 600.0
job_done_2= 500.0 job_done_2= 500.0
job_end_times = {'MIR-04000-7BFF1-32768':job_done_1, 'MIR-00000-33331-512':job_done_2} job_end_times = {'MIR-04000-7BFF1-32768':job_done_1, 'MIR-00000-33331-512':job_done_2}
for key in job_end_times.keys():
self.set_blocking_states(key, 'allocated')
Cobalt.Components.bgq_base_system.BACKFILL_MODE = 'PESSIMISTIC' Cobalt.Components.bgq_base_system.BACKFILL_MODE = 'PESSIMISTIC'
BGBaseSystem.set_backfill_times(self.blocks, job_end_times, now) BGBaseSystem.set_backfill_times(self.blocks, job_end_times, now)
assert self.blocks['MIR-00000-73FF1-16384'].backfill_time == job_done_2, "MIR-00000-73FF1-16384 has backfill_time"\ assert self.blocks['MIR-00000-73FF1-16384'].backfill_time == job_done_2, "MIR-00000-73FF1-16384 has backfill_time"\
" of %s should be %s" % (self.blocks['MIR-00000-73FF1-16384'].backfill_time, job_done_2) " of %s should be %s" % (self.blocks['MIR-00000-73FF1-16384'].backfill_time, job_done_2)
assert self.blocks['MIR-04000-77FF1-16384'].backfill_time == job_done_1, "MIR-00000-73FF1-16384 has backfill_time"\ assert self.blocks['MIR-04000-77FF1-16384'].backfill_time == job_done_1, "MIR-04000-73FF1-16384 has backfill_time"\
" of %s should be %s" % (self.blocks['MIR-04000-77FF1-16384'].backfill_time, job_done_1)
def test_mira_32_16_512_backfill_optimistic(self):
#This came from a situation that occurred during acceptance testing
#You have a 32k running long, and a short 512, the backfill time (and therefore
#the drain preference) should be set such that you drain over the 512, not the 32k
self.setup_mira_32k_test_blocks()
now = 100.0
now_delta = 400.0
job_done_1= 600.0
job_done_2= 500.0
job_end_times = {'MIR-04000-7BFF1-32768':job_done_1, 'MIR-00000-33331-512':job_done_2}
for key in job_end_times.keys():
self.set_blocking_states(key, 'allocated')
Cobalt.Components.bgq_base_system.BACKFILL_MODE = 'OPTIMISTIC'
BGBaseSystem.set_backfill_times(self.blocks, job_end_times, now)
assert self.blocks['MIR-00000-73FF1-16384'].backfill_time == job_done_1, "MIR-00000-73FF1-16384 has backfill_time"\
" of %s should be %s" % (self.blocks['MIR-00000-73FF1-16384'].backfill_time, job_done_1)
assert self.blocks['MIR-04000-77FF1-16384'].backfill_time == job_done_1, "MIR-04000-73FF1-16384 has backfill_time"\
" of %s should be %s" % (self.blocks['MIR-04000-77FF1-16384'].backfill_time, job_done_1) " of %s should be %s" % (self.blocks['MIR-04000-77FF1-16384'].backfill_time, job_done_1)
def test_parent_inherit(self): def test_parent_inherit(self):
...@@ -171,9 +198,8 @@ class TestBackfillTime(object): ...@@ -171,9 +198,8 @@ class TestBackfillTime(object):
jobdone = 500.0 jobdone = 500.0
job_end_times = {'8k-1':jobdone} job_end_times = {'8k-1':jobdone}
self.set_blocking_states('8k-1', 'allocated') self.set_blocking_states('8k-1', 'allocated')
Cobalt.Components.bgq_base_system.BACKFILL_MODE = 'PESSIMISTIC'
BGBaseSystem.set_backfill_times(self.blocks, job_end_times, now) BGBaseSystem.set_backfill_times(self.blocks, job_end_times, now)
assert self.blocks['512-1'].backfill_time == jobdone, "Child did not recieve correct time" assert self.blocks['512-1'].backfill_time == jobdone, "Child got %s should have %s" % (self.blocks['512-1'].backfill_time, jobdone)
def test_minimum_window(self): def test_minimum_window(self):
self.setup_standard_blocks() self.setup_standard_blocks()
...@@ -182,7 +208,6 @@ class TestBackfillTime(object): ...@@ -182,7 +208,6 @@ class TestBackfillTime(object):
jobdone = 300.0 jobdone = 300.0
job_end_times = {'8k-1':jobdone} job_end_times = {'8k-1':jobdone}
self.set_blocking_states('8k-1', 'allocated') self.set_blocking_states('8k-1', 'allocated')
Cobalt.Components.bgq_base_system.BACKFILL_MODE = 'PESSIMISTIC'
BGBaseSystem.set_backfill_times(self.blocks, job_end_times, now) BGBaseSystem.set_backfill_times(self.blocks, job_end_times, now)
assert self.blocks['8k-1'].backfill_time == now_delta, "Minimum backfill window not set." assert self.blocks['8k-1'].backfill_time == now_delta, "Minimum backfill window not set."
...@@ -247,3 +272,63 @@ class TestBackfillTime(object): ...@@ -247,3 +272,63 @@ class TestBackfillTime(object):
assert self.blocks['512-1'].backfill_time == job_done_1, '512-1 has time %s should be %s' % (self.blocks['512-1'].backfill_time, job_done_1) assert self.blocks['512-1'].backfill_time == job_done_1, '512-1 has time %s should be %s' % (self.blocks['512-1'].backfill_time, job_done_1)
assert self.blocks['512-4'].backfill_time == now_delta, '512-4 has time %s should be %s' % (self.blocks['512-4'].backfill_time, now_delta) assert self.blocks['512-4'].backfill_time == now_delta, '512-4 has time %s should be %s' % (self.blocks['512-4'].backfill_time, now_delta)
def test_overlap_v16blocking_8k_secondary_optimistic(self):
self.setup_standard_blocks()
now = 100.0
job_done_1 = 600.0
job_done_2 = 500.0
job_end_times = {'8k-2':job_done_2, 'vert-16k-1':job_done_1}
for key in job_end_times.keys():
self.set_blocking_states(key, 'allocated')
Cobalt.Components.bgq_base_system.BACKFILL_MODE = 'OPTIMISTIC'
BGBaseSystem.set_backfill_times(self.blocks, job_end_times, now)
assert self.blocks['vert-16k-1'].backfill_time == job_done_1, 'vert-16k-1 has time %s should be %s' % (self.blocks['vert-16k-1'].backfill_time, job_done_1)
assert self.blocks['horiz-16k-1'].backfill_time == job_done_1, 'horiz-16k-1 has time %s should be %s' % (self.blocks['horiz-16k-1'].backfill_time, job_done_1)
assert self.blocks['8k-2'].backfill_time == job_done_1, '8k-2 has time %s should be %s' % (self.blocks['8k-2'].backfill_time, job_done_1)
assert self.blocks['8k-3'].backfill_time == job_done_1, '8k-2 has time %s should be %s' % (self.blocks['8k-3'].backfill_time, job_done_1)
assert self.blocks['512-2'].backfill_time == job_done_1, '512-2 has time %s should be %s' % (self.blocks['512-2'].backfill_time, job_done_1)
assert self.blocks['512-1'].backfill_time == job_done_1, '512-1 has time %s should be %s' % (self.blocks['512-1'].backfill_time, job_done_1)
assert self.blocks['512-4'].backfill_time == job_done_1, '512-4 has time %s should be %s' % (self.blocks['512-4'].backfill_time, job_done_1)
def test_overlap_8kblocking_v16_secondary_optimistic(self):
self.setup_standard_blocks()
now = 100.0
job_done_1 = 600.0
job_done_2 = 500.0
job_end_times = {'8k-2':job_done_1, 'vert-16k-1':job_done_2}
for key in job_end_times.keys():
self.set_blocking_states(key, 'allocated')
Cobalt.Components.bgq_base_system.BACKFILL_MODE = 'OPTIMISTIC'
BGBaseSystem.set_backfill_times(self.blocks, job_end_times, now)
assert self.blocks['vert-16k-1'].backfill_time == job_done_1, 'vert-16k-1 has time %s should be %s' % (self.blocks['vert-16k-1'].backfill_time, job_done_1)
assert self.blocks['vert-16k-2'].backfill_time == job_done_1, 'vert-16k-2 has time %s should be %s' % (self.blocks['vert-16k-2'].backfill_time, job_done_1)
assert self.blocks['horiz-16k-1'].backfill_time == job_done_1, 'horiz-16k-1 has time %s should be %s' % (self.blocks['horiz-16k-1'].backfill_time, job_done_1)
assert self.blocks['8k-2'].backfill_time == job_done_1, '8k-2 has time %s should be %s' % (self.blocks['8k-2'].backfill_time, job_done_1)
assert self.blocks['8k-3'].backfill_time == job_done_1, '8k-2 has time %s should be %s' % (self.blocks['8k-3'].backfill_time, job_done_1)
assert self.blocks['512-2'].backfill_time == job_done_1, '512-2 has time %s should be %s' % (self.blocks['512-2'].backfill_time, job_done_1)
assert self.blocks['512-1'].backfill_time == job_done_1, '512-1 has time %s should be %s' % (self.blocks['512-1'].backfill_time, job_done_1)
assert self.blocks['512-4'].backfill_time == job_done_1, '512-4 has time %s should be %s' % (self.blocks['512-4'].backfill_time, job_done_1)
def test_overlap_v16blocking_8k_secondary_short_time_optimistic(self):
self.setup_standard_blocks()
now = 250.0
now_delta = 550.0
job_done_1 = 600.0
job_done_2 = 500.0
job_end_times = {'8k-2':job_done_2, 'vert-16k-1':job_done_1}
for key in job_end_times.keys():
self.set_blocking_states(key, 'allocated')
Cobalt.Components.bgq_base_system.BACKFILL_MODE = 'OPTIMISTIC'
BGBaseSystem.set_backfill_times(self.blocks, job_end_times, now)
assert job_done_2 not in [val.backfill_time for val in self.blocks.values()], "Minimum backfill shadow not honored."
assert self.blocks['vert-16k-1'].backfill_time == job_done_1, 'vert-16k-1 has time %s should be %s' % (self.blocks['vert-16k-1'].backfill_time, job_done_1)
assert self.blocks['vert-16k-2'].backfill_time == job_done_1, 'vert-16k-2 has time %s should be %s' % (self.blocks['vert-16k-2'].backfill_time, now_delta)
assert self.blocks['horiz-16k-1'].backfill_time == job_done_1, 'horiz-16k-1 has time %s should be %s' % (self.blocks['horiz-16k-1'].backfill_time, job_done_1)
assert self.blocks['8k-2'].backfill_time == job_done_1, '8k-2 has time %s should be %s' % (self.blocks['8k-2'].backfill_time, now_delta)
assert self.blocks['8k-3'].backfill_time == job_done_1, '8k-2 has time %s should be %s' % (self.blocks['8k-3'].backfill_time, job_done_1)
assert self.blocks['512-2'].backfill_time == job_done_1, '512-2 has time %s should be %s' % (self.blocks['512-2'].backfill_time, now_delta)
assert self.blocks['512-1'].backfill_time == job_done_1, '512-1 has time %s should be %s' % (self.blocks['512-1'].backfill_time, job_done_1)
assert self.blocks['512-4'].backfill_time == job_done_1, '512-4 has time %s should be %s' % (self.blocks['512-4'].backfill_time, now_delta)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment