Commit cd732a8f authored by Jakob Luettgau's avatar Jakob Luettgau
Browse files

Add isolated transformation example from dxt-timeline. (dxt2png).

parent c7b0910d
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import os
import sys
import math
from operator import itemgetter
from PIL import Image, ImageDraw
def sanitize_size(x):
""" Ensure segments are at least represented by one pixel. """
if x < 1:
x = 1
return int(x)
def calc_duration(trace):
start_time = float('inf')
end_time = float('-inf')
for seg in trace:
if seg['start_time'] < start_time:
start_time = seg['start_time']
if seg['end_time'] > end_time:
end_time = seg['end_time']
return start_time, end_time, end_time - start_time
def calc_minsize(trace):
minsize = 0
for seg in trace:
size = seg['offset'] + seg['length']
if size > minsize:
minsize = size
return minsize
def segment(rec):
"""
* write segments, then read segments
* segments in order they occur
"""
for item in rec['read_segments']:
item.update({'type': 'r'})
for item in rec['write_segments']:
item.update({'type': '2'})
trace = rec['read_segments'] + rec['write_segments']
minsize = calc_minsize(trace)
start, end, duration = calc_duration(trace)
print("len(trace):", len(trace), "minsize:", minsize, "duration:", duration)
count = len(trace)
factor = 720
width = sanitize_size( duration * factor )
factor = width/count
#print(count)
#print(factor)
# image properties
#height = int(math.log(minsize))
#height = sanitize_size( int((math.log(minsize)*math.log(minsize))/2) )
height = sanitize_size( math.log(minsize)*math.log(minsize) )
#print(width, height)
#img = Image.new('RGB', (width, height), color = (0, 0, 0))
#img = Image.new('RGBA', (width, height), color = (0, 0, 0, 0))
img = Image.new('RGBA', (width, height), color = (33, 33, 33, 255))
# sort?
trace = sorted(trace, key=itemgetter('start_time'))
draw = ImageDraw.Draw(img)
for i, event in enumerate(trace):
typ = event['type']
off = event['offset']
lee = event['length']
sta = event['start_time']
end = event['end_time']
#print(typ, off, lee, sta, end)
xx = i*factor;
yy = height * (off / minsize)
wi = 1 * factor;
he = sanitize_size( height * (lee / minsize) )
fill = None
#fill = (0,0,0,0)
if typ == 'r':
fill = (222, 66, 111, 200)
elif typ == 'w':
fill = (66, 222, 222, 200)
#print([xx, yy, xx+wi-1, yy+he-1])
# draw.rectangle(xy, fill=None, outline=None)
# where yx either [(x0, y0), (x1, y1)] or [x0, y0, x1, y1]
draw.rectangle([xx, yy, xx+wi-1, yy+he-1], fill=fill, outline=None)
del draw
return img
def wallclock(rec):
for item in rec['read_segments']:
item.update({'type': 'r'})
for item in rec['write_segments']:
item.update({'type': '2'})
trace = rec['read_segments'] + rec['write_segments']
minsize = calc_minsize(trace)
start, end, duration = calc_duration(trace)
count = len(trace)
factor = 720
if duration == 0:
duration = 1
# image properties
width = sanitize_size( duration * factor )
#height = int(math.log(minsize))
height = sanitize_size( math.log(minsize)*math.log(minsize) )
#print(width, height)
#img = Image.new('RGB', (width, height), color = (0, 0, 0))
#img = Image.new('RGBA', (width, height), color = (0, 0, 0, 0))
img = Image.new('RGBA', (width, height), color = (33, 33, 33, 255))
# sort?
trace = sorted(trace, key=itemgetter('start_time'))
draw = ImageDraw.Draw(img)
for i, event in enumerate(trace):
typ = event['type']
off = event['offset']
lee = event['length']
sta = event['start_time'] - start
end = event['end_time'] - start
xx = sta/duration * width;
yy = height * (off / minsize)
wi = sanitize_size( (end-sta)*factor );
he = sanitize_size( height * (lee / minsize) )
fill = None
#fill = (0,0,0,0)
if typ == 'r':
fill = (222, 66, 111, 200)
#fill = (222, 66, 111)
elif typ == 'w':
fill = (66, 222, 222, 200)
#fill = (66, 222, 222)
#print([xx, yy, xx+wi, yy+he-1])
# draw.rectangle(xy, fill=None, outline=None)
# where yx either [(x0, y0), (x1, y1)] or [x0, y0, x1, y1]
draw.rectangle([xx, yy, xx+wi-1, yy+he-1], fill=fill, outline=None)
del draw
return img
def visualize(data, modes=['wallclock', 'segment'], path="./"):
"""
alternative mode: wallclock
"""
#print(data)
fileid = data['cur']['fileid']
rankid = data['rankid']
trace = data['cur']['ranks'][ data['rankid'] ]['trace']
minsize = data['minsize']
start = data['cur']['ranks'][ data['rankid'] ]['start']
end = data['cur']['ranks'][ data['rankid'] ]['end']
duration = (end-start)
filename = "%s/%s" % (path, fileid)
#filename = "%s/file%s_rank%s" % (path, fileid, rankid)
filename = os.path.normpath(filename)
if 'wallclock' in modes:
img = wallclock(trace, minsize, duration, start, end)
print('Writing %s_wallclock.png' % (filename))
img.save('%s_wallclock.png' % (filename), 'PNG')
if 'segment' in modes:
img = segment(trace, minsize, duration)
print('Writing %s_segment.png' % (filename))
img.save('%s_segment.png' % (filename), 'PNG')
%% Cell type:markdown id: tags:
# DarshanUtils for Python
This notebook gives an overwiew of features provided by the Python bindings for DarshanUtils.
%% Cell type:markdown id: tags:
By default all records, metadata, available modules and the name records are loaded:
%% Cell type:code id: tags:
``` python
import darshan
report = darshan.DarshanReport("example-logs/example.darshan", read_all=True) # Default behavior
report.info()
```
%%%% Output: stream
Filename: example-logs/example.darshan
Times: 2017-03-20 10:07:47 to 2017-03-20 10:09:43 (Duration 0:01:56)
Executeable: /global/project/projectdirs/m888/glock/tokio-abc-results/bin.edison/vpicio_uni /scratch2/scratchdirs/glock/tokioabc-s.4478544/vpicio/vpicio.hdf5 32
Processes: 2048
JobID: 4478544
UID: 69615
Modules in Log: ['POSIX', 'MPI-IO', 'LUSTRE', 'STDIO']
Loaded Records: {'POSIX': 1, 'MPI-IO': 1, 'STDIO': 129}
Name Records: 4
Darshan/Hints: {'lib_ver': '3.1.3', 'h': 'romio_no_indep_rw=true;cb_nodes=4'}
DarshanReport: id(140346659969064) (tmp)
%% Cell type:code id: tags:
``` python
report.modules
```
%%%% Output: execute_result
{'POSIX': {'len': 186, 'ver': 3, 'idx': 1, 'num_records': 1},
'MPI-IO': {'len': 154, 'ver': 2, 'idx': 2, 'num_records': 1},
'LUSTRE': {'len': 87, 'ver': 1, 'idx': 7},
'STDIO': {'len': 3234, 'ver': 1, 'idx': 8, 'num_records': 129}}
%% Cell type:markdown id: tags:
A few of the internal data structures explained:
%% Cell type:code id: tags:
``` python
# report.metadata # dictionary with raw metadata from darshan log
# report.modules # dictionary with raw module info from darshan log (need: technical, module idx)
# report.name_records # dictionary for resovling name records: id -> path/name
# report.records # per module "dataframes"/dictionaries holding loaded records
```
%% Cell type:markdown id: tags:
The darshan report holds a variety of namespaces for report related data. All of them are also referenced in `report.data` at the moment, but reliance on this internal organization of the report object is discouraged once the API stabilized. Currently, `report.data` references the following information:
%% Cell type:code id: tags:
``` python
report.data.keys()
```
%%%% Output: execute_result
dict_keys(['version', 'metadata', 'records', 'summary', 'modules', 'counters', 'name_records', 'mounts'])
%% Cell type:code id: tags:
``` python
report.mod_read_all_records('POSIX')
```
%% Cell type:code id: tags:
``` python
report.mod_read_all_records('STDIO')
```
%% Cell type:code id: tags:
``` python
report.update_name_records()
report.info()
```
%%%% Output: stream
POSIX
MPI-IO
STDIO
Filename: example-logs/example.darshan
Times: 2017-03-20 10:07:47 to 2017-03-20 10:09:43 (Duration 0:01:56)
Executeable: /global/project/projectdirs/m888/glock/tokio-abc-results/bin.edison/vpicio_uni /scratch2/scratchdirs/glock/tokioabc-s.4478544/vpicio/vpicio.hdf5 32
Processes: 2048
JobID: 4478544
UID: 69615
Modules in Log: ['POSIX', 'MPI-IO', 'LUSTRE', 'STDIO']
Loaded Records: {'POSIX': 1, 'MPI-IO': 1, 'STDIO': 129}
Name Records: 3
Darshan/Hints: {'lib_ver': '3.1.3', 'h': 'romio_no_indep_rw=true;cb_nodes=4'}
DarshanReport: id(140346659969064) (tmp)
%% Cell type:code id: tags:
``` python
# visualization helper used by different examples in the remainder of this notebook
from IPython.display import display, HTML
# usage: display(obj)
```
%% Cell type:markdown id: tags:
### Record Formats and Selectively Loading Records
For memory efficiant analysis, it is possible to supress records from being loaded automatically. This is useful, for example, when analysis considers only records of a particular layer/module.
%% Cell type:code id: tags:
``` python
import darshan
report = darshan.DarshanReport("example-logs/example.darshan", read_all=False, lookup_name_records=True) # Loads no records!
```
%% Cell type:code id: tags:
``` python
# expected to fail, as no records were loaded
try:
print(len(report.records['STDIO']), "records loaded for STDIO.")
except:
print("No STDIO records loaded for this report yet.")
```
%%%% Output: stream
No STDIO records loaded for this report yet.
%% Cell type:markdown id: tags:
Additional records then can be loaded selectively, for example, on a per module basis:
%% Cell type:markdown id: tags:
#### dtype: pandas
%% Cell type:code id: tags:
``` python
report.mod_read_all_records("STDIO", dtype="pandas")
```
%%%% Output: stream
STDIO
%% Cell type:code id: tags:
``` python
print('id', report.records['STDIO'][0]['id'])
print('rank', report.records['STDIO'][0]['rank'])
display(report.records['STDIO'][0]['counters'])
display(report.records['STDIO'][0]['fcounters'])
```
%%%% Output: stream
id -1
rank -1
%%%% Output: display_data
%%%% Output: display_data
%% Cell type:markdown id: tags:
#### dtype: dict
%% Cell type:code id: tags:
``` python
report.mod_read_all_records("STDIO", dtype='dict')
report.records['STDIO'][0]
```
%%%% Output: stream
STDIO
%%%% Output: execute_result
{'id': 15920181672442173319,
'rank': 0,
'counters': {'STDIO_OPENS': 1,
'STDIO_FDOPENS': -1,
'STDIO_READS': 0,
'STDIO_WRITES': 6,
'STDIO_SEEKS': 0,
'STDIO_FLUSHES': 0,
'STDIO_BYTES_WRITTEN': 280,
'STDIO_BYTES_READ': 0,
'STDIO_MAX_BYTE_READ': 0,
'STDIO_MAX_BYTE_WRITTEN': 279,
'STDIO_FASTEST_RANK': 0,
'STDIO_FASTEST_RANK_BYTES': 0,
'STDIO_SLOWEST_RANK': 0,
'STDIO_SLOWEST_RANK_BYTES': 0},
'fcounters': {'STDIO_F_META_TIME': 0.0,
'STDIO_F_WRITE_TIME': 6.794929504394531e-05,
'STDIO_F_READ_TIME': 0.0,
'STDIO_F_OPEN_START_TIMESTAMP': 0.0,
'STDIO_F_CLOSE_START_TIMESTAMP': 0.0,
'STDIO_F_WRITE_START_TIMESTAMP': 0.07752799987792969,
'STDIO_F_READ_START_TIMESTAMP': 0.0,
'STDIO_F_OPEN_END_TIMESTAMP': 0.0,
'STDIO_F_CLOSE_END_TIMESTAMP': 0.0,
'STDIO_F_WRITE_END_TIMESTAMP': 116.28358292579651,
'STDIO_F_READ_END_TIMESTAMP': 0.0,
'STDIO_F_FASTEST_RANK_TIME': 0.0,
'STDIO_F_SLOWEST_RANK_TIME': 0.0,
'STDIO_F_VARIANCE_RANK_TIME': 0.0,
'STDIO_F_VARIANCE_RANK_BYTES': 0.0}}
%% Cell type:code id: tags:
``` python
```
%% Cell type:markdown id: tags:
#### dtype: numpy
%% Cell type:code id: tags:
``` python
report.mod_read_all_records("STDIO")
report.records['STDIO'][0]
```
%%%% Output: execute_result
{'id': 15920181672442173319,
'rank': 0,
'counters': array([ 1, -1, 0, 6, 0, 0, 280, 0, 0, 279, 0, 0, 0,
0]),
'fcounters': array([0.00000000e+00, 6.79492950e-05, 0.00000000e+00, 0.00000000e+00,
0.00000000e+00, 7.75279999e-02, 0.00000000e+00, 0.00000000e+00,
0.00000000e+00, 1.16283583e+02, 0.00000000e+00, 0.00000000e+00,
0.00000000e+00, 0.00000000e+00, 0.00000000e+00])}
%% Cell type:markdown id: tags:
#### The Log in Memory
Let's have a look at how calling `report.mod_read_all_records("STDIO")` changed the state of the log in memory.
%% Cell type:code id: tags:
``` python
# Compare to info line: "Loaded Records: {...}"
report.info()
```
%%%% Output: stream
Filename: example-logs/example.darshan
Times: 2017-03-20 10:07:47 to 2017-03-20 10:09:43 (Duration 0:01:56)
Executeable: /global/project/projectdirs/m888/glock/tokio-abc-results/bin.edison/vpicio_uni /scratch2/scratchdirs/glock/tokioabc-s.4478544/vpicio/vpicio.hdf5 32
Processes: 2048
JobID: 4478544
UID: 69615
Modules in Log: ['POSIX', 'MPI-IO', 'LUSTRE', 'STDIO']
Loaded Records: {'STDIO': 129}
Name Records: 2
Darshan/Hints: {'lib_ver': '3.1.3', 'h': 'romio_no_indep_rw=true;cb_nodes=4'}
DarshanReport: id(139926751139712) (tmp)
%% Cell type:markdown id: tags:
When interacting on individual log data for example in a for loop you would most likely care about the following instead:
%% Cell type:code id: tags:
``` python
print("Num records:", len(report.records['STDIO']))
# show first 10 records
for rec in report.records['STDIO'][0:10]:
print(rec)
```
%%%% Output: stream
Num records: 129
{'id': 15920181672442173319, 'rank': 0, 'counters': array([ 1, -1, 0, 6, 0, 0, 280, 0, 0, 279, 0, 0, 0,
0]), 'fcounters': array([0.00000000e+00, 6.79492950e-05, 0.00000000e+00, 0.00000000e+00,
0.00000000e+00, 7.75279999e-02, 0.00000000e+00, 0.00000000e+00,
0.00000000e+00, 1.16283583e+02, 0.00000000e+00, 0.00000000e+00,
0.00000000e+00, 0.00000000e+00, 0.00000000e+00])}
{'id': 7238257241479193519, 'rank': 0, 'counters': array([ 1, -1, 0, 68, 0, 0, 3029, 0, 0, 3028, 0,
0, 0, 0]), 'fcounters': array([ 0. , -2662.74663377, 0. , 0. ,
0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0. ,
0. , 0. , 0. ])}
{'id': 7238257241479193519, 'rank': 16, 'counters': array([ 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), 'fcounters': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])}
{'id': 7238257241479193519, 'rank': 32, 'counters': array([ 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), 'fcounters': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])}
{'id': 7238257241479193519, 'rank': 48, 'counters': array([ 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), 'fcounters': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])}
{'id': 7238257241479193519, 'rank': 64, 'counters': array([ 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), 'fcounters': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])}
{'id': 7238257241479193519, 'rank': 80, 'counters': array([ 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), 'fcounters': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])}
{'id': 7238257241479193519, 'rank': 96, 'counters': array([ 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), 'fcounters': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])}
{'id': 7238257241479193519, 'rank': 112, 'counters': array([ 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), 'fcounters': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])}
{'id': 7238257241479193519, 'rank': 128, 'counters': array([ 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), 'fcounters': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])}
%% Cell type:markdown id: tags:
### Aggregation and Filtering (Experimental)
Darshan log data is routinely aggregated for quick overview. The report object offers a few methods to perform common aggregations:
%% Cell type:markdown id: tags:
Report aggregations and summarization remains **experimental** for now, mostly to allow interfaces to stabilize. But experimental features can be switched on easily by invoking `darshan.enable_experimental()`:
%% Cell type:code id: tags:
``` python
import darshan
darshan.enable_experimental(verbose=True) # Enable verbosity, listing new functionality
```
%%%% Output: stream
Added method create_time_summary to DarshanReport.
Added method print_module_records to DarshanReport.
Added method summarize to DarshanReport.
Added method merge to DarshanReport.
Added method create_timeline to DarshanReport.
Added method records_as_dict to DarshanReport.
Added method reduce to DarshanReport.
Added method agg_ioops to DarshanReport.
Added method create_sankey to DarshanReport.
Added method filter to DarshanReport.
Added method mod_agg_iohist to DarshanReport.
Added method name_records_summary to DarshanReport.
%% Cell type:code id: tags:
``` python
# Example report, which counts records in log across modules
report.name_records_summary()
```
%%%% Output: execute_result
{15920181672442173319: {'name': '<STDOUT>', 'counts': {'STDIO': 1}},
7238257241479193519: {'name': '<STDERR>', 'counts': {'STDIO': 128}}}
%% Cell type:markdown id: tags:
### Chain operations like filtering and reductions
The filter and reduce operations return DarshanReports themsleves, thus allow to convieniently chain operations.
%% Cell type:code id: tags: