Commit 91bdad64 authored by Jakob Luettgau's avatar Jakob Luettgau
Browse files

Make sure dtype parameter is used consistently.

parent c16aa8a2
......@@ -325,7 +325,6 @@ def log_get_generic_record(log, mod_name, mod_type, dtype='numpy'):
})
if dtype == "pandas":
df_c = pd.DataFrame(cdict, index=[0])
df_fc = pd.DataFrame(fcdict, index=[0])
......
......@@ -67,7 +67,8 @@ def create_timeline(self, group_by='rank', mode="append"):
"id": rid,
"rank": rec['rank'],
"hostname": rec['hostname'],
"filename": rec['filename'],
#"filename": rec['filename'],
"filename": "FIXME: NEED FILENAME",
"group": rid,
"start": start.isoformat(),
......@@ -86,7 +87,8 @@ def create_timeline(self, group_by='rank', mode="append"):
group = {
"id": rid,
"content": "[%s] " % (mod) + rec['filename'][-84:],
#"content": "[%s] " % (mod) + rec['filename'][-84:],
"content": "[%s] " % (mod) + "NEED FILENAME",
"order": seg['start_time']
}
groups.append(group)
......
......@@ -61,28 +61,46 @@ class DarshanReport(object):
a number of common aggregations can be performed.
"""
# a way to conser memory?
#__slots__ = ['attr1', 'attr2']
def __init__(self,
filename=None, data_format='pandas',
filename=None, dtype='pandas',
start_time=None, end_time=None,
automatic_summary=False,
read_all=True, lookup_name_records=True):
"""
Args:
filename (str): filename to open (optional)
dtype (str): default dtype for internal structures
automatic_summary (bool): automatically generate summary after loading
read_all (bool): whether to read all records for log
lookup_name_records (bool): lookup and update name_records as records are loaded
Return:
None
"""
self.filename = filename
# options
self.data_format = data_format # Experimental: preferred internal representation: pandas/numpy useful for aggregations, dict good for export/REST
# Behavioral Options
self.dtype = dtype # Experimental: preferred internal representation: pandas/numpy useful for aggregations, dict good for export/REST
# might require alternative granularity: e.g., records, vs summaries?
# vs dict/pandas? dict/native?
self.automatic_summary = automatic_summary
self.lookup_name_records = lookup_name_records
# state dependent book-keeping
# State dependent book-keeping
self.converted_records = False # true if convert_records() was called (unnumpyfy)
#
self.start_time = float('inf')
self.end_time = float('-inf')
# initialize data namespaces
# Report Metadata
self.start_time = start_time if start_time else float('inf')
self.end_time = end_time if end_time else float('-inf')
self.timebase = self.start_time
# Initialize data namespaces
self.metadata = {}
self.modules = {}
self.counters = {}
......@@ -91,7 +109,7 @@ class DarshanReport(object):
self.name_records = {}
# initialize report/summary namespace
self.summary_revision = 0 # counter to check if summary needs update
self.summary_revision = 0 # counter to check if summary needs update (see data_revision)
self.summary = {}
......@@ -110,7 +128,7 @@ class DarshanReport(object):
# when using report algebra this log allows to untangle potentially
# unfair aggregations (e.g., double accounting)
self.provenance_enabled = True
self.provenance_log = []
self.provenance_graph = []
self.provenance_reports = {}
......@@ -236,10 +254,10 @@ class DarshanReport(object):
ids.add(rec['id'])
self.name_records = backend.log_lookup_name_records(self.log, ids)
self.name_records.update(backend.log_lookup_name_records(self.log, ids))
def read_all(self):
def read_all(self, dtype=None):
"""
Read all available records from darshan log and return as dictionary.
......@@ -249,15 +267,15 @@ class DarshanReport(object):
Return:
None
"""
self.read_all_generic_records()
self.read_all_dxt_records()
self.mod_read_all_lustre_records()
self.read_all_generic_records(dtype=dtype)
self.read_all_dxt_records(dtype=dtype)
self.mod_read_all_lustre_records(dtype=dtype)
return
def read_all_generic_records(self, counters=True, fcounters=True):
def read_all_generic_records(self, counters=True, fcounters=True, dtype=None):
"""
Read all generic records from darshan log and return as dictionary.
......@@ -267,8 +285,11 @@ class DarshanReport(object):
Return:
None
"""
dtype = dtype if dtype else self.dtype
for mod in self.data['modules']:
self.mod_read_all_records(mod, warnings=False)
self.mod_read_all_records(mod, dtype=dtype, warnings=False)
pass
......@@ -283,16 +304,16 @@ class DarshanReport(object):
Return:
None
"""
dtype = dtype if dtype else self.dtype
for mod in self.data['modules']:
self.mod_read_all_dxt_records(mod, warnings=False, reads=reads, writes=writes, dtype=dtype)
pass
def mod_read_all_records(self, mod, dtype='numpy', warnings=True):
def mod_read_all_records(self, mod, dtype=None, warnings=True):
"""
Reads all generic records for module
......@@ -313,21 +334,23 @@ class DarshanReport(object):
return
self.data['records'][mod] = []
# handling options
dtype = dtype if dtype else self.dtype
self.data['records'][mod] = []
cn = backend.counter_names(mod)
fcn = backend.fcounter_names(mod)
# update module metadata
self.modules[mod]['num_records'] = 0
if mod not in self.counters:
self.counters[mod] = {}
self.counters[mod]['counters'] = cn
self.counters[mod]['fcounters'] = fcn
self.counters[mod]['counters'] = cn
self.counters[mod]['fcounters'] = fcn
# fetch records
rec = backend.log_get_generic_record(self.log, mod, _structdefs[mod], dtype=dtype)
while rec != None:
if dtype == 'pandas':
......@@ -346,7 +369,6 @@ class DarshanReport(object):
if self.lookup_name_records:
self.update_name_records()
# process/combine records if the format dtype allows for this
if dtype == 'pandas':
combined_c = None
......@@ -377,7 +399,7 @@ class DarshanReport(object):
pass
def mod_read_all_dxt_records(self, mod, dtype='numpy', warnings=True, reads=True, writes=True):
def mod_read_all_dxt_records(self, mod, dtype=None, warnings=True, reads=True, writes=True):
"""
Reads all dxt records for provided module.
......@@ -404,14 +426,19 @@ class DarshanReport(object):
return
self.records[mod] = []
self.modules[mod]['num_records'] = 0
# handling options
dtype = dtype if dtype else self.dtype
self.records[mod] = []
# update module metadata
self.modules[mod]['num_records'] = 0
if mod not in self.counters:
self.counters[mod] = {}
# fetch records
rec = backend.log_get_dxt_record(self.log, mod, _structdefs[mod], dtype=dtype)
while rec != None:
if dtype == 'numpy':
......@@ -419,20 +446,20 @@ class DarshanReport(object):
else:
self.records[mod].append(rec)
pass
self.data['modules'][mod]['num_records'] += 1
# fetch next
rec = backend.log_get_dxt_record(self.log, mod, _structdefs[mod], reads=reads, writes=writes, dtype=dtype)
if self.lookup_name_records:
self.update_name_records()
pass
def mod_read_all_lustre_records(self, mod="LUSTRE", dtype='numpy', warnings=True):
def mod_read_all_lustre_records(self, mod="LUSTRE", dtype=None, warnings=True):
"""
Reads all dxt records for provided module.
......@@ -459,14 +486,21 @@ class DarshanReport(object):
return
self.records[mod] = []
self.modules[mod]['num_records'] = 0
# handling options
dtype = dtype if dtype else self.dtype
self.records[mod] = []
cn = backend.counter_names(mod)
# update module metadata
self.modules[mod]['num_records'] = 0
if mod not in self.counters:
self.counters[mod] = {}
self.counters[mod]['counters'] = cn
# fetch records
rec = backend.log_get_record(self.log, mod, dtype=dtype)
while rec != None:
self.records[mod].append(rec)
......@@ -475,12 +509,39 @@ class DarshanReport(object):
# fetch next
rec = backend.log_get_record(self.log, mod, dtype=dtype)
if self.lookup_name_records:
self.update_name_records()
# process/combine records if the format dtype allows for this
if dtype == 'pandas':
combined_c = None
for rec in self.records[mod]:
obj = rec['counters']
#print(type(obj))
#display(obj)
if combined_c is None:
combined_c = rec['counters']
else:
combined_c = pd.concat([combined_c, rec['counters']])
self.records[mod] = [{
'rank': -1,
'id': -1,
'counters': combined_c,
}]
pass
def mod_records(self, mod, dtype='numpy', warnings=True):
def mod_records(self, mod,
dtype='numpy', warnings=True):
"""
Return generator for lazy record loading and traversal.
......@@ -583,6 +644,32 @@ class DarshanReport(object):
#print("Memory:", get_size(self), 'bytes')
###########################################################################
# Internal Organisation
###########################################################################
def rebase_timestamps(records=None, inplace=False, timebase=False):
"""
Updates all records in the report to use timebase (defaults: start_time).
This might allow to conserve memory as reports are merged.
Args:
records (dict, list): records to rebase
inplace (bool): weather to merel return a copy or to update records
timebase (datetime.datetime): new timebase to use
Return:
rebased_records (same type as provided to records)
"""
rebase_records = copy.deepcopy(record)
# TODO: apply timestamp rebase
# TODO: settle on format
return rebased_records
###########################################################################
# Conversion
###########################################################################
def to_dict(self):
"""
Return dictionary representation of report data.
......@@ -641,3 +728,10 @@ class DarshanReport(object):
pass
return json.dumps(data, cls=DarshanReportJSONEncoder)
@staticmethod
def from_string(string):
return DarshanReport()
%% Cell type:markdown id: tags:
# DarshanUtils for Python
This notebook gives an overwiew of features provided by the Python bindings for DarshanUtils.
%% Cell type:markdown id: tags:
By default all records, metadata, available modules and the name records are loaded:
By default all records, metadata, available modules and the name records are loaded when opening a Darshan log:
%% Cell type:code id: tags:
``` python
import darshan
report = darshan.DarshanReport("example-logs/example.darshan", read_all=True) # Default behavior
report.info()
```
%%%% Output: stream
Filename: example-logs/example.darshan
Times: 2017-03-20 10:07:47 to 2017-03-20 10:09:43 (Duration 0:01:56)
Executeable: /global/project/projectdirs/m888/glock/tokio-abc-results/bin.edison/vpicio_uni /scratch2/scratchdirs/glock/tokioabc-s.4478544/vpicio/vpicio.hdf5 32
Processes: 2048
JobID: 4478544
UID: 69615
Modules in Log: ['POSIX', 'MPI-IO', 'LUSTRE', 'STDIO']
Loaded Records: {'POSIX': 1, 'MPI-IO': 1, 'STDIO': 129, 'LUSTRE': 1}
Name Records: 3
Darshan/Hints: {'lib_ver': '3.1.3', 'h': 'romio_no_indep_rw=true;cb_nodes=4'}
DarshanReport: id(140072175381248) (tmp)
DarshanReport: id(140464297220080) (tmp)
%% Cell type:code id: tags:
``` python
report.modules
```
%%%% Output: execute_result
{'POSIX': {'len': 186, 'ver': 3, 'idx': 1, 'num_records': 1},
'MPI-IO': {'len': 154, 'ver': 2, 'idx': 2, 'num_records': 1},
'LUSTRE': {'len': 87, 'ver': 1, 'idx': 7, 'num_records': 1},
'STDIO': {'len': 3234, 'ver': 1, 'idx': 8, 'num_records': 129}}
%% Cell type:markdown id: tags:
A few of the internal data structures explained:
%% Cell type:code id: tags:
``` python
# report.metadata # dictionary with raw metadata from darshan log
# report.modules # dictionary with raw module info from darshan log (need: technical, module idx)
# report.name_records # dictionary for resovling name records: id -> path/name
# report.records # per module "dataframes"/dictionaries holding loaded records
```
%% Cell type:markdown id: tags:
The darshan report holds a variety of namespaces for report related data. All of them are also referenced in `report.data` at the moment, but reliance on this internal organization of the report object is discouraged once the API stabilized. Currently, `report.data` references the following information:
%% Cell type:code id: tags:
``` python
report.data.keys()
```
%%%% Output: execute_result
dict_keys(['version', 'metadata', 'records', 'summary', 'modules', 'counters', 'name_records', 'mounts'])
%% Cell type:code id: tags:
``` python
report.mod_read_all_records('POSIX')
```
%% Cell type:code id: tags:
``` python
report.mod_read_all_records('STDIO')
```
%% Cell type:code id: tags:
``` python
report.update_name_records()
report.info()
```
%%%% Output: stream
Filename: example-logs/example.darshan
Times: 2017-03-20 10:07:47 to 2017-03-20 10:09:43 (Duration 0:01:56)
Executeable: /global/project/projectdirs/m888/glock/tokio-abc-results/bin.edison/vpicio_uni /scratch2/scratchdirs/glock/tokioabc-s.4478544/vpicio/vpicio.hdf5 32
Processes: 2048
JobID: 4478544
UID: 69615
Modules in Log: ['POSIX', 'MPI-IO', 'LUSTRE', 'STDIO']
Loaded Records: {'POSIX': 1, 'MPI-IO': 1, 'STDIO': 129, 'LUSTRE': 1}
Name Records: 3
Darshan/Hints: {'lib_ver': '3.1.3', 'h': 'romio_no_indep_rw=true;cb_nodes=4'}
DarshanReport: id(140072175381248) (tmp)
%% Cell type:code id: tags:
``` python
# visualization helper used by different examples in the remainder of this notebook
from IPython.display import display, HTML
# usage: display(obj)
```
%% Cell type:markdown id: tags:
### Record Formats and Selectively Loading Records
For memory efficiant analysis, it is possible to supress records from being loaded automatically. This is useful, for example, when analysis considers only records of a particular layer/module.
%% Cell type:code id: tags:
``` python
import darshan
report = darshan.DarshanReport("example-logs/example.darshan", read_all=False, lookup_name_records=True) # Loads no records!
```
%% Cell type:code id: tags:
``` python
# expected to fail, as no records were loaded
try:
print(len(report.records['STDIO']), "records loaded for STDIO.")
except:
print("No STDIO records loaded for this report yet.")
```
%%%% Output: stream
No STDIO records loaded for this report yet.
%% Cell type:markdown id: tags:
Additional records then can be loaded selectively, for example, on a per module basis:
%% Cell type:markdown id: tags:
#### dtype: pandas
%% Cell type:code id: tags:
``` python
report.mod_read_all_records("STDIO", dtype="pandas")
```
%% Cell type:code id: tags:
``` python
print('id', report.records['STDIO'][0]['id'])
print('rank', report.records['STDIO'][0]['rank'])
display(report.records['STDIO'][0]['counters'])
display(report.records['STDIO'][0]['fcounters'])
```
%%%% Output: stream
id -1
rank -1
%%%% Output: display_data
%%%% Output: display_data
%% Cell type:markdown id: tags:
#### dtype: dict
%% Cell type:code id: tags:
``` python
report.mod_read_all_records("STDIO", dtype='dict')
report.records['STDIO'][0]
```
%%%% Output: execute_result
{'id': 15920181672442173319,
'rank': 0,
'counters': {'STDIO_OPENS': 1,
'STDIO_FDOPENS': -1,
'STDIO_READS': 0,
'STDIO_WRITES': 6,
'STDIO_SEEKS': 0,
'STDIO_FLUSHES': 0,
'STDIO_BYTES_WRITTEN': 280,
'STDIO_BYTES_READ': 0,
'STDIO_MAX_BYTE_READ': 0,
'STDIO_MAX_BYTE_WRITTEN': 279,
'STDIO_FASTEST_RANK': 0,
'STDIO_FASTEST_RANK_BYTES': 0,
'STDIO_SLOWEST_RANK': 0,
'STDIO_SLOWEST_RANK_BYTES': 0},
'fcounters': {'STDIO_F_META_TIME': 0.0,
'STDIO_F_WRITE_TIME': 6.794929504394531e-05,
'STDIO_F_READ_TIME': 0.0,
'STDIO_F_OPEN_START_TIMESTAMP': 0.0,
'STDIO_F_CLOSE_START_TIMESTAMP': 0.0,
'STDIO_F_WRITE_START_TIMESTAMP': 0.07752799987792969,
'STDIO_F_READ_START_TIMESTAMP': 0.0,
'STDIO_F_OPEN_END_TIMESTAMP': 0.0,
'STDIO_F_CLOSE_END_TIMESTAMP': 0.0,
'STDIO_F_WRITE_END_TIMESTAMP': 116.28358292579651,
'STDIO_F_READ_END_TIMESTAMP': 0.0,
'STDIO_F_FASTEST_RANK_TIME': 0.0,
'STDIO_F_SLOWEST_RANK_TIME': 0.0,
'STDIO_F_VARIANCE_RANK_TIME': 0.0,
'STDIO_F_VARIANCE_RANK_BYTES': 0.0}}
%% Cell type:code id: tags:
``` python
```
%% Cell type:markdown id: tags:
#### dtype: numpy
%% Cell type:code id: tags:
``` python
report.mod_read_all_records("STDIO")
report.records['STDIO'][0]
```
%%%% Output: execute_result
{'id': 15920181672442173319,
'rank': 0,
'counters': array([ 1, -1, 0, 6, 0, 0, 280, 0, 0, 279, 0, 0, 0,
0]),
'fcounters': array([0.00000000e+00, 6.79492950e-05, 0.00000000e+00, 0.00000000e+00,
0.00000000e+00, 7.75279999e-02, 0.00000000e+00, 0.00000000e+00,
0.00000000e+00, 1.16283583e+02, 0.00000000e+00, 0.00000000e+00,
0.00000000e+00, 0.00000000e+00, 0.00000000e+00])}
%% Cell type:markdown id: tags: