darshan-logutils.c 13.5 KB
Newer Older
1
2
3
4
5
/*
 *  (C) 2009 by Argonne National Laboratory.
 *      See COPYRIGHT in top-level directory.
 */

6
7
#include <stdio.h>
#include <string.h>
8
#include <assert.h>
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
#include "darshan-logutils.h"

/* isn't there a clever c way to avoid this? */
char *darshan_names[] = {
    "CP_INDEP_OPENS",
    "CP_COLL_OPENS",               /* count of MPI collective opens */
    "CP_INDEP_READS",              /* count of independent MPI reads */
    "CP_INDEP_WRITES",             /* count of independent MPI writes */
    "CP_COLL_READS",               /* count of collective MPI reads */
    "CP_COLL_WRITES",              /* count of collective MPI writes */
    "CP_SPLIT_READS",              /* count of split collective MPI reads */
    "CP_SPLIT_WRITES",             /* count of split collective MPI writes */
    "CP_NB_READS",                 /* count of nonblocking MPI reads */
    "CP_NB_WRITES",                /* count of nonblocking MPI writes */
    "CP_SYNCS",                    /* count of MPI_File_sync */
    "CP_POSIX_READS",              /* count of posix reads */
    "CP_POSIX_WRITES",             /* count of posix writes */
    "CP_POSIX_OPENS",              /* count of posix opens */
    "CP_POSIX_SEEKS",              /* count of posix seeks */
    "CP_POSIX_STATS",              /* count of posix stat/lstat/fstats */
    "CP_POSIX_MMAPS",              /* count of posix mmaps */
    "CP_POSIX_FREADS",
    "CP_POSIX_FWRITES",
    "CP_POSIX_FOPENS",
    "CP_POSIX_FSEEKS",
    "CP_POSIX_FSYNCS",
    "CP_POSIX_FDSYNCS",
36
37
38
    "CP_INDEP_NC_OPENS",
    "CP_COLL_NC_OPENS",
    "CP_HDF5_OPENS",
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
    "CP_COMBINER_NAMED",           /* count of each MPI datatype category */
    "CP_COMBINER_DUP",
    "CP_COMBINER_CONTIGUOUS",
    "CP_COMBINER_VECTOR",
    "CP_COMBINER_HVECTOR_INTEGER",
    "CP_COMBINER_HVECTOR",
    "CP_COMBINER_INDEXED",
    "CP_COMBINER_HINDEXED_INTEGER",
    "CP_COMBINER_HINDEXED",
    "CP_COMBINER_INDEXED_BLOCK",
    "CP_COMBINER_STRUCT_INTEGER",
    "CP_COMBINER_STRUCT",
    "CP_COMBINER_SUBARRAY",
    "CP_COMBINER_DARRAY",
    "CP_COMBINER_F90_REAL",
    "CP_COMBINER_F90_COMPLEX",
    "CP_COMBINER_F90_INTEGER",
    "CP_COMBINER_RESIZED",
    "CP_HINTS",                     /* count of MPI hints used */
    "CP_VIEWS",                     /* count of MPI set view calls */
    "CP_MODE",                      /* mode of file */
    "CP_BYTES_READ",                /* total bytes read */
    "CP_BYTES_WRITTEN",             /* total bytes written */
    "CP_MAX_BYTE_READ",             /* highest offset byte read */
    "CP_MAX_BYTE_WRITTEN",          /* highest offset byte written */
    "CP_CONSEC_READS",              /* count of consecutive reads */
    "CP_CONSEC_WRITES",             /* count of consecutive writes */
    "CP_SEQ_READS",                 /* count of sequential reads */
    "CP_SEQ_WRITES",                /* count of sequential writes */
    "CP_RW_SWITCHES",
    "CP_MEM_NOT_ALIGNED",           /* count of accesses not mem aligned */
    "CP_MEM_ALIGNMENT",             /* mem alignment in bytes */
    "CP_FILE_NOT_ALIGNED",          /* count of accesses not file aligned */
    "CP_FILE_ALIGNMENT",            /* file alignment in bytes */
73
74
    "CP_MAX_READ_TIME_SIZE",
    "CP_MAX_WRITE_TIME_SIZE",
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
    "CP_SIZE_READ_0_100",           /* count of posix read size ranges */
    "CP_SIZE_READ_100_1K",
    "CP_SIZE_READ_1K_10K",
    "CP_SIZE_READ_10K_100K",
    "CP_SIZE_READ_100K_1M",
    "CP_SIZE_READ_1M_4M",
    "CP_SIZE_READ_4M_10M",
    "CP_SIZE_READ_10M_100M",
    "CP_SIZE_READ_100M_1G",
    "CP_SIZE_READ_1G_PLUS",
    "CP_SIZE_WRITE_0_100",          /* count of posix write size ranges */
    "CP_SIZE_WRITE_100_1K",
    "CP_SIZE_WRITE_1K_10K",
    "CP_SIZE_WRITE_10K_100K",
    "CP_SIZE_WRITE_100K_1M",
    "CP_SIZE_WRITE_1M_4M",
    "CP_SIZE_WRITE_4M_10M",
    "CP_SIZE_WRITE_10M_100M",
    "CP_SIZE_WRITE_100M_1G",
    "CP_SIZE_WRITE_1G_PLUS",
    "CP_SIZE_READ_AGG_0_100",       /* count of MPI read size ranges */
    "CP_SIZE_READ_AGG_100_1K",
    "CP_SIZE_READ_AGG_1K_10K",
    "CP_SIZE_READ_AGG_10K_100K",
    "CP_SIZE_READ_AGG_100K_1M",
    "CP_SIZE_READ_AGG_1M_4M",
    "CP_SIZE_READ_AGG_4M_10M",
    "CP_SIZE_READ_AGG_10M_100M",
    "CP_SIZE_READ_AGG_100M_1G",
    "CP_SIZE_READ_AGG_1G_PLUS",
    "CP_SIZE_WRITE_AGG_0_100",      /* count of MPI write size ranges */
    "CP_SIZE_WRITE_AGG_100_1K",
    "CP_SIZE_WRITE_AGG_1K_10K",
    "CP_SIZE_WRITE_AGG_10K_100K",
    "CP_SIZE_WRITE_AGG_100K_1M",
    "CP_SIZE_WRITE_AGG_1M_4M",
    "CP_SIZE_WRITE_AGG_4M_10M",
    "CP_SIZE_WRITE_AGG_10M_100M",
    "CP_SIZE_WRITE_AGG_100M_1G",
    "CP_SIZE_WRITE_AGG_1G_PLUS",
    "CP_EXTENT_READ_0_100",          /* count of MPI read extent ranges */
    "CP_EXTENT_READ_100_1K",
    "CP_EXTENT_READ_1K_10K",
    "CP_EXTENT_READ_10K_100K",
    "CP_EXTENT_READ_100K_1M",
    "CP_EXTENT_READ_1M_4M",
    "CP_EXTENT_READ_4M_10M",
    "CP_EXTENT_READ_10M_100M",
    "CP_EXTENT_READ_100M_1G",
    "CP_EXTENT_READ_1G_PLUS",
    "CP_EXTENT_WRITE_0_100",         /* count of MPI write extent ranges */
    "CP_EXTENT_WRITE_100_1K",
    "CP_EXTENT_WRITE_1K_10K",
    "CP_EXTENT_WRITE_10K_100K",
    "CP_EXTENT_WRITE_100K_1M",
    "CP_EXTENT_WRITE_1M_4M",
    "CP_EXTENT_WRITE_4M_10M",
    "CP_EXTENT_WRITE_10M_100M",
    "CP_EXTENT_WRITE_100M_1G",
    "CP_EXTENT_WRITE_1G_PLUS",
    "CP_STRIDE1_STRIDE",             /* the four most frequently appearing strides */
    "CP_STRIDE2_STRIDE",
    "CP_STRIDE3_STRIDE",
    "CP_STRIDE4_STRIDE",
    "CP_STRIDE1_COUNT",              /* count of each of the most frequent strides */
    "CP_STRIDE2_COUNT",
    "CP_STRIDE3_COUNT",
    "CP_STRIDE4_COUNT",
    "CP_ACCESS1_ACCESS",
    "CP_ACCESS2_ACCESS",
    "CP_ACCESS3_ACCESS",
    "CP_ACCESS4_ACCESS",
    "CP_ACCESS1_COUNT",
    "CP_ACCESS2_COUNT",
    "CP_ACCESS3_COUNT",
    "CP_ACCESS4_COUNT",

    "CP_NUM_INDICES"
};

/* isn't there a clever c way to avoid this? */
char *darshan_f_names[] = {
    "CP_F_OPEN_TIMESTAMP",        /* timestamp of first open */
    "CP_F_READ_START_TIMESTAMP",  /* timestamp of first read */
    "CP_F_WRITE_START_TIMESTAMP", /* timestamp of first write */
    "CP_F_CLOSE_TIMESTAMP",       /* timestamp of last close */
    "CP_F_READ_END_TIMESTAMP",    /* timestamp of last read */
    "CP_F_WRITE_END_TIMESTAMP",   /* timestamp of last write */
    "CP_F_POSIX_READ_TIME",       /* cumulative posix read time */
    "CP_F_POSIX_WRITE_TIME",      /* cumulative posix write time */
    "CP_F_POSIX_META_TIME",       /* cumulative posix meta time */
    "CP_F_MPI_META_TIME",         /* cumulative mpi-io metadata time */
    "CP_F_MPI_READ_TIME",         /* cumulative mpi-io read time */
    "CP_F_MPI_WRITE_TIME",        /* cumulative mpi-io write time */
169
170
    "CP_F_MAX_READ_TIME",
    "CP_F_MAX_WRITE_TIME",
171
172
173
    "CP_F_NUM_INDICES",
};

174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
/*******************************
 * version 1.21 to 1.22 differences 
 * - added:
 *   - CP_INDEP_NC_OPENS
 *   - CP_COLL_NC_OPENS
 *   - CP_HDF5_OPENS
 *   - CP_MAX_READ_TIME_SIZE
 *   - CP_MAX_WRITE_TIME_SIZE
 *   - CP_F_MAX_READ_TIME
 *   - CP_F_MAX_WRITE_TIME
 * - changed params:
 *   - CP_FILE_RECORD_SIZE: 1184 to 1240
 *   - CP_NUM_INDICES: 133 to 138
 *   - CP_F_NUM_INDICES: 12 to 14
 * - so 56 bytes worth of new indices are the only difference
 */
#define CP_NUM_INDICES_1_21 133
#define CP_F_NUM_INDICES_1_21 12
struct darshan_file_1_21
{
    uint64_t hash;
    int rank;
    int64_t counters[CP_NUM_INDICES_1_21];
    double fcounters[CP_F_NUM_INDICES_1_21];
    char name_suffix[CP_NAME_SUFFIX_LEN+1];
};

static void shift_missing_1_21(struct darshan_file* file);

203
/* a rather crude API for accessing raw binary darshan files */
204
darshan_fd darshan_log_open(const char *name)
205
206
207
208
{
    return gzopen(name, "r");
}

209
/* darshan_log_getjob()
210
211
212
 *
 * returns 0 on success, -1 on failure
 */
213
int darshan_log_getjob(darshan_fd file, struct darshan_job *job)
214
215
{
    int ret;
216
217
218

    gzseek(file, 0, SEEK_SET);

219
220
221
222
223
224
225
226
227
228
    ret = gzread(file, job, sizeof(*job));
    if (ret < sizeof(*job))
    {
        if(gzeof(file))
        {
            fprintf(stderr, "Error: invalid log file (too short).\n");
        }
        perror("darshan_job_init");
        return(-1);
    }
229
230
231
232
    if(strcmp(job->version_string, "1.21"))
        return(0);
    if(strcmp(job->version_string, "1.22"))
        return(0);
233

234
235
236
237
    fprintf(stderr, "Error: incompatible darshan file.\n");
    fprintf(stderr, "Error: expected version %s, but got %s\n", 
            CP_VERSION, job->version_string);
    return(-1);
238
239
}

240
/* darshan_log_getfile()
241
242
243
 *
 * return 1 if file record found, 0 on eof, and -1 on error
 */
244
int darshan_log_getfile(darshan_fd fd, struct darshan_job *job, struct darshan_file *file)
245
{
246
247
    int ret;
    const char* err_string;
248
    struct darshan_file_1_21 file_1_21;
249
250
251
252
253
    
    if(gztell(file) < CP_JOB_RECORD_SIZE)
        gzseek(file, CP_JOB_RECORD_SIZE, SEEK_SET);

    gzseek(file, sizeof(struct darshan_job), SEEK_SET);
254
255
256
257
258
259

    /* reset file record, so that diff compares against a zero'd out record
     * if file is missing
     */
    memset(file, 0, sizeof(&file));

260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
    if(strcmp(job->version_string, "1.21") == 0)
    {
        ret = gzread(fd, &file_1_21, sizeof(file_1_21));
        if(ret == sizeof(file_1_21))
        {
            /* convert to new file record structure */
            file->hash = file_1_21.hash;
            file->rank = file_1_21.rank;
            strcpy(file->name_suffix, file_1_21.name_suffix);
            memcpy(file->counters, file_1_21.counters,
                (CP_NUM_INDICES_1_21*sizeof(int64_t)));
            memcpy(file->fcounters, file_1_21.fcounters,
                (CP_F_NUM_INDICES_1_21*sizeof(double)));
            shift_missing_1_21(file);

            /* got exactly one, correct size record */
            return(1);
        }
    }
    else if(strcmp(job->version_string, "1.22") == 0)
    {
        /* make sure this is the current version */
        assert(strcmp("1.22", CP_VERSION) == 0);

        ret = gzread(fd, file, sizeof(*file));
        if(ret == sizeof(*file))
        {
            /* got exactly one, correct size record */
            return(1);
        }
    }
    else
292
    {
293
294
        fprintf(stderr, "Error: invalid log file version.\n");
        return(-1);
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
    }

    if(ret > 0)
    {
        /* got a short read */
        fprintf(stderr, "Error: invalid file record (too small)\n");
        return(-1);
    }

    if(ret == 0 && gzeof(fd))
    {
        /* hit end of file */
        return(0);
    }

    /* all other errors */
    err_string = gzerror(fd, &ret);
    fprintf(stderr, "Error: %s\n", err_string);
    return(-1);
314
315
}

316
int darshan_log_getexe(darshan_fd fd, char *buf, int *flag)
317
318
{
    int ret;
319
    char* newline;
320

321
322
    gzseek(fd, sizeof(struct darshan_job), SEEK_SET);

323
324
325
326
327
328
    ret = gzread(fd, buf, (CP_EXE_LEN + 1));
    if (ret < (CP_EXE_LEN + 1))
    {
        perror("gzread");
        return(-1);
    }
329
330
331
332
333
    if (gzeof(fd))
        *flag = 1;
    else
        *flag = 0;

334
335
336
337
338
339
340
341
    /* this call is only supposed to return the exe string, but starting in
     * log format 1.23 there could be a table of mount entry information
     * after the exe.  Look for newline character and truncate there.
     */
    newline = strchr(buf, '\n');
    if(newline)
        *newline = '\0';

342
343
344
    return (0);
}

345
void darshan_log_close(darshan_fd file)
346
347
348
349
{
    gzclose(file);
}

350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
/* darshan_log_print_version_warnings()
 *
 * Print summary of any problems with the detected log format
 */
void darshan_log_print_version_warnings(struct darshan_job *job)
{
    if(strcmp(job->version_string, "1.22") == 0)
    {
        /* nothing to do, this is the current version */
        return;
    }

    if(strcmp(job->version_string, "1.21") == 0)
    {
        printf("# WARNING: version 1.21 log format does not support the following parameters:\n");
        printf("#   CP_INDEP_NC_OPENS\n");
        printf("#   CP_COLL_NC_OPENS\n");
        printf("#   CP_HDF5_OPENS\n");
        printf("#   CP_MAX_READ_TIME_SIZE\n");
        printf("#   CP_MAX_WRITE_TIME_SIZE\n");
        printf("#   CP_F_MAX_READ_TIME\n");
        printf("#   CP_F_MAX_WRITE_TIME\n");
372
        printf("#\n");
373
374
375
376
377
378
379
380
        return;
    }

    fprintf(stderr, "Error: version %s not supported by parser.\n",
        job->version_string);
    return;
}

381
382
383
384
/* shift_missing_1_21()
 *
 * translates indices to account for counters that weren't present in log
 * format 1.21
385
 */
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
static void shift_missing_1_21(struct darshan_file* file)
{
    int c_index = 0;
    int missing_counters[] = {
        CP_INDEP_NC_OPENS,
        CP_COLL_NC_OPENS,
        CP_HDF5_OPENS,
        CP_MAX_READ_TIME_SIZE,
        CP_MAX_WRITE_TIME_SIZE,
        -1};
    int missing_f_counters[] = {
        CP_F_MAX_READ_TIME,
        CP_F_MAX_WRITE_TIME,
        -1};

    c_index = 0;
    while(missing_counters[c_index] != -1)
    {
        int missing_counter = missing_counters[c_index];
        c_index++;
        if(missing_counter < (CP_NUM_INDICES - 1))
        {
            /* shift down */
            memmove(&file->counters[missing_counter+1],
                &file->counters[missing_counter],
                (CP_NUM_INDICES-missing_counter-1)*sizeof(int64_t));
        }
        /* zero out missing counter */
        file->counters[missing_counter] = 0;
    }

    c_index = 0;
    while(missing_f_counters[c_index] != -1)
    {
        int missing_counter = missing_f_counters[c_index];
        c_index++;
        if(missing_counter < (CP_F_NUM_INDICES - 1))
        {
            /* shift down */
            memmove(&file->fcounters[missing_counter+1],
                &file->fcounters[missing_counter],
                (CP_F_NUM_INDICES-missing_counter-1)*sizeof(double));
        }
        /* zero out missing counter */
        file->fcounters[missing_counter] = 0;
    }

    return;
}