darshan-mpi-io.c 72.9 KB
Newer Older
1
2
3
4
5
/*
 *  (C) 2009 by Argonne National Laboratory.
 *      See COPYRIGHT in top-level directory.
 */

6
7
8
#define _XOPEN_SOURCE 500
#define _GNU_SOURCE /* for tdestroy() */

9
10
#include "darshan-config.h"

11
#include <stdio.h>
12
#ifdef HAVE_MNTENT_H
13
#include <mntent.h>
14
#endif
15
16
17
18
19
20
21
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <limits.h>
#include <unistd.h>
#include <pthread.h>
#include <sys/types.h>
22
#include <sys/stat.h>
23
24
25
26
27
28
29
#include <zlib.h>
#include <assert.h>
#include <search.h>

#include "mpi.h"
#include "darshan.h"

30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
#ifdef DARSHAN_PRELOAD
#include <dlfcn.h>
#include <stdlib.h>

#define DARSHAN_MPI_CALL(func) __real_ ## func

#define DARSHAN_FORWARD_DECL(name,ret,args) \
  ret (*__real_ ## name)args = NULL;

#define MAP_OR_FAIL(func) \
    __real_ ## func = dlsym(RTLD_NEXT, #func); \
    if (!(__real_ ## func)) { \
        fprintf(stderr, "Darshan failed to map symbol: %s\n", #func); \
    }

DARSHAN_FORWARD_DECL(PMPI_File_close, int, (MPI_File *fh));
DARSHAN_FORWARD_DECL(PMPI_File_set_size, int, (MPI_File fh, MPI_Offset size));
Philip Carns's avatar
Philip Carns committed
47
48
49
50
51
52
DARSHAN_FORWARD_DECL(PMPI_File_iread_at, int, (MPI_File fh, MPI_Offset offset, void *buf, int count, MPI_Datatype datatype, __D_MPI_REQUEST *request));
DARSHAN_FORWARD_DECL(PMPI_File_iread, int, (MPI_File fh, void  *buf, int  count, MPI_Datatype  datatype, __D_MPI_REQUEST  *request));
DARSHAN_FORWARD_DECL(PMPI_File_iread_shared, int, (MPI_File fh, void *buf, int count, MPI_Datatype datatype, __D_MPI_REQUEST *request));
DARSHAN_FORWARD_DECL(PMPI_File_iwrite_at, int, (MPI_File fh, MPI_Offset offset, void *buf, int count, MPI_Datatype datatype, __D_MPI_REQUEST *request));
DARSHAN_FORWARD_DECL(PMPI_File_iwrite, int, (MPI_File fh, void *buf, int count, MPI_Datatype datatype, __D_MPI_REQUEST *request));
DARSHAN_FORWARD_DECL(PMPI_File_iwrite_shared, int, (MPI_File fh, void *buf, int count, MPI_Datatype datatype, __D_MPI_REQUEST *request));
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
DARSHAN_FORWARD_DECL(PMPI_File_open, int, (MPI_Comm comm, char *filename, int amode, MPI_Info info, MPI_File *fh));
DARSHAN_FORWARD_DECL(PMPI_File_read_all_begin, int, (MPI_File fh, void *buf, int count, MPI_Datatype datatype));
DARSHAN_FORWARD_DECL(PMPI_File_read_all, int, (MPI_File fh, void *buf, int count, MPI_Datatype datatype, MPI_Status *status));
DARSHAN_FORWARD_DECL(PMPI_File_read_at_all, int, (MPI_File fh, MPI_Offset offset, void *buf, int count, MPI_Datatype datatype, MPI_Status *status));
DARSHAN_FORWARD_DECL(PMPI_File_read_at_all_begin, int, (MPI_File fh, MPI_Offset offset, void *buf, int count, MPI_Datatype datatype));
DARSHAN_FORWARD_DECL(PMPI_File_read_at, int, (MPI_File fh, MPI_Offset offset, void *buf, int count, MPI_Datatype datatype, MPI_Status *status));
DARSHAN_FORWARD_DECL(PMPI_File_read, int, (MPI_File fh, void *buf, int count, MPI_Datatype datatype, MPI_Status *status));
DARSHAN_FORWARD_DECL(PMPI_File_read_ordered_begin, int, (MPI_File fh, void *buf, int count, MPI_Datatype datatype));
DARSHAN_FORWARD_DECL(PMPI_File_read_ordered, int, (MPI_File fh, void *buf, int count, MPI_Datatype datatype, MPI_Status *status));
DARSHAN_FORWARD_DECL(PMPI_File_read_shared, int, (MPI_File fh, void *buf, int count, MPI_Datatype datatype, MPI_Status *status));
DARSHAN_FORWARD_DECL(PMPI_File_set_view, int, (MPI_File fh, MPI_Offset disp, MPI_Datatype etype, MPI_Datatype filetype, char *datarep, MPI_Info info));
DARSHAN_FORWARD_DECL(PMPI_File_sync, int, (MPI_File fh));
DARSHAN_FORWARD_DECL(PMPI_File_write_all_begin, int, (MPI_File fh, void *buf, int count, MPI_Datatype datatype));
DARSHAN_FORWARD_DECL(PMPI_File_write_all, int, (MPI_File fh, void *buf, int count, MPI_Datatype datatype, MPI_Status *status));
DARSHAN_FORWARD_DECL(PMPI_File_write_at_all_begin, int, (MPI_File fh, MPI_Offset offset, void *buf, int count, MPI_Datatype datatype));
DARSHAN_FORWARD_DECL(PMPI_File_write_at_all, int, (MPI_File fh, MPI_Offset offset, void *buf, int count, MPI_Datatype datatype, MPI_Status *status));
DARSHAN_FORWARD_DECL(PMPI_File_write_at, int, (MPI_File fh, MPI_Offset offset, void *buf, int count, MPI_Datatype datatype, MPI_Status *status));
DARSHAN_FORWARD_DECL(PMPI_File_write, int, (MPI_File fh, void *buf, int count, MPI_Datatype datatype, MPI_Status *status));
DARSHAN_FORWARD_DECL(PMPI_File_write_ordered_begin, int, (MPI_File fh, void *buf, int count, MPI_Datatype datatype));
DARSHAN_FORWARD_DECL(PMPI_File_write_ordered, int, (MPI_File fh, void *buf, int count, MPI_Datatype datatype, MPI_Status *status));
DARSHAN_FORWARD_DECL(PMPI_File_write_shared, int, (MPI_File fh, void *buf, int count, MPI_Datatype datatype, MPI_Status *status));
DARSHAN_FORWARD_DECL(PMPI_Finalize, int, ());
DARSHAN_FORWARD_DECL(PMPI_Init, int, (int *argc, char ***argv));
DARSHAN_FORWARD_DECL(PMPI_Init_thread, int, (int *argc, char ***argv, int required, int *provided));

DARSHAN_FORWARD_DECL(PMPI_Wtime, double, ());
DARSHAN_FORWARD_DECL(PMPI_Allreduce, int, (void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, MPI_Comm comm));
DARSHAN_FORWARD_DECL(PMPI_Bcast, int, (void *buffer, int count, MPI_Datatype datatype, int root, MPI_Comm comm));
DARSHAN_FORWARD_DECL(PMPI_Comm_rank, int, (MPI_Comm comm, int *rank));
DARSHAN_FORWARD_DECL(PMPI_Comm_size, int, (MPI_Comm comm, int *size));
DARSHAN_FORWARD_DECL(PMPI_Scan, int, (void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, MPI_Comm comm));
DARSHAN_FORWARD_DECL(PMPI_Type_commit, int, (MPI_Datatype *datatype));
DARSHAN_FORWARD_DECL(PMPI_Type_contiguous, int, (int count, MPI_Datatype oldtype, MPI_Datatype *newtype));
DARSHAN_FORWARD_DECL(PMPI_Type_extent, int, (MPI_Datatype datatype, MPI_Aint *extent));
DARSHAN_FORWARD_DECL(PMPI_Type_free, int, (MPI_Datatype *datatype));
DARSHAN_FORWARD_DECL(PMPI_Type_hindexed, int, (int count, int *array_of_blocklengths, MPI_Aint *array_of_displacements, MPI_Datatype oldtype, MPI_Datatype *newtype));
DARSHAN_FORWARD_DECL(PMPI_Op_create, int, (MPI_User_function *function, int commute, MPI_Op *op));
90
DARSHAN_FORWARD_DECL(PMPI_Op_free, int, (MPI_Op *op));
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
DARSHAN_FORWARD_DECL(PMPI_Reduce, int, (void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, int root, MPI_Comm comm));
DARSHAN_FORWARD_DECL(PMPI_Type_get_envelope, int, (MPI_Datatype datatype, int *num_integers, int *num_addresses, int *num_datatypes, int *combiner));
DARSHAN_FORWARD_DECL(PMPI_Type_size, int, (MPI_Datatype datatype, int *size));

void resolve_mpi_symbols (void)
{
    /*
     * Overloaded functions
     */
    MAP_OR_FAIL(PMPI_File_close);
    MAP_OR_FAIL(PMPI_File_set_size);
    MAP_OR_FAIL(PMPI_File_iread_at);
    MAP_OR_FAIL(PMPI_File_iread);
    MAP_OR_FAIL(PMPI_File_iread_shared);
    MAP_OR_FAIL(PMPI_File_iwrite_at);
    MAP_OR_FAIL(PMPI_File_iwrite);
    MAP_OR_FAIL(PMPI_File_iwrite_shared);
    MAP_OR_FAIL(PMPI_File_open);
    MAP_OR_FAIL(PMPI_File_read_all_begin);
    MAP_OR_FAIL(PMPI_File_read_all);
    MAP_OR_FAIL(PMPI_File_read_at_all_begin);
    MAP_OR_FAIL(PMPI_File_read_at_all);
    MAP_OR_FAIL(PMPI_File_read_at);
    MAP_OR_FAIL(PMPI_File_read);
    MAP_OR_FAIL(PMPI_File_read_ordered_begin);
    MAP_OR_FAIL(PMPI_File_read_ordered);
    MAP_OR_FAIL(PMPI_File_read_shared);
    MAP_OR_FAIL(PMPI_File_set_view);
    MAP_OR_FAIL(PMPI_File_sync);
    MAP_OR_FAIL(PMPI_File_write_all_begin);
    MAP_OR_FAIL(PMPI_File_write_all);
    MAP_OR_FAIL(PMPI_File_write_at_all_begin);
    MAP_OR_FAIL(PMPI_File_write_at_all);
    MAP_OR_FAIL(PMPI_File_write_at);
    MAP_OR_FAIL(PMPI_File_write);
    MAP_OR_FAIL(PMPI_File_write_ordered_begin);
    MAP_OR_FAIL(PMPI_File_write_ordered);
    MAP_OR_FAIL(PMPI_File_write_shared);
    MAP_OR_FAIL(PMPI_Finalize);
    MAP_OR_FAIL(PMPI_Init);
    MAP_OR_FAIL(PMPI_Init_thread);

    /*
     * These function are not intercepted but are used
     * by darshan itself.
     */
    MAP_OR_FAIL(PMPI_Wtime);
    MAP_OR_FAIL(PMPI_Allreduce);
    MAP_OR_FAIL(PMPI_Bcast);
    MAP_OR_FAIL(PMPI_Comm_rank);
    MAP_OR_FAIL(PMPI_Comm_size);
    MAP_OR_FAIL(PMPI_Scan);
    MAP_OR_FAIL(PMPI_Type_commit);
    MAP_OR_FAIL(PMPI_Type_contiguous);
    MAP_OR_FAIL(PMPI_Type_extent);
    MAP_OR_FAIL(PMPI_Type_free);
    MAP_OR_FAIL(PMPI_Type_size);
    MAP_OR_FAIL(PMPI_Type_hindexed);
    MAP_OR_FAIL(PMPI_Op_create);
150
    MAP_OR_FAIL(PMPI_Op_free);
151
152
153
154
155
156
157
158
159
160
161
162
    MAP_OR_FAIL(PMPI_Reduce);
    MAP_OR_FAIL(PMPI_Type_get_envelope);

    return;
}

#else

#define DARSHAN_MPI_CALL(func) func

#endif

163
164
165
166
167
extern char* __progname;

/* maximum number of memory segments each process will write to the log */
#define CP_MAX_MEM_SEGMENTS 8

168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
/* Some old versions of MPI don't provide all of these COMBINER definitions.  
 * If any are missing then we define them to an arbitrary value just to 
 * prevent compile errors in DATATYPE_INC().
 */
#ifndef MPI_COMBINER_NAMED
    #define MPI_COMBINER_NAMED CP_COMBINER_NAMED
#endif
#ifndef MPI_COMBINER_DUP
    #define MPI_COMBINER_DUP CP_COMBINER_DUP
#endif
#ifndef MPI_COMBINER_CONTIGUOUS
    #define MPI_COMBINER_CONTIGUOUS CP_COMBINER_CONTIGUOUS
#endif
#ifndef MPI_COMBINER_VECTOR
    #define MPI_COMBINER_VECTOR CP_COMBINER_VECTOR
#endif
#ifndef MPI_COMBINER_HVECTOR_INTEGER
    #define MPI_COMBINER_HVECTOR_INTEGER CP_COMBINER_HVECTOR_INTEGER
#endif
#ifndef MPI_COMBINER_HVECTOR
    #define MPI_COMBINER_HVECTOR CP_COMBINER_HVECTOR
#endif
#ifndef MPI_COMBINER_INDEXED
    #define MPI_COMBINER_INDEXED CP_COMBINER_INDEXED
#endif
#ifndef MPI_COMBINER_HINDEXED_INTEGER
    #define MPI_COMBINER_HINDEXED_INTEGER CP_COMBINER_HINDEXED_INTEGER
#endif
#ifndef MPI_COMBINER_HINDEXED
    #define MPI_COMBINER_HINDEXED CP_COMBINER_HINDEXED
#endif
#ifndef MPI_COMBINER_INDEXED_BLOCK
    #define MPI_COMBINER_INDEXED_BLOCK CP_COMBINER_INDEXED_BLOCK
#endif
#ifndef MPI_COMBINER_STRUCT_INTEGER
    #define MPI_COMBINER_STRUCT_INTEGER CP_COMBINER_STRUCT_INTEGER
#endif
#ifndef MPI_COMBINER_STRUCT
    #define MPI_COMBINER_STRUCT CP_COMBINER_STRUCT
#endif
#ifndef MPI_COMBINER_SUBARRAY
    #define MPI_COMBINER_SUBARRAY CP_COMBINER_SUBARRAY
#endif
#ifndef MPI_COMBINER_DARRAY
    #define MPI_COMBINER_DARRAY CP_COMBINER_DARRAY
#endif
#ifndef MPI_COMBINER_F90_REAL
    #define MPI_COMBINER_F90_REAL CP_COMBINER_F90_REAL
#endif
#ifndef MPI_COMBINER_F90_COMPLEX
    #define MPI_COMBINER_F90_COMPLEX CP_COMBINER_F90_COMPLEX
#endif
#ifndef MPI_COMBINER_F90_INTEGER
    #define MPI_COMBINER_F90_INTEGER CP_COMBINER_F90_INTEGER
#endif
#ifndef MPI_COMBINER_RESIZED
    #define MPI_COMBINER_RESIZED CP_COMBINER_RESIZED
#endif

227
228
#define CP_DATATYPE_INC(__file, __datatype) do {\
    int num_integers, num_addresses, num_datatypes, combiner, ret; \
229
230
    ret = DARSHAN_MPI_CALL(PMPI_Type_get_envelope)(__datatype, &num_integers, \
        &num_addresses, &num_datatypes, &combiner); \
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
    if(ret == MPI_SUCCESS) { \
        switch(combiner) { \
            case MPI_COMBINER_NAMED:\
                CP_INC(__file,CP_COMBINER_NAMED,1); break; \
            case MPI_COMBINER_DUP:\
                CP_INC(__file,CP_COMBINER_DUP,1); break; \
            case MPI_COMBINER_CONTIGUOUS:\
                CP_INC(__file,CP_COMBINER_CONTIGUOUS,1); break; \
            case MPI_COMBINER_VECTOR:\
                CP_INC(__file,CP_COMBINER_VECTOR,1); break; \
            case MPI_COMBINER_HVECTOR_INTEGER:\
                CP_INC(__file,CP_COMBINER_HVECTOR_INTEGER,1); break; \
            case MPI_COMBINER_HVECTOR:\
                CP_INC(__file,CP_COMBINER_HVECTOR,1); break; \
            case MPI_COMBINER_INDEXED:\
                CP_INC(__file,CP_COMBINER_INDEXED,1); break; \
            case MPI_COMBINER_HINDEXED_INTEGER:\
                CP_INC(__file,CP_COMBINER_HINDEXED_INTEGER,1); break; \
            case MPI_COMBINER_HINDEXED:\
                CP_INC(__file,CP_COMBINER_HINDEXED,1); break; \
            case MPI_COMBINER_INDEXED_BLOCK:\
                CP_INC(__file,CP_COMBINER_INDEXED_BLOCK,1); break; \
            case MPI_COMBINER_STRUCT_INTEGER:\
                CP_INC(__file,CP_COMBINER_STRUCT_INTEGER,1); break; \
            case MPI_COMBINER_STRUCT:\
                CP_INC(__file,CP_COMBINER_STRUCT,1); break; \
            case MPI_COMBINER_SUBARRAY:\
                CP_INC(__file,CP_COMBINER_SUBARRAY,1); break; \
            case MPI_COMBINER_DARRAY:\
                CP_INC(__file,CP_COMBINER_DARRAY,1); break; \
            case MPI_COMBINER_F90_REAL:\
                CP_INC(__file,CP_COMBINER_F90_REAL,1); break; \
            case MPI_COMBINER_F90_COMPLEX:\
                CP_INC(__file,CP_COMBINER_F90_COMPLEX,1); break; \
            case MPI_COMBINER_F90_INTEGER:\
                CP_INC(__file,CP_COMBINER_F90_INTEGER,1); break; \
            case MPI_COMBINER_RESIZED:\
                CP_INC(__file,CP_COMBINER_RESIZED,1); break; \
        } \
    } \
} while(0)

#define CP_RECORD_MPI_WRITE(__ret, __fh, __count, __datatype, __counter, __tm1, __tm2) do { \
    struct darshan_file_runtime* file; \
    int size = 0; \
    MPI_Aint extent = 0; \
    if(__ret != MPI_SUCCESS) break; \
    file = darshan_file_by_fh(__fh); \
    if(!file) break; \
280
    DARSHAN_MPI_CALL(PMPI_Type_size)(__datatype, &size);  \
281
    size = size * __count; \
282
    DARSHAN_MPI_CALL(PMPI_Type_extent)(__datatype, &extent); \
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
    CP_BUCKET_INC(file, CP_SIZE_WRITE_AGG_0_100, size); \
    CP_BUCKET_INC(file, CP_EXTENT_WRITE_0_100, extent); \
    CP_INC(file, __counter, 1); \
    CP_DATATYPE_INC(file, __datatype); \
    CP_F_INC(file, CP_F_MPI_WRITE_TIME, (__tm2-__tm1)); \
    if(CP_F_VALUE(file, CP_F_WRITE_START_TIMESTAMP) == 0) \
        CP_F_SET(file, CP_F_WRITE_START_TIMESTAMP, __tm1); \
    CP_F_SET(file, CP_F_WRITE_END_TIMESTAMP, __tm2); \
} while(0)

#define CP_RECORD_MPI_READ(__ret, __fh, __count, __datatype, __counter, __tm1, __tm2) do { \
    struct darshan_file_runtime* file; \
    int size = 0; \
    MPI_Aint extent = 0; \
    if(__ret != MPI_SUCCESS) break; \
    file = darshan_file_by_fh(__fh); \
    if(!file) break; \
300
    DARSHAN_MPI_CALL(PMPI_Type_size)(__datatype, &size);  \
301
    size = size * __count; \
302
    DARSHAN_MPI_CALL(PMPI_Type_extent)(__datatype, &extent); \
303
304
305
306
307
308
309
310
311
312
313
    CP_BUCKET_INC(file, CP_SIZE_READ_AGG_0_100, size); \
    CP_BUCKET_INC(file, CP_EXTENT_READ_0_100, extent); \
    CP_INC(file, __counter, 1); \
    CP_DATATYPE_INC(file, __datatype); \
    CP_F_INC(file, CP_F_MPI_READ_TIME, (__tm2-__tm1)); \
    if(CP_F_VALUE(file, CP_F_READ_START_TIMESTAMP) == 0) \
        CP_F_SET(file, CP_F_READ_START_TIMESTAMP, __tm1); \
    CP_F_SET(file, CP_F_READ_END_TIMESTAMP, __tm2); \
} while(0)

static struct darshan_file_runtime* darshan_file_by_fh(MPI_File fh);
314
315
316
static void cp_log_construct_indices(struct darshan_job_runtime* final_job,
    int rank, int* inout_count, int* lengths, void** pointers, char*
    trailing_data);
317
static int cp_log_write(struct darshan_job_runtime* final_job, int rank, 
318
    char* logfile_name, int count, int* lengths, void** pointers, double start_log_time);
319
static int cp_log_reduction(struct darshan_job_runtime* final_job, int rank, 
320
    char* logfile_name, MPI_Offset* next_offset);
321
322
323
324
325
326
static void darshan_file_reduce(void* infile_v, 
    void* inoutfile_v, int *len, 
    MPI_Datatype *datatype);
static int cp_log_compress(struct darshan_job_runtime* final_job,
    int rank, int* inout_count, int* lengths, void** pointers);
static int file_compare(const void* a, const void* b);
Philip Carns's avatar
Philip Carns committed
327
static void darshan_mpi_initialize(int *argc, char ***argv);
328
static char*  darshan_get_exe_and_mounts(struct darshan_job_runtime* final_job);
329
330
331
332
333
334
335
static int darshan_file_variance(
    struct darshan_file *infile_array,
    struct darshan_file *outfile_array,
    int count, int rank);
static void pairwise_variance_reduce (
    void *invec, void *inoutvec, int *len, MPI_Datatype *dt);

336

337
338
339
340
341
#define CP_MAX_MNTS 32
uint64_t mnt_hash_array[CP_MAX_MNTS] = {0};
int64_t mnt_id_array[CP_MAX_MNTS] = {0};
uint64_t mnt_hash_array_root[CP_MAX_MNTS] = {0};
int64_t mnt_id_array_root[CP_MAX_MNTS] = {0};
Philip Carns's avatar
Philip Carns committed
342
343
344
345
346
struct
{
    int64_t mnt_id_local;
    int64_t mnt_id_root;
} mnt_mapping[CP_MAX_MNTS];
347

348
349
350
351
352
353
354
struct variance_dt
{
    double n;
    double T;
    double S;
};

355
356
357
358
int MPI_Init(int *argc, char ***argv)
{
    int ret;

359
360
361
362
363
#ifdef DARSHAN_PRELOAD
    resolve_mpi_symbols();
#endif

    ret = DARSHAN_MPI_CALL(PMPI_Init)(argc, argv);
364
365
366
367
368
    if(ret != MPI_SUCCESS)
    {
        return(ret);
    }

369
370
371
372
373
374
375
376
377
    darshan_mpi_initialize(argc, argv);

    return(ret);
}

int MPI_Init_thread (int *argc, char ***argv, int required, int *provided)
{
    int ret;

378
    ret = DARSHAN_MPI_CALL(PMPI_Init_thread)(argc, argv, required, provided);
379
380
381
382
383
384
385
386
387
388
    if (ret != MPI_SUCCESS)
    {
        return(ret);
    }

    darshan_mpi_initialize(argc, argv);

    return(ret);
}

Philip Carns's avatar
Philip Carns committed
389
static void darshan_mpi_initialize(int *argc, char ***argv)
390
391
392
393
{
    int nprocs;
    int rank;

394
395
    DARSHAN_MPI_CALL(PMPI_Comm_size)(MPI_COMM_WORLD, &nprocs);
    DARSHAN_MPI_CALL(PMPI_Comm_rank)(MPI_COMM_WORLD, &rank);
396
397
398
399
400
401
402
403
404
405
406

    if(argc && argv)
    {
        darshan_initialize(*argc, *argv, nprocs, rank);
    }
    else
    {
        /* we don't see argc and argv here in fortran */
        darshan_initialize(0, NULL, nprocs, rank);
    }

407
    return;
408
409
410
411
412
413
414
415
416
417
418
419
420
}

void darshan_shutdown(int timing_flag)
{
    int rank;
    char* logfile_name;
    struct darshan_job_runtime* final_job;
    double start_log_time = 0;
    int flags;
    int all_ret = 0;
    int local_ret = 0;
    MPI_Offset next_offset = 0;
    char* jobid_str;
421
422
    char* envjobid;
    char* logpath;
423
424
425
426
427
428
    int jobid;
    int index_count = 0;
    int lengths[CP_MAX_MEM_SEGMENTS];
    void* pointers[CP_MAX_MEM_SEGMENTS];
    int ret;
    double red1=0, red2=0, gz1=0, gz2=0, write1=0, write2=0, tm_end=0;
429
    double bcst1=0, bcst2=0, bcst3=0;
430
    int nprocs;
431
    char* trailing_data = NULL;
Philip Carns's avatar
Philip Carns committed
432
433
    int i, j;
    int map_index = 0;
434
    time_t start_time_tmp = 0;
435
436
    uint64_t logmod;
    char hname[HOST_NAME_MAX];
437
438
439
440
441

    CP_LOCK();
    if(!darshan_global_job)
    {
        CP_UNLOCK();
442
        return;
443
444
445
446
447
448
449
450
451
    }
    /* disable further tracing while hanging onto the data so that we can
     * write it out
     */
    final_job = darshan_global_job;
    darshan_global_job = NULL;
    flags = final_job->flags;
    CP_UNLOCK();

452
    start_log_time = DARSHAN_MPI_CALL(PMPI_Wtime)();
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498

    /* figure out which access sizes to log */
    darshan_walk_file_accesses(final_job);

    /* if the records have been condensed, then zero out fields that are no
     * longer valid for safety 
     */
    if(final_job->flags & CP_FLAG_CONDENSED && final_job->file_count)
    {
        CP_SET(&final_job->file_runtime_array[0], CP_MODE, 0);
        CP_SET(&final_job->file_runtime_array[0], CP_CONSEC_READS, 0);
        CP_SET(&final_job->file_runtime_array[0], CP_CONSEC_WRITES, 0);
        CP_SET(&final_job->file_runtime_array[0], CP_SEQ_READS, 0);
        CP_SET(&final_job->file_runtime_array[0], CP_SEQ_WRITES, 0);
        CP_SET(&final_job->file_runtime_array[0], CP_STRIDE1_STRIDE, 0);
        CP_SET(&final_job->file_runtime_array[0], CP_STRIDE2_STRIDE, 0);
        CP_SET(&final_job->file_runtime_array[0], CP_STRIDE3_STRIDE, 0);
        CP_SET(&final_job->file_runtime_array[0], CP_STRIDE4_STRIDE, 0);
        CP_SET(&final_job->file_runtime_array[0], CP_STRIDE1_COUNT, 0);
        CP_SET(&final_job->file_runtime_array[0], CP_STRIDE2_COUNT, 0);
        CP_SET(&final_job->file_runtime_array[0], CP_STRIDE3_COUNT, 0);
        CP_SET(&final_job->file_runtime_array[0], CP_STRIDE4_COUNT, 0);
        CP_SET(&final_job->file_runtime_array[0], CP_ACCESS1_ACCESS, 0);
        CP_SET(&final_job->file_runtime_array[0], CP_ACCESS2_ACCESS, 0);
        CP_SET(&final_job->file_runtime_array[0], CP_ACCESS3_ACCESS, 0);
        CP_SET(&final_job->file_runtime_array[0], CP_ACCESS4_ACCESS, 0);
        CP_SET(&final_job->file_runtime_array[0], CP_ACCESS1_COUNT, 0);
        CP_SET(&final_job->file_runtime_array[0], CP_ACCESS2_COUNT, 0);
        CP_SET(&final_job->file_runtime_array[0], CP_ACCESS3_COUNT, 0);
        CP_SET(&final_job->file_runtime_array[0], CP_ACCESS4_COUNT, 0);
        
        CP_F_SET(&final_job->file_runtime_array[0], CP_F_OPEN_TIMESTAMP, 0);
        CP_F_SET(&final_job->file_runtime_array[0], CP_F_CLOSE_TIMESTAMP, 0);
        CP_F_SET(&final_job->file_runtime_array[0], CP_F_READ_START_TIMESTAMP, 0);
        CP_F_SET(&final_job->file_runtime_array[0], CP_F_READ_END_TIMESTAMP, 0);
        CP_F_SET(&final_job->file_runtime_array[0], CP_F_WRITE_START_TIMESTAMP, 0);
        CP_F_SET(&final_job->file_runtime_array[0], CP_F_WRITE_END_TIMESTAMP, 0);
    }

    logfile_name = malloc(PATH_MAX);
    if(!logfile_name)
    {
        darshan_finalize(final_job);
        return;
    }

499
    DARSHAN_MPI_CALL(PMPI_Comm_rank)(MPI_COMM_WORLD, &rank);
500

501
    /* collect information about command line and 
502
503
     * mounted file systems 
     */
504
505
506
507
    trailing_data = darshan_get_exe_and_mounts(final_job);

    /* broadcast mount point information from root */
    if(rank == 0)
508
    {
509
510
511
512
        memcpy(mnt_hash_array_root, mnt_hash_array,
            CP_MAX_MNTS*sizeof(uint64_t));
        memcpy(mnt_id_array_root, mnt_id_array,
            CP_MAX_MNTS*sizeof(int64_t));
513
    }
514

515
516
517
518
519
520
    bcst1=DARSHAN_MPI_CALL(PMPI_Wtime)();
    DARSHAN_MPI_CALL(PMPI_Bcast)(mnt_id_array_root,
        CP_MAX_MNTS*sizeof(int64_t), MPI_BYTE, 0, MPI_COMM_WORLD);
    DARSHAN_MPI_CALL(PMPI_Bcast)(mnt_hash_array_root,
        CP_MAX_MNTS*sizeof(uint64_t), MPI_BYTE, 0, MPI_COMM_WORLD);
    bcst2=DARSHAN_MPI_CALL(PMPI_Wtime)();
521

522
523
524
525
526
527
528
529
530
531
532
533
    /* identify any common mount points that have different device ids on
     * non-root processes
     */
    for(i=0; (i<CP_MAX_MNTS && mnt_hash_array_root[i] != 0); i++)
    {
        for(j=0; (j<CP_MAX_MNTS && mnt_hash_array[j] != 0); j++)
        {
            if(mnt_hash_array_root[i] == mnt_hash_array[j])
            {
                /* found a shared mount point */
                if(mnt_id_array_root[i] != mnt_id_array[j])
                {
Philip Carns's avatar
Philip Carns committed
534
535
536
537
538
539
                    /* mismatching ids; record correct mapping */
                    mnt_mapping[map_index].mnt_id_local =
                        mnt_id_array[j];
                    mnt_mapping[map_index].mnt_id_root = 
                        mnt_id_array_root[i];
                    map_index++;
540
541
542
543
544
                }
                break;
            }
        }
    }
Philip Carns's avatar
Philip Carns committed
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
 
    /* adjust affected file records */
    for(i=0; (i<final_job->file_count && map_index > 0); i++)
    {
        for(j=0; j<map_index; j++)
        {
            if(final_job->file_array[i].counters[CP_DEVICE] ==
                mnt_mapping[j].mnt_id_local)
            {
                final_job->file_array[i].counters[CP_DEVICE] =  
                    mnt_mapping[j].mnt_id_root;
                break;
            }
        }
    }
   
561
562
563
564
565
566
    /* construct log file name */
    if(rank == 0)
    {
        char cuser[L_cuserid] = {0};
        struct tm* my_tm;

567
568
569
570
571
572
573
574
575
576
577
578
579
580
        /* Use CP_JOBID_OVERRIDE for the env var or CP_JOBID */
        envjobid = getenv(CP_JOBID_OVERRIDE);
        if (!envjobid)
        {
            envjobid = CP_JOBID;
        }

        /* Use CP_LOG_PATH_OVERRIDE for the value or __CP_LOG_PATH */
        logpath = getenv(CP_LOG_PATH_OVERRIDE);
        if (!logpath)
        {
            logpath = __CP_LOG_PATH;
        }

581
        /* find a job id */
582
        jobid_str = getenv(envjobid);
583
584
585
586
587
588
589
590
591
592
593
594
        if(jobid_str)
        {
            /* in cobalt we can find it in env var */
            ret = sscanf(jobid_str, "%d", &jobid);
        }
        if(!jobid_str || ret != 1)
        {
            /* use pid as fall back */
            jobid = getpid();
        }

        /* break out time into something human readable */
595
596
        start_time_tmp += final_job->log_job.start_time;
        my_tm = localtime(&start_time_tmp);
597
598
599
600

        /* note: getpwuid() causes link errors for static binaries */
        cuserid(cuser);

601
602
603
604
        /* generate a random number to help differentiate the log */
        (void) gethostname(hname, sizeof(hname));
        logmod = darshan_hash((void*)hname,strlen(hname),0);

605
        ret = snprintf(logfile_name, PATH_MAX, 
Philip Carns's avatar
Philip Carns committed
606
            "%s/%d/%d/%d/%s_%s_id%d_%d-%d-%d-%" PRIu64 ".darshan_partial",
607
            logpath, (my_tm->tm_year+1900), 
608
            (my_tm->tm_mon+1), my_tm->tm_mday, 
609
610
611
            cuser, __progname, jobid,
            (my_tm->tm_mon+1), 
            my_tm->tm_mday, 
612
            (my_tm->tm_hour*60*60 + my_tm->tm_min*60 + my_tm->tm_sec),
613
            logmod);
614
615
616
617
618
        if(ret == (PATH_MAX-1))
        {
            /* file name was too big; squish it down */
            snprintf(logfile_name, PATH_MAX,
                "%s/id%d.darshan_partial",
619
                logpath, jobid);
620
        }
621
622
623

        /* add jobid */
        final_job->log_job.jobid = (int64_t)jobid;
624
625
626
    }

    /* broadcast log file name */
627
628
629
    bcst3=DARSHAN_MPI_CALL(PMPI_Wtime)();
    DARSHAN_MPI_CALL(PMPI_Bcast)(logfile_name, PATH_MAX, MPI_CHAR, 0,
        MPI_COMM_WORLD);
630
631
632
633
634

    final_job->log_job.end_time = time(NULL);

    /* reduce records for shared files */
    if(timing_flag)
635
        red1 = DARSHAN_MPI_CALL(PMPI_Wtime)();
636
637
    local_ret = cp_log_reduction(final_job, rank, logfile_name, 
        &next_offset);
638
    if(timing_flag)
639
640
        red2 = DARSHAN_MPI_CALL(PMPI_Wtime)();
    DARSHAN_MPI_CALL(PMPI_Allreduce)(&local_ret, &all_ret, 1, MPI_INT, MPI_LOR, 
641
642
643
644
645
646
        MPI_COMM_WORLD);

    if(all_ret == 0)
    {
        /* collect data to write from local process */
        cp_log_construct_indices(final_job, rank, &index_count, lengths, 
647
            pointers, trailing_data);
648
649
650
651
652
653
    }

    if(all_ret == 0)
    {
        /* compress data */
        if(timing_flag)
654
            gz1 = DARSHAN_MPI_CALL(PMPI_Wtime)();
655
656
657
        local_ret = cp_log_compress(final_job, rank, &index_count, 
            lengths, pointers);
        if(timing_flag)
658
659
660
            gz2 = DARSHAN_MPI_CALL(PMPI_Wtime)();
        DARSHAN_MPI_CALL(PMPI_Allreduce)(&local_ret, &all_ret, 1,
            MPI_INT, MPI_LOR, MPI_COMM_WORLD);
661
662
663
664
665
666
    }

    if(all_ret == 0)
    {
        /* actually write out log file */
        if(timing_flag)
667
            write1 = DARSHAN_MPI_CALL(PMPI_Wtime)();
668
        local_ret = cp_log_write(final_job, rank, logfile_name, 
669
670
            index_count, lengths, pointers, start_log_time);
        if(timing_flag)
671
672
673
            write2 = DARSHAN_MPI_CALL(PMPI_Wtime)();
        DARSHAN_MPI_CALL(PMPI_Allreduce)(&local_ret, &all_ret, 1,
            MPI_INT, MPI_LOR, MPI_COMM_WORLD);
674
675
676
677
678
679
680
681
682
683
    }

    /* if any process failed to write log, then delete the whole file so we
     * don't leave corrupted results
     */
    if(all_ret != 0 && rank == 0)
    {
        unlink(logfile_name);
    }

684
685
    if(trailing_data)
        free(trailing_data);
686
687
688
689
690
691
692
693
694
    free(logfile_name);
    darshan_finalize(final_job);
    
    if(timing_flag)
    {
        double red_tm, red_slowest;
        double gz_tm, gz_slowest;
        double write_tm, write_slowest;
        double all_tm, all_slowest;
695
        double bcst_tm, bcst_slowest;
696
        
697
        tm_end = DARSHAN_MPI_CALL(PMPI_Wtime)();
698

699
        bcst_tm=(bcst2-bcst1)+(red1-bcst3);
700
701
702
703
704
        red_tm = red2-red1;
        gz_tm = gz2-gz1;
        write_tm = write2-write1;
        all_tm = tm_end-start_log_time;

705
        DARSHAN_MPI_CALL(PMPI_Allreduce)(&red_tm, &red_slowest, 1,
706
            MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
707
        DARSHAN_MPI_CALL(PMPI_Allreduce)(&gz_tm, &gz_slowest, 1,
708
            MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
709
        DARSHAN_MPI_CALL(PMPI_Allreduce)(&write_tm, &write_slowest, 1,
710
            MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
711
        DARSHAN_MPI_CALL(PMPI_Allreduce)(&all_tm, &all_slowest, 1,
712
            MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
713
        DARSHAN_MPI_CALL(PMPI_Allreduce)(&bcst_tm, &bcst_slowest, 1,
714
            MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
715
716
717

        if(rank == 0)
        {
718
            DARSHAN_MPI_CALL(PMPI_Comm_size)(MPI_COMM_WORLD, &nprocs);
719
            printf("#<op>\t<nprocs>\t<time>\n");
720
            printf("bcst\t%d\t%f\n", nprocs, bcst_slowest);
721
722
723
724
725
726
727
728
729
730
731
732
733
734
            printf("reduce\t%d\t%f\n", nprocs, red_slowest);
            printf("gzip\t%d\t%f\n", nprocs, gz_slowest);
            printf("write\t%d\t%f\n", nprocs, write_slowest);
            printf("all\t%d\t%f\n", nprocs, all_slowest);
        }
    }

    return;
}

int MPI_Finalize(void)
{
    int ret;

735
736
737
738
    if(getenv("DARSHAN_INTERNAL_TIMING"))
        darshan_shutdown(1);
    else
        darshan_shutdown(0);
739

740
    ret = DARSHAN_MPI_CALL(PMPI_Finalize)();
741
742
743
744
745
746
747
748
749
750
751
752
753
754
    return(ret);
}

int MPI_File_open(MPI_Comm comm, char *filename, int amode, MPI_Info info, MPI_File *fh) 
{
    int ret;
    struct darshan_file_runtime* file;
    char* tmp;
    int comm_size;
    int hash_index;
    uint64_t tmp_hash;
    double tm1, tm2;

    tm1 = darshan_wtime();
755
    ret = DARSHAN_MPI_CALL(PMPI_File_open)(comm, filename, amode, info, fh);
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
    tm2 = darshan_wtime();

    if(ret == MPI_SUCCESS)
    {
        CP_LOCK();

        /* use ROMIO approach to strip prefix if present */
        /* strip off prefix if there is one, but only skip prefixes
         * if they are greater than length one to allow for windows
         * drive specifications (e.g. c:\...) 
         */
        tmp = strchr(filename, ':');
        if (tmp > filename + 1) {
            filename = tmp + 1;
        }

        file = darshan_file_by_name(filename);
        /* TODO: handle the case of multiple concurrent opens */
        if(file && (file->fh == MPI_FILE_NULL))
        {
            file->fh = *fh;
            CP_SET(file, CP_MODE, amode);
            CP_F_INC(file, CP_F_MPI_META_TIME, (tm2-tm1));
            if(CP_F_VALUE(file, CP_F_OPEN_TIMESTAMP) == 0)
780
781
782
                CP_F_SET(file, CP_F_OPEN_TIMESTAMP,
                DARSHAN_MPI_CALL(PMPI_Wtime)());
            DARSHAN_MPI_CALL(PMPI_Comm_size)(comm, &comm_size);
783
784
785
786
787
788
789
790
791
792
793
794
            if(comm_size == 1)
            {
                CP_INC(file, CP_INDEP_OPENS, 1);
            }
            else
            {
                CP_INC(file, CP_COLL_OPENS, 1);
            }
            if(info != MPI_INFO_NULL)
            {
                CP_INC(file, CP_HINTS, 1);
            }
795
            tmp_hash = darshan_hash((void*)fh, sizeof(*fh), 0);
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
            hash_index = tmp_hash & CP_HASH_MASK;
            file->fh_prev = NULL;
            file->fh_next = darshan_global_job->fh_table[hash_index];
            if(file->fh_next)
                file->fh_next->fh_prev = file;
            darshan_global_job->fh_table[hash_index] = file;
        }
        CP_UNLOCK();
    }

    return(ret);
}

int MPI_File_close(MPI_File *fh) 
{
    int hash_index;
    uint64_t tmp_hash;
    struct darshan_file_runtime* file;
    MPI_File tmp_fh = *fh;
    double tm1, tm2;
    int ret;
    
    tm1 = darshan_wtime();
819
    ret = DARSHAN_MPI_CALL(PMPI_File_close)(fh);
820
821
822
823
824
825
826
    tm2 = darshan_wtime();

    CP_LOCK();
    file = darshan_file_by_fh(tmp_fh);
    if(file)
    {
        file->fh = MPI_FILE_NULL;
827
        CP_F_SET(file, CP_F_CLOSE_TIMESTAMP, DARSHAN_MPI_CALL(PMPI_Wtime)());
828
829
830
831
        CP_F_INC(file, CP_F_MPI_META_TIME, (tm2-tm1));
        if(file->fh_prev == NULL)
        {
            /* head of fh hash table list */
832
            tmp_hash = darshan_hash((void*)&tmp_fh, sizeof(tmp_fh), 0);
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
            hash_index = tmp_hash & CP_HASH_MASK;
            darshan_global_job->fh_table[hash_index] = file->fh_next;
            if(file->fh_next)
                file->fh_next->fh_prev = NULL;
        }
        else
        {
            if(file->fh_prev)
                file->fh_prev->fh_next = file->fh_next;
            if(file->fh_next)
                file->fh_next->fh_prev = file->fh_prev;
        }
        file->fh_prev = NULL;
        file->fh_next = NULL;
        darshan_global_job->darshan_mru_file = file; /* in case we open it again, or hit posix calls */
    }
    CP_UNLOCK();

    return(ret);
}

int MPI_File_sync(MPI_File fh)
{
    int ret;
    struct darshan_file_runtime* file;
    double tm1, tm2;

    tm1 = darshan_wtime();
861
    ret = DARSHAN_MPI_CALL(PMPI_File_sync)(fh);
862
863
864
865
866
867
868
    tm2 = darshan_wtime();
    if(ret == MPI_SUCCESS)
    {
        CP_LOCK();
        file = darshan_file_by_fh(fh);
        if(file)
        {
869
            CP_F_INC(file, CP_F_MPI_WRITE_TIME, (tm2-tm1));
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
            CP_INC(file, CP_SYNCS, 1);
        }
        CP_UNLOCK();
    }

    return(ret);
}


int MPI_File_set_view(MPI_File fh, MPI_Offset disp, MPI_Datatype etype, 
    MPI_Datatype filetype, char *datarep, MPI_Info info)
{
    int ret;
    struct darshan_file_runtime* file;
    double tm1, tm2;

    tm1 = darshan_wtime();
887
888
    ret = DARSHAN_MPI_CALL(PMPI_File_set_view)(fh, disp, etype,
        filetype, datarep, info);
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
    tm2 = darshan_wtime();
    if(ret == MPI_SUCCESS)
    {
        CP_LOCK();
        file = darshan_file_by_fh(fh);
        if(file)
        {
            CP_INC(file, CP_VIEWS, 1);
            if(info != MPI_INFO_NULL)
            {
                CP_F_INC(file, CP_F_MPI_META_TIME, (tm2-tm1));
                CP_INC(file, CP_HINTS, 1);
            }
            CP_DATATYPE_INC(file, filetype);
        }
        CP_UNLOCK();
    }

    return(ret);
}

int MPI_File_read(MPI_File fh, void *buf, int count, 
    MPI_Datatype datatype, MPI_Status *status)
{
    int ret;
    double tm1, tm2;

    tm1 = darshan_wtime();
917
    ret = DARSHAN_MPI_CALL(PMPI_File_read)(fh, buf, count, datatype, status);
918
919
920
921
922
923
924
925
926
927
928
929
930
931
    tm2 = darshan_wtime();
    CP_LOCK();
    CP_RECORD_MPI_READ(ret, fh, count, datatype, CP_INDEP_READS, tm1, tm2);
    CP_UNLOCK();
    return(ret);
}

int MPI_File_read_at(MPI_File fh, MPI_Offset offset, void *buf,
    int count, MPI_Datatype datatype, MPI_Status *status)
{
    int ret;
    double tm1, tm2;

    tm1 = darshan_wtime();
932
933
    ret = DARSHAN_MPI_CALL(PMPI_File_read_at)(fh, offset, buf,
        count, datatype, status);
934
935
936
937
938
939
940
941
942
943
944
945
946
947
    tm2 = darshan_wtime();
    CP_LOCK();
    CP_RECORD_MPI_READ(ret, fh, count, datatype, CP_INDEP_READS, tm1, tm2);
    CP_UNLOCK();
    return(ret);
}

int MPI_File_read_at_all(MPI_File fh, MPI_Offset offset, void * buf,
    int count, MPI_Datatype datatype, MPI_Status * status)
{
    int ret;
    double tm1, tm2;

    tm1 = darshan_wtime();
948
949
    ret = DARSHAN_MPI_CALL(PMPI_File_read_at_all)(fh, offset, buf,
        count, datatype, status);
950
951
952
953
954
955
956
957
958
959
960
961
962
    tm2 = darshan_wtime();
    CP_LOCK();
    CP_RECORD_MPI_READ(ret, fh, count, datatype, CP_COLL_READS, tm1, tm2);
    CP_UNLOCK();
    return(ret);
}

int MPI_File_read_all(MPI_File fh, void * buf, int count, MPI_Datatype datatype, MPI_Status *status)
{
    int ret;
    double tm1, tm2;

    tm1 = darshan_wtime();
963
964
    ret = DARSHAN_MPI_CALL(PMPI_File_read_all)(fh, buf, count,
        datatype, status);
965
966
967
968
969
970
971
972
973
974
975
976
977
    tm2 = darshan_wtime();
    CP_LOCK();
    CP_RECORD_MPI_READ(ret, fh, count, datatype, CP_COLL_READS, tm1, tm2);
    CP_UNLOCK();
    return(ret);
}

int MPI_File_read_shared(MPI_File fh, void * buf, int count, MPI_Datatype datatype, MPI_Status *status)
{
    int ret;
    double tm1, tm2;

    tm1 = darshan_wtime();
978
979
    ret = DARSHAN_MPI_CALL(PMPI_File_read_shared)(fh, buf, count,
        datatype, status);
980
981
982
983
984
985
986
987
988
989
990
991
992
993
    tm2 = darshan_wtime();
    CP_LOCK();
    CP_RECORD_MPI_READ(ret, fh, count, datatype, CP_INDEP_READS, tm1, tm2);
    CP_UNLOCK();
    return(ret);
}

int MPI_File_read_ordered(MPI_File fh, void * buf, int count, 
    MPI_Datatype datatype, MPI_Status * status)
{
    int ret;
    double tm1, tm2;

    tm1 = darshan_wtime();
994
995
    ret = DARSHAN_MPI_CALL(PMPI_File_read_ordered)(fh, buf, count,
        datatype, status);
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
    tm2 = darshan_wtime();
    CP_LOCK();
    CP_RECORD_MPI_READ(ret, fh, count, datatype, CP_COLL_READS, tm1, tm2);
    CP_UNLOCK();
    return(ret);
}

int MPI_File_read_at_all_begin(MPI_File fh, MPI_Offset offset, void * buf,
    int count, MPI_Datatype datatype)
{
    int ret;
    double tm1, tm2;

    tm1 = darshan_wtime();
1010
1011
    ret = DARSHAN_MPI_CALL(PMPI_File_read_at_all_begin)(fh, offset, buf,
        count, datatype);
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
    tm2 = darshan_wtime();
    CP_LOCK();
    CP_RECORD_MPI_READ(ret, fh, count, datatype, CP_SPLIT_READS, tm1, tm2);
    CP_UNLOCK();
    return(ret);
}

int MPI_File_read_all_begin(MPI_File fh, void * buf, int count, MPI_Datatype datatype)
{
    int ret;
    double tm1, tm2;

    tm1 = darshan_wtime();
1025
    ret = DARSHAN_MPI_CALL(PMPI_File_read_all_begin)(fh, buf, count, datatype);
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
    tm2 = darshan_wtime();
    CP_LOCK();
    CP_RECORD_MPI_READ(ret, fh, count, datatype, CP_SPLIT_READS, tm1, tm2);
    CP_UNLOCK();
    return(ret);
}

int MPI_File_read_ordered_begin(MPI_File fh, void * buf, int count, MPI_Datatype datatype)
{
    int ret;
    double tm1, tm2;

    tm1 = darshan_wtime();
1039
1040
    ret = DARSHAN_MPI_CALL(PMPI_File_read_ordered_begin)(fh, buf, count,
        datatype);
1041
1042
1043
1044
1045
1046
1047
1048
    tm2 = darshan_wtime();
    CP_LOCK();
    CP_RECORD_MPI_READ(ret, fh, count, datatype, CP_SPLIT_READS, tm1, tm2);
    CP_UNLOCK();
    return(ret);
}

int MPI_File_iread_at(MPI_File fh, MPI_Offset offset, void * buf,
Philip Carns's avatar
Philip Carns committed
1049
    int count, MPI_Datatype datatype, __D_MPI_REQUEST *request)
1050
1051
1052
1053
1054
{
    int ret;
    double tm1, tm2;

    tm1 = darshan_wtime();
1055
1056
    ret = DARSHAN_MPI_CALL(PMPI_File_iread_at)(fh, offset, buf, count,
        datatype, request);
1057
1058
1059
1060
1061
1062
1063
    tm2 = darshan_wtime();
    CP_LOCK();
    CP_RECORD_MPI_READ(ret, fh, count, datatype, CP_NB_READS, tm1, tm2);
    CP_UNLOCK();
    return(ret);
}

Philip Carns's avatar
Philip Carns committed
1064
int MPI_File_iread(MPI_File fh, void * buf, int count, MPI_Datatype datatype, __D_MPI_REQUEST * request)
1065
1066
1067
1068
1069
{
    int ret;
    double tm1, tm2;

    tm1 = darshan_wtime();
1070
    ret = DARSHAN_MPI_CALL(PMPI_File_iread)(fh, buf, count, datatype, request);
1071
1072
1073
1074
1075
1076
1077
1078
    tm2 = darshan_wtime();
    CP_LOCK();
    CP_RECORD_MPI_READ(ret, fh, count, datatype, CP_NB_READS, tm1, tm2);
    CP_UNLOCK();
    return(ret);
}

int MPI_File_iread_shared(MPI_File fh, void * buf, int count,
Philip Carns's avatar
Philip Carns committed
1079
    MPI_Datatype datatype, __D_MPI_REQUEST * request)
1080
1081
1082
1083
1084
{
    int ret;
    double tm1, tm2;

    tm1 = darshan_wtime();
1085
1086
    ret = DARSHAN_MPI_CALL(PMPI_File_iread_shared)(fh, buf, count,
        datatype, request);
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
    tm2 = darshan_wtime();
    CP_LOCK();
    CP_RECORD_MPI_READ(ret, fh, count, datatype, CP_NB_READS, tm1, tm2);
    CP_UNLOCK();
    return(ret);
}


int MPI_File_write(MPI_File fh, void *buf, int count, 
    MPI_Datatype datatype, MPI_Status *status)
{
    int ret;
    double tm1, tm2;

    tm1 = darshan_wtime();
1102
    ret = DARSHAN_MPI_CALL(PMPI_File_write)(fh, buf, count, datatype, status);
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
    tm2 = darshan_wtime();
    CP_LOCK();
    CP_RECORD_MPI_WRITE(ret, fh, count, datatype, CP_INDEP_WRITES, tm1, tm2);
    CP_UNLOCK();
    return(ret);
}

int MPI_File_write_at(MPI_File fh, MPI_Offset offset, void *buf,
    int count, MPI_Datatype datatype, MPI_Status *status)
{
    int ret;
    double tm1, tm2;

    tm1 = darshan_wtime();
1117
1118
    ret = DARSHAN_MPI_CALL(PMPI_File_write_at)(fh, offset, buf,
        count, datatype, status);
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
    tm2 = darshan_wtime();
    CP_LOCK();
    CP_RECORD_MPI_WRITE(ret, fh, count, datatype, CP_INDEP_WRITES, tm1, tm2);
    CP_UNLOCK();
    return(ret);
}

int MPI_File_write_at_all(MPI_File fh, MPI_Offset offset, void * buf,
    int count, MPI_Datatype datatype, MPI_Status * status)
{
    int ret;
    double tm1, tm2;

    tm1 = darshan_wtime();
1133
1134
    ret = DARSHAN_MPI_CALL(PMPI_File_write_at_all)(fh, offset, buf,
        count, datatype, status);
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
    tm2 = darshan_wtime();
    CP_LOCK();
    CP_RECORD_MPI_WRITE(ret, fh, count, datatype, CP_COLL_WRITES, tm1, tm2);
    CP_UNLOCK();
    return(ret);
}

int MPI_File_write_all(MPI_File fh, void * buf, int count, MPI_Datatype datatype, MPI_Status *status)
{
    int ret;
    double tm1, tm2;

    tm1 = darshan_wtime();
1148
1149
    ret = DARSHAN_MPI_CALL(PMPI_File_write_all)(fh, buf, count,
        datatype, status);
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
    tm2 = darshan_wtime();
    CP_LOCK();
    CP_RECORD_MPI_WRITE(ret, fh, count, datatype, CP_COLL_WRITES, tm1, tm2);
    CP_UNLOCK();
    return(ret);
}

int MPI_File_write_shared(MPI_File fh, void * buf, int count, MPI_Datatype datatype, MPI_Status *status)
{
    int ret;
    double tm1, tm2;

    tm1 = darshan_wtime();
1163
1164
    ret = DARSHAN_MPI_CALL(PMPI_File_write_shared)(fh, buf, count,
        datatype, status);
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
    tm2 = darshan_wtime();
    CP_LOCK();
    CP_RECORD_MPI_WRITE(ret, fh, count, datatype, CP_INDEP_WRITES, tm1, tm2);
    CP_UNLOCK();
    return(ret);
}

int MPI_File_write_ordered(MPI_File fh, void * buf, int count, 
    MPI_Datatype datatype, MPI_Status * status)
{
    int ret;
    double tm1, tm2;

    tm1 = darshan_wtime();
1179
1180
    ret = DARSHAN_MPI_CALL(PMPI_File_write_ordered)(fh, buf, count,
         datatype, status);
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
    tm2 = darshan_wtime();
    CP_LOCK();
    CP_RECORD_MPI_WRITE(ret, fh, count, datatype, CP_COLL_WRITES, tm1, tm2);
    CP_UNLOCK();
    return(ret);
}

int MPI_File_write_at_all_begin(MPI_File fh, MPI_Offset offset, void * buf,
    int count, MPI_Datatype datatype)
{
    int ret;
    double tm1, tm2;

    tm1 = darshan_wtime();
1195
1196
    ret = DARSHAN_MPI_CALL(PMPI_File_write_at_all_begin)(fh, offset,
        buf, count, datatype);
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
    tm2 = darshan_wtime();
    CP_LOCK();
    CP_RECORD_MPI_WRITE(ret, fh, count, datatype, CP_SPLIT_WRITES, tm1, tm2);
    CP_UNLOCK();
    return(ret);
}

int MPI_File_write_all_begin(MPI_File fh, void * buf, int count, MPI_Datatype datatype)
{
    int ret;
    double tm1, tm2;

    tm1 = darshan_wtime();
1210
    ret = DARSHAN_MPI_CALL(PMPI_File_write_all_begin)(fh, buf, count, datatype);
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
    tm2 = darshan_wtime();
    CP_LOCK();
    CP_RECORD_MPI_WRITE(ret, fh, count, datatype, CP_SPLIT_WRITES, tm1, tm2);
    CP_UNLOCK();
    return(ret);
}

int MPI_File_write_ordered_begin(MPI_File fh, void * buf, int count, MPI_Datatype datatype)
{
    int ret;
    double tm1, tm2;

    tm1 = darshan_wtime();
1224
1225
    ret = DARSHAN_MPI_CALL(PMPI_File_write_ordered_begin)(fh, buf, count,
        datatype);
1226
1227
1228
1229
1230
1231
1232
1233
    tm2 = darshan_wtime();
    CP_LOCK();
    CP_RECORD_MPI_WRITE(ret, fh, count, datatype, CP_SPLIT_WRITES, tm1, tm2);
    CP_UNLOCK();
    return(ret);
}

int MPI_File_iwrite_at(MPI_File fh, MPI_Offset offset, void * buf,
Philip Carns's avatar
Philip Carns committed
1234
    int count, MPI_Datatype datatype, __D_MPI_REQUEST *request)
1235
1236
1237
1238
1239
{
    int ret;
    double tm1, tm2;

    tm1 = darshan_wtime();
1240
1241
    ret = DARSHAN_MPI_CALL(PMPI_File_iwrite_at)(fh, offset, buf,
        count, datatype, request);
1242
1243
1244
1245
1246
1247
1248
    tm2 = darshan_wtime();
    CP_LOCK();
    CP_RECORD_MPI_WRITE(ret, fh, count, datatype, CP_NB_WRITES, tm1, tm2);
    CP_UNLOCK();
    return(ret);
}

Philip Carns's avatar
Philip Carns committed
1249
int MPI_File_iwrite(MPI_File fh, void * buf, int count, MPI_Datatype datatype, __D_MPI_REQUEST * request)
1250
1251
1252
1253
1254
{
    int ret;
    double tm1, tm2;

    tm1 = darshan_wtime();
1255
    ret = DARSHAN_MPI_CALL(PMPI_File_iwrite)(fh, buf, count, datatype, request);
1256
1257
1258
1259
1260
1261
1262
1263
    tm2 = darshan_wtime();
    CP_LOCK();
    CP_RECORD_MPI_WRITE(ret, fh, count, datatype, CP_NB_WRITES, tm1, tm2);
    CP_UNLOCK();
    return(ret);
}

int MPI_File_iwrite_shared(MPI_File fh, void * buf, int count,
Philip Carns's avatar
Philip Carns committed
1264
    MPI_Datatype datatype, __D_MPI_REQUEST * request)
1265
1266
1267
1268
1269
{
    int ret;
    double tm1, tm2;

    tm1 = darshan_wtime();
1270
1271
    ret = DARSHAN_MPI_CALL(PMPI_File_iwrite_shared)(fh, buf, count,
        datatype, request);
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
    tm2 = darshan_wtime();
    CP_LOCK();
    CP_RECORD_MPI_WRITE(ret, fh, count, datatype, CP_NB_WRITES, tm1, tm2);
    CP_UNLOCK();
    return(ret);
}

static struct darshan_file_runtime* darshan_file_by_fh(MPI_File fh)
{
    struct darshan_file_runtime* tmp_file;
    uint64_t tmp_hash = 0;
    int hash_index;

    if(!darshan_global_job)
        return(NULL);

    /* if we have already condensed the data, then just hand the first file
     * back
     */
    if(darshan_global_job->flags & CP_FLAG_CONDENSED)
    {
        return(&darshan_global_job->file_runtime_array[0]);
    }

    /* check most recently used */
    if(darshan_global_job->darshan_mru_file && darshan_global_job->darshan_mru_file->fh == fh)
    {
        return(darshan_global_job->darshan_mru_file);
    }

1302
    tmp_hash = darshan_hash((void*)(&fh), sizeof(fh), 0);
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326

    /* search hash table */
    hash_index = tmp_hash & CP_HASH_MASK;
    tmp_file = darshan_global_job->fh_table[hash_index];
    while(tmp_file)
    {
        if(tmp_file->fh == fh)
        {
            darshan_global_job->darshan_mru_file = tmp_file;
            return(tmp_file);
        }
        tmp_file = tmp_file->fh_next;
    }

    return(NULL);
}

/* cp_log_reduction()
 *
 * Identify shared files and reduce them to one log entry
 *
 * returns 0 on success, -1 on failure
 */
static int cp_log_reduction(struct darshan_job_runtime* final_job, int rank, 
1327
    char* logfile_name, MPI_Offset* next_offset)
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
{
    /* TODO: these need to be allocated differently now, too big */
    uint64_t hash_array[CP_MAX_FILES] = {0};
    int mask_array[CP_MAX_FILES] = {0};
    int all_mask_array[CP_MAX_FILES] = {0};
    int ret;
    int i;
    int j;
    MPI_Op reduce_op;
    MPI_Datatype rtype;
    struct darshan_file* tmp_array = NULL;
    int shared_count = 0;

    /* register a reduction operation */
1342
    ret = DARSHAN_MPI_CALL(PMPI_Op_create)(darshan_file_reduce, 1, &reduce_op); 
1343
1344
1345
1346
1347
1348
1349
1350
    if(ret != 0)
    {
        return(-1);
    }

    /* construct a datatype for a file record.  This is serving no purpose
     * except to make sure we can do a reduction on proper boundaries
     */
1351
1352
1353
    DARSHAN_MPI_CALL(PMPI_Type_contiguous)(sizeof(struct darshan_file),
        MPI_BYTE, &rtype); 
    DARSHAN_MPI_CALL(PMPI_Type_commit)(&rtype); 
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364

    /* gather list of files that root process has opened */
    if(rank == 0)
    {
        for(i=0; i<final_job->file_count; i++)
        {
            hash_array[i] = final_job->file_array[i].hash;
        }
    }

    /* broadcast list of files to all other processes */
1365
1366
    ret = DARSHAN_MPI_CALL(PMPI_Bcast)(hash_array,
        (CP_MAX_FILES * sizeof(uint64_t)), 
1367
1368
1369
        MPI_BYTE, 0, MPI_COMM_WORLD);
    if(ret != 0)
    {
1370
        DARSHAN_MPI_CALL(PMPI_Op_free)(&reduce_op);
1371
        DARSHAN_MPI_CALL(PMPI_Type_free)(&rtype);
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
        return(-1);
    }

    /* everyone looks to see if they have also opened that same file */
    for(i=0; (i<CP_MAX_FILES && hash_array[i] != 0); i++)
    {
        for(j=0; j<final_job->file_count; j++)
        {
            if(hash_array[i] && final_job->file_array[j].hash == hash_array[i])
            {
                /* we opened that file too */
                mask_array[i] = 1;
                break;
            }
        }
    }

    /* now allreduce so that everyone agrees on which files are shared */
1390
1391
    ret = DARSHAN_MPI_CALL(PMPI_Allreduce)(mask_array, all_mask_array,
        CP_MAX_FILES, MPI_INT, MPI_LAND, MPI_COMM_WORLD);
1392
1393
    if(ret != 0)
    {
1394
        DARSHAN_MPI_CALL(PMPI_Op_free)(&reduce_op);
1395
        DARSHAN_MPI_CALL(PMPI_Type_free)(&rtype);
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
        return(-1);
    }

    /* walk through mask array counting entries and marking corresponding
     * files with a rank of -1
     */
    for(i=0; i<CP_MAX_FILES; i++)
    {
        if(all_mask_array[i])
        {
            shared_count++;
            for(j=0; j<final_job->file_count; j++)
            {
                if(final_job->file_array[j].hash == hash_array[i])
                {
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435

                    /*
                     * Initialize fastest/slowest info prior
                     * to the reduction.
                     */
                    final_job->file_array[j].counters[CP_FASTEST_RANK] =
                      final_job->file_array[j].rank;
                    final_job->file_array[j].counters[CP_FASTEST_RANK_BYTES] =
                      final_job->file_array[j].counters[CP_BYTES_READ] +
                      final_job->file_array[j].counters[CP_BYTES_WRITTEN];
                    final_job->file_array[j].fcounters[CP_F_FASTEST_RANK_TIME] =
                      final_job->file_array[j].fcounters[CP_F_POSIX_META_TIME] +
                      final_job->file_array[j].fcounters[CP_F_POSIX_READ_TIME] +
                      final_job->file_array[j].fcounters[CP_F_POSIX_WRITE_TIME];

                    final_job->file_array[j].counters[CP_SLOWEST_RANK] =
                      final_job->file_array[j].rank;
                    final_job->file_array[j].counters[CP_SLOWEST_RANK_BYTES] =
                      final_job->file_array[j].counters[CP_BYTES_READ] +
                      final_job->file_array[j].counters[CP_BYTES_WRITTEN];
                    final_job->file_array[j].fcounters[CP_F_SLOWEST_RANK_TIME] =
                      final_job->file_array[j].fcounters[CP_F_POSIX_META_TIME] +
                      final_job->file_array[j].fcounters[CP_F_POSIX_READ_TIME] +
                      final_job->file_array[j].fcounters[CP_F_POSIX_WRITE_TIME];

1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
                    final_job->file_array[j].rank = -1;
                    break;
                }
            }
        }
    }

    if(shared_count)
    {
        if(rank == 0)
        {
            /* root proc needs to allocate memory to store reduction */
            tmp_array = malloc(shared_count*sizeof(struct darshan_file));
            if(!tmp_array)
            {
                /* TODO: think more about how to handle errors like this */
1452
                DARSHAN_MPI_CALL(PMPI_Op_free)(&reduce_op);
1453
                DARSHAN_MPI_CALL(PMPI_Type_free)(&rtype);
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
                return(-1);
            }
        }

        /* sort the array of files descending by rank so that we get all of the 
         * shared files (marked by rank -1) in a contiguous portion at end 
         * of the array
         */
        qsort(final_job->file_array, final_job->file_count, 
            sizeof(struct darshan_file), file_compare);

1465
        ret = DARSHAN_MPI_CALL(PMPI_Reduce)(
1466
1467
1468
1469
            &final_job->file_array[final_job->file_count-shared_count], 
            tmp_array, shared_count, rtype, reduce_op, 0, MPI_COMM_WORLD);
        if(ret != 0)
        {
1470
            DARSHAN_MPI_CALL(PMPI_Op_free)(&reduce_op);
1471
            DARSHAN_MPI_CALL(PMPI_Type_free)(&rtype);
1472
1473
1474
            return(-1);
        }

1475
1476
1477
1478
1479
        ret = darshan_file_variance(
            &final_job->file_array[final_job->file_count-shared_count],
            tmp_array, shared_count, rank);
        if (ret)
        {
1480
            DARSHAN_MPI_CALL(PMPI_Op_free)(&reduce_op);
1481
            DARSHAN_MPI_CALL(PMPI_Type_free)(&rtype);
1482
1483
1484
            return(-1);
        }

1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
        if(rank == 0)
        {
            /* root replaces local files with shared ones */
            memcpy(&final_job->file_array[final_job->file_count-shared_count],
                tmp_array, shared_count*sizeof(struct darshan_file));
            free(tmp_array);
            tmp_array = NULL;
        }
        else
        {
            /* everyone else simply discards those file records */
            final_job->file_count -= shared_count;
        }
    }
    
1500
    DARSHAN_MPI_CALL(PMPI_Op_free)(&reduce_op);
1501
    DARSHAN_MPI_CALL(PMPI_Type_free)(&rtype);
1502

1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
    return(0);
}

/* TODO: should we use more of the CP macros here? */
static void darshan_file_reduce(void* infile_v, 
    void* inoutfile_v, int *len, 
    MPI_Datatype *datatype)
{
    struct darshan_file tmp_file;
    struct darshan_file* infile = infile_v;
    struct darshan_file* inoutfile = inoutfile_v;
    struct darshan_file_runtime tmp_runtime;
    int i;
    int j;
    int k;

    for(i=0; i<*len; i++)
    {
        memset(&tmp_file, 0, sizeof(tmp_file));

        tmp_file.hash = infile->hash;
        tmp_file.rank = -1; /* indicates shared across all procs */

        /* sum */
        for(j=CP_INDEP_OPENS; j<=CP_VIEWS; j++)
        {
            tmp_file.counters[j] = infile->counters[j] + 
                inoutfile->counters[j];
        }

        /* pick one */
        tmp_file.counters[CP_MODE] = infile->counters[CP_MODE];


        /* sum */
        for(j=CP_BYTES_READ; j<=CP_BYTES_WRITTEN; j++)
        {
            tmp_file.counters[j] = infile->counters[j] + 
                inoutfile->counters[j];
        }

        /* max */
        for(j=CP_MAX_BYTE_READ; j<=CP_MAX_BYTE_WRITTEN; j++)
        {
            tmp_file.counters[j] = (
                (infile->counters[j] > inoutfile->counters[j]) ? 
                infile->counters[j] :
                inoutfile->counters[j]);
        }

        /* sum */
        for(j=CP_CONSEC_READS; j<=CP_MEM_NOT_ALIGNED; j++)
        {
            tmp_file.counters[j] = infile->counters[j] + 
                inoutfile->counters[j];
        }

        /* pick one */
        tmp_file.counters[CP_MEM_ALIGNMENT] = infile->counters[CP_MEM_ALIGNMENT];
        /* sum */
        for(j=CP_FILE_NOT_ALIGNED; j<=CP_FILE_NOT_ALIGNED; j++)
        {
            tmp_file.counters[j] = infile->counters[j] + 
                inoutfile->counters[j];
        }

        /* pick one */
        tmp_file.counters[CP_FILE_ALIGNMENT] = infile->counters[CP_FILE_ALIGNMENT];
        
1572
1573
        /* skip CP_MAX_*_TIME_SIZE; handled in floating point section */

1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
        /* sum */
        for(j=CP_SIZE_READ_0_100; j<=CP_EXTENT_WRITE_1G_PLUS; j++)
        {
            tmp_file.counters[j] = infile->counters[j] + 
                inoutfile->counters[j];
        }

        /* pick the 4 most common strides out of the 8 we have to chose from */

        /* first collapse any duplicates */
        for(j=CP_STRIDE1_STRIDE; j<=CP_STRIDE4_STRIDE; j++)
        {
            for(k=CP_STRIDE1_STRIDE; k<=CP_STRIDE4_STRIDE; k++)
            {
                if(infile->counters[j] == inoutfile->counters[k])
                {
                    infile->counters[j+4] += inoutfile->counters[k+4];
                    inoutfile->counters[k] = 0;
                    inoutfile->counters[k+4] = 0;
                }
            }
        }

        /* placeholder so we can re-use macros */
        tmp_runtime.log_file = &tmp_file;
        /* first set */
        for(j=CP_STRIDE1_STRIDE; j<=CP_STRIDE4_STRIDE; j++)
        {
            CP_COUNTER_INC(&tmp_runtime, infile->counters[j],
                infile->counters[j+4], 1, CP_STRIDE1_STRIDE, CP_STRIDE1_COUNT);
        }
        /* second set */
        for(j=CP_STRIDE1_STRIDE; j<=CP_STRIDE4_STRIDE; j++)
        {
            CP_COUNTER_INC(&tmp_runtime, inoutfile->counters[j],
                inoutfile->counters[j+4], 1, CP_STRIDE1_STRIDE, CP_STRIDE1_COUNT);
        }

        /* TODO: subroutine so we don't duplicate so much */
        /* same for access counts */

        /* first collapse any duplicates */
        for(j=CP_ACCESS1_ACCESS; j<=CP_ACCESS4_ACCESS; j++)
        {
            for(k=CP_ACCESS1_ACCESS; k<=CP_ACCESS4_ACCESS; k++)
            {
                if(infile->counters[j] == inoutfile->counters[k])
                {
                    infile->counters[j+4] += inoutfile->counters[k+4];
                    inoutfile->counters[k] = 0;
                    inoutfile->counters[k+4] = 0;
                }
            }
        }

        /* placeholder so we can re-use macros */
        tmp_runtime.log_file = &tmp_file;
        /* first set */
        for(j=CP_ACCESS1_ACCESS; j<=CP_ACCESS4_ACCESS; j++)
        {
            CP_COUNTER_INC(&tmp_runtime, infile->counters[j],
                infile->counters[j+4], 1, CP_ACCESS1_ACCESS, CP_ACCESS1_COUNT);
        }
        /* second set */
        for(j=CP_ACCESS1_ACCESS; j<=CP_ACCESS4_ACCESS; j++)
        {
            CP_COUNTER_INC(&tmp_runtime, inoutfile->counters[j],
                inoutfile->counters[j+4], 1, CP_ACCESS1_ACCESS, CP_ACCESS1_COUNT);
        }

        /* min */
        for(j=CP_F_OPEN_TIMESTAMP; j<=CP_F_WRITE_START_TIMESTAMP; j++)
        {
            if(infile->fcounters[j] > inoutfile->fcounters[j])
                tmp_file.fcounters[j] = inoutfile->fcounters[j];
            else
                tmp_file.fcounters[j] = infile->fcounters[j];
        }

        /* max */
        for(j=CP_F_CLOSE_TIMESTAMP; j<=CP_F_WRITE_END_TIMESTAMP; j++)
        {
            if(infile->fcounters[j] > inoutfile->fcounters[j])
                tmp_file.fcounters[j] = infile->fcounters[j];
            else
                tmp_file.fcounters[j] = inoutfile->fcounters[j];
        }

        /* sum */
        for(j=CP_F_POSIX_READ_TIME; j<=CP_F_MPI_WRITE_TIME; j++)
        {
            tmp_file.fcounters[j] = infile->fcounters[j] + 
                inoutfile->fcounters[j];
        }

1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
        /* max (special case) */
        if(infile->fcounters[CP_F_MAX_WRITE_TIME] > 
            inoutfile->fcounters[CP_F_MAX_WRITE_TIME])
        {
            tmp_file.fcounters[CP_F_MAX_WRITE_TIME] = 
                infile->fcounters[CP_F_MAX_WRITE_TIME];
            tmp_file.counters[CP_MAX_WRITE_TIME_SIZE] = 
                infile->counters[CP_MAX_WRITE_TIME_SIZE];
        }
        else
        {
            tmp_file.fcounters[CP_F_MAX_WRITE_TIME] = 
                inoutfile->fcounters[CP_F_MAX_WRITE_TIME];
            tmp_file.counters[CP_MAX_WRITE_TIME_SIZE] = 
                inoutfile->counters[CP_MAX_WRITE_TIME_SIZE];
        }

        if(infile->fcounters[CP_F_MAX_READ_TIME] > 
            inoutfile->fcounters[CP_F_MAX_READ_TIME])
        {
            tmp_file.fcounters[CP_F_MAX_READ_TIME] = 
                infile->fcounters[CP_F_MAX_READ_TIME];
            tmp_file.counters[CP_MAX_READ_TIME_SIZE] = 
                infile->counters[CP_MAX_READ_TIME_SIZE];
        }
        else
        {
            tmp_file.fcounters[CP_F_MAX_READ_TIME] = 
                inoutfile->fcounters[CP_F_MAX_READ_TIME];
            tmp_file.counters[CP_MAX_READ_TIME_SIZE] = 
                inoutfile->counters[CP_MAX_READ_TIME_SIZE];
        }

1702
        /* min */
1703
1704
        if(infile->fcounters[CP_F_FASTEST_RANK_TIME] <
           inoutfile->fcounters[CP_F_FASTEST_RANK_TIME])
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
        {
            tmp_file.counters[CP_FASTEST_RANK] =
                infile->counters[CP_FASTEST_RANK];
            tmp_file.counters[CP_FASTEST_RANK_BYTES] = 
                infile->counters[CP_FASTEST_RANK_BYTES];
            tmp_file.fcounters[CP_F_FASTEST_RANK_TIME] =
                infile->fcounters[CP_F_FASTEST_RANK_TIME];
        }
        else
        {
            tmp_file.counters[CP_FASTEST_RANK] =
                inoutfile->counters[CP_FASTEST_RANK];
            tmp_file.counters[CP_FASTEST_RANK_BYTES] =
                inoutfile->counters[CP_FASTEST_RANK_BYTES];
            tmp_file.fcounters[CP_F_FASTEST_RANK_TIME] = 
                inoutfile->fcounters[CP_F_FASTEST_RANK_TIME];
        }

        /* max */
        if(infile->fcounters[CP_F_SLOWEST_RANK_TIME] >
           inoutfile->fcounters[CP_F_SLOWEST_RANK_TIME])
        {
            tmp_file.counters[CP_SLOWEST_RANK] =
                infile->counters[CP_SLOWEST_RANK];
            tmp_file.counters[CP_SLOWEST_RANK_BYTES] =
                infile->counters[CP_SLOWEST_RANK_BYTES];
            tmp_file.fcounters[CP_F_SLOWEST_RANK_TIME] = 
                infile->fcounters[CP_F_SLOWEST_RANK_TIME];
        }
        else
        {
            tmp_file.counters[CP_SLOWEST_RANK] =