darshan-mpiio.c 46.8 KB
Newer Older
Philip Carns's avatar
Philip Carns committed
1
/*
Shane Snyder's avatar
Shane Snyder committed
2
3
4
 * Copyright (C) 2015 University of Chicago.
 * See COPYRIGHT notice in top-level directory.
 *
Philip Carns's avatar
Philip Carns committed
5
6
 */

7
8
9
#define _XOPEN_SOURCE 500
#define _GNU_SOURCE

Philip Carns's avatar
Philip Carns committed
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
#include "darshan-runtime-config.h"
#include <stdio.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <stdarg.h>
#include <string.h>
#include <time.h>
#include <stdlib.h>
#include <errno.h>
#include <search.h>
#include <assert.h>
#include <pthread.h>

#include "uthash.h"
26

Philip Carns's avatar
Philip Carns committed
27
#include "darshan.h"
Shane Snyder's avatar
Shane Snyder committed
28
#include "darshan-dynamic.h"
Philip Carns's avatar
Philip Carns committed
29

30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
/* The mpiio_file_runtime structure maintains necessary runtime metadata
 * for the MPIIO file record (darshan_mpiio_file structure, defined in
 * darshan-mpiio-log-format.h) pointed to by 'file_record'. This metadata
 * assists with the instrumenting of specific statistics in the file record.
 * 'hlink' is a hash table link structure used to add/remove this record
 * from the hash table of MPIIO file records for this process. 
 *
 * RATIONALE: the MPIIO module needs to track some stateful, volatile 
 * information about each open file (like the current file offset, most recent 
 * access time, etc.) to aid in instrumentation, but this information can't be
 * stored in the darshan_mpiio_file struct because we don't want it to appear in
 * the final darshan log file.  We therefore associate a mpiio_file_runtime
 * struct with each darshan_mpiio_file struct in order to track this information.
  *
 * NOTE: There is a one-to-one mapping of mpiio_file_runtime structs to
 * darshan_mpiio_file structs.
 *
 * NOTE: The mpiio_file_runtime struct contains a pointer to a darshan_mpiio_file
 * struct (see the *file_record member) rather than simply embedding an entire
 * darshan_mpiio_file struct.  This is done so that all of the darshan_mpiio_file
 * structs can be kept contiguous in memory as a single array to simplify
 * reduction, compression, and storage.
 */
53
struct mpiio_file_runtime
Philip Carns's avatar
Philip Carns committed
54
55
{
    struct darshan_mpiio_file* file_record;
Shane Snyder's avatar
Shane Snyder committed
56
    enum darshan_io_type last_io_type;
57
58
59
    double last_meta_end;
    double last_read_end;
    double last_write_end;
60
61
    void *access_root;
    int access_count;
Philip Carns's avatar
Philip Carns committed
62
63
64
    UT_hash_handle hlink;
};

65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
/* The mpiio_file_runtime_ref structure is used to associate a MPIIO
 * file handle with an already existing MPIIO file record. This is
 * necessary as many MPIIO I/O functions take only a file handle as input,
 * but MPIIO file records are indexed by their full file paths (i.e., darshan
 * record identifiers for MPIIO files are created by hashing the file path).
 * In other words, this structure is necessary as it allows us to look up a
 * file record either by a pathname (mpiio_file_runtime) or by MPIIO file
 * descriptor (mpiio_file_runtime_ref), depending on which parameters are
 * available. This structure includes another hash table link, since separate
 * hashes are maintained for mpiio_file_runtime structures and mpiio_file_runtime_ref
 * structures.
 *
 * RATIONALE: In theory the file handle information could be included in the
 * mpiio_file_runtime struct rather than in a separate structure here.  The
 * reason we don't do that is to handle the potential for an MPI implementation
 * to produce a new file handle instance each time MPI_File_open() is called on a
 * file.  Thus there might be multiple file handles referring to the same
 * underlying record.
 *
 * NOTE: there are potentially multiple mpiio_file_runtime_ref structures
 * referring to a single mpiio_file_runtime structure.  Most of the time there is
 * only one, however.
 */
88
struct mpiio_file_runtime_ref
Philip Carns's avatar
Philip Carns committed
89
{
90
    struct mpiio_file_runtime* file;
91
    MPI_File fh;
Philip Carns's avatar
Philip Carns committed
92
93
94
    UT_hash_handle hlink;
};

95
96
97
98
/* The mpiio_runtime structure maintains necessary state for storing
 * MPI-IO file records and for coordinating with darshan-core at 
 * shutdown time.
 */
Philip Carns's avatar
Philip Carns committed
99
100
struct mpiio_runtime
{
101
    struct mpiio_file_runtime* file_runtime_array;
Philip Carns's avatar
Philip Carns committed
102
103
    struct darshan_mpiio_file* file_record_array;
    int file_array_ndx;
104
105
    struct mpiio_file_runtime* file_hash;
    struct mpiio_file_runtime_ref* fh_hash;
Philip Carns's avatar
Philip Carns committed
106
107
108
109
110
111
112
};

static struct mpiio_runtime *mpiio_runtime = NULL;
static pthread_mutex_t mpiio_runtime_mutex = PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP;
static int instrumentation_disabled = 0;
static int my_rank = -1;

Philip Carns's avatar
Philip Carns committed
113
static void mpiio_runtime_initialize(void);
114
static struct mpiio_file_runtime* mpiio_file_by_name(const char *name);
115
116
117
static struct mpiio_file_runtime* mpiio_file_by_name_setfh(const char* name, MPI_File fh);
static struct mpiio_file_runtime* mpiio_file_by_fh(MPI_File fh);
static void mpiio_file_close_fh(MPI_File fh);
118
static int mpiio_record_compare(const void* a, const void* b);
Philip Carns's avatar
Philip Carns committed
119
120
static void mpiio_record_reduction_op(void* infile_v, void* inoutfile_v,
    int *len, MPI_Datatype *datatype);
121
122
123
static void mpiio_shared_record_variance(MPI_Comm mod_comm,
    struct darshan_mpiio_file *inrec_array, struct darshan_mpiio_file *outrec_array,
    int shared_rec_count);
124
125
126
127

static void mpiio_begin_shutdown(void);
static void mpiio_get_output_data(MPI_Comm mod_comm, darshan_record_id *shared_recs,
    int shared_rec_count, void **mpiio_buf, int *mpiio_buf_sz);
128
129
130
131
132
static void mpiio_shutdown(void);

#define MPIIO_LOCK() pthread_mutex_lock(&mpiio_runtime_mutex)
#define MPIIO_UNLOCK() pthread_mutex_unlock(&mpiio_runtime_mutex)

133
134
135
#define MPIIO_RECORD_READ(__ret, __fh, __count, __datatype, __counter, __tm1, __tm2) do { \
    struct mpiio_file_runtime* file; \
    int size = 0; \
Shane Snyder's avatar
Shane Snyder committed
136
    double __elapsed = __tm2-__tm1; \
137
138
139
140
141
    if(__ret != MPI_SUCCESS) break; \
    file = mpiio_file_by_fh(__fh); \
    if(!file) break; \
    DARSHAN_MPI_CALL(PMPI_Type_size)(__datatype, &size);  \
    size = size * __count; \
142
    DARSHAN_BUCKET_INC(&(file->file_record->counters[MPIIO_SIZE_READ_AGG_0_100]), size); \
143
144
145
    darshan_common_val_counter(&file->access_root, &file->access_count, size, \
        &(file->file_record->counters[MPIIO_ACCESS1_ACCESS]), \
        &(file->file_record->counters[MPIIO_ACCESS1_COUNT])); \
146
147
    file->file_record->counters[MPIIO_BYTES_READ] += size; \
    file->file_record->counters[__counter] += 1; \
Shane Snyder's avatar
Shane Snyder committed
148
    if(file->last_io_type == DARSHAN_IO_WRITE) \
149
        file->file_record->counters[MPIIO_RW_SWITCHES] += 1; \
Shane Snyder's avatar
Shane Snyder committed
150
    file->last_io_type = DARSHAN_IO_READ; \
151
152
153
154
155
156
    if(file->file_record->fcounters[MPIIO_F_READ_START_TIMESTAMP] == 0) \
        file->file_record->fcounters[MPIIO_F_READ_START_TIMESTAMP] = __tm1; \
    file->file_record->fcounters[MPIIO_F_READ_END_TIMESTAMP] = __tm2; \
    if(file->file_record->fcounters[MPIIO_F_MAX_READ_TIME] < __elapsed) { \
        file->file_record->fcounters[MPIIO_F_MAX_READ_TIME] = __elapsed; \
        file->file_record->counters[MPIIO_MAX_READ_TIME_SIZE] = size; } \
157
158
    DARSHAN_TIMER_INC_NO_OVERLAP(file->file_record->fcounters[MPIIO_F_READ_TIME], \
        __tm1, __tm2, file->last_read_end); \
159
} while(0)
160

161
162
163
#define MPIIO_RECORD_WRITE(__ret, __fh, __count, __datatype, __counter, __tm1, __tm2) do { \
    struct mpiio_file_runtime* file; \
    int size = 0; \
Shane Snyder's avatar
Shane Snyder committed
164
165
    double __elapsed = __tm2-__tm1; \
    if(__ret != MPI_SUCCESS) break; \
166
167
168
169
    file = mpiio_file_by_fh(__fh); \
    if(!file) break; \
    DARSHAN_MPI_CALL(PMPI_Type_size)(__datatype, &size);  \
    size = size * __count; \
170
    DARSHAN_BUCKET_INC(&(file->file_record->counters[MPIIO_SIZE_WRITE_AGG_0_100]), size); \
171
172
173
    darshan_common_val_counter(&file->access_root, &file->access_count, size, \
        &(file->file_record->counters[MPIIO_ACCESS1_ACCESS]), \
        &(file->file_record->counters[MPIIO_ACCESS1_COUNT])); \
174
175
    file->file_record->counters[MPIIO_BYTES_WRITTEN] += size; \
    file->file_record->counters[__counter] += 1; \
Shane Snyder's avatar
Shane Snyder committed
176
    if(file->last_io_type == DARSHAN_IO_READ) \
177
        file->file_record->counters[MPIIO_RW_SWITCHES] += 1; \
Shane Snyder's avatar
Shane Snyder committed
178
    file->last_io_type = DARSHAN_IO_WRITE; \
179
180
181
182
183
184
    if(file->file_record->fcounters[MPIIO_F_WRITE_START_TIMESTAMP] == 0) \
        file->file_record->fcounters[MPIIO_F_WRITE_START_TIMESTAMP] = __tm1; \
    file->file_record->fcounters[MPIIO_F_WRITE_END_TIMESTAMP] = __tm2; \
    if(file->file_record->fcounters[MPIIO_F_MAX_WRITE_TIME] < __elapsed) { \
        file->file_record->fcounters[MPIIO_F_MAX_WRITE_TIME] = __elapsed; \
        file->file_record->counters[MPIIO_MAX_WRITE_TIME_SIZE] = size; } \
185
186
    DARSHAN_TIMER_INC_NO_OVERLAP(file->file_record->fcounters[MPIIO_F_WRITE_TIME], \
        __tm1, __tm2, file->last_write_end); \
187
188
} while(0)

189
190
191
/**********************************************************
 *        Wrappers for MPI-IO functions of interest       * 
 **********************************************************/
Philip Carns's avatar
Philip Carns committed
192

Philip Carns's avatar
Philip Carns committed
193
194
195
196
197
198
199
#ifdef HAVE_MPIIO_CONST
int MPI_File_open(MPI_Comm comm, const char *filename, int amode, MPI_Info info, MPI_File *fh) 
#else
int MPI_File_open(MPI_Comm comm, char *filename, int amode, MPI_Info info, MPI_File *fh) 
#endif
{
    int ret;
200
    struct mpiio_file_runtime* file;
Philip Carns's avatar
Philip Carns committed
201
202
203
204
205
206
207
208
209
210
211
    char* tmp;
    int comm_size;
    double tm1, tm2;

    tm1 = darshan_core_wtime();
    ret = DARSHAN_MPI_CALL(PMPI_File_open)(comm, filename, amode, info, fh);
    tm2 = darshan_core_wtime();

    if(ret == MPI_SUCCESS)
    {
        MPIIO_LOCK();
Philip Carns's avatar
Philip Carns committed
212
        mpiio_runtime_initialize();
Philip Carns's avatar
Philip Carns committed
213
214
215
216
217
218
219
220
221
222
223

        /* use ROMIO approach to strip prefix if present */
        /* strip off prefix if there is one, but only skip prefixes
         * if they are greater than length one to allow for windows
         * drive specifications (e.g. c:\...) 
         */
        tmp = strchr(filename, ':');
        if (tmp > filename + 1) {
            filename = tmp + 1;
        }

224
        file = mpiio_file_by_name_setfh(filename, (*fh));
Philip Carns's avatar
Philip Carns committed
225
226
        if(file)
        {
227
            file->file_record->counters[MPIIO_MODE] = amode;
Philip Carns's avatar
Philip Carns committed
228
229
230
            DARSHAN_MPI_CALL(PMPI_Comm_size)(comm, &comm_size);
            if(comm_size == 1)
            {
231
                file->file_record->counters[MPIIO_INDEP_OPENS] += 1;
Philip Carns's avatar
Philip Carns committed
232
233
234
            }
            else
            {
235
                file->file_record->counters[MPIIO_COLL_OPENS] += 1;
Philip Carns's avatar
Philip Carns committed
236
237
238
            }
            if(info != MPI_INFO_NULL)
            {
239
                file->file_record->counters[MPIIO_HINTS] += 1;
Philip Carns's avatar
Philip Carns committed
240
            }
241
242
243
244
245
            if(file->file_record->fcounters[MPIIO_F_OPEN_TIMESTAMP] == 0)
                file->file_record->fcounters[MPIIO_F_OPEN_TIMESTAMP] = tm1;
            DARSHAN_TIMER_INC_NO_OVERLAP(
                file->file_record->fcounters[MPIIO_F_META_TIME],
                tm1, tm2, file->last_meta_end);
Philip Carns's avatar
Philip Carns committed
246
247
248
249
250
251
252
253
        }

        MPIIO_UNLOCK();
    }

    return(ret);
}

254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
int MPI_File_read(MPI_File fh, void *buf, int count,
    MPI_Datatype datatype, MPI_Status *status)
{
    int ret;
    double tm1, tm2;

    tm1 = darshan_core_wtime();
    ret = DARSHAN_MPI_CALL(PMPI_File_read)(fh, buf, count, datatype, status);
    tm2 = darshan_core_wtime();

    MPIIO_LOCK();
    mpiio_runtime_initialize();
    MPIIO_RECORD_READ(ret, fh, count, datatype, MPIIO_INDEP_READS, tm1, tm2);
    MPIIO_UNLOCK();
    return(ret);
}

#ifdef HAVE_MPIIO_CONST
int MPI_File_write(MPI_File fh, const void *buf, int count,
    MPI_Datatype datatype, MPI_Status *status)
#else
int MPI_File_write(MPI_File fh, void *buf, int count,
    MPI_Datatype datatype, MPI_Status *status)
#endif
{
    int ret;
    double tm1, tm2;

    tm1 = darshan_core_wtime();
    ret = DARSHAN_MPI_CALL(PMPI_File_write)(fh, buf, count, datatype, status);
    tm2 = darshan_core_wtime();

    MPIIO_LOCK();
    mpiio_runtime_initialize();
    MPIIO_RECORD_WRITE(ret, fh, count, datatype, MPIIO_INDEP_WRITES, tm1, tm2);
    MPIIO_UNLOCK();
    return(ret);
}

int MPI_File_read_at(MPI_File fh, MPI_Offset offset, void *buf,
    int count, MPI_Datatype datatype, MPI_Status *status)
{
    int ret;
    double tm1, tm2;

    tm1 = darshan_core_wtime();
    ret = DARSHAN_MPI_CALL(PMPI_File_read_at)(fh, offset, buf,
        count, datatype, status);
    tm2 = darshan_core_wtime();

    MPIIO_LOCK();
    mpiio_runtime_initialize();
    MPIIO_RECORD_READ(ret, fh, count, datatype, MPIIO_INDEP_READS, tm1, tm2);
    MPIIO_UNLOCK();
    return(ret);
}

#ifdef HAVE_MPIIO_CONST
int MPI_File_write_at(MPI_File fh, MPI_Offset offset, const void *buf,
    int count, MPI_Datatype datatype, MPI_Status *status)
#else
int MPI_File_write_at(MPI_File fh, MPI_Offset offset, void *buf,
    int count, MPI_Datatype datatype, MPI_Status *status)
#endif
{
    int ret;
    double tm1, tm2;

    tm1 = darshan_core_wtime();
    ret = DARSHAN_MPI_CALL(PMPI_File_write_at)(fh, offset, buf,
        count, datatype, status);
    tm2 = darshan_core_wtime();

    MPIIO_LOCK();
    mpiio_runtime_initialize();
    MPIIO_RECORD_WRITE(ret, fh, count, datatype, MPIIO_INDEP_WRITES, tm1, tm2);
    MPIIO_UNLOCK();
    return(ret);
}

int MPI_File_read_all(MPI_File fh, void * buf, int count, MPI_Datatype datatype, MPI_Status *status)
{
    int ret;
    double tm1, tm2;

    tm1 = darshan_core_wtime();
    ret = DARSHAN_MPI_CALL(PMPI_File_read_all)(fh, buf, count,
        datatype, status);
    tm2 = darshan_core_wtime();

    MPIIO_LOCK();
    mpiio_runtime_initialize();
    MPIIO_RECORD_READ(ret, fh, count, datatype, MPIIO_COLL_READS, tm1, tm2);
    MPIIO_UNLOCK();
    return(ret);
}

#ifdef HAVE_MPIIO_CONST
int MPI_File_write_all(MPI_File fh, const void * buf, int count, MPI_Datatype datatype, MPI_Status *status)
#else
int MPI_File_write_all(MPI_File fh, void * buf, int count, MPI_Datatype datatype, MPI_Status *status)
#endif
{
    int ret;
    double tm1, tm2;

    tm1 = darshan_core_wtime();
    ret = DARSHAN_MPI_CALL(PMPI_File_write_all)(fh, buf, count,
        datatype, status);
    tm2 = darshan_core_wtime();

    MPIIO_LOCK();
    mpiio_runtime_initialize();
    MPIIO_RECORD_WRITE(ret, fh, count, datatype, MPIIO_COLL_WRITES, tm1, tm2);
    MPIIO_UNLOCK();
    return(ret);
}

int MPI_File_read_at_all(MPI_File fh, MPI_Offset offset, void * buf,
    int count, MPI_Datatype datatype, MPI_Status * status)
{
    int ret;
    double tm1, tm2;

    tm1 = darshan_core_wtime();
    ret = DARSHAN_MPI_CALL(PMPI_File_read_at_all)(fh, offset, buf,
        count, datatype, status);
    tm2 = darshan_core_wtime();

    MPIIO_LOCK();
    mpiio_runtime_initialize();
    MPIIO_RECORD_READ(ret, fh, count, datatype, MPIIO_COLL_READS, tm1, tm2);
    MPIIO_UNLOCK();
    return(ret);
}

#ifdef HAVE_MPIIO_CONST
int MPI_File_write_at_all(MPI_File fh, MPI_Offset offset, const void * buf,
    int count, MPI_Datatype datatype, MPI_Status * status)
#else
int MPI_File_write_at_all(MPI_File fh, MPI_Offset offset, void * buf,
    int count, MPI_Datatype datatype, MPI_Status * status)
#endif
{
    int ret;
    double tm1, tm2;

    tm1 = darshan_core_wtime();
    ret = DARSHAN_MPI_CALL(PMPI_File_write_at_all)(fh, offset, buf,
        count, datatype, status);
    tm2 = darshan_core_wtime();

    MPIIO_LOCK();
    mpiio_runtime_initialize();
    MPIIO_RECORD_WRITE(ret, fh, count, datatype, MPIIO_COLL_WRITES, tm1, tm2);
    MPIIO_UNLOCK();
    return(ret);
}

413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
int MPI_File_read_shared(MPI_File fh, void * buf, int count, MPI_Datatype datatype, MPI_Status *status)
{
    int ret;
    double tm1, tm2;

    tm1 = darshan_core_wtime();
    ret = DARSHAN_MPI_CALL(PMPI_File_read_shared)(fh, buf, count,
        datatype, status);
    tm2 = darshan_core_wtime();

    MPIIO_LOCK();
    mpiio_runtime_initialize();
    MPIIO_RECORD_READ(ret, fh, count, datatype, MPIIO_INDEP_READS, tm1, tm2);
    MPIIO_UNLOCK();
    return(ret);
}

#ifdef HAVE_MPIIO_CONST
int MPI_File_write_shared(MPI_File fh, const void * buf, int count, MPI_Datatype datatype, MPI_Status *status)
#else
int MPI_File_write_shared(MPI_File fh, void * buf, int count, MPI_Datatype datatype, MPI_Status *status)
#endif
{
    int ret;
    double tm1, tm2;

    tm1 = darshan_core_wtime();
    ret = DARSHAN_MPI_CALL(PMPI_File_write_shared)(fh, buf, count,
        datatype, status);
    tm2 = darshan_core_wtime();

    MPIIO_LOCK();
    mpiio_runtime_initialize();
    MPIIO_RECORD_WRITE(ret, fh, count, datatype, MPIIO_INDEP_WRITES, tm1, tm2);
    MPIIO_UNLOCK();
    return(ret);
}

int MPI_File_read_ordered(MPI_File fh, void * buf, int count,
    MPI_Datatype datatype, MPI_Status * status)
{
    int ret;
    double tm1, tm2;

    tm1 = darshan_core_wtime();
    ret = DARSHAN_MPI_CALL(PMPI_File_read_ordered)(fh, buf, count,
        datatype, status);
    tm2 = darshan_core_wtime();

    MPIIO_LOCK();
    mpiio_runtime_initialize();
    MPIIO_RECORD_READ(ret, fh, count, datatype, MPIIO_COLL_READS, tm1, tm2);
    MPIIO_UNLOCK();
    return(ret);
}

#ifdef HAVE_MPIIO_CONST
int MPI_File_write_ordered(MPI_File fh, const void * buf, int count,
    MPI_Datatype datatype, MPI_Status * status)
#else
int MPI_File_write_ordered(MPI_File fh, void * buf, int count,
    MPI_Datatype datatype, MPI_Status * status)
#endif
{
    int ret;
    double tm1, tm2;

    tm1 = darshan_core_wtime();
    ret = DARSHAN_MPI_CALL(PMPI_File_write_ordered)(fh, buf, count,
         datatype, status);
    tm2 = darshan_core_wtime();

    MPIIO_LOCK();
    mpiio_runtime_initialize();
    MPIIO_RECORD_WRITE(ret, fh, count, datatype, MPIIO_COLL_WRITES, tm1, tm2);
    MPIIO_UNLOCK();
    return(ret);
}

int MPI_File_read_all_begin(MPI_File fh, void * buf, int count, MPI_Datatype datatype)
{
    int ret;
    double tm1, tm2;

    tm1 = darshan_core_wtime();
    ret = DARSHAN_MPI_CALL(PMPI_File_read_all_begin)(fh, buf, count, datatype);
    tm2 = darshan_core_wtime();

    MPIIO_LOCK();
    mpiio_runtime_initialize();
    MPIIO_RECORD_READ(ret, fh, count, datatype, MPIIO_SPLIT_READS, tm1, tm2);
    MPIIO_UNLOCK();
    return(ret);
}

#ifdef HAVE_MPIIO_CONST
int MPI_File_write_all_begin(MPI_File fh, const void * buf, int count, MPI_Datatype datatype)
#else
int MPI_File_write_all_begin(MPI_File fh, void * buf, int count, MPI_Datatype datatype)
#endif
{
    int ret;
    double tm1, tm2;

    tm1 = darshan_core_wtime();
    ret = DARSHAN_MPI_CALL(PMPI_File_write_all_begin)(fh, buf, count, datatype);
    tm2 = darshan_core_wtime();

    MPIIO_LOCK();
    mpiio_runtime_initialize();
    MPIIO_RECORD_WRITE(ret, fh, count, datatype, MPIIO_SPLIT_WRITES, tm1, tm2);
    MPIIO_UNLOCK();
    return(ret);
}

int MPI_File_read_at_all_begin(MPI_File fh, MPI_Offset offset, void * buf,
    int count, MPI_Datatype datatype)
{
    int ret;
    double tm1, tm2;

    tm1 = darshan_core_wtime();
    ret = DARSHAN_MPI_CALL(PMPI_File_read_at_all_begin)(fh, offset, buf,
        count, datatype);
    tm2 = darshan_core_wtime();
    
    MPIIO_LOCK();
    mpiio_runtime_initialize();
    MPIIO_RECORD_READ(ret, fh, count, datatype, MPIIO_SPLIT_READS, tm1, tm2);
    MPIIO_UNLOCK();
    return(ret);
}

#ifdef HAVE_MPIIO_CONST
int MPI_File_write_at_all_begin(MPI_File fh, MPI_Offset offset, const void * buf,
    int count, MPI_Datatype datatype)
#else
int MPI_File_write_at_all_begin(MPI_File fh, MPI_Offset offset, void * buf,
    int count, MPI_Datatype datatype)
#endif
{
    int ret;
    double tm1, tm2;

    tm1 = darshan_core_wtime();
    ret = DARSHAN_MPI_CALL(PMPI_File_write_at_all_begin)(fh, offset,
        buf, count, datatype);
    tm2 = darshan_core_wtime();

    MPIIO_LOCK();
    mpiio_runtime_initialize();
    MPIIO_RECORD_WRITE(ret, fh, count, datatype, MPIIO_SPLIT_WRITES, tm1, tm2);
    MPIIO_UNLOCK();
    return(ret);
}

int MPI_File_read_ordered_begin(MPI_File fh, void * buf, int count, MPI_Datatype datatype)
{
    int ret;
    double tm1, tm2;

    tm1 = darshan_core_wtime();
    ret = DARSHAN_MPI_CALL(PMPI_File_read_ordered_begin)(fh, buf, count,
        datatype);
    tm2 = darshan_core_wtime();

    MPIIO_LOCK();
    mpiio_runtime_initialize();
    MPIIO_RECORD_READ(ret, fh, count, datatype, MPIIO_SPLIT_READS, tm1, tm2);
    MPIIO_UNLOCK();
    return(ret);
}

#ifdef HAVE_MPIIO_CONST
int MPI_File_write_ordered_begin(MPI_File fh, const void * buf, int count, MPI_Datatype datatype)
#else
int MPI_File_write_ordered_begin(MPI_File fh, void * buf, int count, MPI_Datatype datatype)
#endif
{
    int ret;
    double tm1, tm2;

    tm1 = darshan_core_wtime();
    ret = DARSHAN_MPI_CALL(PMPI_File_write_ordered_begin)(fh, buf, count,
        datatype);
    tm2 = darshan_core_wtime();

    MPIIO_LOCK();
    mpiio_runtime_initialize();
    MPIIO_RECORD_WRITE(ret, fh, count, datatype, MPIIO_SPLIT_WRITES, tm1, tm2);
    MPIIO_UNLOCK();
    return(ret);
}

int MPI_File_iread(MPI_File fh, void * buf, int count, MPI_Datatype datatype, __D_MPI_REQUEST * request)
{
    int ret;
    double tm1, tm2;

    tm1 = darshan_core_wtime();
    ret = DARSHAN_MPI_CALL(PMPI_File_iread)(fh, buf, count, datatype, request);
    tm2 = darshan_core_wtime();

    MPIIO_LOCK();
    mpiio_runtime_initialize();
    MPIIO_RECORD_READ(ret, fh, count, datatype, MPIIO_NB_READS, tm1, tm2);
    MPIIO_UNLOCK();
    return(ret);
}

#ifdef HAVE_MPIIO_CONST
int MPI_File_iwrite(MPI_File fh, const void * buf, int count,
    MPI_Datatype datatype, __D_MPI_REQUEST * request)
#else
int MPI_File_iwrite(MPI_File fh, void * buf, int count,
    MPI_Datatype datatype, __D_MPI_REQUEST * request)
#endif
{
    int ret;
    double tm1, tm2;

    tm1 = darshan_core_wtime();
    ret = DARSHAN_MPI_CALL(PMPI_File_iwrite)(fh, buf, count, datatype, request);
    tm2 = darshan_core_wtime();

    MPIIO_LOCK();
    mpiio_runtime_initialize();
    MPIIO_RECORD_WRITE(ret, fh, count, datatype, MPIIO_NB_WRITES, tm1, tm2);
    MPIIO_UNLOCK();
    return(ret);
}

int MPI_File_iread_at(MPI_File fh, MPI_Offset offset, void * buf,
    int count, MPI_Datatype datatype, __D_MPI_REQUEST *request)
{
    int ret;
    double tm1, tm2;

    tm1 = darshan_core_wtime();
    ret = DARSHAN_MPI_CALL(PMPI_File_iread_at)(fh, offset, buf, count,
        datatype, request);
    tm2 = darshan_core_wtime();

    MPIIO_LOCK();
    mpiio_runtime_initialize();
    MPIIO_RECORD_READ(ret, fh, count, datatype, MPIIO_NB_READS, tm1, tm2);
    MPIIO_UNLOCK();
    return(ret);
}

#ifdef HAVE_MPIIO_CONST
int MPI_File_iwrite_at(MPI_File fh, MPI_Offset offset, const void * buf,
    int count, MPI_Datatype datatype, __D_MPI_REQUEST *request)
#else
int MPI_File_iwrite_at(MPI_File fh, MPI_Offset offset, void * buf,
    int count, MPI_Datatype datatype, __D_MPI_REQUEST *request)
#endif
{
    int ret;
    double tm1, tm2;

    tm1 = darshan_core_wtime();
    ret = DARSHAN_MPI_CALL(PMPI_File_iwrite_at)(fh, offset, buf,
        count, datatype, request);
    tm2 = darshan_core_wtime();

    MPIIO_LOCK();
    mpiio_runtime_initialize();
    MPIIO_RECORD_WRITE(ret, fh, count, datatype, MPIIO_NB_WRITES, tm1, tm2);
    MPIIO_UNLOCK();
    return(ret);
}

int MPI_File_iread_shared(MPI_File fh, void * buf, int count,
    MPI_Datatype datatype, __D_MPI_REQUEST * request)
{
    int ret;
    double tm1, tm2;

    tm1 = darshan_core_wtime();
    ret = DARSHAN_MPI_CALL(PMPI_File_iread_shared)(fh, buf, count,
        datatype, request);
    tm2 = darshan_core_wtime();

    MPIIO_LOCK();
    mpiio_runtime_initialize();
    MPIIO_RECORD_READ(ret, fh, count, datatype, MPIIO_NB_READS, tm1, tm2);
    MPIIO_UNLOCK();
    return(ret);
}

#ifdef HAVE_MPIIO_CONST
int MPI_File_iwrite_shared(MPI_File fh, const void * buf, int count,
    MPI_Datatype datatype, __D_MPI_REQUEST * request)
#else
int MPI_File_iwrite_shared(MPI_File fh, void * buf, int count,
    MPI_Datatype datatype, __D_MPI_REQUEST * request)
#endif
{
    int ret;
    double tm1, tm2;

    tm1 = darshan_core_wtime();
    ret = DARSHAN_MPI_CALL(PMPI_File_iwrite_shared)(fh, buf, count,
        datatype, request);
    tm2 = darshan_core_wtime();

    MPIIO_LOCK();
    mpiio_runtime_initialize();
    MPIIO_RECORD_WRITE(ret, fh, count, datatype, MPIIO_NB_WRITES, tm1, tm2);
    MPIIO_UNLOCK();
    return(ret);
}
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743

int MPI_File_sync(MPI_File fh)
{
    int ret;
    struct mpiio_file_runtime* file;
    double tm1, tm2;

    tm1 = darshan_core_wtime();
    ret = DARSHAN_MPI_CALL(PMPI_File_sync)(fh);
    tm2 = darshan_core_wtime();

    if(ret == MPI_SUCCESS)
    {
        MPIIO_LOCK();
        mpiio_runtime_initialize();
        file = mpiio_file_by_fh(fh);
        if(file)
        {
744
745
746
747
            file->file_record->counters[MPIIO_SYNCS] += 1;
            DARSHAN_TIMER_INC_NO_OVERLAP(
                file->file_record->fcounters[MPIIO_F_WRITE_TIME],
                tm1, tm2, file->last_write_end);
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
        }
        MPIIO_UNLOCK();
    }

    return(ret);
}

#ifdef HAVE_MPIIO_CONST
int MPI_File_set_view(MPI_File fh, MPI_Offset disp, MPI_Datatype etype,
    MPI_Datatype filetype, const char *datarep, MPI_Info info)
#else
int MPI_File_set_view(MPI_File fh, MPI_Offset disp, MPI_Datatype etype,
    MPI_Datatype filetype, char *datarep, MPI_Info info)
#endif
{
    int ret;
    struct mpiio_file_runtime* file;
    double tm1, tm2;

    tm1 = darshan_core_wtime();
    ret = DARSHAN_MPI_CALL(PMPI_File_set_view)(fh, disp, etype, filetype,
        datarep, info);
    tm2 = darshan_core_wtime();

    if(ret == MPI_SUCCESS)
    {
        MPIIO_LOCK();
        mpiio_runtime_initialize();
        file = mpiio_file_by_fh(fh);
        if(file)
        {
779
            file->file_record->counters[MPIIO_VIEWS] += 1;
780
781
            if(info != MPI_INFO_NULL)
            {
782
783
784
785
                file->file_record->counters[MPIIO_HINTS] += 1;
                DARSHAN_TIMER_INC_NO_OVERLAP(
                    file->file_record->fcounters[MPIIO_F_META_TIME],
                    tm1, tm2, file->last_meta_end);
786
787
788
789
790
791
792
793
           }
        }
        MPIIO_UNLOCK();
    }

    return(ret);
}

794
795
796
797
int MPI_File_close(MPI_File *fh)
{
    int ret;
    struct mpiio_file_runtime* file;
798
    MPI_File tmp_fh = *fh;
799
800
801
802
803
804
805
806
    double tm1, tm2;

    tm1 = darshan_core_wtime();
    ret = DARSHAN_MPI_CALL(PMPI_File_close)(fh);
    tm2 = darshan_core_wtime();

    MPIIO_LOCK();
    mpiio_runtime_initialize();
807
    file = mpiio_file_by_fh(tmp_fh);
808
809
    if(file)
    {
810
811
812
813
814
        file->file_record->fcounters[MPIIO_F_CLOSE_TIMESTAMP] =
            darshan_core_wtime();
        DARSHAN_TIMER_INC_NO_OVERLAP(
            file->file_record->fcounters[MPIIO_F_META_TIME],
            tm1, tm2, file->last_meta_end);
815
        mpiio_file_close_fh(tmp_fh);
816
817
818
819
820
821
822
823
824
825
826
    }
    MPIIO_UNLOCK();

    return(ret);
}

/***********************************************************
 * Internal functions for manipulating MPI-IO module state *
 ***********************************************************/

/* initialize data structures and register with darshan-core component */
Philip Carns's avatar
Philip Carns committed
827
828
829
830
static void mpiio_runtime_initialize()
{
    struct darshan_module_funcs mpiio_mod_fns =
    {
831
        .begin_shutdown = &mpiio_begin_shutdown,
Philip Carns's avatar
Philip Carns committed
832
833
834
        .get_output_data = &mpiio_get_output_data,
        .shutdown = &mpiio_shutdown
    };
835
836
837
    void *mpiio_buf;
    int mpiio_buf_size;
    int file_array_size;
Philip Carns's avatar
Philip Carns committed
838
839
840
841
842

    /* don't do anything if already initialized or instrumenation is disabled */
    if(mpiio_runtime || instrumentation_disabled)
        return;

843
844
845
    /* try and store the default number of records for this module */
    mpiio_buf_size = DARSHAN_DEF_MOD_REC_COUNT * sizeof(struct darshan_mpiio_file);

Philip Carns's avatar
Philip Carns committed
846
847
848
849
    /* register the mpiio module with darshan core */
    darshan_core_register_module(
        DARSHAN_MPIIO_MOD,
        &mpiio_mod_fns,
850
851
        &mpiio_buf_size,
        &mpiio_buf,
852
        &my_rank,
853
        NULL);
Philip Carns's avatar
Philip Carns committed
854

855
856
    /* return if darshan-core does not provide enough module memory */
    if(mpiio_buf_size < sizeof(struct darshan_mpiio_file))
857
858
    {
        darshan_core_unregister_module(DARSHAN_MPIIO_MOD);
Philip Carns's avatar
Philip Carns committed
859
        return;
860
    }
Philip Carns's avatar
Philip Carns committed
861
862
863

    mpiio_runtime = malloc(sizeof(*mpiio_runtime));
    if(!mpiio_runtime)
864
865
    {
        darshan_core_unregister_module(DARSHAN_MPIIO_MOD);
Philip Carns's avatar
Philip Carns committed
866
        return;
867
    }
Philip Carns's avatar
Philip Carns committed
868
869
    memset(mpiio_runtime, 0, sizeof(*mpiio_runtime));

870
871
872
873
    /* set number of trackable files for the MPIIO module according to the
     * amount of memory returned by darshan-core
     */
    file_array_size = mpiio_buf_size / sizeof(struct darshan_mpiio_file);
Philip Carns's avatar
Philip Carns committed
874
875
    mpiio_runtime->file_array_ndx = 0;

876
877
878
    /* store pointer to MPIIO record buffer given by darshan-core */
    mpiio_runtime->file_record_array = (struct darshan_mpiio_file *)mpiio_buf;

Philip Carns's avatar
Philip Carns committed
879
    /* allocate array of runtime file records */
880
    mpiio_runtime->file_runtime_array = malloc(file_array_size *
881
                                               sizeof(struct mpiio_file_runtime));
882
    if(!mpiio_runtime->file_runtime_array)
Philip Carns's avatar
Philip Carns committed
883
    {
884
885
886
        free(mpiio_runtime);
        mpiio_runtime = NULL;
        darshan_core_unregister_module(DARSHAN_MPIIO_MOD);
Philip Carns's avatar
Philip Carns committed
887
888
        return;
    }
889
    memset(mpiio_runtime->file_runtime_array, 0, file_array_size *
890
           sizeof(struct mpiio_file_runtime));
Philip Carns's avatar
Philip Carns committed
891
892
893
894

    return;
}

895
896
897
898
/* get a MPIIO file record for the given file path */
static struct mpiio_file_runtime* mpiio_file_by_name(const char *name)
{
    struct mpiio_file_runtime *file = NULL;
899
    struct darshan_mpiio_file *file_rec;
900
901
    char *newname = NULL;
    darshan_record_id file_id;
902
    int ret;
903
904
905
906
907
908
909
910

    if(!mpiio_runtime || instrumentation_disabled)
        return(NULL);

    newname = darshan_clean_file_path(name);
    if(!newname)
        newname = (char*)name;

911
912
    /* lookup the unique id for this filename */
    darshan_core_lookup_record(
913
        newname,
914
        &file_id);
915

916
917
    /* search the hash table for this file record, and return if found */
    HASH_FIND(hlink, mpiio_runtime->file_hash, &file_id, sizeof(darshan_record_id), file);
918
    if(!file)
919
    {
920
        /* register the record with the darshan core component */
921
        ret = darshan_core_register_record(file_id, newname, DARSHAN_MPIIO_MOD,
922
923
924
925
926
927
928
929
            sizeof(struct darshan_mpiio_file), NULL);
        if(ret == 1)
        {
            /* register was successful */
            file = &(mpiio_runtime->file_runtime_array[mpiio_runtime->file_array_ndx]);
            file->file_record =
                &(mpiio_runtime->file_record_array[mpiio_runtime->file_array_ndx]);
            file_rec = file->file_record;
930

931
932
            file_rec->base_rec.id = file_id;
            file_rec->base_rec.rank = my_rank;
933

934
935
936
937
938
939
            /* add new record to file hash table */
            HASH_ADD(hlink, mpiio_runtime->file_hash, file_record->base_rec.id,
                sizeof(darshan_record_id), file);
            mpiio_runtime->file_array_ndx++;
        }
    }
940
941
942
943
944
945
946
947
948

    if(newname != name)
        free(newname);
    return(file);
}

/* get an MPIIO file record for the given file path, and also create a
 * reference structure using the corresponding file handle
 */
949
static struct mpiio_file_runtime* mpiio_file_by_name_setfh(const char* name, MPI_File fh)
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
{
    struct mpiio_file_runtime* file;
    struct mpiio_file_runtime_ref* ref;

    if(!mpiio_runtime || instrumentation_disabled)
        return(NULL);

    /* find file record by name first */
    file = mpiio_file_by_name(name);

    if(!file)
        return(NULL);

    /* search hash table for existing file ref for this fh */
    HASH_FIND(hlink, mpiio_runtime->fh_hash, &fh, sizeof(fh), ref);
    if(ref)
    {
        /* we have a reference.  Make sure it points to the correct file
         * and return it
         */
        ref->file = file;
        return(file);
    }

    /* if we hit this point, then we don't have a reference for this fh
     * in the table yet.  Add it.
     */
    ref = malloc(sizeof(*ref));
    if(!ref)
        return(NULL);
    memset(ref, 0, sizeof(*ref));

    ref->file = file;
    ref->fh = fh;    
    HASH_ADD(hlink, mpiio_runtime->fh_hash, fh, sizeof(fh), ref);

    return(file);
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
}

/* get an MPIIO file record for the given file handle */
static struct mpiio_file_runtime* mpiio_file_by_fh(MPI_File fh)
{
    struct mpiio_file_runtime_ref* ref;

    if(!mpiio_runtime || instrumentation_disabled)
        return(NULL);

    /* search hash table for existing file ref for this file handle */
    HASH_FIND(hlink, mpiio_runtime->fh_hash, &fh, sizeof(fh), ref);
    if(ref)
        return(ref->file);

    return(NULL);
}

/* free up reference data structures for the given file handle */
static void mpiio_file_close_fh(MPI_File fh)
{
    struct mpiio_file_runtime_ref* ref;

    if(!mpiio_runtime || instrumentation_disabled)
        return;

    /* search hash table for this fd */
    HASH_FIND(hlink, mpiio_runtime->fh_hash, &fh, sizeof(fh), ref);
    if(ref)
    {
        /* we have a reference, delete it */
        HASH_DELETE(hlink, mpiio_runtime->fh_hash, ref);
        free(ref);
    }

    return;
1023
1024
}

1025
/* compare function for sorting file records by descending rank */
1026
static int mpiio_record_compare(const void* a_p, const void* b_p)
Philip Carns's avatar
Philip Carns committed
1027
{
1028
1029
    const struct darshan_mpiio_file* a = a_p;
    const struct darshan_mpiio_file* b = b_p;
Philip Carns's avatar
Philip Carns committed
1030

1031
    if(a->base_rec.rank < b->base_rec.rank)
1032
        return 1;
1033
    if(a->base_rec.rank > b->base_rec.rank)
1034
        return -1;
Philip Carns's avatar
Philip Carns committed
1035

1036
1037
    return 0;
}
Philip Carns's avatar
Philip Carns committed
1038

1039
1040
1041
1042
1043
static void mpiio_record_reduction_op(
    void* infile_v,
    void* inoutfile_v,
    int *len,
    MPI_Datatype *datatype)
Philip Carns's avatar
Philip Carns committed
1044
{
1045
1046
1047
    struct darshan_mpiio_file tmp_file;
    struct darshan_mpiio_file *infile = infile_v;
    struct darshan_mpiio_file *inoutfile = inoutfile_v;
1048
    int i, j, k;
Philip Carns's avatar
Philip Carns committed
1049

1050
    assert(mpiio_runtime);
Philip Carns's avatar
Philip Carns committed
1051

1052
1053
1054
    for(i=0; i<*len; i++)
    {
        memset(&tmp_file, 0, sizeof(struct darshan_mpiio_file));
1055
1056
        tmp_file.base_rec.id = infile->base_rec.id;
        tmp_file.base_rec.rank = -1;
1057
1058

        /* sum */
1059
1060
1061
1062
1063
1064
1065
1066
1067
        for(j=MPIIO_INDEP_OPENS; j<=MPIIO_VIEWS; j++)
        {
            tmp_file.counters[j] = infile->counters[j] + inoutfile->counters[j];
        }

        tmp_file.counters[MPIIO_MODE] = infile->counters[MPIIO_MODE];

        /* sum */
        for(j=MPIIO_BYTES_READ; j<=MPIIO_RW_SWITCHES; j++)
1068
1069
1070
1071
        {
            tmp_file.counters[j] = infile->counters[j] + inoutfile->counters[j];
        }

1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
        /* skip MPIIO_MAX_*_TIME_SIZE; handled in floating point section */

        for(j=MPIIO_SIZE_READ_AGG_0_100; j<=MPIIO_SIZE_WRITE_AGG_1G_PLUS; j++)
        {
            tmp_file.counters[j] = infile->counters[j] + inoutfile->counters[j];
        }

        /* first collapse any duplicates */
        for(j=MPIIO_ACCESS1_ACCESS; j<=MPIIO_ACCESS4_ACCESS; j++)
        {
            for(k=MPIIO_ACCESS1_ACCESS; k<=MPIIO_ACCESS4_ACCESS; k++)
            {
                if(infile->counters[j] == inoutfile->counters[k])
                {
                    infile->counters[j+4] += inoutfile->counters[k+4];
                    inoutfile->counters[k] = 0;
                    inoutfile->counters[k+4] = 0;
                }
            }
        }

        /* first set */
        for(j=MPIIO_ACCESS1_ACCESS; j<=MPIIO_ACCESS4_ACCESS; j++)
        {
            DARSHAN_COMMON_VAL_COUNTER_INC(&(tmp_file.counters[MPIIO_ACCESS1_ACCESS]),
                &(tmp_file.counters[MPIIO_ACCESS1_COUNT]), infile->counters[j],
1098
                infile->counters[j+4], 0);
1099
1100
1101
1102
        }

        /* second set */
        for(j=MPIIO_ACCESS1_ACCESS; j<=MPIIO_ACCESS4_ACCESS; j++)
1103
        {
1104
1105
            DARSHAN_COMMON_VAL_COUNTER_INC(&(tmp_file.counters[MPIIO_ACCESS1_ACCESS]),
                &(tmp_file.counters[MPIIO_ACCESS1_COUNT]), inoutfile->counters[j],
1106
                inoutfile->counters[j+4], 0);
1107
1108
1109
        }

        /* min non-zero (if available) value */
1110
        for(j=MPIIO_F_OPEN_TIMESTAMP; j<=MPIIO_F_WRITE_START_TIMESTAMP; j++)
1111
1112
1113
1114
1115
1116
1117
        {
            if(infile->fcounters[j] > inoutfile->fcounters[j] && inoutfile->fcounters[j] > 0)
                tmp_file.fcounters[j] = inoutfile->fcounters[j];
            else
                tmp_file.fcounters[j] = infile->fcounters[j];
        }

1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
        /* max */
        for(j=MPIIO_F_READ_END_TIMESTAMP; j<= MPIIO_F_CLOSE_TIMESTAMP; j++)
        {
            if(infile->fcounters[j] > inoutfile->fcounters[j])
                tmp_file.fcounters[j] = infile->fcounters[j];
            else
                tmp_file.fcounters[j] = inoutfile->fcounters[j];
        }

        /* sum */
        for(j=MPIIO_F_READ_TIME; j<=MPIIO_F_META_TIME; j++)
        {
Shane Snyder's avatar
Shane Snyder committed
1130
            tmp_file.fcounters[j] = infile->fcounters[j] + inoutfile->fcounters[j];
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
        }

        /* max (special case) */
        if(infile->fcounters[MPIIO_F_MAX_READ_TIME] >
            inoutfile->fcounters[MPIIO_F_MAX_READ_TIME])
        {
            tmp_file.fcounters[MPIIO_F_MAX_READ_TIME] =
                infile->fcounters[MPIIO_F_MAX_READ_TIME];
            tmp_file.counters[MPIIO_MAX_READ_TIME_SIZE] =
                infile->counters[MPIIO_MAX_READ_TIME_SIZE];
        }
        else
        {
            tmp_file.fcounters[MPIIO_F_MAX_READ_TIME] =
                inoutfile->fcounters[MPIIO_F_MAX_READ_TIME];
            tmp_file.counters[MPIIO_MAX_READ_TIME_SIZE] =
                inoutfile->counters[MPIIO_MAX_READ_TIME_SIZE];
        }

        if(infile->fcounters[MPIIO_F_MAX_WRITE_TIME] >
            inoutfile->fcounters[MPIIO_F_MAX_WRITE_TIME])
        {
            tmp_file.fcounters[MPIIO_F_MAX_WRITE_TIME] =
                infile->fcounters[MPIIO_F_MAX_WRITE_TIME];
            tmp_file.counters[MPIIO_MAX_WRITE_TIME_SIZE] =
                infile->counters[MPIIO_MAX_WRITE_TIME_SIZE];
        }
        else
        {
            tmp_file.fcounters[MPIIO_F_MAX_WRITE_TIME] =
                inoutfile->fcounters[MPIIO_F_MAX_WRITE_TIME];
            tmp_file.counters[MPIIO_MAX_WRITE_TIME_SIZE] =
                inoutfile->counters[MPIIO_MAX_WRITE_TIME_SIZE];
        }

1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
        /* min (zeroes are ok here; some procs don't do I/O) */
        if(infile->fcounters[MPIIO_F_FASTEST_RANK_TIME] <
            inoutfile->fcounters[MPIIO_F_FASTEST_RANK_TIME])
        {
            tmp_file.counters[MPIIO_FASTEST_RANK] =
                infile->counters[MPIIO_FASTEST_RANK];
            tmp_file.counters[MPIIO_FASTEST_RANK_BYTES] =
                infile->counters[MPIIO_FASTEST_RANK_BYTES];
            tmp_file.fcounters[MPIIO_F_FASTEST_RANK_TIME] =
                infile->fcounters[MPIIO_F_FASTEST_RANK_TIME];
        }
        else
        {
            tmp_file.counters[MPIIO_FASTEST_RANK] =
                inoutfile->counters[MPIIO_FASTEST_RANK];
            tmp_file.counters[MPIIO_FASTEST_RANK_BYTES] =
                inoutfile->counters[MPIIO_FASTEST_RANK_BYTES];
            tmp_file.fcounters[MPIIO_F_FASTEST_RANK_TIME] =
                inoutfile->fcounters[MPIIO_F_FASTEST_RANK_TIME];
        }

        /* max */
        if(infile->fcounters[MPIIO_F_SLOWEST_RANK_TIME] >
           inoutfile->fcounters[MPIIO_F_SLOWEST_RANK_TIME])
        {
            tmp_file.counters[MPIIO_SLOWEST_RANK] =
                infile->counters[MPIIO_SLOWEST_RANK];
            tmp_file.counters[MPIIO_SLOWEST_RANK_BYTES] =
                infile->counters[MPIIO_SLOWEST_RANK_BYTES];
            tmp_file.fcounters[MPIIO_F_SLOWEST_RANK_TIME] =
                infile->fcounters[MPIIO_F_SLOWEST_RANK_TIME];
        }
        else
        {
            tmp_file.counters[MPIIO_SLOWEST_RANK] =
                inoutfile->counters[MPIIO_SLOWEST_RANK];
            tmp_file.counters[MPIIO_SLOWEST_RANK_BYTES] =
                inoutfile->counters[MPIIO_SLOWEST_RANK_BYTES];
            tmp_file.fcounters[MPIIO_F_SLOWEST_RANK_TIME] =
                inoutfile->fcounters[MPIIO_F_SLOWEST_RANK_TIME];
        }

1208
1209
1210
1211
1212
1213
1214
1215
1216
        /* update pointers */
        *inoutfile = tmp_file;
        inoutfile++;
        infile++;
    }

    return;
}

1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
static void mpiio_shared_record_variance(MPI_Comm mod_comm,
    struct darshan_mpiio_file *inrec_array, struct darshan_mpiio_file *outrec_array,
    int shared_rec_count)
{
    MPI_Datatype var_dt;
    MPI_Op var_op;
    int i;
    struct darshan_variance_dt *var_send_buf = NULL;
    struct darshan_variance_dt *var_recv_buf = NULL;

    DARSHAN_MPI_CALL(PMPI_Type_contiguous)(sizeof(struct darshan_variance_dt),
        MPI_BYTE, &var_dt);
    DARSHAN_MPI_CALL(PMPI_Type_commit)(&var_dt);

    DARSHAN_MPI_CALL(PMPI_Op_create)(darshan_variance_reduce, 1, &var_op);

    var_send_buf = malloc(shared_rec_count * sizeof(struct darshan_variance_dt));
    if(!var_send_buf)
        return;

    if(my_rank == 0)
    {
        var_recv_buf = malloc(shared_rec_count * sizeof(struct darshan_variance_dt));

        if(!var_recv_buf)
            return;
    }

    /* get total i/o time variances for shared records */

    for(i=0; i<shared_rec_count; i++)
    {
        var_send_buf[i].n = 1;
        var_send_buf[i].S = 0;
        var_send_buf[i].T = inrec_array[i].fcounters[MPIIO_F_READ_TIME] +
                            inrec_array[i].fcounters[MPIIO_F_WRITE_TIME] +
                            inrec_array[i].fcounters[MPIIO_F_META_TIME];
    }

    DARSHAN_MPI_CALL(PMPI_Reduce)(var_send_buf, var_recv_buf, shared_rec_count,
        var_dt, var_op, 0, mod_comm);

    if(my_rank == 0)
    {
        for(i=0; i<shared_rec_count; i++)
        {
            outrec_array[i].fcounters[MPIIO_F_VARIANCE_RANK_TIME] =
                (var_recv_buf[i].S / var_recv_buf[i].n);
        }
    }

    /* get total bytes moved variances for shared records */

    for(i=0; i<shared_rec_count; i++)
    {
        var_send_buf[i].n = 1;
        var_send_buf[i].S = 0;
        var_send_buf[i].T = (double)
                            inrec_array[i].counters[MPIIO_BYTES_READ] +
                            inrec_array[i].counters[MPIIO_BYTES_WRITTEN];
    }

    DARSHAN_MPI_CALL(PMPI_Reduce)(var_send_buf, var_recv_buf, shared_rec_count,
        var_dt, var_op, 0, mod_comm);

    if(my_rank == 0)
    {
        for(i=0; i<shared_rec_count; i++)
        {
            outrec_array[i].fcounters[MPIIO_F_VARIANCE_RANK_BYTES] =
                (var_recv_buf[i].S / var_recv_buf[i].n);
        }
    }

    DARSHAN_MPI_CALL(PMPI_Type_free)(&var_dt);
    DARSHAN_MPI_CALL(PMPI_Op_free)(&var_op);
    free(var_send_buf);
    free(var_recv_buf);

    return;
}

1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
/**************************************************************************
 * Functions exported by MPI-IO module for coordinating with darshan-core *
 **************************************************************************/

static void mpiio_begin_shutdown()
{
    assert(mpiio_runtime);

    MPIIO_LOCK();
    /* disable further instrumentation while Darshan shuts down */
    instrumentation_disabled = 1;
    MPIIO_UNLOCK();

    return;
}

1315
static void mpiio_get_output_data(
1316
1317
1318
1319
1320
    MPI_Comm mod_comm,
    darshan_record_id *shared_recs,
    int shared_rec_count,
    void **mpiio_buf,
    int *mpiio_buf_sz)
1321
{
1322
1323
1324
1325
1326
1327
1328
1329
1330
    struct mpiio_file_runtime *file;
    struct mpiio_file_runtime* tmp;
    int i;
    double mpiio_time;
    void *red_send_buf = NULL;
    void *red_recv_buf = NULL;
    MPI_Datatype red_type;
    MPI_Op red_op;

1331
1332
    assert(mpiio_runtime);

1333
1334
    /* go through and set the 4 most common access sizes for MPI-IO */
    for(i = 0; i < mpiio_runtime->file_array_ndx; i++)
1335
    {
1336
1337
        tmp = &(mpiio_runtime->file_runtime_array[i]);

1338
1339
1340
1341
1342
#ifndef __DARSHAN_ENABLE_MMAP_LOGS
        /* walk common counters to get 4 most common -- only if mmap
         * feature is disabled (mmap updates counters on the go)
         */

1343
1344
1345
1346
        /* common access sizes */
        darshan_walk_common_vals(tmp->access_root,
            &(tmp->file_record->counters[MPIIO_ACCESS1_ACCESS]),
            &(tmp->file_record->counters[MPIIO_ACCESS1_COUNT]));
1347
1348
1349
#endif

        tdestroy(tmp->access_root, free);
1350
    }
1351

1352
1353
1354
1355
1356
    /* if there are globally shared files, do a shared file reduction */
    /* NOTE: the shared file reduction is also skipped if the 
     * DARSHAN_DISABLE_SHARED_REDUCTION environment variable is set.
     */
    if(shared_rec_count && !getenv("DARSHAN_DISABLE_SHARED_REDUCTION"))
1357
    {
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
        /* necessary initialization of shared records */
        for(i = 0; i < shared_rec_count; i++)
        {
            HASH_FIND(hlink, mpiio_runtime->file_hash, &shared_recs[i],
                sizeof(darshan_record_id), file);
            assert(file);

            mpiio_time =
                file->file_record->fcounters[MPIIO_F_READ_TIME] +
                file->file_record->fcounters[MPIIO_F_WRITE_TIME] +
                file->file_record->fcounters[MPIIO_F_META_TIME];

            /* initialize fastest/slowest info prior to the reduction */
            file->file_record->counters[MPIIO_FASTEST_RANK] =
1372
                file->file_record->base_rec.rank;
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
            file->file_record->counters[MPIIO_FASTEST_RANK_BYTES] =
                file->file_record->counters[MPIIO_BYTES_READ] +
                file->file_record->counters[MPIIO_BYTES_WRITTEN];
            file->file_record->fcounters[MPIIO_F_FASTEST_RANK_TIME] =
                mpiio_time;

            /* until reduction occurs, we assume that this rank is both
             * the fastest and slowest. It is up to the reduction operator
             * to find the true min and max.
             */
            file->file_record->counters[MPIIO_SLOWEST_RANK] =
                file->file_record->counters[MPIIO_FASTEST_RANK];
            file->file_record->counters[MPIIO_SLOWEST_RANK_BYTES] =
                file->file_record->counters[MPIIO_FASTEST_RANK_BYTES];
            file->file_record->fcounters[MPIIO_F_SLOWEST_RANK_TIME] =
                file->file_record->fcounters[MPIIO_F_FASTEST_RANK_TIME];

1390
            file->file_record->base_rec.rank = -1;
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
        }

        /* sort the array of files descending by rank so that we get all of the 
         * shared files (marked by rank -1) in a contiguous portion at end 
         * of the array
         */
        qsort(mpiio_runtime->file_record_array, mpiio_runtime->file_array_ndx,
            sizeof(struct darshan_mpiio_file), mpiio_record_compare);

        /* make *send_buf point to the shared files at the end of sorted array */
        red_send_buf =
Shane Snyder's avatar
Shane Snyder committed
1402
            &(mpiio_runtime->file_record_array[mpiio_runtime->file_array_ndx-shared_rec_count]);
1403
1404
1405
1406
1407
1408

        /* allocate memory for the reduction output on rank 0 */
        if(my_rank == 0)
        {
            red_recv_buf = malloc(shared_rec_count * sizeof(struct darshan_mpiio_file));
            if(!red_recv_buf)
1409
            {
1410
                return;
1411
            }
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
        }

        /* construct a datatype for a MPIIO file record.  This is serving no purpose
         * except to make sure we can do a reduction on proper boundaries
         */
        DARSHAN_MPI_CALL(PMPI_Type_contiguous)(sizeof(struct darshan_mpiio_file),
            MPI_BYTE, &red_type);
        DARSHAN_MPI_CALL(PMPI_Type_commit)(&red_type);

        /* register a MPIIO file record reduction operator */
        DARSHAN_MPI_CALL(PMPI_Op_create)(mpiio_record_reduction_op, 1, &red_op);

        /* reduce shared MPIIO file records */
        DARSHAN_MPI_CALL(PMPI_Reduce)(red_send_buf, red_recv_buf,
            shared_rec_count, red_type, red_op, 0, mod_comm);

1428
1429
1430
1431
        /* get the time and byte variances for shared files */
        mpiio_shared_record_variance(mod_comm, red_send_buf, red_recv_buf,
            shared_rec_count);

1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
        /* clean up reduction state */
        if(my_rank == 0)
        {
            int tmp_ndx = mpiio_runtime->file_array_ndx - shared_rec_count;
            memcpy(&(mpiio_runtime->file_record_array[tmp_ndx]), red_recv_buf,
                shared_rec_count * sizeof(struct darshan_mpiio_file));
            free(red_recv_buf);
        }
        else
        {
            mpiio_runtime->file_array_ndx -= shared_rec_count;
        }

        DARSHAN_MPI_CALL(PMPI_Type_free)(&red_type);
        DARSHAN_MPI_CALL(PMPI_Op_free)(&red_op);
1447
1448
    }

1449
1450
    *mpiio_buf = (void *)(mpiio_runtime->file_record_array);
    *mpiio_buf_sz = mpiio_runtime->file_array_ndx * sizeof(struct darshan_mpiio_file);
1451
1452
1453
1454
1455
1456
1457
1458

    return;
}

static void mpiio_shutdown()
{
    struct mpiio_file_runtime_ref *ref, *tmp;

1459
1460
    assert(mpiio_runtime);

1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
    HASH_ITER(hlink, mpiio_runtime->fh_hash, ref, tmp)
    {
        HASH_DELETE(hlink, mpiio_runtime->fh_hash, ref);
        free(ref);
    }

    HASH_CLEAR(hlink, mpiio_runtime->file_hash); /* these entries are freed all at once below */

    free(mpiio_runtime->file_runtime_array);
    free(mpiio_runtime);
    mpiio_runtime = NULL;

    return;
Philip Carns's avatar
Philip Carns committed
1474
1475
}

Philip Carns's avatar
Philip Carns committed
1476
1477
1478
1479
1480
1481
1482
1483
/*
 * Local variables:
 *  c-indent-level: 4
 *  c-basic-offset: 4
 * End:
 *
 * vim: ts=8 sts=4 sw=4 expandtab
 */