darshan-mpiio.c 46.3 KB
Newer Older
Philip Carns's avatar
Philip Carns committed
1
/*
Shane Snyder's avatar
Shane Snyder committed
2
3
4
 * Copyright (C) 2015 University of Chicago.
 * See COPYRIGHT notice in top-level directory.
 *
Philip Carns's avatar
Philip Carns committed
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
 */

#include "darshan-runtime-config.h"
#include <stdio.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <stdarg.h>
#include <string.h>
#include <time.h>
#include <stdlib.h>
#include <errno.h>
#include <search.h>
#include <assert.h>
#define __USE_GNU
#include <pthread.h>

#include "uthash.h"
24

Philip Carns's avatar
Philip Carns committed
25
#include "darshan.h"
Shane Snyder's avatar
Shane Snyder committed
26
#include "darshan-dynamic.h"
Philip Carns's avatar
Philip Carns committed
27

28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
/* The mpiio_file_runtime structure maintains necessary runtime metadata
 * for the MPIIO file record (darshan_mpiio_file structure, defined in
 * darshan-mpiio-log-format.h) pointed to by 'file_record'. This metadata
 * assists with the instrumenting of specific statistics in the file record.
 * 'hlink' is a hash table link structure used to add/remove this record
 * from the hash table of MPIIO file records for this process. 
 *
 * RATIONALE: the MPIIO module needs to track some stateful, volatile 
 * information about each open file (like the current file offset, most recent 
 * access time, etc.) to aid in instrumentation, but this information can't be
 * stored in the darshan_mpiio_file struct because we don't want it to appear in
 * the final darshan log file.  We therefore associate a mpiio_file_runtime
 * struct with each darshan_mpiio_file struct in order to track this information.
  *
 * NOTE: There is a one-to-one mapping of mpiio_file_runtime structs to
 * darshan_mpiio_file structs.
 *
 * NOTE: The mpiio_file_runtime struct contains a pointer to a darshan_mpiio_file
 * struct (see the *file_record member) rather than simply embedding an entire
 * darshan_mpiio_file struct.  This is done so that all of the darshan_mpiio_file
 * structs can be kept contiguous in memory as a single array to simplify
 * reduction, compression, and storage.
 */
51
struct mpiio_file_runtime
Philip Carns's avatar
Philip Carns committed
52
53
{
    struct darshan_mpiio_file* file_record;
Shane Snyder's avatar
Shane Snyder committed
54
    enum darshan_io_type last_io_type;
55
56
57
    double last_meta_end;
    double last_read_end;
    double last_write_end;
58
59
    void *access_root;
    int access_count;
Philip Carns's avatar
Philip Carns committed
60
61
62
    UT_hash_handle hlink;
};

63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
/* The mpiio_file_runtime_ref structure is used to associate a MPIIO
 * file handle with an already existing MPIIO file record. This is
 * necessary as many MPIIO I/O functions take only a file handle as input,
 * but MPIIO file records are indexed by their full file paths (i.e., darshan
 * record identifiers for MPIIO files are created by hashing the file path).
 * In other words, this structure is necessary as it allows us to look up a
 * file record either by a pathname (mpiio_file_runtime) or by MPIIO file
 * descriptor (mpiio_file_runtime_ref), depending on which parameters are
 * available. This structure includes another hash table link, since separate
 * hashes are maintained for mpiio_file_runtime structures and mpiio_file_runtime_ref
 * structures.
 *
 * RATIONALE: In theory the file handle information could be included in the
 * mpiio_file_runtime struct rather than in a separate structure here.  The
 * reason we don't do that is to handle the potential for an MPI implementation
 * to produce a new file handle instance each time MPI_File_open() is called on a
 * file.  Thus there might be multiple file handles referring to the same
 * underlying record.
 *
 * NOTE: there are potentially multiple mpiio_file_runtime_ref structures
 * referring to a single mpiio_file_runtime structure.  Most of the time there is
 * only one, however.
 */
86
struct mpiio_file_runtime_ref
Philip Carns's avatar
Philip Carns committed
87
{
88
    struct mpiio_file_runtime* file;
89
    MPI_File fh;
Philip Carns's avatar
Philip Carns committed
90
91
92
    UT_hash_handle hlink;
};

93
94
95
96
/* The mpiio_runtime structure maintains necessary state for storing
 * MPI-IO file records and for coordinating with darshan-core at 
 * shutdown time.
 */
Philip Carns's avatar
Philip Carns committed
97
98
struct mpiio_runtime
{
99
    struct mpiio_file_runtime* file_runtime_array;
Philip Carns's avatar
Philip Carns committed
100
101
    struct darshan_mpiio_file* file_record_array;
    int file_array_ndx;
102
103
    struct mpiio_file_runtime* file_hash;
    struct mpiio_file_runtime_ref* fh_hash;
Philip Carns's avatar
Philip Carns committed
104
105
106
107
108
109
110
};

static struct mpiio_runtime *mpiio_runtime = NULL;
static pthread_mutex_t mpiio_runtime_mutex = PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP;
static int instrumentation_disabled = 0;
static int my_rank = -1;

Philip Carns's avatar
Philip Carns committed
111
static void mpiio_runtime_initialize(void);
112
static struct mpiio_file_runtime* mpiio_file_by_name(const char *name);
113
114
115
static struct mpiio_file_runtime* mpiio_file_by_name_setfh(const char* name, MPI_File fh);
static struct mpiio_file_runtime* mpiio_file_by_fh(MPI_File fh);
static void mpiio_file_close_fh(MPI_File fh);
116
static int mpiio_record_compare(const void* a, const void* b);
Philip Carns's avatar
Philip Carns committed
117
118
static void mpiio_record_reduction_op(void* infile_v, void* inoutfile_v,
    int *len, MPI_Datatype *datatype);
119
120
121
static void mpiio_shared_record_variance(MPI_Comm mod_comm,
    struct darshan_mpiio_file *inrec_array, struct darshan_mpiio_file *outrec_array,
    int shared_rec_count);
122
123
124
125

static void mpiio_begin_shutdown(void);
static void mpiio_get_output_data(MPI_Comm mod_comm, darshan_record_id *shared_recs,
    int shared_rec_count, void **mpiio_buf, int *mpiio_buf_sz);
126
127
128
129
130
static void mpiio_shutdown(void);

#define MPIIO_LOCK() pthread_mutex_lock(&mpiio_runtime_mutex)
#define MPIIO_UNLOCK() pthread_mutex_unlock(&mpiio_runtime_mutex)

131
132
133
#define MPIIO_RECORD_READ(__ret, __fh, __count, __datatype, __counter, __tm1, __tm2) do { \
    struct mpiio_file_runtime* file; \
    int size = 0; \
Shane Snyder's avatar
Shane Snyder committed
134
    double __elapsed = __tm2-__tm1; \
135
136
137
138
139
    if(__ret != MPI_SUCCESS) break; \
    file = mpiio_file_by_fh(__fh); \
    if(!file) break; \
    DARSHAN_MPI_CALL(PMPI_Type_size)(__datatype, &size);  \
    size = size * __count; \
140
141
142
143
    DARSHAN_BUCKET_INC(&(file->file_record->counters[MPIIO_SIZE_READ_AGG_0_100]), size); \
    darshan_common_val_counter(&file->access_root, &file->access_count, size); \
    file->file_record->counters[MPIIO_BYTES_READ] += size; \
    file->file_record->counters[__counter] += 1; \
Shane Snyder's avatar
Shane Snyder committed
144
    if(file->last_io_type == DARSHAN_IO_WRITE) \
145
        file->file_record->counters[MPIIO_RW_SWITCHES] += 1; \
Shane Snyder's avatar
Shane Snyder committed
146
    file->last_io_type = DARSHAN_IO_READ; \
147
148
149
150
151
152
153
    if(file->file_record->fcounters[MPIIO_F_READ_START_TIMESTAMP] == 0) \
        file->file_record->fcounters[MPIIO_F_READ_START_TIMESTAMP] = __tm1; \
    file->file_record->fcounters[MPIIO_F_READ_END_TIMESTAMP] = __tm2; \
    if(file->file_record->fcounters[MPIIO_F_MAX_READ_TIME] < __elapsed) { \
        file->file_record->fcounters[MPIIO_F_MAX_READ_TIME] = __elapsed; \
        file->file_record->counters[MPIIO_MAX_READ_TIME_SIZE] = size; } \
    DARSHAN_TIMER_INC_NO_OVERLAP(file->file_record->fcounters[MPIIO_F_READ_TIME], __tm1, __tm2, file->last_read_end); \
154
} while(0)
155

156
157
158
#define MPIIO_RECORD_WRITE(__ret, __fh, __count, __datatype, __counter, __tm1, __tm2) do { \
    struct mpiio_file_runtime* file; \
    int size = 0; \
Shane Snyder's avatar
Shane Snyder committed
159
160
    double __elapsed = __tm2-__tm1; \
    if(__ret != MPI_SUCCESS) break; \
161
162
163
164
    file = mpiio_file_by_fh(__fh); \
    if(!file) break; \
    DARSHAN_MPI_CALL(PMPI_Type_size)(__datatype, &size);  \
    size = size * __count; \
165
166
167
168
    DARSHAN_BUCKET_INC(&(file->file_record->counters[MPIIO_SIZE_WRITE_AGG_0_100]), size); \
    darshan_common_val_counter(&file->access_root, &file->access_count, size); \
    file->file_record->counters[MPIIO_BYTES_WRITTEN] += size; \
    file->file_record->counters[__counter] += 1; \
Shane Snyder's avatar
Shane Snyder committed
169
    if(file->last_io_type == DARSHAN_IO_READ) \
170
        file->file_record->counters[MPIIO_RW_SWITCHES] += 1; \
Shane Snyder's avatar
Shane Snyder committed
171
    file->last_io_type = DARSHAN_IO_WRITE; \
172
173
174
175
176
177
178
    if(file->file_record->fcounters[MPIIO_F_WRITE_START_TIMESTAMP] == 0) \
        file->file_record->fcounters[MPIIO_F_WRITE_START_TIMESTAMP] = __tm1; \
    file->file_record->fcounters[MPIIO_F_WRITE_END_TIMESTAMP] = __tm2; \
    if(file->file_record->fcounters[MPIIO_F_MAX_WRITE_TIME] < __elapsed) { \
        file->file_record->fcounters[MPIIO_F_MAX_WRITE_TIME] = __elapsed; \
        file->file_record->counters[MPIIO_MAX_WRITE_TIME_SIZE] = size; } \
    DARSHAN_TIMER_INC_NO_OVERLAP(file->file_record->fcounters[MPIIO_F_WRITE_TIME], __tm1, __tm2, file->last_write_end); \
179
180
} while(0)

181
182
183
/**********************************************************
 *        Wrappers for MPI-IO functions of interest       * 
 **********************************************************/
Philip Carns's avatar
Philip Carns committed
184

Philip Carns's avatar
Philip Carns committed
185
186
187
188
189
190
191
#ifdef HAVE_MPIIO_CONST
int MPI_File_open(MPI_Comm comm, const char *filename, int amode, MPI_Info info, MPI_File *fh) 
#else
int MPI_File_open(MPI_Comm comm, char *filename, int amode, MPI_Info info, MPI_File *fh) 
#endif
{
    int ret;
192
    struct mpiio_file_runtime* file;
Philip Carns's avatar
Philip Carns committed
193
194
195
196
197
198
199
200
201
202
203
    char* tmp;
    int comm_size;
    double tm1, tm2;

    tm1 = darshan_core_wtime();
    ret = DARSHAN_MPI_CALL(PMPI_File_open)(comm, filename, amode, info, fh);
    tm2 = darshan_core_wtime();

    if(ret == MPI_SUCCESS)
    {
        MPIIO_LOCK();
Philip Carns's avatar
Philip Carns committed
204
        mpiio_runtime_initialize();
Philip Carns's avatar
Philip Carns committed
205
206
207
208
209
210
211
212
213
214
215

        /* use ROMIO approach to strip prefix if present */
        /* strip off prefix if there is one, but only skip prefixes
         * if they are greater than length one to allow for windows
         * drive specifications (e.g. c:\...) 
         */
        tmp = strchr(filename, ':');
        if (tmp > filename + 1) {
            filename = tmp + 1;
        }

216
        file = mpiio_file_by_name_setfh(filename, (*fh));
Philip Carns's avatar
Philip Carns committed
217
218
        if(file)
        {
219
            file->file_record->counters[MPIIO_MODE] = amode;
Philip Carns's avatar
Philip Carns committed
220
221
222
            DARSHAN_MPI_CALL(PMPI_Comm_size)(comm, &comm_size);
            if(comm_size == 1)
            {
223
                file->file_record->counters[MPIIO_INDEP_OPENS] += 1;
Philip Carns's avatar
Philip Carns committed
224
225
226
            }
            else
            {
227
                file->file_record->counters[MPIIO_COLL_OPENS] += 1;
Philip Carns's avatar
Philip Carns committed
228
229
230
            }
            if(info != MPI_INFO_NULL)
            {
231
                file->file_record->counters[MPIIO_HINTS] += 1;
Philip Carns's avatar
Philip Carns committed
232
            }
233
234
235
236
237
            if(file->file_record->fcounters[MPIIO_F_OPEN_TIMESTAMP] == 0)
                file->file_record->fcounters[MPIIO_F_OPEN_TIMESTAMP] = tm1;
            DARSHAN_TIMER_INC_NO_OVERLAP(
                file->file_record->fcounters[MPIIO_F_META_TIME],
                tm1, tm2, file->last_meta_end);
Philip Carns's avatar
Philip Carns committed
238
239
240
241
242
243
244
245
        }

        MPIIO_UNLOCK();
    }

    return(ret);
}

246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
int MPI_File_read(MPI_File fh, void *buf, int count,
    MPI_Datatype datatype, MPI_Status *status)
{
    int ret;
    double tm1, tm2;

    tm1 = darshan_core_wtime();
    ret = DARSHAN_MPI_CALL(PMPI_File_read)(fh, buf, count, datatype, status);
    tm2 = darshan_core_wtime();

    MPIIO_LOCK();
    mpiio_runtime_initialize();
    MPIIO_RECORD_READ(ret, fh, count, datatype, MPIIO_INDEP_READS, tm1, tm2);
    MPIIO_UNLOCK();
    return(ret);
}

#ifdef HAVE_MPIIO_CONST
int MPI_File_write(MPI_File fh, const void *buf, int count,
    MPI_Datatype datatype, MPI_Status *status)
#else
int MPI_File_write(MPI_File fh, void *buf, int count,
    MPI_Datatype datatype, MPI_Status *status)
#endif
{
    int ret;
    double tm1, tm2;

    tm1 = darshan_core_wtime();
    ret = DARSHAN_MPI_CALL(PMPI_File_write)(fh, buf, count, datatype, status);
    tm2 = darshan_core_wtime();

    MPIIO_LOCK();
    mpiio_runtime_initialize();
    MPIIO_RECORD_WRITE(ret, fh, count, datatype, MPIIO_INDEP_WRITES, tm1, tm2);
    MPIIO_UNLOCK();
    return(ret);
}

int MPI_File_read_at(MPI_File fh, MPI_Offset offset, void *buf,
    int count, MPI_Datatype datatype, MPI_Status *status)
{
    int ret;
    double tm1, tm2;

    tm1 = darshan_core_wtime();
    ret = DARSHAN_MPI_CALL(PMPI_File_read_at)(fh, offset, buf,
        count, datatype, status);
    tm2 = darshan_core_wtime();

    MPIIO_LOCK();
    mpiio_runtime_initialize();
    MPIIO_RECORD_READ(ret, fh, count, datatype, MPIIO_INDEP_READS, tm1, tm2);
    MPIIO_UNLOCK();
    return(ret);
}

#ifdef HAVE_MPIIO_CONST
int MPI_File_write_at(MPI_File fh, MPI_Offset offset, const void *buf,
    int count, MPI_Datatype datatype, MPI_Status *status)
#else
int MPI_File_write_at(MPI_File fh, MPI_Offset offset, void *buf,
    int count, MPI_Datatype datatype, MPI_Status *status)
#endif
{
    int ret;
    double tm1, tm2;

    tm1 = darshan_core_wtime();
    ret = DARSHAN_MPI_CALL(PMPI_File_write_at)(fh, offset, buf,
        count, datatype, status);
    tm2 = darshan_core_wtime();

    MPIIO_LOCK();
    mpiio_runtime_initialize();
    MPIIO_RECORD_WRITE(ret, fh, count, datatype, MPIIO_INDEP_WRITES, tm1, tm2);
    MPIIO_UNLOCK();
    return(ret);
}

int MPI_File_read_all(MPI_File fh, void * buf, int count, MPI_Datatype datatype, MPI_Status *status)
{
    int ret;
    double tm1, tm2;

    tm1 = darshan_core_wtime();
    ret = DARSHAN_MPI_CALL(PMPI_File_read_all)(fh, buf, count,
        datatype, status);
    tm2 = darshan_core_wtime();

    MPIIO_LOCK();
    mpiio_runtime_initialize();
    MPIIO_RECORD_READ(ret, fh, count, datatype, MPIIO_COLL_READS, tm1, tm2);
    MPIIO_UNLOCK();
    return(ret);
}

#ifdef HAVE_MPIIO_CONST
int MPI_File_write_all(MPI_File fh, const void * buf, int count, MPI_Datatype datatype, MPI_Status *status)
#else
int MPI_File_write_all(MPI_File fh, void * buf, int count, MPI_Datatype datatype, MPI_Status *status)
#endif
{
    int ret;
    double tm1, tm2;

    tm1 = darshan_core_wtime();
    ret = DARSHAN_MPI_CALL(PMPI_File_write_all)(fh, buf, count,
        datatype, status);
    tm2 = darshan_core_wtime();

    MPIIO_LOCK();
    mpiio_runtime_initialize();
    MPIIO_RECORD_WRITE(ret, fh, count, datatype, MPIIO_COLL_WRITES, tm1, tm2);
    MPIIO_UNLOCK();
    return(ret);
}

int MPI_File_read_at_all(MPI_File fh, MPI_Offset offset, void * buf,
    int count, MPI_Datatype datatype, MPI_Status * status)
{
    int ret;
    double tm1, tm2;

    tm1 = darshan_core_wtime();
    ret = DARSHAN_MPI_CALL(PMPI_File_read_at_all)(fh, offset, buf,
        count, datatype, status);
    tm2 = darshan_core_wtime();

    MPIIO_LOCK();
    mpiio_runtime_initialize();
    MPIIO_RECORD_READ(ret, fh, count, datatype, MPIIO_COLL_READS, tm1, tm2);
    MPIIO_UNLOCK();
    return(ret);
}

#ifdef HAVE_MPIIO_CONST
int MPI_File_write_at_all(MPI_File fh, MPI_Offset offset, const void * buf,
    int count, MPI_Datatype datatype, MPI_Status * status)
#else
int MPI_File_write_at_all(MPI_File fh, MPI_Offset offset, void * buf,
    int count, MPI_Datatype datatype, MPI_Status * status)
#endif
{
    int ret;
    double tm1, tm2;

    tm1 = darshan_core_wtime();
    ret = DARSHAN_MPI_CALL(PMPI_File_write_at_all)(fh, offset, buf,
        count, datatype, status);
    tm2 = darshan_core_wtime();

    MPIIO_LOCK();
    mpiio_runtime_initialize();
    MPIIO_RECORD_WRITE(ret, fh, count, datatype, MPIIO_COLL_WRITES, tm1, tm2);
    MPIIO_UNLOCK();
    return(ret);
}

405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
int MPI_File_read_shared(MPI_File fh, void * buf, int count, MPI_Datatype datatype, MPI_Status *status)
{
    int ret;
    double tm1, tm2;

    tm1 = darshan_core_wtime();
    ret = DARSHAN_MPI_CALL(PMPI_File_read_shared)(fh, buf, count,
        datatype, status);
    tm2 = darshan_core_wtime();

    MPIIO_LOCK();
    mpiio_runtime_initialize();
    MPIIO_RECORD_READ(ret, fh, count, datatype, MPIIO_INDEP_READS, tm1, tm2);
    MPIIO_UNLOCK();
    return(ret);
}

#ifdef HAVE_MPIIO_CONST
int MPI_File_write_shared(MPI_File fh, const void * buf, int count, MPI_Datatype datatype, MPI_Status *status)
#else
int MPI_File_write_shared(MPI_File fh, void * buf, int count, MPI_Datatype datatype, MPI_Status *status)
#endif
{
    int ret;
    double tm1, tm2;

    tm1 = darshan_core_wtime();
    ret = DARSHAN_MPI_CALL(PMPI_File_write_shared)(fh, buf, count,
        datatype, status);
    tm2 = darshan_core_wtime();

    MPIIO_LOCK();
    mpiio_runtime_initialize();
    MPIIO_RECORD_WRITE(ret, fh, count, datatype, MPIIO_INDEP_WRITES, tm1, tm2);
    MPIIO_UNLOCK();
    return(ret);
}

int MPI_File_read_ordered(MPI_File fh, void * buf, int count,
    MPI_Datatype datatype, MPI_Status * status)
{
    int ret;
    double tm1, tm2;

    tm1 = darshan_core_wtime();
    ret = DARSHAN_MPI_CALL(PMPI_File_read_ordered)(fh, buf, count,
        datatype, status);
    tm2 = darshan_core_wtime();

    MPIIO_LOCK();
    mpiio_runtime_initialize();
    MPIIO_RECORD_READ(ret, fh, count, datatype, MPIIO_COLL_READS, tm1, tm2);
    MPIIO_UNLOCK();
    return(ret);
}

#ifdef HAVE_MPIIO_CONST
int MPI_File_write_ordered(MPI_File fh, const void * buf, int count,
    MPI_Datatype datatype, MPI_Status * status)
#else
int MPI_File_write_ordered(MPI_File fh, void * buf, int count,
    MPI_Datatype datatype, MPI_Status * status)
#endif
{
    int ret;
    double tm1, tm2;

    tm1 = darshan_core_wtime();
    ret = DARSHAN_MPI_CALL(PMPI_File_write_ordered)(fh, buf, count,
         datatype, status);
    tm2 = darshan_core_wtime();

    MPIIO_LOCK();
    mpiio_runtime_initialize();
    MPIIO_RECORD_WRITE(ret, fh, count, datatype, MPIIO_COLL_WRITES, tm1, tm2);
    MPIIO_UNLOCK();
    return(ret);
}

int MPI_File_read_all_begin(MPI_File fh, void * buf, int count, MPI_Datatype datatype)
{
    int ret;
    double tm1, tm2;

    tm1 = darshan_core_wtime();
    ret = DARSHAN_MPI_CALL(PMPI_File_read_all_begin)(fh, buf, count, datatype);
    tm2 = darshan_core_wtime();

    MPIIO_LOCK();
    mpiio_runtime_initialize();
    MPIIO_RECORD_READ(ret, fh, count, datatype, MPIIO_SPLIT_READS, tm1, tm2);
    MPIIO_UNLOCK();
    return(ret);
}

#ifdef HAVE_MPIIO_CONST
int MPI_File_write_all_begin(MPI_File fh, const void * buf, int count, MPI_Datatype datatype)
#else
int MPI_File_write_all_begin(MPI_File fh, void * buf, int count, MPI_Datatype datatype)
#endif
{
    int ret;
    double tm1, tm2;

    tm1 = darshan_core_wtime();
    ret = DARSHAN_MPI_CALL(PMPI_File_write_all_begin)(fh, buf, count, datatype);
    tm2 = darshan_core_wtime();

    MPIIO_LOCK();
    mpiio_runtime_initialize();
    MPIIO_RECORD_WRITE(ret, fh, count, datatype, MPIIO_SPLIT_WRITES, tm1, tm2);
    MPIIO_UNLOCK();
    return(ret);
}

int MPI_File_read_at_all_begin(MPI_File fh, MPI_Offset offset, void * buf,
    int count, MPI_Datatype datatype)
{
    int ret;
    double tm1, tm2;

    tm1 = darshan_core_wtime();
    ret = DARSHAN_MPI_CALL(PMPI_File_read_at_all_begin)(fh, offset, buf,
        count, datatype);
    tm2 = darshan_core_wtime();
    
    MPIIO_LOCK();
    mpiio_runtime_initialize();
    MPIIO_RECORD_READ(ret, fh, count, datatype, MPIIO_SPLIT_READS, tm1, tm2);
    MPIIO_UNLOCK();
    return(ret);
}

#ifdef HAVE_MPIIO_CONST
int MPI_File_write_at_all_begin(MPI_File fh, MPI_Offset offset, const void * buf,
    int count, MPI_Datatype datatype)
#else
int MPI_File_write_at_all_begin(MPI_File fh, MPI_Offset offset, void * buf,
    int count, MPI_Datatype datatype)
#endif
{
    int ret;
    double tm1, tm2;

    tm1 = darshan_core_wtime();
    ret = DARSHAN_MPI_CALL(PMPI_File_write_at_all_begin)(fh, offset,
        buf, count, datatype);
    tm2 = darshan_core_wtime();

    MPIIO_LOCK();
    mpiio_runtime_initialize();
    MPIIO_RECORD_WRITE(ret, fh, count, datatype, MPIIO_SPLIT_WRITES, tm1, tm2);
    MPIIO_UNLOCK();
    return(ret);
}

int MPI_File_read_ordered_begin(MPI_File fh, void * buf, int count, MPI_Datatype datatype)
{
    int ret;
    double tm1, tm2;

    tm1 = darshan_core_wtime();
    ret = DARSHAN_MPI_CALL(PMPI_File_read_ordered_begin)(fh, buf, count,
        datatype);
    tm2 = darshan_core_wtime();

    MPIIO_LOCK();
    mpiio_runtime_initialize();
    MPIIO_RECORD_READ(ret, fh, count, datatype, MPIIO_SPLIT_READS, tm1, tm2);
    MPIIO_UNLOCK();
    return(ret);
}

#ifdef HAVE_MPIIO_CONST
int MPI_File_write_ordered_begin(MPI_File fh, const void * buf, int count, MPI_Datatype datatype)
#else
int MPI_File_write_ordered_begin(MPI_File fh, void * buf, int count, MPI_Datatype datatype)
#endif
{
    int ret;
    double tm1, tm2;

    tm1 = darshan_core_wtime();
    ret = DARSHAN_MPI_CALL(PMPI_File_write_ordered_begin)(fh, buf, count,
        datatype);
    tm2 = darshan_core_wtime();

    MPIIO_LOCK();
    mpiio_runtime_initialize();
    MPIIO_RECORD_WRITE(ret, fh, count, datatype, MPIIO_SPLIT_WRITES, tm1, tm2);
    MPIIO_UNLOCK();
    return(ret);
}

int MPI_File_iread(MPI_File fh, void * buf, int count, MPI_Datatype datatype, __D_MPI_REQUEST * request)
{
    int ret;
    double tm1, tm2;

    tm1 = darshan_core_wtime();
    ret = DARSHAN_MPI_CALL(PMPI_File_iread)(fh, buf, count, datatype, request);
    tm2 = darshan_core_wtime();

    MPIIO_LOCK();
    mpiio_runtime_initialize();
    MPIIO_RECORD_READ(ret, fh, count, datatype, MPIIO_NB_READS, tm1, tm2);
    MPIIO_UNLOCK();
    return(ret);
}

#ifdef HAVE_MPIIO_CONST
int MPI_File_iwrite(MPI_File fh, const void * buf, int count,
    MPI_Datatype datatype, __D_MPI_REQUEST * request)
#else
int MPI_File_iwrite(MPI_File fh, void * buf, int count,
    MPI_Datatype datatype, __D_MPI_REQUEST * request)
#endif
{
    int ret;
    double tm1, tm2;

    tm1 = darshan_core_wtime();
    ret = DARSHAN_MPI_CALL(PMPI_File_iwrite)(fh, buf, count, datatype, request);
    tm2 = darshan_core_wtime();

    MPIIO_LOCK();
    mpiio_runtime_initialize();
    MPIIO_RECORD_WRITE(ret, fh, count, datatype, MPIIO_NB_WRITES, tm1, tm2);
    MPIIO_UNLOCK();
    return(ret);
}

int MPI_File_iread_at(MPI_File fh, MPI_Offset offset, void * buf,
    int count, MPI_Datatype datatype, __D_MPI_REQUEST *request)
{
    int ret;
    double tm1, tm2;

    tm1 = darshan_core_wtime();
    ret = DARSHAN_MPI_CALL(PMPI_File_iread_at)(fh, offset, buf, count,
        datatype, request);
    tm2 = darshan_core_wtime();

    MPIIO_LOCK();
    mpiio_runtime_initialize();
    MPIIO_RECORD_READ(ret, fh, count, datatype, MPIIO_NB_READS, tm1, tm2);
    MPIIO_UNLOCK();
    return(ret);
}

#ifdef HAVE_MPIIO_CONST
int MPI_File_iwrite_at(MPI_File fh, MPI_Offset offset, const void * buf,
    int count, MPI_Datatype datatype, __D_MPI_REQUEST *request)
#else
int MPI_File_iwrite_at(MPI_File fh, MPI_Offset offset, void * buf,
    int count, MPI_Datatype datatype, __D_MPI_REQUEST *request)
#endif
{
    int ret;
    double tm1, tm2;

    tm1 = darshan_core_wtime();
    ret = DARSHAN_MPI_CALL(PMPI_File_iwrite_at)(fh, offset, buf,
        count, datatype, request);
    tm2 = darshan_core_wtime();

    MPIIO_LOCK();
    mpiio_runtime_initialize();
    MPIIO_RECORD_WRITE(ret, fh, count, datatype, MPIIO_NB_WRITES, tm1, tm2);
    MPIIO_UNLOCK();
    return(ret);
}

int MPI_File_iread_shared(MPI_File fh, void * buf, int count,
    MPI_Datatype datatype, __D_MPI_REQUEST * request)
{
    int ret;
    double tm1, tm2;

    tm1 = darshan_core_wtime();
    ret = DARSHAN_MPI_CALL(PMPI_File_iread_shared)(fh, buf, count,
        datatype, request);
    tm2 = darshan_core_wtime();

    MPIIO_LOCK();
    mpiio_runtime_initialize();
    MPIIO_RECORD_READ(ret, fh, count, datatype, MPIIO_NB_READS, tm1, tm2);
    MPIIO_UNLOCK();
    return(ret);
}

#ifdef HAVE_MPIIO_CONST
int MPI_File_iwrite_shared(MPI_File fh, const void * buf, int count,
    MPI_Datatype datatype, __D_MPI_REQUEST * request)
#else
int MPI_File_iwrite_shared(MPI_File fh, void * buf, int count,
    MPI_Datatype datatype, __D_MPI_REQUEST * request)
#endif
{
    int ret;
    double tm1, tm2;

    tm1 = darshan_core_wtime();
    ret = DARSHAN_MPI_CALL(PMPI_File_iwrite_shared)(fh, buf, count,
        datatype, request);
    tm2 = darshan_core_wtime();

    MPIIO_LOCK();
    mpiio_runtime_initialize();
    MPIIO_RECORD_WRITE(ret, fh, count, datatype, MPIIO_NB_WRITES, tm1, tm2);
    MPIIO_UNLOCK();
    return(ret);
}
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735

int MPI_File_sync(MPI_File fh)
{
    int ret;
    struct mpiio_file_runtime* file;
    double tm1, tm2;

    tm1 = darshan_core_wtime();
    ret = DARSHAN_MPI_CALL(PMPI_File_sync)(fh);
    tm2 = darshan_core_wtime();

    if(ret == MPI_SUCCESS)
    {
        MPIIO_LOCK();
        mpiio_runtime_initialize();
        file = mpiio_file_by_fh(fh);
        if(file)
        {
736
737
738
739
            file->file_record->counters[MPIIO_SYNCS] += 1;
            DARSHAN_TIMER_INC_NO_OVERLAP(
                file->file_record->fcounters[MPIIO_F_WRITE_TIME],
                tm1, tm2, file->last_write_end);
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
        }
        MPIIO_UNLOCK();
    }

    return(ret);
}

#ifdef HAVE_MPIIO_CONST
int MPI_File_set_view(MPI_File fh, MPI_Offset disp, MPI_Datatype etype,
    MPI_Datatype filetype, const char *datarep, MPI_Info info)
#else
int MPI_File_set_view(MPI_File fh, MPI_Offset disp, MPI_Datatype etype,
    MPI_Datatype filetype, char *datarep, MPI_Info info)
#endif
{
    int ret;
    struct mpiio_file_runtime* file;
    double tm1, tm2;

    tm1 = darshan_core_wtime();
    ret = DARSHAN_MPI_CALL(PMPI_File_set_view)(fh, disp, etype, filetype,
        datarep, info);
    tm2 = darshan_core_wtime();

    if(ret == MPI_SUCCESS)
    {
        MPIIO_LOCK();
        mpiio_runtime_initialize();
        file = mpiio_file_by_fh(fh);
        if(file)
        {
771
            file->file_record->counters[MPIIO_VIEWS] += 1;
772
773
            if(info != MPI_INFO_NULL)
            {
774
775
776
777
                file->file_record->counters[MPIIO_HINTS] += 1;
                DARSHAN_TIMER_INC_NO_OVERLAP(
                    file->file_record->fcounters[MPIIO_F_META_TIME],
                    tm1, tm2, file->last_meta_end);
778
779
780
781
782
783
784
785
           }
        }
        MPIIO_UNLOCK();
    }

    return(ret);
}

786
787
788
789
int MPI_File_close(MPI_File *fh)
{
    int ret;
    struct mpiio_file_runtime* file;
790
    MPI_File tmp_fh = *fh;
791
792
793
794
795
796
797
798
    double tm1, tm2;

    tm1 = darshan_core_wtime();
    ret = DARSHAN_MPI_CALL(PMPI_File_close)(fh);
    tm2 = darshan_core_wtime();

    MPIIO_LOCK();
    mpiio_runtime_initialize();
799
    file = mpiio_file_by_fh(tmp_fh);
800
801
    if(file)
    {
802
803
804
805
806
        file->file_record->fcounters[MPIIO_F_CLOSE_TIMESTAMP] =
            darshan_core_wtime();
        DARSHAN_TIMER_INC_NO_OVERLAP(
            file->file_record->fcounters[MPIIO_F_META_TIME],
            tm1, tm2, file->last_meta_end);
807
        mpiio_file_close_fh(tmp_fh);
808
809
810
811
812
813
814
815
816
817
818
    }
    MPIIO_UNLOCK();

    return(ret);
}

/***********************************************************
 * Internal functions for manipulating MPI-IO module state *
 ***********************************************************/

/* initialize data structures and register with darshan-core component */
Philip Carns's avatar
Philip Carns committed
819
820
821
822
static void mpiio_runtime_initialize()
{
    struct darshan_module_funcs mpiio_mod_fns =
    {
823
        .begin_shutdown = &mpiio_begin_shutdown,
Philip Carns's avatar
Philip Carns committed
824
825
826
        .get_output_data = &mpiio_get_output_data,
        .shutdown = &mpiio_shutdown
    };
827
828
829
    void *mpiio_buf;
    int mpiio_buf_size;
    int file_array_size;
Philip Carns's avatar
Philip Carns committed
830
831
832
833
834

    /* don't do anything if already initialized or instrumenation is disabled */
    if(mpiio_runtime || instrumentation_disabled)
        return;

835
836
837
    /* try and store the default number of records for this module */
    mpiio_buf_size = DARSHAN_DEF_MOD_REC_COUNT * sizeof(struct darshan_mpiio_file);

Philip Carns's avatar
Philip Carns committed
838
839
840
841
    /* register the mpiio module with darshan core */
    darshan_core_register_module(
        DARSHAN_MPIIO_MOD,
        &mpiio_mod_fns,
842
843
        &mpiio_buf_size,
        &mpiio_buf,
844
        &my_rank,
845
        NULL);
Philip Carns's avatar
Philip Carns committed
846

847
848
    /* return if darshan-core does not provide enough module memory */
    if(mpiio_buf_size < sizeof(struct darshan_mpiio_file))
849
850
    {
        darshan_core_unregister_module(DARSHAN_MPIIO_MOD);
Philip Carns's avatar
Philip Carns committed
851
        return;
852
    }
Philip Carns's avatar
Philip Carns committed
853
854
855

    mpiio_runtime = malloc(sizeof(*mpiio_runtime));
    if(!mpiio_runtime)
856
857
    {
        darshan_core_unregister_module(DARSHAN_MPIIO_MOD);
Philip Carns's avatar
Philip Carns committed
858
        return;
859
    }
Philip Carns's avatar
Philip Carns committed
860
861
    memset(mpiio_runtime, 0, sizeof(*mpiio_runtime));

862
863
864
865
    /* set number of trackable files for the MPIIO module according to the
     * amount of memory returned by darshan-core
     */
    file_array_size = mpiio_buf_size / sizeof(struct darshan_mpiio_file);
Philip Carns's avatar
Philip Carns committed
866
867
    mpiio_runtime->file_array_ndx = 0;

868
869
870
    /* store pointer to MPIIO record buffer given by darshan-core */
    mpiio_runtime->file_record_array = (struct darshan_mpiio_file *)mpiio_buf;

Philip Carns's avatar
Philip Carns committed
871
    /* allocate array of runtime file records */
872
    mpiio_runtime->file_runtime_array = malloc(file_array_size *
873
                                               sizeof(struct mpiio_file_runtime));
874
    if(!mpiio_runtime->file_runtime_array)
Philip Carns's avatar
Philip Carns committed
875
    {
876
877
878
        free(mpiio_runtime);
        mpiio_runtime = NULL;
        darshan_core_unregister_module(DARSHAN_MPIIO_MOD);
Philip Carns's avatar
Philip Carns committed
879
880
        return;
    }
881
    memset(mpiio_runtime->file_runtime_array, 0, file_array_size *
882
           sizeof(struct mpiio_file_runtime));
Philip Carns's avatar
Philip Carns committed
883
884
885
886

    return;
}

887
888
889
890
/* get a MPIIO file record for the given file path */
static struct mpiio_file_runtime* mpiio_file_by_name(const char *name)
{
    struct mpiio_file_runtime *file = NULL;
891
    struct darshan_mpiio_file *file_rec;
892
893
    char *newname = NULL;
    darshan_record_id file_id;
894
    int ret;
895
896
897
898
899
900
901
902

    if(!mpiio_runtime || instrumentation_disabled)
        return(NULL);

    newname = darshan_clean_file_path(name);
    if(!newname)
        newname = (char*)name;

903
904
    /* lookup the unique id for this filename */
    darshan_core_lookup_record(
905
906
        (void*)newname,
        strlen(newname),
907
        &file_id);
908

909
910
    /* search the hash table for this file record, and return if found */
    HASH_FIND(hlink, mpiio_runtime->file_hash, &file_id, sizeof(darshan_record_id), file);
911
    if(!file)
912
    {
913
914
915
916
917
918
919
920
921
922
        /* register the record with the darshan core component */
        ret = darshan_core_register_record(file_id, (void *)newname, DARSHAN_MPIIO_MOD,
            sizeof(struct darshan_mpiio_file), NULL);
        if(ret == 1)
        {
            /* register was successful */
            file = &(mpiio_runtime->file_runtime_array[mpiio_runtime->file_array_ndx]);
            file->file_record =
                &(mpiio_runtime->file_record_array[mpiio_runtime->file_array_ndx]);
            file_rec = file->file_record;
923

924
925
            file_rec->base_rec.id = file_id;
            file_rec->base_rec.rank = my_rank;
926

927
928
929
930
931
932
            /* add new record to file hash table */
            HASH_ADD(hlink, mpiio_runtime->file_hash, file_record->base_rec.id,
                sizeof(darshan_record_id), file);
            mpiio_runtime->file_array_ndx++;
        }
    }
933
934
935
936
937
938
939
940
941

    if(newname != name)
        free(newname);
    return(file);
}

/* get an MPIIO file record for the given file path, and also create a
 * reference structure using the corresponding file handle
 */
942
static struct mpiio_file_runtime* mpiio_file_by_name_setfh(const char* name, MPI_File fh)
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
{
    struct mpiio_file_runtime* file;
    struct mpiio_file_runtime_ref* ref;

    if(!mpiio_runtime || instrumentation_disabled)
        return(NULL);

    /* find file record by name first */
    file = mpiio_file_by_name(name);

    if(!file)
        return(NULL);

    /* search hash table for existing file ref for this fh */
    HASH_FIND(hlink, mpiio_runtime->fh_hash, &fh, sizeof(fh), ref);
    if(ref)
    {
        /* we have a reference.  Make sure it points to the correct file
         * and return it
         */
        ref->file = file;
        return(file);
    }

    /* if we hit this point, then we don't have a reference for this fh
     * in the table yet.  Add it.
     */
    ref = malloc(sizeof(*ref));
    if(!ref)
        return(NULL);
    memset(ref, 0, sizeof(*ref));

    ref->file = file;
    ref->fh = fh;    
    HASH_ADD(hlink, mpiio_runtime->fh_hash, fh, sizeof(fh), ref);

    return(file);
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
}

/* get an MPIIO file record for the given file handle */
static struct mpiio_file_runtime* mpiio_file_by_fh(MPI_File fh)
{
    struct mpiio_file_runtime_ref* ref;

    if(!mpiio_runtime || instrumentation_disabled)
        return(NULL);

    /* search hash table for existing file ref for this file handle */
    HASH_FIND(hlink, mpiio_runtime->fh_hash, &fh, sizeof(fh), ref);
    if(ref)
        return(ref->file);

    return(NULL);
}

/* free up reference data structures for the given file handle */
static void mpiio_file_close_fh(MPI_File fh)
{
    struct mpiio_file_runtime_ref* ref;

    if(!mpiio_runtime || instrumentation_disabled)
        return;

    /* search hash table for this fd */
    HASH_FIND(hlink, mpiio_runtime->fh_hash, &fh, sizeof(fh), ref);
    if(ref)
    {
        /* we have a reference, delete it */
        HASH_DELETE(hlink, mpiio_runtime->fh_hash, ref);
        free(ref);
    }

    return;
1016
1017
}

1018
/* compare function for sorting file records by descending rank */
1019
static int mpiio_record_compare(const void* a_p, const void* b_p)
Philip Carns's avatar
Philip Carns committed
1020
{
1021
1022
    const struct darshan_mpiio_file* a = a_p;
    const struct darshan_mpiio_file* b = b_p;
Philip Carns's avatar
Philip Carns committed
1023

1024
    if(a->base_rec.rank < b->base_rec.rank)
1025
        return 1;
1026
    if(a->base_rec.rank > b->base_rec.rank)
1027
        return -1;
Philip Carns's avatar
Philip Carns committed
1028

1029
1030
    return 0;
}
Philip Carns's avatar
Philip Carns committed
1031

1032
1033
1034
1035
1036
static void mpiio_record_reduction_op(
    void* infile_v,
    void* inoutfile_v,
    int *len,
    MPI_Datatype *datatype)
Philip Carns's avatar
Philip Carns committed
1037
{
1038
1039
1040
    struct darshan_mpiio_file tmp_file;
    struct darshan_mpiio_file *infile = infile_v;
    struct darshan_mpiio_file *inoutfile = inoutfile_v;
1041
    int i, j, k;
Philip Carns's avatar
Philip Carns committed
1042

1043
    assert(mpiio_runtime);
Philip Carns's avatar
Philip Carns committed
1044

1045
1046
1047
    for(i=0; i<*len; i++)
    {
        memset(&tmp_file, 0, sizeof(struct darshan_mpiio_file));
1048
1049
        tmp_file.base_rec.id = infile->base_rec.id;
        tmp_file.base_rec.rank = -1;
1050
1051

        /* sum */
1052
1053
1054
1055
1056
1057
1058
1059
1060
        for(j=MPIIO_INDEP_OPENS; j<=MPIIO_VIEWS; j++)
        {
            tmp_file.counters[j] = infile->counters[j] + inoutfile->counters[j];
        }

        tmp_file.counters[MPIIO_MODE] = infile->counters[MPIIO_MODE];

        /* sum */
        for(j=MPIIO_BYTES_READ; j<=MPIIO_RW_SWITCHES; j++)
1061
1062
1063
1064
        {
            tmp_file.counters[j] = infile->counters[j] + inoutfile->counters[j];
        }

1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
        /* skip MPIIO_MAX_*_TIME_SIZE; handled in floating point section */

        for(j=MPIIO_SIZE_READ_AGG_0_100; j<=MPIIO_SIZE_WRITE_AGG_1G_PLUS; j++)
        {
            tmp_file.counters[j] = infile->counters[j] + inoutfile->counters[j];
        }

        /* first collapse any duplicates */
        for(j=MPIIO_ACCESS1_ACCESS; j<=MPIIO_ACCESS4_ACCESS; j++)
        {
            for(k=MPIIO_ACCESS1_ACCESS; k<=MPIIO_ACCESS4_ACCESS; k++)
            {
                if(infile->counters[j] == inoutfile->counters[k])
                {
                    infile->counters[j+4] += inoutfile->counters[k+4];
                    inoutfile->counters[k] = 0;
                    inoutfile->counters[k+4] = 0;
                }
            }
        }

        /* first set */
        for(j=MPIIO_ACCESS1_ACCESS; j<=MPIIO_ACCESS4_ACCESS; j++)
        {
            DARSHAN_COMMON_VAL_COUNTER_INC(&(tmp_file.counters[MPIIO_ACCESS1_ACCESS]),
                &(tmp_file.counters[MPIIO_ACCESS1_COUNT]), infile->counters[j],
                infile->counters[j+4]);
        }

        /* second set */
        for(j=MPIIO_ACCESS1_ACCESS; j<=MPIIO_ACCESS4_ACCESS; j++)
1096
        {
1097
1098
1099
            DARSHAN_COMMON_VAL_COUNTER_INC(&(tmp_file.counters[MPIIO_ACCESS1_ACCESS]),
                &(tmp_file.counters[MPIIO_ACCESS1_COUNT]), inoutfile->counters[j],
                inoutfile->counters[j+4]);
1100
1101
1102
        }

        /* min non-zero (if available) value */
1103
        for(j=MPIIO_F_OPEN_TIMESTAMP; j<=MPIIO_F_WRITE_START_TIMESTAMP; j++)
1104
1105
1106
1107
1108
1109
1110
        {
            if(infile->fcounters[j] > inoutfile->fcounters[j] && inoutfile->fcounters[j] > 0)
                tmp_file.fcounters[j] = inoutfile->fcounters[j];
            else
                tmp_file.fcounters[j] = infile->fcounters[j];
        }

1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
        /* max */
        for(j=MPIIO_F_READ_END_TIMESTAMP; j<= MPIIO_F_CLOSE_TIMESTAMP; j++)
        {
            if(infile->fcounters[j] > inoutfile->fcounters[j])
                tmp_file.fcounters[j] = infile->fcounters[j];
            else
                tmp_file.fcounters[j] = inoutfile->fcounters[j];
        }

        /* sum */
        for(j=MPIIO_F_READ_TIME; j<=MPIIO_F_META_TIME; j++)
        {
Shane Snyder's avatar
Shane Snyder committed
1123
            tmp_file.fcounters[j] = infile->fcounters[j] + inoutfile->fcounters[j];
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
        }

        /* max (special case) */
        if(infile->fcounters[MPIIO_F_MAX_READ_TIME] >
            inoutfile->fcounters[MPIIO_F_MAX_READ_TIME])
        {
            tmp_file.fcounters[MPIIO_F_MAX_READ_TIME] =
                infile->fcounters[MPIIO_F_MAX_READ_TIME];
            tmp_file.counters[MPIIO_MAX_READ_TIME_SIZE] =
                infile->counters[MPIIO_MAX_READ_TIME_SIZE];
        }
        else
        {
            tmp_file.fcounters[MPIIO_F_MAX_READ_TIME] =
                inoutfile->fcounters[MPIIO_F_MAX_READ_TIME];
            tmp_file.counters[MPIIO_MAX_READ_TIME_SIZE] =
                inoutfile->counters[MPIIO_MAX_READ_TIME_SIZE];
        }

        if(infile->fcounters[MPIIO_F_MAX_WRITE_TIME] >
            inoutfile->fcounters[MPIIO_F_MAX_WRITE_TIME])
        {
            tmp_file.fcounters[MPIIO_F_MAX_WRITE_TIME] =
                infile->fcounters[MPIIO_F_MAX_WRITE_TIME];
            tmp_file.counters[MPIIO_MAX_WRITE_TIME_SIZE] =
                infile->counters[MPIIO_MAX_WRITE_TIME_SIZE];
        }
        else
        {
            tmp_file.fcounters[MPIIO_F_MAX_WRITE_TIME] =
                inoutfile->fcounters[MPIIO_F_MAX_WRITE_TIME];
            tmp_file.counters[MPIIO_MAX_WRITE_TIME_SIZE] =
                inoutfile->counters[MPIIO_MAX_WRITE_TIME_SIZE];
        }

1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
        /* min (zeroes are ok here; some procs don't do I/O) */
        if(infile->fcounters[MPIIO_F_FASTEST_RANK_TIME] <
            inoutfile->fcounters[MPIIO_F_FASTEST_RANK_TIME])
        {
            tmp_file.counters[MPIIO_FASTEST_RANK] =
                infile->counters[MPIIO_FASTEST_RANK];
            tmp_file.counters[MPIIO_FASTEST_RANK_BYTES] =
                infile->counters[MPIIO_FASTEST_RANK_BYTES];
            tmp_file.fcounters[MPIIO_F_FASTEST_RANK_TIME] =
                infile->fcounters[MPIIO_F_FASTEST_RANK_TIME];
        }
        else
        {
            tmp_file.counters[MPIIO_FASTEST_RANK] =
                inoutfile->counters[MPIIO_FASTEST_RANK];
            tmp_file.counters[MPIIO_FASTEST_RANK_BYTES] =
                inoutfile->counters[MPIIO_FASTEST_RANK_BYTES];
            tmp_file.fcounters[MPIIO_F_FASTEST_RANK_TIME] =
                inoutfile->fcounters[MPIIO_F_FASTEST_RANK_TIME];
        }

        /* max */
        if(infile->fcounters[MPIIO_F_SLOWEST_RANK_TIME] >
           inoutfile->fcounters[MPIIO_F_SLOWEST_RANK_TIME])
        {
            tmp_file.counters[MPIIO_SLOWEST_RANK] =
                infile->counters[MPIIO_SLOWEST_RANK];
            tmp_file.counters[MPIIO_SLOWEST_RANK_BYTES] =
                infile->counters[MPIIO_SLOWEST_RANK_BYTES];
            tmp_file.fcounters[MPIIO_F_SLOWEST_RANK_TIME] =
                infile->fcounters[MPIIO_F_SLOWEST_RANK_TIME];
        }
        else
        {
            tmp_file.counters[MPIIO_SLOWEST_RANK] =
                inoutfile->counters[MPIIO_SLOWEST_RANK];
            tmp_file.counters[MPIIO_SLOWEST_RANK_BYTES] =
                inoutfile->counters[MPIIO_SLOWEST_RANK_BYTES];
            tmp_file.fcounters[MPIIO_F_SLOWEST_RANK_TIME] =
                inoutfile->fcounters[MPIIO_F_SLOWEST_RANK_TIME];
        }

1201
1202
1203
1204
1205
1206
1207
1208
1209
        /* update pointers */
        *inoutfile = tmp_file;
        inoutfile++;
        infile++;
    }

    return;
}

1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
static void mpiio_shared_record_variance(MPI_Comm mod_comm,
    struct darshan_mpiio_file *inrec_array, struct darshan_mpiio_file *outrec_array,
    int shared_rec_count)
{
    MPI_Datatype var_dt;
    MPI_Op var_op;
    int i;
    struct darshan_variance_dt *var_send_buf = NULL;
    struct darshan_variance_dt *var_recv_buf = NULL;

    DARSHAN_MPI_CALL(PMPI_Type_contiguous)(sizeof(struct darshan_variance_dt),
        MPI_BYTE, &var_dt);
    DARSHAN_MPI_CALL(PMPI_Type_commit)(&var_dt);

    DARSHAN_MPI_CALL(PMPI_Op_create)(darshan_variance_reduce, 1, &var_op);

    var_send_buf = malloc(shared_rec_count * sizeof(struct darshan_variance_dt));
    if(!var_send_buf)
        return;

    if(my_rank == 0)
    {
        var_recv_buf = malloc(shared_rec_count * sizeof(struct darshan_variance_dt));

        if(!var_recv_buf)
            return;
    }

    /* get total i/o time variances for shared records */

    for(i=0; i<shared_rec_count; i++)
    {
        var_send_buf[i].n = 1;
        var_send_buf[i].S = 0;
        var_send_buf[i].T = inrec_array[i].fcounters[MPIIO_F_READ_TIME] +
                            inrec_array[i].fcounters[MPIIO_F_WRITE_TIME] +
                            inrec_array[i].fcounters[MPIIO_F_META_TIME];
    }

    DARSHAN_MPI_CALL(PMPI_Reduce)(var_send_buf, var_recv_buf, shared_rec_count,
        var_dt, var_op, 0, mod_comm);

    if(my_rank == 0)
    {
        for(i=0; i<shared_rec_count; i++)
        {
            outrec_array[i].fcounters[MPIIO_F_VARIANCE_RANK_TIME] =
                (var_recv_buf[i].S / var_recv_buf[i].n);
        }
    }

    /* get total bytes moved variances for shared records */

    for(i=0; i<shared_rec_count; i++)
    {
        var_send_buf[i].n = 1;
        var_send_buf[i].S = 0;
        var_send_buf[i].T = (double)
                            inrec_array[i].counters[MPIIO_BYTES_READ] +
                            inrec_array[i].counters[MPIIO_BYTES_WRITTEN];
    }

    DARSHAN_MPI_CALL(PMPI_Reduce)(var_send_buf, var_recv_buf, shared_rec_count,
        var_dt, var_op, 0, mod_comm);

    if(my_rank == 0)
    {
        for(i=0; i<shared_rec_count; i++)
        {
            outrec_array[i].fcounters[MPIIO_F_VARIANCE_RANK_BYTES] =
                (var_recv_buf[i].S / var_recv_buf[i].n);
        }
    }

    DARSHAN_MPI_CALL(PMPI_Type_free)(&var_dt);
    DARSHAN_MPI_CALL(PMPI_Op_free)(&var_op);
    free(var_send_buf);
    free(var_recv_buf);

    return;
}

1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
/**************************************************************************
 * Functions exported by MPI-IO module for coordinating with darshan-core *
 **************************************************************************/

static void mpiio_begin_shutdown()
{
    assert(mpiio_runtime);

    MPIIO_LOCK();
    /* disable further instrumentation while Darshan shuts down */
    instrumentation_disabled = 1;
    MPIIO_UNLOCK();

    return;
}

1308
static void mpiio_get_output_data(
1309
1310
1311
1312
1313
    MPI_Comm mod_comm,
    darshan_record_id *shared_recs,
    int shared_rec_count,
    void **mpiio_buf,
    int *mpiio_buf_sz)
1314
{
1315
1316
1317
1318
1319
1320
1321
1322
1323
    struct mpiio_file_runtime *file;
    struct mpiio_file_runtime* tmp;
    int i;
    double mpiio_time;
    void *red_send_buf = NULL;
    void *red_recv_buf = NULL;
    MPI_Datatype red_type;
    MPI_Op red_op;

1324
1325
    assert(mpiio_runtime);

1326
1327
    /* go through and set the 4 most common access sizes for MPI-IO */
    for(i = 0; i < mpiio_runtime->file_array_ndx; i++)
1328
    {
1329
1330
1331
1332
1333
1334
        tmp = &(mpiio_runtime->file_runtime_array[i]);

        /* common access sizes */
        darshan_walk_common_vals(tmp->access_root,
            &(tmp->file_record->counters[MPIIO_ACCESS1_ACCESS]),
            &(tmp->file_record->counters[MPIIO_ACCESS1_COUNT]));
1335
    }
1336

1337
1338
1339
1340
1341
    /* if there are globally shared files, do a shared file reduction */
    /* NOTE: the shared file reduction is also skipped if the 
     * DARSHAN_DISABLE_SHARED_REDUCTION environment variable is set.
     */
    if(shared_rec_count && !getenv("DARSHAN_DISABLE_SHARED_REDUCTION"))
1342
    {
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
        /* necessary initialization of shared records */
        for(i = 0; i < shared_rec_count; i++)
        {
            HASH_FIND(hlink, mpiio_runtime->file_hash, &shared_recs[i],
                sizeof(darshan_record_id), file);
            assert(file);

            mpiio_time =
                file->file_record->fcounters[MPIIO_F_READ_TIME] +
                file->file_record->fcounters[MPIIO_F_WRITE_TIME] +
                file->file_record->fcounters[MPIIO_F_META_TIME];

            /* initialize fastest/slowest info prior to the reduction */
            file->file_record->counters[MPIIO_FASTEST_RANK] =
1357
                file->file_record->base_rec.rank;
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
            file->file_record->counters[MPIIO_FASTEST_RANK_BYTES] =
                file->file_record->counters[MPIIO_BYTES_READ] +
                file->file_record->counters[MPIIO_BYTES_WRITTEN];
            file->file_record->fcounters[MPIIO_F_FASTEST_RANK_TIME] =
                mpiio_time;

            /* until reduction occurs, we assume that this rank is both
             * the fastest and slowest. It is up to the reduction operator
             * to find the true min and max.
             */
            file->file_record->counters[MPIIO_SLOWEST_RANK] =
                file->file_record->counters[MPIIO_FASTEST_RANK];
            file->file_record->counters[MPIIO_SLOWEST_RANK_BYTES] =
                file->file_record->counters[MPIIO_FASTEST_RANK_BYTES];
            file->file_record->fcounters[MPIIO_F_SLOWEST_RANK_TIME] =
                file->file_record->fcounters[MPIIO_F_FASTEST_RANK_TIME];

1375
            file->file_record->base_rec.rank = -1;
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
        }

        /* sort the array of files descending by rank so that we get all of the 
         * shared files (marked by rank -1) in a contiguous portion at end 
         * of the array
         */
        qsort(mpiio_runtime->file_record_array, mpiio_runtime->file_array_ndx,
            sizeof(struct darshan_mpiio_file), mpiio_record_compare);

        /* make *send_buf point to the shared files at the end of sorted array */
        red_send_buf =
Shane Snyder's avatar
Shane Snyder committed
1387
            &(mpiio_runtime->file_record_array[mpiio_runtime->file_array_ndx-shared_rec_count]);
1388
1389
1390
1391
1392
1393

        /* allocate memory for the reduction output on rank 0 */
        if(my_rank == 0)
        {
            red_recv_buf = malloc(shared_rec_count * sizeof(struct darshan_mpiio_file));
            if(!red_recv_buf)
1394
            {
1395
                return;
1396
            }
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
        }

        /* construct a datatype for a MPIIO file record.  This is serving no purpose
         * except to make sure we can do a reduction on proper boundaries
         */
        DARSHAN_MPI_CALL(PMPI_Type_contiguous)(sizeof(struct darshan_mpiio_file),
            MPI_BYTE, &red_type);
        DARSHAN_MPI_CALL(PMPI_Type_commit)(&red_type);

        /* register a MPIIO file record reduction operator */
        DARSHAN_MPI_CALL(PMPI_Op_create)(mpiio_record_reduction_op, 1, &red_op);

        /* reduce shared MPIIO file records */
        DARSHAN_MPI_CALL(PMPI_Reduce)(red_send_buf, red_recv_buf,
            shared_rec_count, red_type, red_op, 0, mod_comm);

1413
1414
1415
1416
        /* get the time and byte variances for shared files */
        mpiio_shared_record_variance(mod_comm, red_send_buf, red_recv_buf,
            shared_rec_count);

1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
        /* clean up reduction state */
        if(my_rank == 0)
        {
            int tmp_ndx = mpiio_runtime->file_array_ndx - shared_rec_count;
            memcpy(&(mpiio_runtime->file_record_array[tmp_ndx]), red_recv_buf,
                shared_rec_count * sizeof(struct darshan_mpiio_file));
            free(red_recv_buf);
        }
        else
        {
            mpiio_runtime->file_array_ndx -= shared_rec_count;
        }

        DARSHAN_MPI_CALL(PMPI_Type_free)(&red_type);
        DARSHAN_MPI_CALL(PMPI_Op_free)(&red_op);
1432
1433
    }

1434
1435
    *mpiio_buf = (void *)(mpiio_runtime->file_record_array);
    *mpiio_buf_sz = mpiio_runtime->file_array_ndx * sizeof(struct darshan_mpiio_file);
1436
1437
1438
1439
1440
1441
1442
1443

    return;
}

static void mpiio_shutdown()
{
    struct mpiio_file_runtime_ref *ref, *tmp;

1444
1445
    assert(mpiio_runtime);

1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
    HASH_ITER(hlink, mpiio_runtime->fh_hash, ref, tmp)
    {
        HASH_DELETE(hlink, mpiio_runtime->fh_hash, ref);
        free(ref);
    }

    HASH_CLEAR(hlink, mpiio_runtime->file_hash); /* these entries are freed all at once below */

    free(mpiio_runtime->file_runtime_array);
    free(mpiio_runtime);
    mpiio_runtime = NULL;

    return;
Philip Carns's avatar
Philip Carns committed
1459
1460
}

Philip Carns's avatar
Philip Carns committed
1461
1462
1463
1464
1465
1466
1467
1468
/*
 * Local variables:
 *  c-indent-level: 4
 *  c-basic-offset: 4
 * End:
 *
 * vim: ts=8 sts=4 sw=4 expandtab
 */