darshan-core.c 40.6 KB
Newer Older
1
/*
Shane Snyder's avatar
Shane Snyder committed
2
3
4
 * Copyright (C) 2015 University of Chicago.
 * See COPYRIGHT notice in top-level directory.
 *
5
6
 */

7
#define _XOPEN_SOURCE 500
8
#define _GNU_SOURCE
9

10
11
12
13
14
15
#include "darshan-runtime-config.h"

#include <stdio.h>
#ifdef HAVE_MNTENT_H
#include <mntent.h>
#endif
16
17
18
19
20
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <limits.h>
#include <pthread.h>
21
#include <fcntl.h>
22
23
#include <sys/types.h>
#include <sys/stat.h>
24
#include <sys/mman.h>
25
#include <sys/vfs.h>
26
#include <zlib.h>
27
#include <mpi.h>
28
#include <assert.h>
29

30
#include "uthash.h"
Shane Snyder's avatar
Shane Snyder committed
31
#include "darshan.h"
32
#include "darshan-core.h"
Shane Snyder's avatar
Shane Snyder committed
33
#include "darshan-dynamic.h"
34

35
extern char* __progname;
36
extern char* __progname_full;
37

38
/* internal variable delcarations */
39
static struct darshan_core_runtime *darshan_core = NULL;
40
static pthread_mutex_t darshan_core_mutex = PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP;
41
static int my_rank = -1;
42
static int nprocs = -1;
43
static int darshan_mem_alignment = 1;
44

45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
/* paths prefixed with the following directories are not traced by darshan */
char* darshan_path_exclusions[] = {
"/etc/",
"/dev/",
"/usr/",
"/bin/",
"/boot/",
"/lib/",
"/opt/",
"/sbin/",
"/sys/",
"/proc/",
NULL
};

60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
#ifdef DARSHAN_BGQ
extern void bgq_runtime_initialize();
#endif

/* array of init functions for modules which need to be statically
 * initialized by darshan at startup time
 */
void (*mod_static_init_fns[])(void) =
{
#ifdef DARSHAN_BGQ
    &bgq_runtime_initialize,
#endif
    NULL
};

Shane Snyder's avatar
Shane Snyder committed
75
76
77
#define DARSHAN_CORE_LOCK() pthread_mutex_lock(&darshan_core_mutex)
#define DARSHAN_CORE_UNLOCK() pthread_mutex_unlock(&darshan_core_mutex)

78
79
80
81
82
83
/* FS mount information */
#define DARSHAN_MAX_MNTS 64
#define DARSHAN_MAX_MNT_PATH 256
#define DARSHAN_MAX_MNT_TYPE 32
struct mnt_data
{
84
    int block_size;
85
86
87
88
89
90
    char path[DARSHAN_MAX_MNT_PATH];
    char type[DARSHAN_MAX_MNT_TYPE];
};
static struct mnt_data mnt_data_array[DARSHAN_MAX_MNTS];
static int mnt_data_count = 0;

91
92
/* prototypes for internal helper functions */
static void darshan_log_record_hints_and_ver(
93
    struct darshan_core_runtime* core);
94
95
static void darshan_get_exe_and_mounts(
    struct darshan_core_runtime *core, int argc, char **argv);
96
97
static void darshan_block_size_from_path(
    const char *path, int *block_size);
98
99
static void darshan_get_logfile_name(
    char* logfile_name, int jobid, struct tm* start_tm);
100
static void darshan_get_shared_records(
101
102
    struct darshan_core_runtime *core, darshan_record_id **shared_recs,
    int *shared_rec_cnt);
103
static int darshan_log_open_all(
104
    char *logfile_name, MPI_File *log_fh);
105
static int darshan_deflate_buffer(
Shane Snyder's avatar
Shane Snyder committed
106
107
    void **pointers, int *lengths, int count, char *comp_buf,
    int *comp_buf_length);
108
static int darshan_log_write_record_hash(
109
    MPI_File log_fh, struct darshan_core_runtime *core,
110
111
112
    uint64_t *inout_off);
static int darshan_log_append_all(
    MPI_File log_fh, struct darshan_core_runtime *core, void *buf,
Shane Snyder's avatar
Shane Snyder committed
113
    int count, uint64_t *inout_off);
Shane Snyder's avatar
Shane Snyder committed
114
115
static void darshan_core_cleanup(
    struct darshan_core_runtime* core);
116

117
118
/* *********************************** */

Shane Snyder's avatar
Shane Snyder committed
119
void darshan_core_initialize(int argc, char **argv)
120
{
121
    struct darshan_core_runtime *init_core = NULL;
122
123
    int internal_timing_flag = 0;
    double init_start, init_time, init_max;
124
    char mmap_log_name[PATH_MAX];
125
126
127
    int mmap_fd;
    int mmap_size;
    int sys_page_size;
128
    char *envstr;
129
130
    char *jobid_str;
    int jobid;
131
132
    int ret;
    int tmpval;
133
    int i;
134
135

    DARSHAN_MPI_CALL(PMPI_Comm_size)(MPI_COMM_WORLD, &nprocs);
136
    DARSHAN_MPI_CALL(PMPI_Comm_rank)(MPI_COMM_WORLD, &my_rank);
137
138
139
140

    if(getenv("DARSHAN_INTERNAL_TIMING"))
        internal_timing_flag = 1;

141
    if(internal_timing_flag)
142
143
144
        init_start = DARSHAN_MPI_CALL(PMPI_Wtime)();

    /* setup darshan runtime if darshan is enabled and hasn't been initialized already */
145
    if(!getenv("DARSHAN_DISABLE") && !darshan_core)
146
    {
147
        #if (__DARSHAN_MEM_ALIGNMENT < 1)
148
149
            #error Darshan must be configured with a positive value for --with-mem-align
        #endif
150
        envstr = getenv(DARSHAN_MEM_ALIGNMENT_OVERRIDE);
151
152
153
154
155
156
157
158
159
160
161
        if(envstr)
        {
            ret = sscanf(envstr, "%d", &tmpval);
            /* silently ignore if the env variable is set poorly */
            if(ret == 1 && tmpval > 0)
            {
                darshan_mem_alignment = tmpval;
            }
        }
        else
        {
162
            darshan_mem_alignment = __DARSHAN_MEM_ALIGNMENT;
163
164
165
166
167
168
169
        }

        /* avoid floating point errors on faulty input */
        if (darshan_mem_alignment < 1)
        {
            darshan_mem_alignment = 1;
        }
170

171
172
173
        /* allocate structure to track darshan core runtime information */
        init_core = malloc(sizeof(*init_core));
        if(init_core)
174
        {
175
176
177
            memset(init_core, 0, sizeof(*init_core));
            init_core->wtime_offset = DARSHAN_MPI_CALL(PMPI_Wtime)();

178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
            /* Use DARSHAN_JOBID_OVERRIDE for the env var for __DARSHAN_JOBID */
            envstr = getenv(DARSHAN_JOBID_OVERRIDE);
            if(!envstr)
            {
                envstr = __DARSHAN_JOBID;
            }

            /* find a job id */
            jobid_str = getenv(envstr);
            if(jobid_str)
            {
                /* in cobalt we can find it in env var */
                ret = sscanf(jobid_str, "%d", &jobid);
            }
            if(!jobid_str || ret != 1)
            {
                /* use pid as fall back */
                jobid = getpid();
            }

198
199
            sys_page_size = sysconf(_SC_PAGESIZE);
            assert(sys_page_size > 0);
200

201
            /* XXX: MMAP */
202
203
            mmap_size = sizeof(struct darshan_header) + DARSHAN_JOB_RECORD_SIZE +
                DARSHAN_MOD_MEM_MAX;
204
205
            if(mmap_size % sys_page_size)
                mmap_size = ((mmap_size / sys_page_size) + 1) * sys_page_size;
206

207
208
209
210
211
212
213
            /* construct a unique temporary log file name for this process
             * to write mmap log data to
             */
            snprintf(mmap_log_name, PATH_MAX, "/tmp/darshan_job%d.%d",
                jobid, my_rank);

            /* create the temporary mmapped darshan log */
214
215
            mmap_fd = open(mmap_log_name, O_CREAT|O_RDWR|O_EXCL , 0644);
            if(mmap_fd < 0)
216
            {
217
218
219
220
                fprintf(stderr, "darshan library warning: "
                    "unable to create darshan log file %s\n", mmap_log_name);
                free(init_core);
                return;
221
222
            }

223
            /* TODO: what's more expensive? truncate or write zeros? perf test this call and later accesses */
224
225
226
227
228
229
230
231
232
233
234
235
236
237
            /* allocate the necessary space in the log file */
            ret = ftruncate(mmap_fd, mmap_size);
            if(ret < 0)
            {
                fprintf(stderr, "darshan library warning: "
                    "unable to allocate darshan log file %s\n", mmap_log_name);
                free(init_core);
                close(mmap_fd);
                unlink(mmap_log_name);
                return;
            }

            /* memory map buffers for getting at least some summary i/o data
             * into a log file if darshan does not shut down properly
238
             */
239
240
            void *mmap_p = mmap(NULL, mmap_size, PROT_WRITE, MAP_SHARED, mmap_fd, 0);
            if(mmap_p == MAP_FAILED)
241
            {
242
243
244
245
246
247
                fprintf(stderr, "darshan library warning: "
                    "unable to mmap darshan log file %s\n", mmap_log_name);
                free(init_core);
                close(mmap_fd);
                unlink(mmap_log_name);
                return;
248
249
            }

250
251
252
            /* close darshan log file (this does *not* unmap the log file) */
            close(mmap_fd);

253
            /* set the memory pointers for each log file region */
254
            init_core->log_hdr_p = (struct darshan_header *)mmap_p;
255
            init_core->log_job_p = (struct darshan_job *)
256
                ((char *)init_core->log_hdr_p + sizeof(struct darshan_header));
257
            init_core->log_exemnt_p = (char *)
258
259
260
261
262
                ((char *)init_core->log_job_p + sizeof(struct darshan_job));
            init_core->log_rec_p = (void *)
                ((char *)init_core->log_exemnt_p + DARSHAN_EXE_LEN + 1);
            init_core->log_mod_p = (void *)
                ((char *)init_core->log_rec_p + DARSHAN_RECORD_BUF_SIZE);
263
264
265
266
267
268
            /* XXX: MMAP */

            /* set known header fields for the log file */
            strcpy(init_core->log_hdr_p->version_string, DARSHAN_LOG_VERSION);
            init_core->log_hdr_p->magic_nr = DARSHAN_MAGIC_NR;
            init_core->log_hdr_p->comp_type = DARSHAN_NO_COMP;
269
270
            init_core->log_hdr_p->rec_map.off =
                sizeof(struct darshan_header) + DARSHAN_JOB_RECORD_SIZE;
271

272
273
274
275
276
            /* set known job-level metadata fields for the log file */
            init_core->log_job_p->uid = getuid();
            init_core->log_job_p->start_time = time(NULL);
            init_core->log_job_p->nprocs = nprocs;
            init_core->log_job_p->jobid = (int64_t)jobid;
277
278
279
280
281
282

            /* if we are using any hints to write the log file, then record those
             * hints with the darshan job information
             */
            darshan_log_record_hints_and_ver(init_core);

283
            /* collect information about command line and mounted file systems */
284
            darshan_get_exe_and_mounts(init_core, argc, argv);
285

Shane Snyder's avatar
Shane Snyder committed
286
            /* bootstrap any modules with static initialization routines */
287
288
289
290
291
292
293
294
            i = 0;
            while(mod_static_init_fns[i])
            {
                (*mod_static_init_fns[i])();
                i++;
            }

            darshan_core = init_core;
295
        }
296
297
    }

298
299
300
301
302
    if(internal_timing_flag)
    {
        init_time = DARSHAN_MPI_CALL(PMPI_Wtime)() - init_start;
        DARSHAN_MPI_CALL(PMPI_Reduce)(&init_time, &init_max, 1,
            MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
303
        if(my_rank == 0)
304
        {
305
306
            fprintf(stderr, "#darshan:<op>\t<nprocs>\t<time>\n");
            fprintf(stderr, "darshan:init\t%d\t%f\n", nprocs, init_max);
307
308
309
310
311
312
        }
    }

    return;
}

Shane Snyder's avatar
Shane Snyder committed
313
void darshan_core_shutdown()
314
{
315
    int i;
316
    struct darshan_core_runtime *final_core;
317
    int internal_timing_flag = 0;
318
    double start_log_time;
319
    double tm_end;
320
321
322
323

    if(getenv("DARSHAN_INTERNAL_TIMING"))
        internal_timing_flag = 1;

324
325
    if(internal_timing_flag)
        start_log_time = DARSHAN_MPI_CALL(PMPI_Wtime)();
326

Shane Snyder's avatar
Shane Snyder committed
327
    /* disable darhan-core while we shutdown */
328
    DARSHAN_CORE_LOCK();
329
    if(!darshan_core)
330
    {
331
        DARSHAN_CORE_UNLOCK();
332
333
        return;
    }
334
335
    final_core = darshan_core;
    darshan_core = NULL;
Shane Snyder's avatar
Shane Snyder committed
336
337
338
339
340

    for(i = 0; i < DARSHAN_MAX_MODS; i++)
    {
        if(final_core->mod_array[i])
        {
341
            final_core->mod_array[i]->mod_funcs.begin_shutdown();
Shane Snyder's avatar
Shane Snyder committed
342
343
        }
    }
344
    DARSHAN_CORE_UNLOCK();
345

346
    final_core->log_job_p->end_time = time(NULL);
347

348
    darshan_core_cleanup(final_core);
349

350
    if(internal_timing_flag)
351
    {
352
353
354
355
356
357
358
359
360
361
362
        double all_tm, all_slowest;

        tm_end = DARSHAN_MPI_CALL(PMPI_Wtime)();

        all_tm = tm_end - start_log_time;

        DARSHAN_MPI_CALL(PMPI_Reduce)(&all_tm, &all_slowest, 1,
            MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);

        if(my_rank == 0)
        {
363
364
            fprintf(stderr, "#darshan:<op>\t<nprocs>\t<time>\n");
            fprintf(stderr, "darshan:core_shutdown\t%d\t%f\n", nprocs, all_slowest);
365
        }
366
    }
367

368
369
    return;
}
370

Shane Snyder's avatar
Shane Snyder committed
371
/* *********************************** */
372

373
/* record any hints used to write the darshan log in the job data */
374
static void darshan_log_record_hints_and_ver(struct darshan_core_runtime* core)
375
376
{
    char* hints;
377
    char* job_hints;
378
379
380
381
382
383
    int meta_remain = 0;
    char* m;

    /* check environment variable to see if the default MPI file hints have
     * been overridden
     */
384
    hints = getenv(DARSHAN_LOG_HINTS_OVERRIDE);
385
386
    if(!hints)
    {
387
        hints = __DARSHAN_LOG_HINTS;
388
389
390
391
392
    }

    if(!hints || strlen(hints) < 1)
        return;

393
394
    job_hints = strdup(hints);
    if(!job_hints)
395
396
397
        return;

    meta_remain = DARSHAN_JOB_METADATA_LEN -
398
        strlen(core->log_job_p->metadata) - 1;
399
400
    if(meta_remain >= (strlen(PACKAGE_VERSION) + 9))
    {
401
        sprintf(core->log_job_p->metadata, "lib_ver=%s\n", PACKAGE_VERSION);
402
403
        meta_remain -= (strlen(PACKAGE_VERSION) + 9);
    }
404
    if(meta_remain >= (3 + strlen(job_hints)))
405
    {
406
        m = core->log_job_p->metadata + strlen(core->log_job_p->metadata);
407
        /* We have room to store the hints in the metadata portion of
408
         * the job structure.  We just prepend an h= to the hints list.  The
409
410
411
         * metadata parser will ignore = characters that appear in the value
         * portion of the metadata key/value pair.
         */
412
        sprintf(m, "h=%s\n", job_hints);
413
    }
414
    free(job_hints);
415
416
417
418

    return;
}

419
420
421
422
423
424
425
426
427
428
429
430
431
432
static int mnt_data_cmp(const void* a, const void* b)
{
    const struct mnt_data *d_a = (const struct mnt_data*)a;
    const struct mnt_data *d_b = (const struct mnt_data*)b;

    if(strlen(d_a->path) > strlen(d_b->path))
        return(-1);
    else if(strlen(d_a->path) < strlen(d_b->path))
        return(1);
    else
        return(0);
}

/* adds an entry to table of mounted file systems */
433
static void add_entry(char* buf, int* space_left, struct mntent *entry)
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
{
    int ret;
    char tmp_mnt[256];
    struct statfs statfsbuf;

    strncpy(mnt_data_array[mnt_data_count].path, entry->mnt_dir,
        DARSHAN_MAX_MNT_PATH-1);
    strncpy(mnt_data_array[mnt_data_count].type, entry->mnt_type,
        DARSHAN_MAX_MNT_TYPE-1);
    /* NOTE: we now try to detect the preferred block size for each file 
     * system using fstatfs().  On Lustre we assume a size of 1 MiB 
     * because fstatfs() reports 4 KiB. 
     */
#ifndef LL_SUPER_MAGIC
#define LL_SUPER_MAGIC 0x0BD00BD0
#endif
    ret = statfs(entry->mnt_dir, &statfsbuf);
    if(ret == 0 && statfsbuf.f_type != LL_SUPER_MAGIC)
        mnt_data_array[mnt_data_count].block_size = statfsbuf.f_bsize;
    else if(ret == 0 && statfsbuf.f_type == LL_SUPER_MAGIC)
        mnt_data_array[mnt_data_count].block_size = 1024*1024;
    else
        mnt_data_array[mnt_data_count].block_size = 4096;

458
    /* store mount information with the job-level metadata in darshan log */
459
    ret = snprintf(tmp_mnt, 256, "\n%s\t%s",
460
461
462
        entry->mnt_type, entry->mnt_dir);
    if(ret < 256 && strlen(tmp_mnt) <= (*space_left))
    {
463
        strcat(buf, tmp_mnt);
464
465
466
467
468
469
470
        (*space_left) -= strlen(tmp_mnt);
    }

    mnt_data_count++;
    return;
}

471
/* darshan_get_exe_and_mounts()
472
473
 *
 * collects command line and list of mounted file systems into a string that
474
 * will be stored with the job-level metadata
475
 */
476
static void darshan_get_exe_and_mounts(struct darshan_core_runtime *core,
477
    int argc, char **argv)
478
479
480
481
{
    FILE* tab;
    struct mntent *entry;
    char* exclude;
482
483
484
485
    char* truncate_string = "<TRUNCATED>";
    int truncate_offset;
    int space_left = DARSHAN_EXE_LEN;
    int i;
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
    int tmp_index = 0;
    int skip = 0;

    /* skip these fs types */
    static char* fs_exclusions[] = {
        "tmpfs",
        "proc",
        "sysfs",
        "devpts",
        "binfmt_misc",
        "fusectl",
        "debugfs",
        "securityfs",
        "nfsd",
        "none",
        "rpc_pipefs",
        "hugetlbfs",
        "cgroup",
        NULL
    };

507
508
509
    /* record exe and arguments */
    for(i=0; i<argc; i++)
    {
510
511
        strncat(core->log_exemnt_p, argv[i], space_left);
        space_left = DARSHAN_EXE_LEN-strlen(core->log_exemnt_p);
512
513
        if(i < (argc-1))
        {
514
515
            strncat(core->log_exemnt_p, " ", space_left);
            space_left = DARSHAN_EXE_LEN-strlen(core->log_exemnt_p);
516
517
518
519
520
521
522
523
        }
    }

    /* if we don't see any arguments, then use glibc symbol to get
     * program name at least (this happens in fortran)
     */
    if(argc == 0)
    {
524
525
526
527
        strncat(core->log_exemnt_p, __progname_full, space_left);
        space_left = DARSHAN_EXE_LEN-strlen(core->log_exemnt_p);
        strncat(core->log_exemnt_p, " <unknown args>", space_left);
        space_left = DARSHAN_EXE_LEN-strlen(core->log_exemnt_p);
528
529
530
531
532
533
    }

    if(space_left == 0)
    {
        /* we ran out of room; mark that string was truncated */
        truncate_offset = DARSHAN_EXE_LEN - strlen(truncate_string);
534
        sprintf(&(core->log_exemnt_p[truncate_offset]), "%s",
535
536
537
            truncate_string);
    }

538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
    /* we make two passes through mounted file systems; in the first pass we
     * grab any non-nfs mount points, then on the second pass we grab nfs
     * mount points
     */

    tab = setmntent("/etc/mtab", "r");
    if(!tab)
        return;
    /* loop through list of mounted file systems */
    while(mnt_data_count<DARSHAN_MAX_MNTS && (entry = getmntent(tab)) != NULL)
    {
        /* filter out excluded fs types */
        tmp_index = 0;
        skip = 0;
        while((exclude = fs_exclusions[tmp_index]))
        {
            if(!(strcmp(exclude, entry->mnt_type)))
            {
                skip =1;
                break;
            }
            tmp_index++;
        }

        if(skip || (strcmp(entry->mnt_type, "nfs") == 0))
            continue;

565
        add_entry(core->log_exemnt_p, &space_left, entry);
566
567
568
569
570
571
572
573
574
575
576
577
    }
    endmntent(tab);

    tab = setmntent("/etc/mtab", "r");
    if(!tab)
        return;
    /* loop through list of mounted file systems */
    while(mnt_data_count<DARSHAN_MAX_MNTS && (entry = getmntent(tab)) != NULL)
    {
        if(strcmp(entry->mnt_type, "nfs") != 0)
            continue;

578
        add_entry(core->log_exemnt_p, &space_left, entry);
579
580
581
    }
    endmntent(tab);

582
    /* sort mount points in order of longest path to shortest path.  This is
583
584
585
586
587
588
589
     * necessary so that if we try to match file paths to mount points later
     * we don't match on "/" every time.
     */
    qsort(mnt_data_array, mnt_data_count, sizeof(mnt_data_array[0]), mnt_data_cmp);
    return;
}

590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
static void darshan_add_record_hashref(struct darshan_core_runtime *core,
    char *name, darshan_record_id id, struct darshan_core_record_ref **ref)
{
    int record_size = sizeof(darshan_record_id) + strlen(name) + 1;

    if((record_size + core->rec_hash_sz) > DARSHAN_RECORD_BUF_SIZE)
        return;

    *ref = malloc(sizeof(**ref));
    if(*ref)
    {
        memset(*ref, 0, sizeof(**ref));

#if 0
        if(!mmap)
        {
            ref->rec.name = malloc(strlen(name) + 1);
        }
        else
#endif
        {
            /* store the rec id and full file path in record hash buffer */
            void *tmp_p = (char *)core->log_rec_p + core->rec_hash_sz;
            *(darshan_record_id *)tmp_p = id;

            /* set the name pointer for this record to point to the
             * appropriate location in the record hash buffer
             */
            tmp_p = (char *)tmp_p + sizeof(darshan_record_id);
            (*ref)->rec.name = (char *)tmp_p;
        }

        /* set record ref fields */
        (*ref)->rec.id = id;
        if((*ref)->rec.name)
            strcpy((*ref)->rec.name, name);

        /* TODO: look at HASH_ADD_KEYPTR, use same strategy (big contig pool) for non-mmap darshan */
        HASH_ADD(hlink, core->rec_hash, rec.id, sizeof(darshan_record_id), (*ref));
        core->rec_hash_cnt++;
        core->rec_hash_sz += record_size;
        core->log_hdr_p->rec_map.len += record_size;
    }

    return;
}

637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
static void darshan_block_size_from_path(const char *path, int *block_size)
{
    int i;
    *block_size = -1;

    for(i=0; i<mnt_data_count; i++)
    {
        if(!(strncmp(mnt_data_array[i].path, path, strlen(mnt_data_array[i].path))))
        {
            *block_size = mnt_data_array[i].block_size;
            return;
        }
    }

    return;
}

654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
/* construct the darshan log file name */
static void darshan_get_logfile_name(char* logfile_name, int jobid, struct tm* start_tm)
{
    char* user_logfile_name;
    char* logpath;
    char* logname_string;
    char* logpath_override = NULL;
#ifdef __DARSHAN_LOG_ENV
    char env_check[256];
    char* env_tok;
#endif
    uint64_t hlevel;
    char hname[HOST_NAME_MAX];
    uint64_t logmod;
    char cuser[L_cuserid] = {0};
    int ret;

    /* first, check if user specifies a complete logpath to use */
    user_logfile_name = getenv("DARSHAN_LOGFILE");
    if(user_logfile_name)
    {
        if(strlen(user_logfile_name) >= (PATH_MAX-1))
        {
            fprintf(stderr, "darshan library warning: user log file name too long.\n");
            logfile_name[0] = '\0';
        }
        else
        {
            strcpy(logfile_name, user_logfile_name);
        }
    }
    else
    {
        /* otherwise, generate the log path automatically */

        /* Use DARSHAN_LOG_PATH_OVERRIDE for the value or __DARSHAN_LOG_PATH */
        logpath = getenv(DARSHAN_LOG_PATH_OVERRIDE);
        if(!logpath)
        {
#ifdef __DARSHAN_LOG_PATH
            logpath = __DARSHAN_LOG_PATH;
#endif
        }

        /* get the username for this job.  In order we will try each of the
         * following until one of them succeeds:
         *
         * - cuserid()
         * - getenv("LOGNAME")
         * - snprintf(..., geteuid());
         *
         * Note that we do not use getpwuid() because it generally will not
         * work in statically compiled binaries.
         */

#ifndef DARSHAN_DISABLE_CUSERID
        cuserid(cuser);
#endif

        /* if cuserid() didn't work, then check the environment */
        if(strcmp(cuser, "") == 0)
        {
            logname_string = getenv("LOGNAME");
            if(logname_string)
            {
                strncpy(cuser, logname_string, (L_cuserid-1));
            }
        }

        /* if cuserid() and environment both fail, then fall back to uid */
        if(strcmp(cuser, "") == 0)
        {
            uid_t uid = geteuid();
            snprintf(cuser, sizeof(cuser), "%u", uid);
        }

        /* generate a random number to help differentiate the log */
        hlevel=DARSHAN_MPI_CALL(PMPI_Wtime)() * 1000000;
        (void)gethostname(hname, sizeof(hname));
        logmod = darshan_hash((void*)hname,strlen(hname),hlevel);

        /* see if darshan was configured using the --with-logpath-by-env
         * argument, which allows the user to specify an absolute path to
         * place logs via an env variable.
         */
#ifdef __DARSHAN_LOG_ENV
        /* just silently skip if the environment variable list is too big */
        if(strlen(__DARSHAN_LOG_ENV) < 256)
        {
            /* copy env variable list to a temporary buffer */
            strcpy(env_check, __DARSHAN_LOG_ENV);
            /* tokenize the comma-separated list */
            env_tok = strtok(env_check, ",");
            if(env_tok)
            {
                do
                {
                    /* check each env variable in order */
                    logpath_override = getenv(env_tok);
                    if(logpath_override)
                    {
                        /* stop as soon as we find a match */
                        break;
                    }
                }while((env_tok = strtok(NULL, ",")));
            }
        }
#endif

        if(logpath_override)
        {
            ret = snprintf(logfile_name, PATH_MAX,
                "%s/%s_%s_id%d_%d-%d-%d-%" PRIu64 ".darshan_partial",
                logpath_override,
                cuser, __progname, jobid,
                (start_tm->tm_mon+1),
                start_tm->tm_mday,
                (start_tm->tm_hour*60*60 + start_tm->tm_min*60 + start_tm->tm_sec),
                logmod);
            if(ret == (PATH_MAX-1))
            {
                /* file name was too big; squish it down */
                snprintf(logfile_name, PATH_MAX,
                    "%s/id%d.darshan_partial",
                    logpath_override, jobid);
            }
        }
        else if(logpath)
        {
            ret = snprintf(logfile_name, PATH_MAX,
                "%s/%d/%d/%d/%s_%s_id%d_%d-%d-%d-%" PRIu64 ".darshan_partial",
                logpath, (start_tm->tm_year+1900),
                (start_tm->tm_mon+1), start_tm->tm_mday,
                cuser, __progname, jobid,
                (start_tm->tm_mon+1),
                start_tm->tm_mday,
                (start_tm->tm_hour*60*60 + start_tm->tm_min*60 + start_tm->tm_sec),
                logmod);
            if(ret == (PATH_MAX-1))
            {
                /* file name was too big; squish it down */
                snprintf(logfile_name, PATH_MAX,
                    "%s/id%d.darshan_partial",
                    logpath, jobid);
            }
        }
        else
        {
            logfile_name[0] = '\0';
        }
    }

    return;
}

809
static void darshan_get_shared_records(struct darshan_core_runtime *core,
810
    darshan_record_id **shared_recs, int *shared_rec_cnt)
811
{
812
    int i, j;
813
    int tmp_cnt = core->rec_hash_cnt;
814
    struct darshan_core_record_ref *tmp, *ref;
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
    darshan_record_id *id_array;
    uint64_t *mod_flags;
    uint64_t *global_mod_flags;

    /* broadcast root's number of records to all other processes */
    DARSHAN_MPI_CALL(PMPI_Bcast)(&tmp_cnt, 1, MPI_INT, 0, MPI_COMM_WORLD);

    /* use root record count to allocate data structures */
    id_array = malloc(tmp_cnt * sizeof(darshan_record_id));
    mod_flags = malloc(tmp_cnt * sizeof(uint64_t));
    global_mod_flags = malloc(tmp_cnt * sizeof(uint64_t));
    *shared_recs = malloc(tmp_cnt * sizeof(darshan_record_id));
    assert(id_array && mod_flags && global_mod_flags && *shared_recs);

    memset(mod_flags, 0, tmp_cnt * sizeof(uint64_t));
    memset(global_mod_flags, 0, tmp_cnt * sizeof(uint64_t));
    memset(*shared_recs, 0, tmp_cnt * sizeof(darshan_record_id));
832
833
834
835

    /* first, determine list of records root process has opened */
    if(my_rank == 0)
    {
836
        i = 0;
837
        HASH_ITER(hlink, core->rec_hash, ref, tmp)
838
        {
839
            id_array[i++] = ref->rec.id;           
840
841
842
843
        }
    }

    /* broadcast root's list of records to all other processes */
844
    DARSHAN_MPI_CALL(PMPI_Bcast)(id_array, (tmp_cnt * sizeof(darshan_record_id)),
845
846
847
        MPI_BYTE, 0, MPI_COMM_WORLD);

    /* everyone looks to see if they opened the same records as root */
848
    for(i=0; i<tmp_cnt; i++)
849
    {
850
851
        HASH_FIND(hlink, core->rec_hash, &id_array[i], sizeof(darshan_record_id), ref);
        if(ref)
852
        {
853
854
            /* we opened that record too, save the mod_flags */
            mod_flags[i] = ref->mod_flags;
855
856
857
        }
    }

858
859
860
    /* now allreduce so everyone agrees which files are shared and
     * which modules accessed them collectively
     */
861
862
    DARSHAN_MPI_CALL(PMPI_Allreduce)(mod_flags, global_mod_flags, tmp_cnt,
        MPI_UINT64_T, MPI_BAND, MPI_COMM_WORLD);
863

864
865
    j = 0;
    for(i=0; i<tmp_cnt; i++)
866
    {
867
        if(global_mod_flags[i] != 0)
868
        {
869
            (*shared_recs)[j++] = id_array[i];
870
871
872
873
874
875
876
877

            /* set global_mod_flags so we know which modules collectively
             * accessed this module. we need this info to support shared
             * file reductions
             */
            HASH_FIND(hlink, core->rec_hash, &id_array[i], sizeof(darshan_record_id), ref);
            assert(ref);
            ref->global_mod_flags = global_mod_flags[i];
878
879
        }
    }
880
    *shared_rec_cnt = j;
881

882
883
884
    return;
}

885
static int darshan_log_open_all(char *logfile_name, MPI_File *log_fh)
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
{
    char *hints;
    char *tok_str;
    char *orig_tok_str;
    char *key;
    char *value;
    char *saveptr = NULL;
    int ret;
    MPI_Info info;

    /* check environment variable to see if the default MPI file hints have
     * been overridden
     */
    MPI_Info_create(&info);

901
    hints = getenv(DARSHAN_LOG_HINTS_OVERRIDE);
902
903
    if(!hints)
    {
904
        hints = __DARSHAN_LOG_HINTS;
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
    }

    if(hints && strlen(hints) > 0)
    {
        tok_str = strdup(hints);
        if(tok_str)
        {
            orig_tok_str = tok_str;
            do
            {
                /* split string on semicolon */
                key = strtok_r(tok_str, ";", &saveptr);
                if(key)
                {
                    tok_str = NULL;
                    /* look for = sign splitting key/value pairs */
                    value = index(key, '=');
                    if(value)
                    {
                        /* break key and value into separate null terminated strings */
                        value[0] = '\0';
                        value++;
                        if(strlen(key) > 0)
                            MPI_Info_set(info, key, value);
                    }
                }
            }while(key != NULL);
            free(orig_tok_str);
        }
    }

    /* open the darshan log file for writing */
    ret = DARSHAN_MPI_CALL(PMPI_File_open)(MPI_COMM_WORLD, logfile_name,
        MPI_MODE_CREATE | MPI_MODE_WRONLY | MPI_MODE_EXCL, info, log_fh);
Shane Snyder's avatar
Shane Snyder committed
939
    if(ret != MPI_SUCCESS)
940
941
942
        return(-1);

    MPI_Info_free(&info);
Shane Snyder's avatar
Shane Snyder committed
943
944
945
    return(0);
}

946
static int darshan_deflate_buffer(void **pointers, int *lengths, int count,
Shane Snyder's avatar
Shane Snyder committed
947
    char *comp_buf, int *comp_buf_length)
948
949
950
951
952
953
{
    int ret = 0;
    int i;
    int total_target = 0;
    z_stream tmp_stream;

954
955
956
957
958
959
960
961
962
963
964
    /* just return if there is no data */
    for(i = 0; i < count; i++)
    {
        total_target += lengths[i];
    }
    if(total_target)
    {
        total_target = 0;
    }
    else
    {
Shane Snyder's avatar
Shane Snyder committed
965
        *comp_buf_length = 0;
966
967
968
        return(0);
    }

969
970
971
972
973
974
975
    memset(&tmp_stream, 0, sizeof(tmp_stream));
    tmp_stream.zalloc = Z_NULL;
    tmp_stream.zfree = Z_NULL;
    tmp_stream.opaque = Z_NULL;

    /* initialize the zlib compression parameters */
    /* TODO: check these parameters? */
Shane Snyder's avatar
Shane Snyder committed
976
977
978
//    ret = deflateInit2(&tmp_stream, Z_DEFAULT_COMPRESSION, Z_DEFLATED,
//        15 + 16, 8, Z_DEFAULT_STRATEGY);
    ret = deflateInit(&tmp_stream, Z_DEFAULT_COMPRESSION);
979
980
981
982
983
    if(ret != Z_OK)
    {
        return(-1);
    }

Shane Snyder's avatar
Shane Snyder committed
984
    tmp_stream.next_out = (unsigned char *)comp_buf;
985
    tmp_stream.avail_out = DARSHAN_COMP_BUF_SIZE;
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027

    /* loop over the input pointers */
    for(i = 0; i < count; i++)
    {
        total_target += lengths[i];
        tmp_stream.next_in = pointers[i];
        tmp_stream.avail_in = lengths[i];
        /* while we have not finished consuming all of the data available to
         * this point in the loop
         */
        while(tmp_stream.total_in < total_target)
        {
            if(tmp_stream.avail_out == 0)
            {
                /* We ran out of buffer space for compression.  In theory,
                 * we could start using some of the file_array buffer space
                 * without having to malloc again.  In practice, this case 
                 * is going to be practically impossible to hit.
                 */
                deflateEnd(&tmp_stream);
                return(-1);
            }

            /* compress data */
            ret = deflate(&tmp_stream, Z_NO_FLUSH);
            if(ret != Z_OK)
            {
                deflateEnd(&tmp_stream);
                return(-1);
            }
        }
    }

    /* flush compression and end */
    ret = deflate(&tmp_stream, Z_FINISH);
    if(ret != Z_STREAM_END)
    {
        deflateEnd(&tmp_stream);
        return(-1);
    }
    deflateEnd(&tmp_stream);

Shane Snyder's avatar
Shane Snyder committed
1028
    *comp_buf_length = tmp_stream.total_out;
1029
1030
1031
    return(0);
}

1032
1033
1034
/* NOTE: the map written to file may contain duplicate id->name entries if a
 *       record is opened by multiple ranks, but not all ranks
 */
1035
static int darshan_log_write_record_hash(MPI_File log_fh, struct darshan_core_runtime *core,
1036
    uint64_t *inout_off)
1037
{
1038
1039
1040
1041
    int ret;
    struct darshan_core_record_ref *ref, *tmp;
    uint32_t name_len;
    size_t record_sz;
1042
    size_t hash_buf_sz = 0;
1043
1044
    char *hash_buf;
    char *hash_buf_off;
1045

1046
    /* allocate a buffer to store at most 64 bytes for each registered record */
1047
    /* NOTE: this buffer may be reallocated if estimate is too small */
1048
    hash_buf_sz = core->rec_hash_cnt * 64;
1049
1050
    hash_buf = malloc(hash_buf_sz);
    if(!hash_buf)
1051
    {
Shane Snyder's avatar
Shane Snyder committed
1052
        return(-1);
1053
1054
    }

1055
    /* serialize the record hash into a buffer for writing */
1056
    hash_buf_off = hash_buf;
1057
    HASH_ITER(hlink, core->rec_hash, ref, tmp)
1058
    {
1059
1060
1061
1062
        /* to avoid duplicate records, only rank 0 will write shared records */
        if(my_rank > 0 && ref->global_mod_flags)
            continue;

1063
        name_len = strlen(ref->rec.name);
1064
        record_sz = sizeof(darshan_record_id) + sizeof(uint32_t) + name_len;
1065
        /* make sure there is room in the buffer for this record */
1066
        if((hash_buf_off + record_sz) > (hash_buf + hash_buf_sz))
1067
        {
1068
            char *tmp_buf;
1069
1070
            size_t old_buf_sz;

1071
            /* if no room, reallocate the hash buffer at twice the current size */
1072
1073
1074
            old_buf_sz = hash_buf_off - hash_buf;
            hash_buf_sz *= 2;
            tmp_buf = malloc(hash_buf_sz);
1075
1076
            if(!tmp_buf)
            {
1077
                free(hash_buf);
Shane Snyder's avatar
Shane Snyder committed
1078
                return(-1);
1079
1080
            }

1081
1082
1083
1084
            memcpy(tmp_buf, hash_buf, old_buf_sz);
            free(hash_buf);
            hash_buf = tmp_buf;
            hash_buf_off = hash_buf + old_buf_sz;
1085
1086
        }

1087
1088
        /* now serialize the record into the hash buffer.
         * NOTE: darshan record hash serialization method: 
1089
1090
         *          ... darshan_record_id | (uint32_t) path_len | path ...
         */
1091
1092
1093
1094
1095
1096
        *((darshan_record_id *)hash_buf_off) = ref->rec.id;
        hash_buf_off += sizeof(darshan_record_id);
        *((uint32_t *)hash_buf_off) = name_len;
        hash_buf_off += sizeof(uint32_t);
        memcpy(hash_buf_off, ref->rec.name, name_len);
        hash_buf_off += name_len;
1097
    }
Shane Snyder's avatar
Shane Snyder committed
1098
    hash_buf_sz = hash_buf_off - hash_buf;
1099

1100
    /* collectively write out the record hash to the darshan log */
Shane Snyder's avatar
Shane Snyder committed
1101
    ret = darshan_log_append_all(log_fh, core, hash_buf, hash_buf_sz, inout_off);
1102

1103
    free(hash_buf);
1104

1105
    return(ret);
Shane Snyder's avatar
Shane Snyder committed
1106
1107
}

1108
1109
/* NOTE: inout_off contains the starting offset of this append at the beginning
 *       of the call, and contains the ending offset at the end of the call.
Shane Snyder's avatar
Shane Snyder committed
1110
 *       This variable is only valid on the root rank (rank 0).
Shane Snyder's avatar
Shane Snyder committed
1111
 */
1112
static int darshan_log_append_all(MPI_File log_fh, struct darshan_core_runtime *core,
Shane Snyder's avatar
Shane Snyder committed
1113
    void *buf, int count, uint64_t *inout_off)
Shane Snyder's avatar
Shane Snyder committed
1114
1115
1116
{
    MPI_Offset send_off, my_off;
    MPI_Status status;
1117
    int comp_buf_sz = 0;
Shane Snyder's avatar
Shane Snyder committed
1118
1119
    int ret;

1120
    /* compress the input buffer */
Shane Snyder's avatar
Shane Snyder committed
1121
    ret = darshan_deflate_buffer((void **)&buf, &count, 1,
1122
1123
1124
1125
        core->comp_buf, &comp_buf_sz);
    if(ret < 0)
        comp_buf_sz = 0;

1126
    /* figure out where everyone is writing using scan */
1127
    send_off = comp_buf_sz;
Shane Snyder's avatar
Shane Snyder committed
1128
    if(my_rank == 0)
1129
    {
1130
        send_off += *inout_off; /* rank 0 knows the beginning offset */
1131
    }
Shane Snyder's avatar
Shane Snyder committed
1132
1133
1134
1135

    DARSHAN_MPI_CALL(PMPI_Scan)(&send_off, &my_off, 1, MPI_OFFSET,
        MPI_SUM, MPI_COMM_WORLD);
    /* scan in inclusive; subtract local size back out */
1136
    my_off -= comp_buf_sz;
Shane Snyder's avatar
Shane Snyder committed
1137

1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
    if(ret == 0)
    {
        /* no compression errors, proceed with the collective write */
        ret = DARSHAN_MPI_CALL(PMPI_File_write_at_all)(log_fh, my_off,
            core->comp_buf, comp_buf_sz, MPI_BYTE, &status);
    }
    else
    {
        /* error during compression. preserve and return error to caller,
         * but participate in collective write to avoid deadlock.
         */
        (void)DARSHAN_MPI_CALL(PMPI_File_write_at_all)(log_fh, my_off,
            core->comp_buf, comp_buf_sz, MPI_BYTE, &status);
    }
Shane Snyder's avatar
Shane Snyder committed
1152

1153
1154
    if(nprocs > 1)
    {
Shane Snyder's avatar
Shane Snyder committed
1155
        /* send the ending offset from rank (n-1) to rank 0 */
1156
1157
        if(my_rank == (nprocs-1))
        {
1158
            my_off += comp_buf_sz;
1159
1160
1161
            DARSHAN_MPI_CALL(PMPI_Send)(&my_off, 1, MPI_OFFSET, 0, 0,
                MPI_COMM_WORLD);
        }
Shane Snyder's avatar
Shane Snyder committed
1162
        if(my_rank == 0)
1163
1164
1165
1166
        {
            DARSHAN_MPI_CALL(PMPI_Recv)(&my_off, 1, MPI_OFFSET, (nprocs-1), 0,
                MPI_COMM_WORLD, &status);

1167
            *inout_off = my_off;
1168
1169
1170
1171
        }
    }
    else
    {
1172
        *inout_off = my_off + comp_buf_sz;
1173
    }
Shane Snyder's avatar
Shane Snyder committed
1174

1175
1176
    if(ret != 0)
        return(-1);
Shane Snyder's avatar
Shane Snyder committed
1177
    return(0);
1178
1179
}

Shane Snyder's avatar
Shane Snyder committed
1180
1181
1182
1183
1184
1185
1186
1187
1188
/* free darshan core data structures to shutdown */
static void darshan_core_cleanup(struct darshan_core_runtime* core)
{
    struct darshan_core_record_ref *tmp, *ref;
    int i;

    HASH_ITER(hlink, core->rec_hash, ref, tmp)
    {
        HASH_DELETE(hlink, core->rec_hash, ref);
1189
        /* XXX MMAP:  free(ref->rec.name); */
Shane Snyder's avatar
Shane Snyder committed
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
        free(ref);
    }

    for(i = 0; i < DARSHAN_MAX_MODS; i++)
    {
        if(core->mod_array[i])
        {        
            free(core->mod_array[i]);
            core->mod_array[i] = NULL;
        }
    }

    free(core);

    return;
}

1207
/* ********************************************************* */
1208
1209

void darshan_core_register_module(
1210
    darshan_module_id mod_id,
1211
    struct darshan_module_funcs *funcs,
1212
1213
    void **mod_buf,
    int *mod_buf_size,
1214
    int *my_rank,
1215
    int *sys_mem_alignment)
1216
{
1217
1218
    int ret;
    int tmpval;
1219
    struct darshan_core_module* mod;
1220
    char *mod_mem_str = NULL;
1221
1222
1223

    *mod_buf_size = 0;
    *mod_buf = NULL;
1224

1225
    if(!darshan_core || (mod_id >= DARSHAN_MAX_MODS))
1226
        return;
1227

1228
1229
1230
    if(sys_mem_alignment)
        *sys_mem_alignment = darshan_mem_alignment;

1231
1232
1233
    /* get the calling process's rank */
    DARSHAN_MPI_CALL(PMPI_Comm_rank)(MPI_COMM_WORLD, my_rank);

1234
    /* see if this module is already registered */
1235
    DARSHAN_CORE_LOCK();
1236
    if(darshan_core->mod_array[mod_id])
1237
    {
1238
        /* if module is already registered just return */
1239
        DARSHAN_CORE_UNLOCK();
1240
        return;
1241
1242
    }

1243
    /* XXX MMAP: how do we assign size and address */
1244
    *mod_buf = darshan_core->