darshan-core.c 73 KB
Newer Older
1
/*
Shane Snyder's avatar
Shane Snyder committed
2
3
4
 * Copyright (C) 2015 University of Chicago.
 * See COPYRIGHT notice in top-level directory.
 *
5
6
 */

7
#define _XOPEN_SOURCE 500
8
#define _GNU_SOURCE
9

10
11
12
13
14
15
#include "darshan-runtime-config.h"

#include <stdio.h>
#ifdef HAVE_MNTENT_H
#include <mntent.h>
#endif
16
17
18
19
20
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <limits.h>
#include <pthread.h>
21
#include <fcntl.h>
Shane Snyder's avatar
Shane Snyder committed
22
#include <stdarg.h>
23
24
#include <dirent.h>
#include <sys/ioctl.h>
25
26
#include <sys/types.h>
#include <sys/stat.h>
27
#include <sys/mman.h>
28
#include <sys/time.h>
29
#include <sys/vfs.h>
30
#include <zlib.h>
31
#include <assert.h>
32

33
34
35
36
#ifdef HAVE_MPI
#include <mpi.h>
#endif

37
#include "uthash.h"
Shane Snyder's avatar
Shane Snyder committed
38
#include "darshan.h"
39
#include "darshan-core.h"
Shane Snyder's avatar
Shane Snyder committed
40
#include "darshan-dynamic.h"
Shane Snyder's avatar
Shane Snyder committed
41
#include "darshan-dxt.h"
42

43
#ifdef DARSHAN_LUSTRE
44
#include <lustre/lustre_user.h>
45
#endif
46

47
extern char* __progname;
48
extern char* __progname_full;
49

50
/* internal variable delcarations */
51
static struct darshan_core_runtime *darshan_core = NULL;
52
static pthread_mutex_t darshan_core_mutex = PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP;
53
54
55
static int using_mpi = 0;
static int my_rank = 0;
static int nprocs = 1;
56
static int darshan_mem_alignment = 1;
Philip Carns's avatar
Philip Carns committed
57
static size_t darshan_mod_mem_quota = DARSHAN_MOD_MEM_MAX;
58

59
60
61
static struct darshan_core_mnt_data mnt_data_array[DARSHAN_MAX_MNTS];
static int mnt_data_count = 0;

62
/* paths prefixed with the following directories are not tracked by darshan */
63
char* darshan_path_exclusions[] = {
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
    "/etc/",
    "/dev/",
    "/usr/",
    "/bin/",
    "/boot/",
    "/lib/",
    "/opt/",
    "/sbin/",
    "/sys/",
    "/proc/",
    "/var/",
    NULL
};
/* paths prefixed with the following directories are tracked by darshan even if
 * they share a root with a path listed in darshan_path_exclusions
 */
char* darshan_path_inclusions[] = {
    "/var/opt/cray/dws/mounts/",
    NULL
83
84
};

85
86
87
/* allow users to override the path exclusions */
char** user_darshan_path_exclusions = NULL;

88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
#ifdef DARSHAN_BGQ
extern void bgq_runtime_initialize();
#endif

/* array of init functions for modules which need to be statically
 * initialized by darshan at startup time
 */
void (*mod_static_init_fns[])(void) =
{
#ifdef DARSHAN_BGQ
    &bgq_runtime_initialize,
#endif
    NULL
};

103
104
105
106
107
108
109
#ifdef DARSHAN_LUSTRE
/* XXX need to use extern to get Lustre module's instrumentation function
 * since modules have no way of providing this to darshan-core
 */
extern void darshan_instrument_lustre_file(const char *filepath, int fd);
#endif

110
/* prototypes for internal helper functions */
111
#ifdef __DARSHAN_ENABLE_MMAP_LOGS
112
113
static void *darshan_init_mmap_log(
    struct darshan_core_runtime* core, int jobid);
114
#endif
115
static void darshan_log_record_hints_and_ver(
116
    struct darshan_core_runtime* core);
117
118
static void darshan_get_exe_and_mounts(
    struct darshan_core_runtime *core, int argc, char **argv);
119
120
static void darshan_fs_info_from_path(
    const char *path, struct darshan_fs_info *fs_info);
121
static int darshan_add_name_record_ref(
122
    struct darshan_core_runtime *core, darshan_record_id rec_id,
123
    const char *name, darshan_module_id mod_id);
124
125
static void darshan_get_user_name(
    char *user);
126
#ifdef HAVE_MPI
127
static void darshan_get_shared_records(
128
129
    struct darshan_core_runtime *core, darshan_record_id **shared_recs,
    int *shared_rec_cnt);
130
#endif
131
static void darshan_get_logfile_name(
132
    char* logfile_name, struct darshan_core_runtime* core);
133
static int darshan_log_open(
134
135
    char *logfile_name, struct darshan_core_runtime *core,
    darshan_core_log_fh *log_fh);
136
static int darshan_log_write_job_record(
Philip Carns's avatar
Philip Carns committed
137
    darshan_core_log_fh log_fh, struct darshan_core_runtime *core,
138
139
140
141
142
143
144
145
146
147
148
149
150
    uint64_t *inout_off);
static int darshan_log_write_name_record_hash(
    darshan_core_log_fh log_fh, struct darshan_core_runtime *core,
    uint64_t *inout_off);
static int darshan_log_write_header(
    darshan_core_log_fh log_fh, struct darshan_core_runtime *core);
static int darshan_log_append(
    darshan_core_log_fh log_fh, struct darshan_core_runtime *core,
    void *buf, int count, uint64_t *inout_off);
void darshan_log_close(
    darshan_core_log_fh log_fh);
void darshan_log_finalize(
    char *logfile_name, double start_log_time);
151
static int darshan_deflate_buffer(
Shane Snyder's avatar
Shane Snyder committed
152
153
    void **pointers, int *lengths, int count, char *comp_buf,
    int *comp_buf_length);
Shane Snyder's avatar
Shane Snyder committed
154
155
static void darshan_core_cleanup(
    struct darshan_core_runtime* core);
156
static double darshan_core_wtime_absolute(void);
157

158
159
160
161
162
163
164
165
166
167
168
169
170
#define DARSHAN_CORE_LOCK() pthread_mutex_lock(&darshan_core_mutex)
#define DARSHAN_CORE_UNLOCK() pthread_mutex_unlock(&darshan_core_mutex)

#define DARSHAN_WARN(__err_str, ...) do { \
    darshan_core_fprintf(stderr, "darshan_library_warning: " \
        __err_str ".\n", ## __VA_ARGS__); \
} while(0)

#ifdef HAVE_MPI

/* MPI variant of darshan logging helpers */
#define DARSHAN_CHECK_ERR(__ret, __err_str, ...) do { \
    if(using_mpi) \
171
        PMPI_Allreduce(MPI_IN_PLACE, &__ret, 1, MPI_INT, MPI_LOR, final_core->mpi_comm); \
172
173
    if(__ret != 0) { \
        if(my_rank == 0) { \
174
            DARSHAN_WARN(__err_str, ## __VA_ARGS__); \
175
176
177
178
179
180
181
182
183
184
185
186
            if(log_created) \
                unlink(logfile_name); \
        } \
        goto exit; \
    } \
} while(0)

#else

/* Non-MPI variant of darshan logging helpers */
#define DARSHAN_CHECK_ERR(__ret, __err_str, ...) do { \
    if(__ret != 0) { \
187
        DARSHAN_WARN(__err_str, ## __VA_ARGS__); \
188
189
190
191
192
193
194
        if(log_created) \
            unlink(logfile_name); \
        goto exit; \
    } \
} while(0)

#endif
195

196
197
/* *********************************** */

Shane Snyder's avatar
Shane Snyder committed
198
void darshan_core_initialize(int argc, char **argv)
199
{
200
    struct darshan_core_runtime *init_core = NULL;
201
    int internal_timing_flag = 0;
202
    double init_start, init_time;
203
    char *envstr;
204
205
    char *jobid_str;
    int jobid;
206
    int ret;
207
    int i;
208
209
    int tmpval;
    double tmpfloat;
210

211
212
    /* setup darshan runtime if darshan is enabled and hasn't been initialized already */
    if (darshan_core != NULL || getenv("DARSHAN_DISABLE"))
213
        return;
214
215

    if(getenv("DARSHAN_INTERNAL_TIMING"))
216
    {
217
        internal_timing_flag = 1;
218
219
        init_start = darshan_core_wtime();
    }
220

221
222
223
224
225
    #if (__DARSHAN_MEM_ALIGNMENT < 1)
        #error Darshan must be configured with a positive value for --with-mem-align
    #endif
    envstr = getenv(DARSHAN_MEM_ALIGNMENT_OVERRIDE);
    if(envstr)
226
    {
227
228
229
        ret = sscanf(envstr, "%d", &tmpval);
        /* silently ignore if the env variable is set poorly */
        if(ret == 1 && tmpval > 0)
230
        {
231
            darshan_mem_alignment = tmpval;
232
        }
233
234
235
236
237
    }
    else
    {
        darshan_mem_alignment = __DARSHAN_MEM_ALIGNMENT;
    }
238

239
240
241
242
243
    /* avoid floating point errors on faulty input */
    if(darshan_mem_alignment < 1)
    {
        darshan_mem_alignment = 1;
    }
244

245
246
247
248
249
250
    /* Use DARSHAN_JOBID_OVERRIDE for the env var for __DARSHAN_JOBID */
    envstr = getenv(DARSHAN_JOBID_OVERRIDE);
    if(!envstr)
    {
        envstr = __DARSHAN_JOBID;
    }
251

252
253
254
255
256
257
258
259
260
261
262
263
    /* find a job id */
    jobid_str = getenv(envstr);
    if(jobid_str)
    {
        /* in cobalt we can find it in env var */
        ret = sscanf(jobid_str, "%d", &jobid);
    }
    if(!jobid_str || ret != 1)
    {
        /* use pid as fall back */
        jobid = getpid();
    }
264

265
266
267
268
269
270
271
    /* set the memory quota for darshan modules' records */
    envstr = getenv(DARSHAN_MOD_MEM_OVERRIDE);
    if(envstr)
    {
        ret = sscanf(envstr, "%lf", &tmpfloat);
        /* silently ignore if the env variable is set poorly */
        if(ret == 1 && tmpfloat > 0)
272
        {
273
            darshan_mod_mem_quota = tmpfloat * 1024 * 1024; /* convert from MiB */
274
        }
275
    }
276

277
278
279
280
281
    /* allocate structure to track darshan core runtime information */
    init_core = malloc(sizeof(*init_core));
    if(init_core)
    {
        memset(init_core, 0, sizeof(*init_core));
282

283
284
285
#ifdef HAVE_MPI
        PMPI_Initialized(&using_mpi);
        if(using_mpi)
286
        {
287
288
289
290
291
            PMPI_Comm_dup(MPI_COMM_WORLD, &init_core->mpi_comm);
            PMPI_Comm_size(init_core->mpi_comm, &nprocs);
            PMPI_Comm_rank(init_core->mpi_comm, &my_rank);
        }
#endif
292

Shane Snyder's avatar
Shane Snyder committed
293
294
295
296
297
        /* record absolute start time at startup so that we can later
         * generate relative times with this as a reference point.
         */
        init_core->wtime_offset = darshan_core_wtime_absolute();

298
    /* TODO: do we alloc new memory as we go or just do everything up front? */
299

300
#ifndef __DARSHAN_ENABLE_MMAP_LOGS
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
        /* just allocate memory for each log file region */
        init_core->log_hdr_p = malloc(sizeof(struct darshan_header));
        init_core->log_job_p = malloc(sizeof(struct darshan_job));
        init_core->log_exemnt_p = malloc(DARSHAN_EXE_LEN+1);
        init_core->log_name_p = malloc(DARSHAN_NAME_RECORD_BUF_SIZE);
        init_core->log_mod_p = malloc(darshan_mod_mem_quota);

        if(!(init_core->log_hdr_p) || !(init_core->log_job_p) ||
           !(init_core->log_exemnt_p) || !(init_core->log_name_p) ||
           !(init_core->log_mod_p))
        {
            free(init_core);
            return;
        }
        /* if allocation succeeds, zero fill memory regions */
        memset(init_core->log_hdr_p, 0, sizeof(struct darshan_header));
        memset(init_core->log_job_p, 0, sizeof(struct darshan_job));
        memset(init_core->log_exemnt_p, 0, DARSHAN_EXE_LEN+1);
        memset(init_core->log_name_p, 0, DARSHAN_NAME_RECORD_BUF_SIZE);
        memset(init_core->log_mod_p, 0, darshan_mod_mem_quota);
321
#else
322
323
324
325
326
327
328
329
330
        /* if mmap logs are enabled, we need to initialize the mmap region
         * before setting the corresponding log file region pointers
         */
        void *mmap_p = darshan_init_mmap_log(init_core, jobid);
        if(!mmap_p)
        {
            free(init_core);
            return;
        }
331

332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
        /* set the memory pointers for each log file region */
        init_core->log_hdr_p = (struct darshan_header *)mmap_p;
        init_core->log_job_p = (struct darshan_job *)
            ((char *)init_core->log_hdr_p + sizeof(struct darshan_header));
        init_core->log_exemnt_p = (char *)
            ((char *)init_core->log_job_p + sizeof(struct darshan_job));
        init_core->log_name_p = (void *)
            ((char *)init_core->log_exemnt_p + DARSHAN_EXE_LEN + 1);
        init_core->log_mod_p = (void *)
            ((char *)init_core->log_name_p + DARSHAN_NAME_RECORD_BUF_SIZE);

        /* set header fields needed for the mmap log mechanism */
        init_core->log_hdr_p->comp_type = DARSHAN_NO_COMP;
        init_core->log_hdr_p->name_map.off =
            ((char *)init_core->log_name_p - (char *)init_core->log_hdr_p);
347
348
#endif

349
350
351
        /* set known header fields for the log file */
        strcpy(init_core->log_hdr_p->version_string, DARSHAN_LOG_VERSION);
        init_core->log_hdr_p->magic_nr = DARSHAN_MAGIC_NR;
352

353
354
355
356
357
        /* set known job-level metadata fields for the log file */
        init_core->log_job_p->uid = getuid();
        init_core->log_job_p->start_time = time(NULL);
        init_core->log_job_p->nprocs = nprocs;
        init_core->log_job_p->jobid = (int64_t)jobid;
358

359
360
361
362
        /* if we are using any hints to write the log file, then record those
         * hints with the darshan job information
         */
        darshan_log_record_hints_and_ver(init_core);
363

364
365
        /* collect information about command line and mounted file systems */
        darshan_get_exe_and_mounts(init_core, argc, argv);
366

367
368
369
370
371
372
        /* determine if/when DXT should be enabled by looking for triggers */
        char *trigger_conf = getenv("DXT_TRIGGER_CONF_PATH");
        if(trigger_conf)
        {
            dxt_load_trigger_conf(trigger_conf);
        }
Shane Snyder's avatar
Shane Snyder committed
373

374
375
376
377
378
379
        /* if darshan was successfully initialized, set the global pointer
         * and bootstrap any modules with static initialization routines
         */
        DARSHAN_CORE_LOCK();
        darshan_core = init_core;
        DARSHAN_CORE_UNLOCK();
380

381
382
383
384
385
        i = 0;
        while(mod_static_init_fns[i])
        {
            (*mod_static_init_fns[i])();
            i++;
386
        }
387
388
    }

389
390
    if(internal_timing_flag)
    {
391
392
        init_time = darshan_core_wtime() - init_start;
#ifdef HAVE_MPI
393
        if(using_mpi)
394
        {
395
396
397
398
399
400
401
402
403
404
405
            if(my_rank == 0)
            {
                PMPI_Reduce(MPI_IN_PLACE, &init_time, 1,
                    MPI_DOUBLE, MPI_MAX, 0, darshan_core->mpi_comm);
            }
            else
            {
                PMPI_Reduce(&init_time, &init_time, 1,
                    MPI_DOUBLE, MPI_MAX, 0, darshan_core->mpi_comm);
                return; /* return early so every rank doesn't print */
            }
406
        }
407
408
409
410
#endif

        darshan_core_fprintf(stderr, "#darshan:<op>\t<nprocs>\t<time>\n");
        darshan_core_fprintf(stderr, "darshan:init\t%d\t%f\n", nprocs, init_time);
411
412
413
414
415
    }

    return;
}

Shane Snyder's avatar
Shane Snyder committed
416
void darshan_core_shutdown()
417
{
418
    struct darshan_core_runtime *final_core;
419
    double start_log_time;
420
    int internal_timing_flag = 0;
421
422
423
    double open1 = 0, open2 = 0;
    double job1 = 0, job2 = 0;
    double rec1 = 0, rec2 = 0;
424
425
    double mod1[DARSHAN_MAX_MODS] = {0};
    double mod2[DARSHAN_MAX_MODS] = {0};
426
    double header1 = 0, header2 = 0;
427
428
    double tm_end;
    int active_mods[DARSHAN_MAX_MODS] = {0};
429
    uint64_t gz_fp = 0;
430
431
432
433
434
    char *logfile_name = NULL;
    darshan_core_log_fh log_fh;
    int log_created = 0;
    int i;
    int ret;
435

Shane Snyder's avatar
Shane Snyder committed
436
    /* disable darhan-core while we shutdown */
437
    DARSHAN_CORE_LOCK();
438
    if(!darshan_core)
439
    {
440
        DARSHAN_CORE_UNLOCK();
441
442
        return;
    }
443
444
    final_core = darshan_core;
    darshan_core = NULL;
445
446
    DARSHAN_CORE_UNLOCK();

447
448
449
450
451
452
453
    /* NOTE: from this point on, this function must use
     * darshan_core_wtime_absolute() rather than darshan_core_wtime() to
     * collect timestamps for internal timing calculations.  The former no
     * longer works because it relies on runtime state to calculate
     * timestamps relative to job start.
     */

454
455
456
457
    /* grab some initial timing information */
#ifdef HAVE_MPI
    /* if using mpi, sync across procs first */
    if(using_mpi)
458
        PMPI_Barrier(final_core->mpi_comm);
459
#endif
460
    start_log_time = darshan_core_wtime_absolute();
461
462
463
464
465
    final_core->log_job_p->end_time = time(NULL);

    if(getenv("DARSHAN_INTERNAL_TIMING"))
        internal_timing_flag = 1;

466
#ifdef __DARSHAN_ENABLE_MMAP_LOGS
467
468
469
470
471
    /* remove the temporary mmap log files */
    /* NOTE: this unlink is not immediate as it must wait for the mapping
     * to no longer be referenced, which in our case happens when the
     * executable exits. If the application terminates mid-shutdown, then
     * there will be no mmap files and no final log file.
472
     */
473
    unlink(final_core->mmap_log_name);
474
#endif
Shane Snyder's avatar
Shane Snyder committed
475

476
    final_core->comp_buf = malloc(darshan_mod_mem_quota);
477
    logfile_name = malloc(PATH_MAX);
478
479
    if(!final_core->comp_buf || !logfile_name)
        goto exit;
480

481
    /* set which modules were used locally */
482
    for(i = 0; i < DARSHAN_MAX_MODS; i++)
483
    {
484
        if(final_core->mod_array[i])
485
            active_mods[i] = 1;
486
487
    }

488
489
490
491
#ifdef HAVE_MPI
    darshan_record_id *shared_recs = NULL;
    darshan_record_id *mod_shared_recs = NULL;
    int shared_rec_cnt = 0;
492

493
    if(using_mpi)
494
    {
495
496
        /* allreduce locally active mods to determine globally active mods */
        PMPI_Allreduce(MPI_IN_PLACE, active_mods, DARSHAN_MAX_MODS, MPI_INT,
497
            MPI_SUM, final_core->mpi_comm);
498

499
        /* reduce to report first start and last end time across all ranks at rank 0 */
500
        if(my_rank == 0)
501
        {
502
503
504
505
            PMPI_Reduce(MPI_IN_PLACE, &final_core->log_job_p->start_time,
                1, MPI_INT64_T, MPI_MIN, 0, final_core->mpi_comm);
            PMPI_Reduce(MPI_IN_PLACE, &final_core->log_job_p->end_time,
                1, MPI_INT64_T, MPI_MAX, 0, final_core->mpi_comm);
506
507
508
        }
        else
        {
509
510
511
512
513
514
            PMPI_Reduce(&final_core->log_job_p->start_time,
                &final_core->log_job_p->start_time,
                1, MPI_INT64_T, MPI_MIN, 0, final_core->mpi_comm);
            PMPI_Reduce(&final_core->log_job_p->end_time,
                &final_core->log_job_p->end_time,
                1, MPI_INT64_T, MPI_MAX, 0, final_core->mpi_comm);
515
516
        }

517
518
        /* get a list of records which are shared across all processes */
        darshan_get_shared_records(final_core, &shared_recs, &shared_rec_cnt);
519

520
521
        mod_shared_recs = malloc(shared_rec_cnt * sizeof(darshan_record_id));
        assert(mod_shared_recs);
522
    }
523
#endif
524

525
    /* get the log file name */
526
    darshan_get_logfile_name(logfile_name, final_core);
527
528
529
    if(strlen(logfile_name) == 0)
    {
        /* failed to generate log file name */
530
        goto exit;
531
532
533
    }

    if(internal_timing_flag)
534
        open1 = darshan_core_wtime_absolute();
535
    /* open the darshan log file */
536
    ret = darshan_log_open(logfile_name, final_core, &log_fh);
537
    if(internal_timing_flag)
538
        open2 = darshan_core_wtime_absolute();
539
    /* error out if unable to open log file */
540
541
    DARSHAN_CHECK_ERR(ret, "unable to create log file %s", logfile_name);
    log_created = 1;
542
543

    if(internal_timing_flag)
544
        job1 = darshan_core_wtime_absolute();
545
546
    /* write the the compressed darshan job information */
    ret = darshan_log_write_job_record(log_fh, final_core, &gz_fp);
547
    if(internal_timing_flag)
548
        job2 = darshan_core_wtime_absolute();
549
550
    /* error out if unable to write job information */
    DARSHAN_CHECK_ERR(ret, "unable to write job record to file %s", logfile_name);
551
552

    if(internal_timing_flag)
553
        rec1 = darshan_core_wtime_absolute();
554
    /* write the record name->id hash to the log file */
555
    final_core->log_hdr_p->name_map.off = gz_fp;
556
    ret = darshan_log_write_name_record_hash(log_fh, final_core, &gz_fp);
557
    if(internal_timing_flag)
558
        rec2 = darshan_core_wtime_absolute();
559
560
561
    final_core->log_hdr_p->name_map.len = gz_fp - final_core->log_hdr_p->name_map.off;
    /* error out if unable to write name records */
    DARSHAN_CHECK_ERR(ret, "unable to write name records to log file %s", logfile_name);
562
563
564
565
566

    /* loop over globally used darshan modules and:
     *      - get final output buffer
     *      - compress (zlib) provided output buffer
     *      - append compressed buffer to log file
567
     *      - add module map info (file offset/length) to log header
568
569
570
571
572
573
574
575
     *      - shutdown the module
     */
    for(i = 0; i < DARSHAN_MAX_MODS; i++)
    {
        struct darshan_core_module* this_mod = final_core->mod_array[i];
        void* mod_buf = NULL;
        int mod_buf_sz = 0;

576
        if(!active_mods[i])
577
        {
578
579
            final_core->log_hdr_p->mod_map[i].off = 0;
            final_core->log_hdr_p->mod_map[i].len = 0;
580
581
582
583
            continue;
        }

        if(internal_timing_flag)
584
            mod1[i] = darshan_core_wtime_absolute();
585

586
        /* if module is registered locally, perform module shutdown operations */
587
588
        if(this_mod)
        {
589
590
            mod_buf = final_core->mod_array[i]->rec_buf_start;
            mod_buf_sz = final_core->mod_array[i]->rec_buf_p - mod_buf;
591
592
593

#ifdef HAVE_MPI
            if(using_mpi)
594
            {
595
596
597
598
599
600
                struct darshan_core_name_record_ref *ref = NULL;
                int mod_shared_rec_cnt = 0;
                int j;

                /* set the shared record list for this module */
                for(j = 0; j < shared_rec_cnt; j++)
601
                {
602
603
604
605
606
607
608
                    HASH_FIND(hlink, final_core->name_hash, &shared_recs[j],
                        sizeof(darshan_record_id), ref);
                    assert(ref);
                    if(DARSHAN_MOD_FLAG_ISSET(ref->global_mod_flags, i))
                    {
                        mod_shared_recs[mod_shared_rec_cnt++] = shared_recs[j];
                    }
609
610
                }

611
                /* allow the module an opportunity to reduce shared files */
612
                if(this_mod->mod_funcs.mod_redux_func && (mod_shared_rec_cnt > 0) &&
613
614
615
616
                   (!getenv("DARSHAN_DISABLE_SHARED_REDUCTION")))
                    this_mod->mod_funcs.mod_redux_func(mod_buf, final_core->mpi_comm,
                        mod_shared_recs, mod_shared_rec_cnt);
            }
617
#endif
618

619
            /* get the final output buffer */
620
            this_mod->mod_funcs.mod_shutdown_func(&mod_buf, &mod_buf_sz);
621
622
623
        }

        /* append this module's data to the darshan log */
624
        final_core->log_hdr_p->mod_map[i].off = gz_fp;
625
        ret = darshan_log_append(log_fh, final_core, mod_buf, mod_buf_sz, &gz_fp);
626
627
        final_core->log_hdr_p->mod_map[i].len =
            gz_fp - final_core->log_hdr_p->mod_map[i].off;
628

629
630
631
632
633
634
        /* XXX: DXT manages its own module memory buffers, so we need to
         * explicitly free them
         */
        if(i == DXT_POSIX_MOD || i == DXT_MPIIO_MOD)
            free(mod_buf);

635
        if(internal_timing_flag)
636
            mod2[i] = darshan_core_wtime_absolute();
637

638
639
640
        /* error out if unable to write module data */
        DARSHAN_CHECK_ERR(ret, "unable to write %s module data to log file %s",
            darshan_module_names[i], logfile_name);
641
    }
642
643

    if(internal_timing_flag)
644
        header1 = darshan_core_wtime_absolute();
645
646
    ret = darshan_log_write_header(log_fh, final_core);
    if(internal_timing_flag)
647
        header2 = darshan_core_wtime_absolute();
648
    DARSHAN_CHECK_ERR(ret, "unable to write header to file %s", logfile_name);
649

650
651
    /* done writing data, close the log file */
    darshan_log_close(log_fh);
652

653
654
    /* finalize log file name and permissions */
    darshan_log_finalize(logfile_name, start_log_time);
655

656
    if(internal_timing_flag)
657
    {
658
659
660
661
662
663
        double open_tm;
        double header_tm;
        double job_tm;
        double rec_tm;
        double mod_tm[DARSHAN_MAX_MODS];
        double all_tm;
664

665
        tm_end = darshan_core_wtime_absolute();
666

667
668
669
670
        open_tm = open2 - open1;
        header_tm = header2 - header1;
        job_tm = job2 - job1;
        rec_tm = rec2 - rec1;
671
        all_tm = tm_end - start_log_time;
672
        for(i = 0; i < DARSHAN_MAX_MODS; i++)
673
674
675
        {
            mod_tm[i] = mod2[i] - mod1[i];
        }
676

677
678
#ifdef HAVE_MPI
        if(using_mpi)
679
        {
680
            if(my_rank == 0)
681
            {
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
                PMPI_Reduce(MPI_IN_PLACE, &open_tm, 1,
                    MPI_DOUBLE, MPI_MAX, 0, final_core->mpi_comm);
                PMPI_Reduce(MPI_IN_PLACE, &header_tm, 1,
                    MPI_DOUBLE, MPI_MAX, 0, final_core->mpi_comm);
                PMPI_Reduce(MPI_IN_PLACE, &job_tm, 1,
                    MPI_DOUBLE, MPI_MAX, 0, final_core->mpi_comm);
                PMPI_Reduce(MPI_IN_PLACE, &rec_tm, 1,
                    MPI_DOUBLE, MPI_MAX, 0, final_core->mpi_comm);
                PMPI_Reduce(MPI_IN_PLACE, &all_tm, 1,
                    MPI_DOUBLE, MPI_MAX, 0, final_core->mpi_comm);
                PMPI_Reduce(MPI_IN_PLACE, mod_tm, DARSHAN_MAX_MODS,
                    MPI_DOUBLE, MPI_MAX, 0, final_core->mpi_comm);
            }
            else
            {
                PMPI_Reduce(&open_tm, &open_tm, 1,
                    MPI_DOUBLE, MPI_MAX, 0, final_core->mpi_comm);
                PMPI_Reduce(&header_tm, &header_tm, 1,
                    MPI_DOUBLE, MPI_MAX, 0, final_core->mpi_comm);
                PMPI_Reduce(&job_tm, &job_tm, 1,
                    MPI_DOUBLE, MPI_MAX, 0, final_core->mpi_comm);
                PMPI_Reduce(&rec_tm, &rec_tm, 1,
                    MPI_DOUBLE, MPI_MAX, 0, final_core->mpi_comm);
                PMPI_Reduce(&all_tm, &all_tm, 1,
                    MPI_DOUBLE, MPI_MAX, 0, final_core->mpi_comm);
                PMPI_Reduce(mod_tm, mod_tm, DARSHAN_MAX_MODS,
                    MPI_DOUBLE, MPI_MAX, 0, final_core->mpi_comm);

                /* let rank 0 report the timing info */
711
                goto exit;
712
            }
713
        }
714
#endif
715

716
717
718
719
720
721
        darshan_core_fprintf(stderr, "#darshan:<op>\t<nprocs>\t<time>\n");
        darshan_core_fprintf(stderr, "darshan:log_open\t%d\t%f\n", nprocs, open_tm);
        darshan_core_fprintf(stderr, "darshan:job_write\t%d\t%f\n", nprocs, job_tm);
        darshan_core_fprintf(stderr, "darshan:hash_write\t%d\t%f\n", nprocs, rec_tm);
        darshan_core_fprintf(stderr, "darshan:header_write\t%d\t%f\n", nprocs, header_tm);
        for(i = 0; i < DARSHAN_MAX_MODS; i++)
722
        {
723
724
725
            if(active_mods[i])
                darshan_core_fprintf(stderr, "darshan:%s_shutdown\t%d\t%f\n",
                    darshan_module_names[i], nprocs, mod_tm[i]);
726
        }
727
        darshan_core_fprintf(stderr, "darshan:core_shutdown\t%d\t%f\n", nprocs, all_tm);
728
    }
729

730
731
exit:
#ifdef HAVE_MPI
732
733
734
735
736
    if(using_mpi)
    {
        free(shared_recs);
        free(mod_shared_recs);
    }
737
738
739
740
#endif
    free(logfile_name);
    darshan_core_cleanup(final_core);

741
742
    return;
}
743

Shane Snyder's avatar
Shane Snyder committed
744
/* *********************************** */
745

746
#ifdef __DARSHAN_ENABLE_MMAP_LOGS
747
748
749
750
static void *darshan_init_mmap_log(struct darshan_core_runtime* core, int jobid)
{
    int ret;
    int mmap_fd;
Philip Carns's avatar
Philip Carns committed
751
    size_t mmap_size;
752
753
    int sys_page_size;
    char cuser[L_cuserid] = {0};
754
755
756
    uint64_t hlevel;
    char hname[HOST_NAME_MAX];
    uint64_t logmod;
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
    char *envstr;
    char *mmap_log_path;
    void *mmap_p;

    sys_page_size = sysconf(_SC_PAGESIZE);
    assert(sys_page_size > 0);

    mmap_size = sizeof(struct darshan_header) + DARSHAN_JOB_RECORD_SIZE +
        + DARSHAN_NAME_RECORD_BUF_SIZE + darshan_mod_mem_quota;
    if(mmap_size % sys_page_size)
        mmap_size = ((mmap_size / sys_page_size) + 1) * sys_page_size;

    envstr = getenv(DARSHAN_MMAP_LOG_PATH_OVERRIDE);
    if(envstr)
        mmap_log_path = envstr;
    else
        mmap_log_path = DARSHAN_DEF_MMAP_LOG_PATH;

    darshan_get_user_name(cuser);

777
778
779
780
781
782
783
    /* generate a random number to help differentiate the temporary log */
    /* NOTE: job id is not sufficient for constructing a unique log file name,
     * since a job could be composed of multiple application runs, so we also
     * add a random number component to the log name
     */
    if(my_rank == 0)
    {
784
        hlevel = darshan_core_wtime_absolute() * 1000000;
785
786
787
        (void)gethostname(hname, sizeof(hname));
        logmod = darshan_hash((void*)hname,strlen(hname),hlevel);
    }
788
789
790
791
#ifdef HAVE_MPI
    if(using_mpi)
        PMPI_Bcast(&logmod, 1, MPI_UINT64_T, 0, core->mpi_comm);
#endif
792

793
794
795
796
    /* construct a unique temporary log file name for this process
     * to write mmap log data to
     */
    snprintf(core->mmap_log_name, PATH_MAX,
797
798
        "/%s/%s_%s_id%d_mmap-log-%" PRIu64 "-%d.darshan",
        mmap_log_path, cuser, __progname, jobid, logmod, my_rank);
799
800
801
802
803

    /* create the temporary mmapped darshan log */
    mmap_fd = open(core->mmap_log_name, O_CREAT|O_RDWR|O_EXCL , 0644);
    if(mmap_fd < 0)
    {
804
        darshan_core_fprintf(stderr, "darshan library warning: "
805
806
807
808
809
810
811
812
813
            "unable to create darshan log file %s\n", core->mmap_log_name);
        return(NULL);
    }

    /* TODO: ftruncate or just zero fill? */
    /* allocate the necessary space in the log file */
    ret = ftruncate(mmap_fd, mmap_size);
    if(ret < 0)
    {
814
        darshan_core_fprintf(stderr, "darshan library warning: "
815
816
817
818
819
820
821
822
823
824
825
826
            "unable to allocate darshan log file %s\n", core->mmap_log_name);
        close(mmap_fd);
        unlink(core->mmap_log_name);
        return(NULL);
    }

    /* create the memory map for darshan's data structures so they are
     * persisted to file as the application executes
     */
    mmap_p = mmap(NULL, mmap_size, PROT_WRITE, MAP_SHARED, mmap_fd, 0);
    if(mmap_p == MAP_FAILED)
    {
827
        darshan_core_fprintf(stderr, "darshan library warning: "
828
829
830
831
832
833
834
835
836
837
838
            "unable to mmap darshan log file %s\n", core->mmap_log_name);
        close(mmap_fd);
        unlink(core->mmap_log_name);
        return(NULL);
    }

    /* close darshan log file (this does *not* unmap the log file) */
    close(mmap_fd);

    return(mmap_p);
}
839
#endif
840

841
/* record any hints used to write the darshan log in the job data */
842
static void darshan_log_record_hints_and_ver(struct darshan_core_runtime* core)
843
844
{
    char* hints;
845
    char* job_hints;
846
847
848
849
850
851
    int meta_remain = 0;
    char* m;

    /* check environment variable to see if the default MPI file hints have
     * been overridden
     */
852
    hints = getenv(DARSHAN_LOG_HINTS_OVERRIDE);
853
854
    if(!hints)
    {
855
        hints = __DARSHAN_LOG_HINTS;
856
857
858
859
860
    }

    if(!hints || strlen(hints) < 1)
        return;

861
862
    job_hints = strdup(hints);
    if(!job_hints)
863
864
865
        return;

    meta_remain = DARSHAN_JOB_METADATA_LEN -
866
        strlen(core->log_job_p->metadata) - 1;
867
868
    if(meta_remain >= (strlen(PACKAGE_VERSION) + 9))
    {
869
        sprintf(core->log_job_p->metadata, "lib_ver=%s\n", PACKAGE_VERSION);
870
871
        meta_remain -= (strlen(PACKAGE_VERSION) + 9);
    }
872
    if(meta_remain >= (3 + strlen(job_hints)))
873
    {
874
        m = core->log_job_p->metadata + strlen(core->log_job_p->metadata);
875
        /* We have room to store the hints in the metadata portion of
876
         * the job structure.  We just prepend an h= to the hints list.  The
877
878
879
         * metadata parser will ignore = characters that appear in the value
         * portion of the metadata key/value pair.
         */
880
        sprintf(m, "h=%s\n", job_hints);
881
    }
882
    free(job_hints);
883
884
885
886

    return;
}

887
888
static int mnt_data_cmp(const void* a, const void* b)
{
889
890
    const struct darshan_core_mnt_data *d_a = (const struct darshan_core_mnt_data*)a;
    const struct darshan_core_mnt_data *d_b = (const struct darshan_core_mnt_data*)b;
891
892
893
894
895
896
897
898
899
900

    if(strlen(d_a->path) > strlen(d_b->path))
        return(-1);
    else if(strlen(d_a->path) < strlen(d_b->path))
        return(1);
    else
        return(0);
}

/* adds an entry to table of mounted file systems */
901
static void add_entry(char* buf, int* space_left, struct mntent* entry)
902
{
903
    int i;
904
905
906
907
    int ret;
    char tmp_mnt[256];
    struct statfs statfsbuf;

908
909
910
911
912
913
914
915
916
917
    /* avoid adding the same mount points multiple times -- to limit
     * storage space and potential statfs, ioctl, etc calls
     */
    for(i = 0; i < mnt_data_count; i++)
    {
        if((strncmp(mnt_data_array[i].path, entry->mnt_dir, DARSHAN_MAX_MNT_PATH) == 0) &&
           (strncmp(mnt_data_array[i].type, entry->mnt_type, DARSHAN_MAX_MNT_PATH) == 0))
            return;
    }

918
919
920
921
    strncpy(mnt_data_array[mnt_data_count].path, entry->mnt_dir,
        DARSHAN_MAX_MNT_PATH-1);
    strncpy(mnt_data_array[mnt_data_count].type, entry->mnt_type,
        DARSHAN_MAX_MNT_TYPE-1);
Philip Carns's avatar
Philip Carns committed
922
923
924
    /* NOTE: we now try to detect the preferred block size for each file
     * system using fstatfs().  On Lustre we assume a size of 1 MiB
     * because fstatfs() reports 4 KiB.
925
926
927
928
929
     */
#ifndef LL_SUPER_MAGIC
#define LL_SUPER_MAGIC 0x0BD00BD0
#endif
    ret = statfs(entry->mnt_dir, &statfsbuf);
930
    mnt_data_array[mnt_data_count].fs_info.fs_type = statfsbuf.f_type;
931
    if(ret == 0 && statfsbuf.f_type != LL_SUPER_MAGIC)
932
        mnt_data_array[mnt_data_count].fs_info.block_size = statfsbuf.f_bsize;
933
    else if(ret == 0 && statfsbuf.f_type == LL_SUPER_MAGIC)
934
        mnt_data_array[mnt_data_count].fs_info.block_size = 1024*1024;
935
    else
936
        mnt_data_array[mnt_data_count].fs_info.block_size = 4096;
937

938
#ifdef DARSHAN_LUSTRE
939
940
941
942
943
944
945
946
947
948
    /* attempt to retrieve OST and MDS counts from Lustre */
    mnt_data_array[mnt_data_count].fs_info.ost_count = -1;
    mnt_data_array[mnt_data_count].fs_info.mdt_count = -1;
    if ( statfsbuf.f_type == LL_SUPER_MAGIC )
    {
        int n_ost, n_mdt;
        int ret_ost, ret_mdt;
        DIR *mount_dir;

        mount_dir = opendir( entry->mnt_dir );
Philip Carns's avatar
Philip Carns committed
949
        if ( mount_dir  )
950
951
952
953
954
955
956
957
        {
            /* n_ost and n_mdt are used for both input and output to ioctl */
            n_ost = 0;
            n_mdt = 1;

            ret_ost = ioctl( dirfd(mount_dir), LL_IOC_GETOBDCOUNT, &n_ost );
            ret_mdt = ioctl( dirfd(mount_dir), LL_IOC_GETOBDCOUNT, &n_mdt );

958
            if ( !(ret_ost < 0 || ret_mdt < 0) )
959
960
961
962
963
964
965
            {
                mnt_data_array[mnt_data_count].fs_info.ost_count = n_ost;
                mnt_data_array[mnt_data_count].fs_info.mdt_count = n_mdt;
            }
            closedir( mount_dir );
        }
    }
966
#endif
967

968
    /* store mount information with the job-level metadata in darshan log */
969
    ret = snprintf(tmp_mnt, 256, "\n%s\t%s",
970
971
972
        entry->mnt_type, entry->mnt_dir);
    if(ret < 256 && strlen(tmp_mnt) <= (*space_left))
    {
973
        strcat(buf, tmp_mnt);
974
975
976
977
978
979
980
        (*space_left) -= strlen(tmp_mnt);
    }

    mnt_data_count++;
    return;
}

981
/* darshan_get_exe_and_mounts()
982
983
 *
 * collects command line and list of mounted file systems into a string that
984
 * will be stored with the job-level metadata
985
 */
986
static void darshan_get_exe_and_mounts(struct darshan_core_runtime *core,
987
    int argc, char **argv)
988
989
990
991
{
    FILE* tab;
    struct mntent *entry;
    char* exclude;
992
993
994
    char* truncate_string = "<TRUNCATED>";
    int truncate_offset;
    int space_left = DARSHAN_EXE_LEN;
995
996
997
    FILE *fh;
    int i, ii;
    char cmdl[DARSHAN_EXE_LEN];
998
999
    int tmp_index = 0;
    int skip = 0;
1000
1001
1002
    char* env_exclusions;
    char* string;
    char* token;
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021

    /* skip these fs types */
    static char* fs_exclusions[] = {
        "tmpfs",
        "proc",
        "sysfs",
        "devpts",
        "binfmt_misc",
        "fusectl",
        "debugfs",
        "securityfs",
        "nfsd",
        "none",
        "rpc_pipefs",
        "hugetlbfs",
        "cgroup",
        NULL
    };

1022
1023
1024
1025
1026
1027
1028
1029
    /* Check if user has set the env variable DARSHAN_EXCLUDE_DIRS */
    env_exclusions = getenv("DARSHAN_EXCLUDE_DIRS");
    if(env_exclusions)
    {
        fs_exclusions[0]=NULL;
        /* if DARSHAN_EXCLUDE_DIRS=none, do not exclude any dir */
        if(strncmp(env_exclusions,"none",strlen(env_exclusions))>=0)
        {
Philip Carns's avatar
Philip Carns committed
1030
            if (my_rank == 0)
1031
                darshan_core_fprintf(stderr, "Darshan info: no system dirs will be excluded\n");
1032
1033
1034
1035
            darshan_path_exclusions[0]=NULL;
        }
        else
        {
Philip Carns's avatar
Philip Carns committed
1036
            if (my_rank == 0)
1037
                darshan_core_fprintf(stderr, "Darshan info: the following system dirs will be excluded: %s\n",
Shane Snyder's avatar
Shane Snyder committed
1038
                    env_exclusions);
1039
            string = strdup(env_exclusions);
1040
1041
            i = 0;
            /* get the comma separated number of directories */
1042
1043
            token = strtok(string, ",");
            while (token != NULL)
1044
            {
1045
1046
                i++;
                token = strtok(NULL, ",");
1047
1048
            }
            user_darshan_path_exclusions=(char **)malloc((i+1)*sizeof(char *