darshan-core.c 70.5 KB
Newer Older
1
/*
Shane Snyder's avatar
Shane Snyder committed
2 3 4
 * Copyright (C) 2015 University of Chicago.
 * See COPYRIGHT notice in top-level directory.
 *
5 6
 */

7
#define _XOPEN_SOURCE 500
8
#define _GNU_SOURCE
9

10 11 12 13 14 15
#include "darshan-runtime-config.h"

#include <stdio.h>
#ifdef HAVE_MNTENT_H
#include <mntent.h>
#endif
16 17 18 19 20
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <limits.h>
#include <pthread.h>
21
#include <fcntl.h>
Shane Snyder's avatar
Shane Snyder committed
22
#include <stdarg.h>
23 24
#include <dirent.h>
#include <sys/ioctl.h>
25 26
#include <sys/types.h>
#include <sys/stat.h>
27
#include <sys/mman.h>
28
#include <sys/time.h>
29
#include <sys/vfs.h>
30
#include <zlib.h>
31
#include <assert.h>
32

33 34 35 36
#ifdef HAVE_MPI
#include <mpi.h>
#endif

37
#include "uthash.h"
Shane Snyder's avatar
Shane Snyder committed
38
#include "darshan.h"
39
#include "darshan-core.h"
Shane Snyder's avatar
Shane Snyder committed
40
#include "darshan-dynamic.h"
41

42
#ifdef DARSHAN_LUSTRE
43
#include <lustre/lustre_user.h>
44
#endif
45

46
extern char* __progname;
47
extern char* __progname_full;
48

49
/* internal variable delcarations */
50
static struct darshan_core_runtime *darshan_core = NULL;
51
static pthread_mutex_t darshan_core_mutex = PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP;
52 53 54
static int using_mpi = 0;
static int my_rank = 0;
static int nprocs = 1;
55
static int darshan_mem_alignment = 1;
56
static long darshan_mod_mem_quota = DARSHAN_MOD_MEM_MAX;
57

58 59 60
static struct darshan_core_mnt_data mnt_data_array[DARSHAN_MAX_MNTS];
static int mnt_data_count = 0;

61
/* paths prefixed with the following directories are not tracked by darshan */
62
char* darshan_path_exclusions[] = {
63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81
    "/etc/",
    "/dev/",
    "/usr/",
    "/bin/",
    "/boot/",
    "/lib/",
    "/opt/",
    "/sbin/",
    "/sys/",
    "/proc/",
    "/var/",
    NULL
};
/* paths prefixed with the following directories are tracked by darshan even if
 * they share a root with a path listed in darshan_path_exclusions
 */
char* darshan_path_inclusions[] = {
    "/var/opt/cray/dws/mounts/",
    NULL
82 83
};

84 85 86
/* allow users to override the path exclusions */
char** user_darshan_path_exclusions = NULL;

87 88 89 90 91 92 93 94 95 96 97 98 99 100 101
#ifdef DARSHAN_BGQ
extern void bgq_runtime_initialize();
#endif

/* array of init functions for modules which need to be statically
 * initialized by darshan at startup time
 */
void (*mod_static_init_fns[])(void) =
{
#ifdef DARSHAN_BGQ
    &bgq_runtime_initialize,
#endif
    NULL
};

102 103 104 105 106 107 108
#ifdef DARSHAN_LUSTRE
/* XXX need to use extern to get Lustre module's instrumentation function
 * since modules have no way of providing this to darshan-core
 */
extern void darshan_instrument_lustre_file(const char *filepath, int fd);
#endif

109
/* prototypes for internal helper functions */
110
#ifdef __DARSHAN_ENABLE_MMAP_LOGS
111 112
static void *darshan_init_mmap_log(
    struct darshan_core_runtime* core, int jobid);
113
#endif
114
static void darshan_log_record_hints_and_ver(
115
    struct darshan_core_runtime* core);
116 117
static void darshan_get_exe_and_mounts(
    struct darshan_core_runtime *core, int argc, char **argv);
118 119
static void darshan_fs_info_from_path(
    const char *path, struct darshan_fs_info *fs_info);
120
static int darshan_add_name_record_ref(
121
    struct darshan_core_runtime *core, darshan_record_id rec_id,
122
    const char *name, darshan_module_id mod_id);
123 124
static void darshan_get_user_name(
    char *user);
125
#ifdef HAVE_MPI
126
static void darshan_get_shared_records(
127 128
    struct darshan_core_runtime *core, darshan_record_id **shared_recs,
    int *shared_rec_cnt);
129
#endif
130 131 132 133 134
static void darshan_get_logfile_name(
    char* logfile_name, int jobid, time_t start_time);
static int darshan_log_open(
    char *logfile_name, darshan_core_log_fh *log_fh);
static int darshan_log_write_job_record(
Philip Carns's avatar
Philip Carns committed
135
    darshan_core_log_fh log_fh, struct darshan_core_runtime *core,
136 137 138 139 140 141 142 143 144 145 146 147 148
    uint64_t *inout_off);
static int darshan_log_write_name_record_hash(
    darshan_core_log_fh log_fh, struct darshan_core_runtime *core,
    uint64_t *inout_off);
static int darshan_log_write_header(
    darshan_core_log_fh log_fh, struct darshan_core_runtime *core);
static int darshan_log_append(
    darshan_core_log_fh log_fh, struct darshan_core_runtime *core,
    void *buf, int count, uint64_t *inout_off);
void darshan_log_close(
    darshan_core_log_fh log_fh);
void darshan_log_finalize(
    char *logfile_name, double start_log_time);
149
static int darshan_deflate_buffer(
Shane Snyder's avatar
Shane Snyder committed
150 151
    void **pointers, int *lengths, int count, char *comp_buf,
    int *comp_buf_length);
Shane Snyder's avatar
Shane Snyder committed
152 153
static void darshan_core_cleanup(
    struct darshan_core_runtime* core);
154
static double darshan_core_wtime_absolute(void);
155

156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193
#define DARSHAN_CORE_LOCK() pthread_mutex_lock(&darshan_core_mutex)
#define DARSHAN_CORE_UNLOCK() pthread_mutex_unlock(&darshan_core_mutex)

#define DARSHAN_WARN(__err_str, ...) do { \
    darshan_core_fprintf(stderr, "darshan_library_warning: " \
        __err_str ".\n", ## __VA_ARGS__); \
} while(0)

#ifdef HAVE_MPI

/* MPI variant of darshan logging helpers */
#define DARSHAN_CHECK_ERR(__ret, __err_str, ...) do { \
    if(using_mpi) \
        PMPI_Allreduce(MPI_IN_PLACE, &__ret, 1, MPI_INT, MPI_LOR, MPI_COMM_WORLD); \
    if(__ret != 0) { \
        if(my_rank == 0) { \
            DARSHAN_WARN(__err_str); \
            if(log_created) \
                unlink(logfile_name); \
        } \
        goto exit; \
    } \
} while(0)

#else

/* Non-MPI variant of darshan logging helpers */
#define DARSHAN_CHECK_ERR(__ret, __err_str, ...) do { \
    if(__ret != 0) { \
        DARSHAN_WARN(__err_str); \
        if(log_created) \
            unlink(logfile_name); \
        goto exit; \
    } \
} while(0)

#endif

194 195
/* *********************************** */

Shane Snyder's avatar
Shane Snyder committed
196
void darshan_core_initialize(int argc, char **argv)
197
{
198
    struct darshan_core_runtime *init_core = NULL;
199
    int internal_timing_flag = 0;
200
    double init_start, init_time;
201
    char *envstr;
202 203
    char *jobid_str;
    int jobid;
204
    int ret;
205
    int i;
206 207
    int tmpval;
    double tmpfloat;
208

209 210 211 212 213 214
    /* bail out _before_ attempting to [re]set using_mpi */
    if (darshan_core != NULL)
        return;

#ifdef HAVE_MPI
    PMPI_Initialized(&using_mpi);
215 216 217 218 219
    if(using_mpi)
    {
        PMPI_Comm_size(MPI_COMM_WORLD, &nprocs);
        PMPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
    }
220
#endif
221 222

    if(getenv("DARSHAN_INTERNAL_TIMING"))
223
    {
224
        internal_timing_flag = 1;
225 226
        init_start = darshan_core_wtime();
    }
227 228

    /* setup darshan runtime if darshan is enabled and hasn't been initialized already */
229
    if(!getenv("DARSHAN_DISABLE") && !darshan_core)
230
    {
231
        #if (__DARSHAN_MEM_ALIGNMENT < 1)
232 233
            #error Darshan must be configured with a positive value for --with-mem-align
        #endif
234
        envstr = getenv(DARSHAN_MEM_ALIGNMENT_OVERRIDE);
235 236 237 238 239 240 241 242 243 244 245
        if(envstr)
        {
            ret = sscanf(envstr, "%d", &tmpval);
            /* silently ignore if the env variable is set poorly */
            if(ret == 1 && tmpval > 0)
            {
                darshan_mem_alignment = tmpval;
            }
        }
        else
        {
246
            darshan_mem_alignment = __DARSHAN_MEM_ALIGNMENT;
247 248 249
        }

        /* avoid floating point errors on faulty input */
250
        if(darshan_mem_alignment < 1)
251 252 253
        {
            darshan_mem_alignment = 1;
        }
254

255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274
        /* Use DARSHAN_JOBID_OVERRIDE for the env var for __DARSHAN_JOBID */
        envstr = getenv(DARSHAN_JOBID_OVERRIDE);
        if(!envstr)
        {
            envstr = __DARSHAN_JOBID;
        }

        /* find a job id */
        jobid_str = getenv(envstr);
        if(jobid_str)
        {
            /* in cobalt we can find it in env var */
            ret = sscanf(jobid_str, "%d", &jobid);
        }
        if(!jobid_str || ret != 1)
        {
            /* use pid as fall back */
            jobid = getpid();
        }

275 276 277 278
        /* set the memory quota for darshan modules' records */
        envstr = getenv(DARSHAN_MOD_MEM_OVERRIDE);
        if(envstr)
        {
279
            ret = sscanf(envstr, "%lf", &tmpfloat);
280
            /* silently ignore if the env variable is set poorly */
281
            if(ret == 1 && tmpfloat > 0)
282
            {
283
                darshan_mod_mem_quota = tmpfloat * 1024 * 1024; /* convert from MiB */
284 285 286
            }
        }

287 288 289
        /* allocate structure to track darshan core runtime information */
        init_core = malloc(sizeof(*init_core));
        if(init_core)
290
        {
291
            memset(init_core, 0, sizeof(*init_core));
292 293 294 295
            /* record absolute start time at startup so that we can later
             * generate relative times with this as a reference point.
             */
            init_core->wtime_offset = darshan_core_wtime_absolute();
296

297 298
        /* TODO: do we alloc new memory as we go or just do everything up front? */

299 300 301 302 303
#ifndef __DARSHAN_ENABLE_MMAP_LOGS
            /* just allocate memory for each log file region */
            init_core->log_hdr_p = malloc(sizeof(struct darshan_header));
            init_core->log_job_p = malloc(sizeof(struct darshan_job));
            init_core->log_exemnt_p = malloc(DARSHAN_EXE_LEN+1);
304
            init_core->log_name_p = malloc(DARSHAN_NAME_RECORD_BUF_SIZE);
305
            init_core->log_mod_p = malloc(darshan_mod_mem_quota);
306 307

            if(!(init_core->log_hdr_p) || !(init_core->log_job_p) ||
308
               !(init_core->log_exemnt_p) || !(init_core->log_name_p) ||
309 310 311 312 313 314 315 316 317
               !(init_core->log_mod_p))
            {
                free(init_core);
                return;
            }
            /* if allocation succeeds, zero fill memory regions */
            memset(init_core->log_hdr_p, 0, sizeof(struct darshan_header));
            memset(init_core->log_job_p, 0, sizeof(struct darshan_job));
            memset(init_core->log_exemnt_p, 0, DARSHAN_EXE_LEN+1);
318
            memset(init_core->log_name_p, 0, DARSHAN_NAME_RECORD_BUF_SIZE);
319
            memset(init_core->log_mod_p, 0, darshan_mod_mem_quota);
320 321 322 323
#else
            /* if mmap logs are enabled, we need to initialize the mmap region
             * before setting the corresponding log file region pointers
             */
324 325
            void *mmap_p = darshan_init_mmap_log(init_core, jobid);
            if(!mmap_p)
326
            {
327 328
                free(init_core);
                return;
329 330
            }

331
            /* set the memory pointers for each log file region */
332
            init_core->log_hdr_p = (struct darshan_header *)mmap_p;
333
            init_core->log_job_p = (struct darshan_job *)
334
                ((char *)init_core->log_hdr_p + sizeof(struct darshan_header));
335
            init_core->log_exemnt_p = (char *)
336
                ((char *)init_core->log_job_p + sizeof(struct darshan_job));
337
            init_core->log_name_p = (void *)
338 339
                ((char *)init_core->log_exemnt_p + DARSHAN_EXE_LEN + 1);
            init_core->log_mod_p = (void *)
340
                ((char *)init_core->log_name_p + DARSHAN_NAME_RECORD_BUF_SIZE);
341

342
            /* set header fields needed for the mmap log mechanism */
343
            init_core->log_hdr_p->comp_type = DARSHAN_NO_COMP;
344
            init_core->log_hdr_p->name_map.off =
345
                ((char *)init_core->log_name_p - (char *)init_core->log_hdr_p);
346 347 348 349 350
#endif

            /* set known header fields for the log file */
            strcpy(init_core->log_hdr_p->version_string, DARSHAN_LOG_VERSION);
            init_core->log_hdr_p->magic_nr = DARSHAN_MAGIC_NR;
351

352 353 354 355 356
            /* set known job-level metadata fields for the log file */
            init_core->log_job_p->uid = getuid();
            init_core->log_job_p->start_time = time(NULL);
            init_core->log_job_p->nprocs = nprocs;
            init_core->log_job_p->jobid = (int64_t)jobid;
357 358 359 360 361 362

            /* if we are using any hints to write the log file, then record those
             * hints with the darshan job information
             */
            darshan_log_record_hints_and_ver(init_core);

363
            /* collect information about command line and mounted file systems */
364
            darshan_get_exe_and_mounts(init_core, argc, argv);
365

366 367 368 369
            /* if darshan was successfully initialized, set the global pointer
             * and bootstrap any modules with static initialization routines
             */
            DARSHAN_CORE_LOCK();
370
            darshan_core = init_core;
371 372 373 374 375 376 377 378
            DARSHAN_CORE_UNLOCK();

            i = 0;
            while(mod_static_init_fns[i])
            {
                (*mod_static_init_fns[i])();
                i++;
            }
379
        }
380 381
    }

382 383
    if(internal_timing_flag)
    {
384 385
        init_time = darshan_core_wtime() - init_start;
#ifdef HAVE_MPI
386 387 388 389 390
        if(using_mpi)
        {
            PMPI_Reduce(MPI_IN_PLACE, &init_time, 1,
                MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
        }
391 392 393 394 395
        if(my_rank > 0) return;
#endif

        darshan_core_fprintf(stderr, "#darshan:<op>\t<nprocs>\t<time>\n");
        darshan_core_fprintf(stderr, "darshan:init\t%d\t%f\n", nprocs, init_time);
396 397 398 399 400
    }

    return;
}

Shane Snyder's avatar
Shane Snyder committed
401
void darshan_core_shutdown()
402
{
403
    struct darshan_core_runtime *final_core;
404
    double start_log_time;
405
    int internal_timing_flag = 0;
406 407 408
    double open1 = 0, open2 = 0;
    double job1 = 0, job2 = 0;
    double rec1 = 0, rec2 = 0;
409 410
    double mod1[DARSHAN_MAX_MODS] = {0};
    double mod2[DARSHAN_MAX_MODS] = {0};
411
    double header1 = 0, header2 = 0;
412 413
    double tm_end;
    int active_mods[DARSHAN_MAX_MODS] = {0};
414
    uint64_t gz_fp = 0;
415 416 417 418 419
    char *logfile_name = NULL;
    darshan_core_log_fh log_fh;
    int log_created = 0;
    int i;
    int ret;
420

Shane Snyder's avatar
Shane Snyder committed
421
    /* disable darhan-core while we shutdown */
422
    DARSHAN_CORE_LOCK();
423
    if(!darshan_core)
424
    {
425
        DARSHAN_CORE_UNLOCK();
426 427
        return;
    }
428 429
    final_core = darshan_core;
    darshan_core = NULL;
430 431
    DARSHAN_CORE_UNLOCK();

432
    /* grab some initial timing information */
433
    /* XXX move to MPI_Finalize wrapper */
434 435 436 437 438 439 440 441 442 443 444
#ifdef HAVE_MPI
    /* if using mpi, sync across procs first */
    if(using_mpi)
        PMPI_Barrier(MPI_COMM_WORLD);
#endif
    start_log_time = darshan_core_wtime();
    final_core->log_job_p->end_time = time(NULL);

    if(getenv("DARSHAN_INTERNAL_TIMING"))
        internal_timing_flag = 1;

445
#ifdef __DARSHAN_ENABLE_MMAP_LOGS
446 447 448 449 450
    /* remove the temporary mmap log files */
    /* NOTE: this unlink is not immediate as it must wait for the mapping
     * to no longer be referenced, which in our case happens when the
     * executable exits. If the application terminates mid-shutdown, then
     * there will be no mmap files and no final log file.
451
     */
452
    unlink(final_core->mmap_log_name);
453
#endif
Shane Snyder's avatar
Shane Snyder committed
454

455 456 457 458
    final_core->comp_buf = malloc(darshan_mod_mem_quota);
    logfile_name = malloc(PATH_MAX);
    if(!final_core->comp_buf || !logfile_name)
        goto exit;
459

460 461
    /* set which modules were used locally */
    for(i = 0; i < DARSHAN_MAX_MODS; i++)
Shane Snyder's avatar
Shane Snyder committed
462
    {
463 464
        if(final_core->mod_array[i])
            active_mods[i] = 1;
Shane Snyder's avatar
Shane Snyder committed
465
    }
466

467 468 469 470
#ifdef HAVE_MPI
    darshan_record_id *shared_recs = NULL;
    darshan_record_id *mod_shared_recs = NULL;
    int shared_rec_cnt = 0;
471

472
    if(using_mpi)
473
    {
474 475 476
        /* allreduce locally active mods to determine globally active mods */
        PMPI_Allreduce(MPI_IN_PLACE, active_mods, DARSHAN_MAX_MODS, MPI_INT,
            MPI_SUM, MPI_COMM_WORLD);
477

478 479 480 481 482
        /* reduce to report first start and last end time across all ranks at rank 0 */
        PMPI_Reduce(MPI_IN_PLACE, &final_core->log_job_p->start_time,
            1, MPI_INT64_T, MPI_MIN, 0, MPI_COMM_WORLD);
        PMPI_Reduce(MPI_IN_PLACE, &final_core->log_job_p->end_time,
            1, MPI_INT64_T, MPI_MAX, 0, MPI_COMM_WORLD);
483

484 485
        /* get a list of records which are shared across all processes */
        darshan_get_shared_records(final_core, &shared_recs, &shared_rec_cnt);
486

487 488 489 490
        mod_shared_recs = malloc(shared_rec_cnt * sizeof(darshan_record_id));
        assert(mod_shared_recs);
    }
#endif
491

492 493 494
    /* get the log file name */
    darshan_get_logfile_name(logfile_name, final_core->log_job_p->jobid,
        final_core->log_job_p->start_time);
495 496 497
    if(strlen(logfile_name) == 0)
    {
        /* failed to generate log file name */
498
        goto exit;
499 500 501
    }

    if(internal_timing_flag)
502 503 504
        open1 = darshan_core_wtime();
    /* open the darshan log file */
    ret = darshan_log_open(logfile_name, &log_fh);
505
    if(internal_timing_flag)
506
        open2 = darshan_core_wtime();
507
    /* error out if unable to open log file */
508 509
    DARSHAN_CHECK_ERR(ret, "unable to create log file %s", logfile_name);
    log_created = 1;
510 511

    if(internal_timing_flag)
512 513 514
        job1 = darshan_core_wtime();
    /* write the the compressed darshan job information */
    ret = darshan_log_write_job_record(log_fh, final_core, &gz_fp);
515
    if(internal_timing_flag)
516 517 518
        job2 = darshan_core_wtime();
    /* error out if unable to write job information */
    DARSHAN_CHECK_ERR(ret, "unable to write job record to file %s", logfile_name);
519 520

    if(internal_timing_flag)
521
        rec1 = darshan_core_wtime();
522
    /* write the record name->id hash to the log file */
523
    final_core->log_hdr_p->name_map.off = gz_fp;
524
    ret = darshan_log_write_name_record_hash(log_fh, final_core, &gz_fp);
525
    if(internal_timing_flag)
526 527 528 529
        rec2 = darshan_core_wtime();
    final_core->log_hdr_p->name_map.len = gz_fp - final_core->log_hdr_p->name_map.off;
    /* error out if unable to write name records */
    DARSHAN_CHECK_ERR(ret, "unable to write name records to log file %s", logfile_name);
530 531 532 533 534

    /* loop over globally used darshan modules and:
     *      - get final output buffer
     *      - compress (zlib) provided output buffer
     *      - append compressed buffer to log file
535
     *      - add module map info (file offset/length) to log header
536 537 538 539 540 541 542 543
     *      - shutdown the module
     */
    for(i = 0; i < DARSHAN_MAX_MODS; i++)
    {
        struct darshan_core_module* this_mod = final_core->mod_array[i];
        void* mod_buf = NULL;
        int mod_buf_sz = 0;

544
        if(!active_mods[i])
545
        {
546 547
            final_core->log_hdr_p->mod_map[i].off = 0;
            final_core->log_hdr_p->mod_map[i].len = 0;
548 549 550 551
            continue;
        }

        if(internal_timing_flag)
552 553 554 555 556 557
            mod1[i] = darshan_core_wtime();

#ifdef HAVE_MPI
        struct darshan_core_name_record_ref *ref = NULL;
        int mod_shared_rec_cnt = 0;
        int j;
558

559
        if(using_mpi)
560
        {
561 562
            /* set the shared record list for this module */
            for(j = 0; j < shared_rec_cnt; j++)
563
            {
564 565 566 567 568 569 570
                HASH_FIND(hlink, final_core->name_hash, &shared_recs[j],
                    sizeof(darshan_record_id), ref);
                assert(ref);
                if(DARSHAN_MOD_FLAG_ISSET(ref->global_mod_flags, i))
                {
                    mod_shared_recs[mod_shared_rec_cnt++] = shared_recs[j];
                }
571
            }
572 573

            /* allow the module an opportunity to reduce shared files */
574 575 576
            if(this_mod->mod_funcs.mod_redux_func && (mod_shared_recs > 0) &&
               (!getenv("DARSHAN_DISABLE_SHARED_REDUCTION")))
                this_mod->mod_funcs.mod_redux_func(mod_buf, MPI_COMM_WORLD, mod_shared_recs,
Philip Carns's avatar
Philip Carns committed
577
                    mod_shared_rec_cnt);
578
        }
579
#endif
580 581

        /* if module is registered locally, get the corresponding output buffer
Philip Carns's avatar
Philip Carns committed
582
         *
583
         * NOTE: this function can be used to run collective operations across
584
         * modules, if there are records shared globally.
585 586 587
         */
        if(this_mod)
        {
588 589
            mod_buf = final_core->mod_array[i]->rec_buf_start;
            mod_buf_sz = final_core->mod_array[i]->rec_buf_p - mod_buf;
590
            this_mod->mod_funcs.mod_shutdown_func(&mod_buf, &mod_buf_sz);
591 592 593
        }

        /* append this module's data to the darshan log */
594
        final_core->log_hdr_p->mod_map[i].off = gz_fp;
595
        ret = darshan_log_append(log_fh, final_core, mod_buf, mod_buf_sz, &gz_fp);
596 597
        final_core->log_hdr_p->mod_map[i].len =
            gz_fp - final_core->log_hdr_p->mod_map[i].off;
598

599 600 601 602 603 604
        /* XXX: DXT manages its own module memory buffers, so we need to
         * explicitly free them
         */
        if(i == DXT_POSIX_MOD || i == DXT_MPIIO_MOD)
            free(mod_buf);

605
        if(internal_timing_flag)
606
            mod2[i] = darshan_core_wtime();
607

608 609 610
        /* error out if unable to write module data */
        DARSHAN_CHECK_ERR(ret, "unable to write %s module data to log file %s",
            darshan_module_names[i], logfile_name);
611
    }
612 613

    if(internal_timing_flag)
614 615 616 617 618
        header1 = darshan_core_wtime();
    ret = darshan_log_write_header(log_fh, final_core);
    if(internal_timing_flag)
        header2 = darshan_core_wtime();
    DARSHAN_CHECK_ERR(ret, "unable to write header to file %s", logfile_name);
619

620 621
    /* done writing data, close the log file */
    darshan_log_close(log_fh);
622

623 624
    /* finalize log file name and permissions */
    darshan_log_finalize(logfile_name, start_log_time);
625

626
    if(internal_timing_flag)
627
    {
628 629 630 631 632 633
        double open_tm;
        double header_tm;
        double job_tm;
        double rec_tm;
        double mod_tm[DARSHAN_MAX_MODS];
        double all_tm;
634

635
        tm_end = darshan_core_wtime();
636

637 638 639 640
        open_tm = open2 - open1;
        header_tm = header2 - header1;
        job_tm = job2 - job1;
        rec_tm = rec2 - rec1;
641
        all_tm = tm_end - start_log_time;
642
        for(i = 0; i < DARSHAN_MAX_MODS; i++)
643 644 645
        {
            mod_tm[i] = mod2[i] - mod1[i];
        }
646

647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667
#ifdef HAVE_MPI
        if(using_mpi)
        {
            PMPI_Reduce(MPI_IN_PLACE, &open_tm, 1,
                MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
            PMPI_Reduce(MPI_IN_PLACE, &header_tm, 1,
                MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
            PMPI_Reduce(MPI_IN_PLACE, &job_tm, 1,
                MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
            PMPI_Reduce(MPI_IN_PLACE, &rec_tm, 1,
                MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
            PMPI_Reduce(MPI_IN_PLACE, &all_tm, 1,
                MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
            PMPI_Reduce(MPI_IN_PLACE, mod_tm, DARSHAN_MAX_MODS,
                MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);

            /* let rank 0 report the timing info */
            if(my_rank > 0)
                goto exit;
        }
#endif
668

669 670 671 672 673 674
        darshan_core_fprintf(stderr, "#darshan:<op>\t<nprocs>\t<time>\n");
        darshan_core_fprintf(stderr, "darshan:log_open\t%d\t%f\n", nprocs, open_tm);
        darshan_core_fprintf(stderr, "darshan:job_write\t%d\t%f\n", nprocs, job_tm);
        darshan_core_fprintf(stderr, "darshan:hash_write\t%d\t%f\n", nprocs, rec_tm);
        darshan_core_fprintf(stderr, "darshan:header_write\t%d\t%f\n", nprocs, header_tm);
        for(i = 0; i < DARSHAN_MAX_MODS; i++)
675
        {
676 677 678
            if(active_mods[i])
                darshan_core_fprintf(stderr, "darshan:%s_shutdown\t%d\t%f\n",
                    darshan_module_names[i], nprocs, mod_tm[i]);
679
        }
680
        darshan_core_fprintf(stderr, "darshan:core_shutdown\t%d\t%f\n", nprocs, all_tm);
681
    }
682

683 684
exit:
#ifdef HAVE_MPI
685 686 687 688 689
    if(using_mpi)
    {
        free(shared_recs);
        free(mod_shared_recs);
    }
690 691 692 693
#endif
    free(logfile_name);
    darshan_core_cleanup(final_core);

694 695
    return;
}
696

Shane Snyder's avatar
Shane Snyder committed
697
/* *********************************** */
698

699
#ifdef __DARSHAN_ENABLE_MMAP_LOGS
700 701 702 703 704 705 706
static void *darshan_init_mmap_log(struct darshan_core_runtime* core, int jobid)
{
    int ret;
    int mmap_fd;
    int mmap_size;
    int sys_page_size;
    char cuser[L_cuserid] = {0};
707 708 709
    uint64_t hlevel;
    char hname[HOST_NAME_MAX];
    uint64_t logmod;
710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729
    char *envstr;
    char *mmap_log_path;
    void *mmap_p;

    sys_page_size = sysconf(_SC_PAGESIZE);
    assert(sys_page_size > 0);

    mmap_size = sizeof(struct darshan_header) + DARSHAN_JOB_RECORD_SIZE +
        + DARSHAN_NAME_RECORD_BUF_SIZE + darshan_mod_mem_quota;
    if(mmap_size % sys_page_size)
        mmap_size = ((mmap_size / sys_page_size) + 1) * sys_page_size;

    envstr = getenv(DARSHAN_MMAP_LOG_PATH_OVERRIDE);
    if(envstr)
        mmap_log_path = envstr;
    else
        mmap_log_path = DARSHAN_DEF_MMAP_LOG_PATH;

    darshan_get_user_name(cuser);

730 731 732 733 734 735 736
    /* generate a random number to help differentiate the temporary log */
    /* NOTE: job id is not sufficient for constructing a unique log file name,
     * since a job could be composed of multiple application runs, so we also
     * add a random number component to the log name
     */
    if(my_rank == 0)
    {
737
        hlevel = darshan_core_wtime_absolute() * 1000000;
738 739 740
        (void)gethostname(hname, sizeof(hname));
        logmod = darshan_hash((void*)hname,strlen(hname),hlevel);
    }
741
    PMPI_Bcast(&logmod, 1, MPI_UINT64_T, 0, MPI_COMM_WORLD);
742

743 744 745 746
    /* construct a unique temporary log file name for this process
     * to write mmap log data to
     */
    snprintf(core->mmap_log_name, PATH_MAX,
747 748
        "/%s/%s_%s_id%d_mmap-log-%" PRIu64 "-%d.darshan",
        mmap_log_path, cuser, __progname, jobid, logmod, my_rank);
749 750 751 752 753

    /* create the temporary mmapped darshan log */
    mmap_fd = open(core->mmap_log_name, O_CREAT|O_RDWR|O_EXCL , 0644);
    if(mmap_fd < 0)
    {
754
        darshan_core_fprintf(stderr, "darshan library warning: "
755 756 757 758 759 760 761 762 763
            "unable to create darshan log file %s\n", core->mmap_log_name);
        return(NULL);
    }

    /* TODO: ftruncate or just zero fill? */
    /* allocate the necessary space in the log file */
    ret = ftruncate(mmap_fd, mmap_size);
    if(ret < 0)
    {
764
        darshan_core_fprintf(stderr, "darshan library warning: "
765 766 767 768 769 770 771 772 773 774 775 776
            "unable to allocate darshan log file %s\n", core->mmap_log_name);
        close(mmap_fd);
        unlink(core->mmap_log_name);
        return(NULL);
    }

    /* create the memory map for darshan's data structures so they are
     * persisted to file as the application executes
     */
    mmap_p = mmap(NULL, mmap_size, PROT_WRITE, MAP_SHARED, mmap_fd, 0);
    if(mmap_p == MAP_FAILED)
    {
777
        darshan_core_fprintf(stderr, "darshan library warning: "
778 779 780 781 782 783 784 785 786 787 788
            "unable to mmap darshan log file %s\n", core->mmap_log_name);
        close(mmap_fd);
        unlink(core->mmap_log_name);
        return(NULL);
    }

    /* close darshan log file (this does *not* unmap the log file) */
    close(mmap_fd);

    return(mmap_p);
}
789
#endif
790

791
/* record any hints used to write the darshan log in the job data */
792
static void darshan_log_record_hints_and_ver(struct darshan_core_runtime* core)
793 794
{
    char* hints;
795
    char* job_hints;
796 797 798 799 800 801
    int meta_remain = 0;
    char* m;

    /* check environment variable to see if the default MPI file hints have
     * been overridden
     */
802
    hints = getenv(DARSHAN_LOG_HINTS_OVERRIDE);
803 804
    if(!hints)
    {
805
        hints = __DARSHAN_LOG_HINTS;
806 807 808 809 810
    }

    if(!hints || strlen(hints) < 1)
        return;

811 812
    job_hints = strdup(hints);
    if(!job_hints)
813 814 815
        return;

    meta_remain = DARSHAN_JOB_METADATA_LEN -
816
        strlen(core->log_job_p->metadata) - 1;
817 818
    if(meta_remain >= (strlen(PACKAGE_VERSION) + 9))
    {
819
        sprintf(core->log_job_p->metadata, "lib_ver=%s\n", PACKAGE_VERSION);
820 821
        meta_remain -= (strlen(PACKAGE_VERSION) + 9);
    }
822
    if(meta_remain >= (3 + strlen(job_hints)))
823
    {
824
        m = core->log_job_p->metadata + strlen(core->log_job_p->metadata);
825
        /* We have room to store the hints in the metadata portion of
826
         * the job structure.  We just prepend an h= to the hints list.  The
827 828 829
         * metadata parser will ignore = characters that appear in the value
         * portion of the metadata key/value pair.
         */
830
        sprintf(m, "h=%s\n", job_hints);
831
    }
832
    free(job_hints);
833 834 835 836

    return;
}

837 838
static int mnt_data_cmp(const void* a, const void* b)
{
839 840
    const struct darshan_core_mnt_data *d_a = (const struct darshan_core_mnt_data*)a;
    const struct darshan_core_mnt_data *d_b = (const struct darshan_core_mnt_data*)b;
841 842 843 844 845 846 847 848 849 850

    if(strlen(d_a->path) > strlen(d_b->path))
        return(-1);
    else if(strlen(d_a->path) < strlen(d_b->path))
        return(1);
    else
        return(0);
}

/* adds an entry to table of mounted file systems */
851
static void add_entry(char* buf, int* space_left, struct mntent* entry)
852
{
853
    int i;
854 855 856 857
    int ret;
    char tmp_mnt[256];
    struct statfs statfsbuf;

858 859 860 861 862 863 864 865 866 867
    /* avoid adding the same mount points multiple times -- to limit
     * storage space and potential statfs, ioctl, etc calls
     */
    for(i = 0; i < mnt_data_count; i++)
    {
        if((strncmp(mnt_data_array[i].path, entry->mnt_dir, DARSHAN_MAX_MNT_PATH) == 0) &&
           (strncmp(mnt_data_array[i].type, entry->mnt_type, DARSHAN_MAX_MNT_PATH) == 0))
            return;
    }

868 869 870 871
    strncpy(mnt_data_array[mnt_data_count].path, entry->mnt_dir,
        DARSHAN_MAX_MNT_PATH-1);
    strncpy(mnt_data_array[mnt_data_count].type, entry->mnt_type,
        DARSHAN_MAX_MNT_TYPE-1);
Philip Carns's avatar
Philip Carns committed
872 873 874
    /* NOTE: we now try to detect the preferred block size for each file
     * system using fstatfs().  On Lustre we assume a size of 1 MiB
     * because fstatfs() reports 4 KiB.
875 876 877 878 879
     */
#ifndef LL_SUPER_MAGIC
#define LL_SUPER_MAGIC 0x0BD00BD0
#endif
    ret = statfs(entry->mnt_dir, &statfsbuf);
880
    mnt_data_array[mnt_data_count].fs_info.fs_type = statfsbuf.f_type;
881
    if(ret == 0 && statfsbuf.f_type != LL_SUPER_MAGIC)
882
        mnt_data_array[mnt_data_count].fs_info.block_size = statfsbuf.f_bsize;
883
    else if(ret == 0 && statfsbuf.f_type == LL_SUPER_MAGIC)
884
        mnt_data_array[mnt_data_count].fs_info.block_size = 1024*1024;
885
    else
886
        mnt_data_array[mnt_data_count].fs_info.block_size = 4096;
887

888
#ifdef DARSHAN_LUSTRE
889 890 891 892 893 894 895 896 897 898
    /* attempt to retrieve OST and MDS counts from Lustre */
    mnt_data_array[mnt_data_count].fs_info.ost_count = -1;
    mnt_data_array[mnt_data_count].fs_info.mdt_count = -1;
    if ( statfsbuf.f_type == LL_SUPER_MAGIC )
    {
        int n_ost, n_mdt;
        int ret_ost, ret_mdt;
        DIR *mount_dir;

        mount_dir = opendir( entry->mnt_dir );
Philip Carns's avatar
Philip Carns committed
899
        if ( mount_dir  )
900 901 902 903 904 905 906 907
        {
            /* n_ost and n_mdt are used for both input and output to ioctl */
            n_ost = 0;
            n_mdt = 1;

            ret_ost = ioctl( dirfd(mount_dir), LL_IOC_GETOBDCOUNT, &n_ost );
            ret_mdt = ioctl( dirfd(mount_dir), LL_IOC_GETOBDCOUNT, &n_mdt );

908
            if ( !(ret_ost < 0 || ret_mdt < 0) )
909 910 911 912 913 914 915
            {
                mnt_data_array[mnt_data_count].fs_info.ost_count = n_ost;
                mnt_data_array[mnt_data_count].fs_info.mdt_count = n_mdt;
            }
            closedir( mount_dir );
        }
    }
916
#endif
917

918
    /* store mount information with the job-level metadata in darshan log */
919
    ret = snprintf(tmp_mnt, 256, "\n%s\t%s",
920 921 922
        entry->mnt_type, entry->mnt_dir);
    if(ret < 256 && strlen(tmp_mnt) <= (*space_left))
    {
923
        strcat(buf, tmp_mnt);
924 925 926 927 928 929 930
        (*space_left) -= strlen(tmp_mnt);
    }

    mnt_data_count++;
    return;
}

931
/* darshan_get_exe_and_mounts()
932 933
 *
 * collects command line and list of mounted file systems into a string that
934
 * will be stored with the job-level metadata