darshan-core.c 67.5 KB
Newer Older
1
/*
Shane Snyder's avatar
Shane Snyder committed
2 3 4
 * Copyright (C) 2015 University of Chicago.
 * See COPYRIGHT notice in top-level directory.
 *
5 6
 */

7
#define _XOPEN_SOURCE 500
8
#define _GNU_SOURCE
9

10 11 12 13 14 15
#include "darshan-runtime-config.h"

#include <stdio.h>
#ifdef HAVE_MNTENT_H
#include <mntent.h>
#endif
16 17 18 19 20
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <limits.h>
#include <pthread.h>
21
#include <fcntl.h>
22 23
#include <dirent.h>
#include <sys/ioctl.h>
24 25
#include <sys/types.h>
#include <sys/stat.h>
26
#include <sys/mman.h>
27
#include <sys/vfs.h>
28
#include <zlib.h>
29
#include <mpi.h>
30
#include <assert.h>
31

32
#include "uthash.h"
Shane Snyder's avatar
Shane Snyder committed
33
#include "darshan.h"
34
#include "darshan-core.h"
Shane Snyder's avatar
Shane Snyder committed
35
#include "darshan-dynamic.h"
36

37
#ifdef DARSHAN_LUSTRE
38
#include <lustre/lustre_user.h>
39
#endif
40

41
extern char* __progname;
42
extern char* __progname_full;
43

44
/* internal variable delcarations */
45
static struct darshan_core_runtime *darshan_core = NULL;
46
static pthread_mutex_t darshan_core_mutex = PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP;
47
static int my_rank = -1;
48
static int nprocs = -1;
49
static int darshan_mem_alignment = 1;
50
static long darshan_mod_mem_quota = DARSHAN_MOD_MEM_MAX;
51

52
/* paths prefixed with the following directories are not tracked by darshan */
53
char* darshan_path_exclusions[] = {
54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72
    "/etc/",
    "/dev/",
    "/usr/",
    "/bin/",
    "/boot/",
    "/lib/",
    "/opt/",
    "/sbin/",
    "/sys/",
    "/proc/",
    "/var/",
    NULL
};
/* paths prefixed with the following directories are tracked by darshan even if
 * they share a root with a path listed in darshan_path_exclusions
 */
char* darshan_path_inclusions[] = {
    "/var/opt/cray/dws/mounts/",
    NULL
73 74
};

75 76 77
/* allow users to override the path exclusions */
char** user_darshan_path_exclusions = NULL;

78 79 80 81 82 83 84 85 86 87 88 89 90 91 92
#ifdef DARSHAN_BGQ
extern void bgq_runtime_initialize();
#endif

/* array of init functions for modules which need to be statically
 * initialized by darshan at startup time
 */
void (*mod_static_init_fns[])(void) =
{
#ifdef DARSHAN_BGQ
    &bgq_runtime_initialize,
#endif
    NULL
};

93 94 95 96 97 98 99
#ifdef DARSHAN_LUSTRE
/* XXX need to use extern to get Lustre module's instrumentation function
 * since modules have no way of providing this to darshan-core
 */
extern void darshan_instrument_lustre_file(const char *filepath, int fd);
#endif

Shane Snyder's avatar
Shane Snyder committed
100 101 102
#define DARSHAN_CORE_LOCK() pthread_mutex_lock(&darshan_core_mutex)
#define DARSHAN_CORE_UNLOCK() pthread_mutex_unlock(&darshan_core_mutex)

103 104 105 106 107 108 109 110
/* FS mount information */
#define DARSHAN_MAX_MNTS 64
#define DARSHAN_MAX_MNT_PATH 256
#define DARSHAN_MAX_MNT_TYPE 32
struct mnt_data
{
    char path[DARSHAN_MAX_MNT_PATH];
    char type[DARSHAN_MAX_MNT_TYPE];
111
    struct darshan_fs_info fs_info;
112 113 114 115
};
static struct mnt_data mnt_data_array[DARSHAN_MAX_MNTS];
static int mnt_data_count = 0;

116
/* prototypes for internal helper functions */
117
#ifdef __DARSHAN_ENABLE_MMAP_LOGS
118 119
static void *darshan_init_mmap_log(
    struct darshan_core_runtime* core, int jobid);
120
#endif
121
static void darshan_log_record_hints_and_ver(
122
    struct darshan_core_runtime* core);
123 124
static void darshan_get_exe_and_mounts(
    struct darshan_core_runtime *core, int argc, char **argv);
125 126
static void darshan_fs_info_from_path(
    const char *path, struct darshan_fs_info *fs_info);
127
static int darshan_add_name_record_ref(
128
    struct darshan_core_runtime *core, darshan_record_id rec_id,
129
    const char *name, darshan_module_id mod_id);
130 131
static void darshan_get_user_name(
    char *user);
132 133
static void darshan_get_logfile_name(
    char* logfile_name, int jobid, struct tm* start_tm);
134
static void darshan_get_shared_records(
135 136
    struct darshan_core_runtime *core, darshan_record_id **shared_recs,
    int *shared_rec_cnt);
137
static int darshan_log_open_all(
138
    char *logfile_name, MPI_File *log_fh);
139
static int darshan_deflate_buffer(
Shane Snyder's avatar
Shane Snyder committed
140 141
    void **pointers, int *lengths, int count, char *comp_buf,
    int *comp_buf_length);
142
static int darshan_log_write_name_record_hash(
143
    MPI_File log_fh, struct darshan_core_runtime *core,
144 145 146
    uint64_t *inout_off);
static int darshan_log_append_all(
    MPI_File log_fh, struct darshan_core_runtime *core, void *buf,
Shane Snyder's avatar
Shane Snyder committed
147
    int count, uint64_t *inout_off);
Shane Snyder's avatar
Shane Snyder committed
148 149
static void darshan_core_cleanup(
    struct darshan_core_runtime* core);
150

151 152
/* *********************************** */

Shane Snyder's avatar
Shane Snyder committed
153
void darshan_core_initialize(int argc, char **argv)
154
{
155
    struct darshan_core_runtime *init_core = NULL;
156 157
    int internal_timing_flag = 0;
    double init_start, init_time, init_max;
158
    char *envstr;
159 160
    char *jobid_str;
    int jobid;
161
    int ret;
162
    int i;
163 164
    int tmpval;
    double tmpfloat;
165

166 167
    PMPI_Comm_size(MPI_COMM_WORLD, &nprocs);
    PMPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
168 169 170 171

    if(getenv("DARSHAN_INTERNAL_TIMING"))
        internal_timing_flag = 1;

172
    if(internal_timing_flag)
173
        init_start = PMPI_Wtime();
174 175

    /* setup darshan runtime if darshan is enabled and hasn't been initialized already */
176
    if(!getenv("DARSHAN_DISABLE") && !darshan_core)
177
    {
178
        #if (__DARSHAN_MEM_ALIGNMENT < 1)
179 180
            #error Darshan must be configured with a positive value for --with-mem-align
        #endif
181
        envstr = getenv(DARSHAN_MEM_ALIGNMENT_OVERRIDE);
182 183 184 185 186 187 188 189 190 191 192
        if(envstr)
        {
            ret = sscanf(envstr, "%d", &tmpval);
            /* silently ignore if the env variable is set poorly */
            if(ret == 1 && tmpval > 0)
            {
                darshan_mem_alignment = tmpval;
            }
        }
        else
        {
193
            darshan_mem_alignment = __DARSHAN_MEM_ALIGNMENT;
194 195 196
        }

        /* avoid floating point errors on faulty input */
197
        if(darshan_mem_alignment < 1)
198 199 200
        {
            darshan_mem_alignment = 1;
        }
201

202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221
        /* Use DARSHAN_JOBID_OVERRIDE for the env var for __DARSHAN_JOBID */
        envstr = getenv(DARSHAN_JOBID_OVERRIDE);
        if(!envstr)
        {
            envstr = __DARSHAN_JOBID;
        }

        /* find a job id */
        jobid_str = getenv(envstr);
        if(jobid_str)
        {
            /* in cobalt we can find it in env var */
            ret = sscanf(jobid_str, "%d", &jobid);
        }
        if(!jobid_str || ret != 1)
        {
            /* use pid as fall back */
            jobid = getpid();
        }

222 223 224 225
        /* set the memory quota for darshan modules' records */
        envstr = getenv(DARSHAN_MOD_MEM_OVERRIDE);
        if(envstr)
        {
226
            ret = sscanf(envstr, "%lf", &tmpfloat);
227
            /* silently ignore if the env variable is set poorly */
228
            if(ret == 1 && tmpfloat > 0)
229
            {
230
                darshan_mod_mem_quota = tmpfloat * 1024 * 1024; /* convert from MiB */
231 232 233
            }
        }

234 235 236
        /* allocate structure to track darshan core runtime information */
        init_core = malloc(sizeof(*init_core));
        if(init_core)
237
        {
238
            memset(init_core, 0, sizeof(*init_core));
239
            init_core->wtime_offset = PMPI_Wtime();
240

241 242
        /* TODO: do we alloc new memory as we go or just do everything up front? */

243 244 245 246 247
#ifndef __DARSHAN_ENABLE_MMAP_LOGS
            /* just allocate memory for each log file region */
            init_core->log_hdr_p = malloc(sizeof(struct darshan_header));
            init_core->log_job_p = malloc(sizeof(struct darshan_job));
            init_core->log_exemnt_p = malloc(DARSHAN_EXE_LEN+1);
248
            init_core->log_name_p = malloc(DARSHAN_NAME_RECORD_BUF_SIZE);
249
            init_core->log_mod_p = malloc(darshan_mod_mem_quota);
250 251

            if(!(init_core->log_hdr_p) || !(init_core->log_job_p) ||
252
               !(init_core->log_exemnt_p) || !(init_core->log_name_p) ||
253 254 255 256 257 258 259 260 261
               !(init_core->log_mod_p))
            {
                free(init_core);
                return;
            }
            /* if allocation succeeds, zero fill memory regions */
            memset(init_core->log_hdr_p, 0, sizeof(struct darshan_header));
            memset(init_core->log_job_p, 0, sizeof(struct darshan_job));
            memset(init_core->log_exemnt_p, 0, DARSHAN_EXE_LEN+1);
262
            memset(init_core->log_name_p, 0, DARSHAN_NAME_RECORD_BUF_SIZE);
263
            memset(init_core->log_mod_p, 0, darshan_mod_mem_quota);
264 265 266 267
#else
            /* if mmap logs are enabled, we need to initialize the mmap region
             * before setting the corresponding log file region pointers
             */
268 269
            void *mmap_p = darshan_init_mmap_log(init_core, jobid);
            if(!mmap_p)
270
            {
271 272
                free(init_core);
                return;
273 274
            }

275
            /* set the memory pointers for each log file region */
276
            init_core->log_hdr_p = (struct darshan_header *)mmap_p;
277
            init_core->log_job_p = (struct darshan_job *)
278
                ((char *)init_core->log_hdr_p + sizeof(struct darshan_header));
279
            init_core->log_exemnt_p = (char *)
280
                ((char *)init_core->log_job_p + sizeof(struct darshan_job));
281
            init_core->log_name_p = (void *)
282 283
                ((char *)init_core->log_exemnt_p + DARSHAN_EXE_LEN + 1);
            init_core->log_mod_p = (void *)
284
                ((char *)init_core->log_name_p + DARSHAN_NAME_RECORD_BUF_SIZE);
285

286
            /* set header fields needed for the mmap log mechanism */
287
            init_core->log_hdr_p->comp_type = DARSHAN_NO_COMP;
288
            init_core->log_hdr_p->name_map.off =
289
                ((char *)init_core->log_name_p - (char *)init_core->log_hdr_p);
290 291 292 293 294
#endif

            /* set known header fields for the log file */
            strcpy(init_core->log_hdr_p->version_string, DARSHAN_LOG_VERSION);
            init_core->log_hdr_p->magic_nr = DARSHAN_MAGIC_NR;
295

296 297 298 299 300
            /* set known job-level metadata fields for the log file */
            init_core->log_job_p->uid = getuid();
            init_core->log_job_p->start_time = time(NULL);
            init_core->log_job_p->nprocs = nprocs;
            init_core->log_job_p->jobid = (int64_t)jobid;
301 302 303 304 305 306

            /* if we are using any hints to write the log file, then record those
             * hints with the darshan job information
             */
            darshan_log_record_hints_and_ver(init_core);

307
            /* collect information about command line and mounted file systems */
308
            darshan_get_exe_and_mounts(init_core, argc, argv);
309

310 311 312 313
            /* if darshan was successfully initialized, set the global pointer
             * and bootstrap any modules with static initialization routines
             */
            DARSHAN_CORE_LOCK();
314
            darshan_core = init_core;
315 316 317 318 319 320 321 322
            DARSHAN_CORE_UNLOCK();

            i = 0;
            while(mod_static_init_fns[i])
            {
                (*mod_static_init_fns[i])();
                i++;
            }
323
        }
324 325
    }

326 327
    if(internal_timing_flag)
    {
328 329
        init_time = PMPI_Wtime() - init_start;
        PMPI_Reduce(&init_time, &init_max, 1,
330
            MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
331
        if(my_rank == 0)
332
        {
333 334
            darshan_core_fprintf(stderr, "#darshan:<op>\t<nprocs>\t<time>\n");
            darshan_core_fprintf(stderr, "darshan:init\t%d\t%f\n", nprocs, init_max);
335 336 337 338 339 340
        }
    }

    return;
}

Shane Snyder's avatar
Shane Snyder committed
341
void darshan_core_shutdown()
342
{
343
    struct darshan_core_runtime *final_core;
344
    int internal_timing_flag = 0;
345 346
    struct tm *start_tm;
    time_t start_time_tmp;
347
    int64_t first_start_time;
348
    int64_t last_end_time;
349
    double start_log_time;
350
    double tm_end;
351 352 353
    double open1 = 0, open2 = 0;
    double job1 = 0, job2 = 0;
    double rec1 = 0, rec2 = 0;
354 355
    double mod1[DARSHAN_MAX_MODS] = {0};
    double mod2[DARSHAN_MAX_MODS] = {0};
356
    double header1 = 0, header2 = 0;
357 358 359 360 361 362 363 364 365 366 367 368
    char *logfile_name;
    int local_mod_use[DARSHAN_MAX_MODS] = {0};
    int global_mod_use_count[DARSHAN_MAX_MODS] = {0};
    darshan_record_id *shared_recs;
    darshan_record_id *mod_shared_recs;
    int shared_rec_cnt = 0;
    int ret = 0;
    int all_ret = 0;
    int i;
    uint64_t gz_fp = 0;
    MPI_File log_fh;
    MPI_Status status;
369 370 371 372

    if(getenv("DARSHAN_INTERNAL_TIMING"))
        internal_timing_flag = 1;

373
    /* synchronize before getting start time */
374 375
    PMPI_Barrier(MPI_COMM_WORLD);
    start_log_time = PMPI_Wtime();
376

Shane Snyder's avatar
Shane Snyder committed
377
    /* disable darhan-core while we shutdown */
378
    DARSHAN_CORE_LOCK();
379
    if(!darshan_core)
380
    {
381
        DARSHAN_CORE_UNLOCK();
382 383
        return;
    }
384 385
    final_core = darshan_core;
    darshan_core = NULL;
386 387
    DARSHAN_CORE_UNLOCK();

388
#ifdef __DARSHAN_ENABLE_MMAP_LOGS
389 390 391 392 393
    /* remove the temporary mmap log files */
    /* NOTE: this unlink is not immediate as it must wait for the mapping
     * to no longer be referenced, which in our case happens when the
     * executable exits. If the application terminates mid-shutdown, then
     * there will be no mmap files and no final log file.
394
     */
395
    unlink(final_core->mmap_log_name);
396
#endif
Shane Snyder's avatar
Shane Snyder committed
397

398 399 400
    final_core->log_job_p->end_time = time(NULL);

    /* reduce to report first start and last end time across all ranks at rank 0 */
401
    PMPI_Reduce(&final_core->log_job_p->start_time, &first_start_time,
402
        1, MPI_INT64_T, MPI_MIN, 0, MPI_COMM_WORLD);
403
    PMPI_Reduce(&final_core->log_job_p->end_time, &last_end_time,
404 405
        1, MPI_INT64_T, MPI_MAX, 0, MPI_COMM_WORLD);
    if(my_rank == 0)
Shane Snyder's avatar
Shane Snyder committed
406
    {
407 408
        final_core->log_job_p->start_time = first_start_time;
        final_core->log_job_p->end_time = last_end_time;
Shane Snyder's avatar
Shane Snyder committed
409
    }
410

411
    final_core->comp_buf = malloc(darshan_mod_mem_quota);
412 413 414 415 416
    if(!(final_core->comp_buf))
    {
        darshan_core_cleanup(final_core);
        return;
    }
417

418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435
    logfile_name = malloc(PATH_MAX);
    if(!logfile_name)
    {
        darshan_core_cleanup(final_core);
        return;
    }

    /* set the log file name on rank 0 */
    if(my_rank == 0)
    {
        /* use human readable start time format in log filename */
        start_time_tmp = final_core->log_job_p->start_time;
        start_tm = localtime(&start_time_tmp);

        darshan_get_logfile_name(logfile_name, final_core->log_job_p->jobid, start_tm);
    }

    /* broadcast log file name */
436
    PMPI_Bcast(logfile_name, PATH_MAX, MPI_CHAR, 0,
437 438 439 440 441
        MPI_COMM_WORLD);

    if(strlen(logfile_name) == 0)
    {
        /* failed to generate log file name */
442
        darshan_core_fprintf(stderr, "darshan library warning: unable to determine log file path\n");
443 444 445 446
        free(logfile_name);
        darshan_core_cleanup(final_core);
        return;
    }
447

448 449 450 451 452
    /* set which modules were registered locally, and call into
     * them to disable further instrumentation and to perform any
     * other pre-shutdown steps
     */
    for(i = 0; i < DARSHAN_MAX_MODS; i++)
453
    {
454 455
        if(final_core->mod_array[i])
            local_mod_use[i] = 1;
456 457 458
    }

    /* reduce the number of times a module was opened globally and bcast to everyone */
459
    PMPI_Allreduce(local_mod_use, global_mod_use_count,
460 461 462 463 464 465
        DARSHAN_MAX_MODS, MPI_INT, MPI_SUM, MPI_COMM_WORLD);

    /* get a list of records which are shared across all processes */
    darshan_get_shared_records(final_core, &shared_recs, &shared_rec_cnt);

    if(internal_timing_flag)
466
        open1 = PMPI_Wtime();
467 468 469
    /* collectively open the darshan log file */
    ret = darshan_log_open_all(logfile_name, &log_fh);
    if(internal_timing_flag)
470
        open2 = PMPI_Wtime();
471 472

    /* error out if unable to open log file */
473
    PMPI_Allreduce(&ret, &all_ret, 1, MPI_INT,
474 475 476 477 478
        MPI_LOR, MPI_COMM_WORLD);
    if(all_ret != 0)
    {
        if(my_rank == 0)
        {
479
            darshan_core_fprintf(stderr, "darshan library warning: unable to create log file %s\n",
480 481 482 483 484 485 486 487
                logfile_name);
        }
        free(logfile_name);
        darshan_core_cleanup(final_core);
        return;
    }

    if(internal_timing_flag)
488
        job1 = PMPI_Wtime();
489 490 491
    /* rank 0 is responsible for writing the compressed darshan job information */
    if(my_rank == 0)
    {
492
        void *pointers[2] = {final_core->log_job_p, final_core->log_exemnt_p};
493 494 495 496 497 498 499 500
        int lengths[2] = {sizeof(struct darshan_job), strlen(final_core->log_exemnt_p)};
        int comp_buf_sz = 0;

        /* compress the job info and the trailing mount/exe data */
        all_ret = darshan_deflate_buffer(pointers, lengths, 2,
            final_core->comp_buf, &comp_buf_sz);
        if(all_ret)
        {
501
            darshan_core_fprintf(stderr, "darshan library warning: unable to compress job data\n");
502 503 504 505 506 507
            unlink(logfile_name);
        }
        else
        {
            /* write the job information, preallocing space for the log header */
            gz_fp += sizeof(struct darshan_header);
508
            all_ret = PMPI_File_write_at(log_fh, gz_fp,
509 510 511
                final_core->comp_buf, comp_buf_sz, MPI_BYTE, &status);
            if(all_ret != MPI_SUCCESS)
            {
512
                darshan_core_fprintf(stderr,
513 514 515 516 517 518 519 520 521 522
                        "darshan library warning: unable to write job data to log file %s\n",
                        logfile_name);
                unlink(logfile_name);

            }
            gz_fp += comp_buf_sz;
        }
    }

    /* error out if unable to write job information */
523
    PMPI_Bcast(&all_ret, 1, MPI_INT, 0, MPI_COMM_WORLD);
524 525 526 527 528 529 530
    if(all_ret != 0)
    {
        free(logfile_name);
        darshan_core_cleanup(final_core);
        return;
    }
    if(internal_timing_flag)
531
        job2 = PMPI_Wtime();
532 533

    if(internal_timing_flag)
534
        rec1 = PMPI_Wtime();
535
    /* write the record name->id hash to the log file */
536
    final_core->log_hdr_p->name_map.off = gz_fp;
537
    ret = darshan_log_write_name_record_hash(log_fh, final_core, &gz_fp);
538
    final_core->log_hdr_p->name_map.len = gz_fp - final_core->log_hdr_p->name_map.off;
539

540
    /* error out if unable to write the name record hash */
541
    PMPI_Allreduce(&ret, &all_ret, 1, MPI_INT,
542 543 544 545 546
        MPI_LOR, MPI_COMM_WORLD);
    if(all_ret != 0)
    {
        if(my_rank == 0)
        {
547
            darshan_core_fprintf(stderr,
548 549 550 551 552 553 554 555 556
                "darshan library warning: unable to write record hash to log file %s\n",
                logfile_name);
            unlink(logfile_name);
        }
        free(logfile_name);
        darshan_core_cleanup(final_core);
        return;
    }
    if(internal_timing_flag)
557
        rec2 = PMPI_Wtime();
558 559 560 561 562 563 564 565

    mod_shared_recs = malloc(shared_rec_cnt * sizeof(darshan_record_id));
    assert(mod_shared_recs);

    /* loop over globally used darshan modules and:
     *      - get final output buffer
     *      - compress (zlib) provided output buffer
     *      - append compressed buffer to log file
566
     *      - add module map info (file offset/length) to log header
567 568 569 570 571
     *      - shutdown the module
     */
    for(i = 0; i < DARSHAN_MAX_MODS; i++)
    {
        struct darshan_core_module* this_mod = final_core->mod_array[i];
572
        struct darshan_core_name_record_ref *ref = NULL;
573 574 575 576 577 578 579 580 581
        int mod_shared_rec_cnt = 0;
        void* mod_buf = NULL;
        int mod_buf_sz = 0;
        int j;

        if(global_mod_use_count[i] == 0)
        {
            if(my_rank == 0)
            {
582 583
                final_core->log_hdr_p->mod_map[i].off = 0;
                final_core->log_hdr_p->mod_map[i].len = 0;
584 585 586 587 588
            }
            continue;
        }

        if(internal_timing_flag)
589
            mod1[i] = PMPI_Wtime();
590

591
        /* set the shared record list for this module */
592 593
        for(j = 0; j < shared_rec_cnt; j++)
        {
594
            HASH_FIND(hlink, final_core->name_hash, &shared_recs[j],
595 596 597 598 599 600 601 602 603 604 605
                sizeof(darshan_record_id), ref);
            assert(ref);
            if(DARSHAN_MOD_FLAG_ISSET(ref->global_mod_flags, i))
            {
                mod_shared_recs[mod_shared_rec_cnt++] = shared_recs[j];
            }
        }

        /* if module is registered locally, get the corresponding output buffer
         * 
         * NOTE: this function can be used to run collective operations across
606
         * modules, if there are records shared globally.
607 608 609
         */
        if(this_mod)
        {
610 611
            mod_buf = final_core->mod_array[i]->rec_buf_start;
            mod_buf_sz = final_core->mod_array[i]->rec_buf_p - mod_buf;
612
            this_mod->mod_shutdown_func(MPI_COMM_WORLD, mod_shared_recs,
613 614 615 616
                mod_shared_rec_cnt, &mod_buf, &mod_buf_sz);
        }

        /* append this module's data to the darshan log */
617
        final_core->log_hdr_p->mod_map[i].off = gz_fp;
618
        ret = darshan_log_append_all(log_fh, final_core, mod_buf, mod_buf_sz, &gz_fp);
619 620
        final_core->log_hdr_p->mod_map[i].len =
            gz_fp - final_core->log_hdr_p->mod_map[i].off;
621

622 623 624 625 626 627
        /* XXX: DXT manages its own module memory buffers, so we need to
         * explicitly free them
         */
        if(i == DXT_POSIX_MOD || i == DXT_MPIIO_MOD)
            free(mod_buf);

628
        /* error out if the log append failed */
629
        PMPI_Allreduce(&ret, &all_ret, 1, MPI_INT,
630 631 632 633 634
            MPI_LOR, MPI_COMM_WORLD);
        if(all_ret != 0)
        {
            if(my_rank == 0)
            {
635
                darshan_core_fprintf(stderr,
636 637 638 639 640 641 642 643 644 645
                    "darshan library warning: unable to write %s module data to log file %s\n",
                    darshan_module_names[i], logfile_name);
                unlink(logfile_name);
            }
            free(logfile_name);
            darshan_core_cleanup(final_core);
            return;
        }

        if(internal_timing_flag)
646
            mod2[i] = PMPI_Wtime();
647 648 649
    }

    if(internal_timing_flag)
650
        header1 = PMPI_Wtime();
651
    /* write out log header, after running 2 reductions on header variables:
652
     *  1) reduce 'partial_flag' variable to determine which modules ran out
653
     *     of memory for storing data
654 655 656 657 658 659
     *  2) reduce 'mod_ver' array to determine which log format version each
     *     module used for this output log
     */
    if(my_rank == 0)
    {
        /* rank 0 is responsible for writing the log header */
660 661
        final_core->log_hdr_p->comp_type = DARSHAN_ZLIB_COMP;

662
        PMPI_Reduce(
663 664
            MPI_IN_PLACE, &(final_core->log_hdr_p->partial_flag),
            1, MPI_UINT32_T, MPI_BOR, 0, MPI_COMM_WORLD);
665
        PMPI_Reduce(
666 667
            MPI_IN_PLACE, &(final_core->log_hdr_p->mod_ver),
            DARSHAN_MAX_MODS, MPI_UINT32_T, MPI_MAX, 0, MPI_COMM_WORLD);
668

669
        all_ret = PMPI_File_write_at(log_fh, 0, final_core->log_hdr_p,
670 671 672
            sizeof(struct darshan_header), MPI_BYTE, &status);
        if(all_ret != MPI_SUCCESS)
        {
673
            darshan_core_fprintf(stderr, "darshan library warning: unable to write header to log file %s\n",
674 675 676 677
                    logfile_name);
            unlink(logfile_name);
        }
    }
678 679
    else
    {
680
        PMPI_Reduce(
681 682
            &(final_core->log_hdr_p->partial_flag), &(final_core->log_hdr_p->partial_flag),
            1, MPI_UINT32_T, MPI_BOR, 0, MPI_COMM_WORLD);
683
        PMPI_Reduce(
684 685 686
            &(final_core->log_hdr_p->mod_ver), &(final_core->log_hdr_p->mod_ver),
            DARSHAN_MAX_MODS, MPI_UINT32_T, MPI_MAX, 0, MPI_COMM_WORLD);
    }
687 688

    /* error out if unable to write log header */
689
    PMPI_Bcast(&all_ret, 1, MPI_INT, 0, MPI_COMM_WORLD);
690 691 692 693 694 695 696
    if(all_ret != 0)
    {
        free(logfile_name);
        darshan_core_cleanup(final_core);
        return;
    }
    if(internal_timing_flag)
697
        header2 = PMPI_Wtime();
698

699
    PMPI_File_close(&log_fh);
700 701 702 703 704 705 706

    /* if we got this far, there are no errors, so rename from *.darshan_partial
     * to *-<logwritetime>.darshan, which indicates that this log file is
     * complete and ready for analysis
     */
    if(my_rank == 0)
    {
707
        mode_t chmod_mode = S_IRUSR;
708
#ifdef __DARSHAN_GROUP_READABLE_LOGS
709
        chmod_mode |= S_IRGRP;
710
#endif
711 712 713 714

        if(getenv("DARSHAN_LOGFILE"))
        {
            chmod(logfile_name, chmod_mode);
715 716 717 718 719 720 721 722 723 724 725
        }
        else
        {
            char* tmp_index;
            double end_log_time;
            char* new_logfile_name;

            new_logfile_name = malloc(PATH_MAX);
            if(new_logfile_name)
            {
                new_logfile_name[0] = '\0';
726
                end_log_time = PMPI_Wtime();
727 728 729 730 731
                strcat(new_logfile_name, logfile_name);
                tmp_index = strstr(new_logfile_name, ".darshan_partial");
                sprintf(tmp_index, "_%d.darshan", (int)(end_log_time-start_log_time+1));
                rename(logfile_name, new_logfile_name);
                /* set permissions on log file */
732
                chmod(new_logfile_name, chmod_mode);
733 734 735 736 737 738
                free(new_logfile_name);
            }
        }
    }

    free(logfile_name);
739 740
    free(shared_recs);
    free(mod_shared_recs);
741
    darshan_core_cleanup(final_core);
742

743
    if(internal_timing_flag)
744
    {
745 746 747 748 749
        double open_tm, open_slowest;
        double header_tm, header_slowest;
        double job_tm, job_slowest;
        double rec_tm, rec_slowest;
        double mod_tm[DARSHAN_MAX_MODS], mod_slowest[DARSHAN_MAX_MODS];
750 751
        double all_tm, all_slowest;

752
        tm_end = PMPI_Wtime();
753

754 755 756 757
        open_tm = open2 - open1;
        header_tm = header2 - header1;
        job_tm = job2 - job1;
        rec_tm = rec2 - rec1;
758
        all_tm = tm_end - start_log_time;
759 760 761 762
        for(i = 0;i < DARSHAN_MAX_MODS; i++)
        {
            mod_tm[i] = mod2[i] - mod1[i];
        }
763

764
        PMPI_Reduce(&open_tm, &open_slowest, 1,
765
            MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
766
        PMPI_Reduce(&header_tm, &header_slowest, 1,
767
            MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
768
        PMPI_Reduce(&job_tm, &job_slowest, 1,
769
            MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
770
        PMPI_Reduce(&rec_tm, &rec_slowest, 1,
771
            MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
772
        PMPI_Reduce(&all_tm, &all_slowest, 1,
773
            MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
774
        PMPI_Reduce(mod_tm, mod_slowest, DARSHAN_MAX_MODS,
775
            MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
776 777 778

        if(my_rank == 0)
        {
779 780 781 782 783
            darshan_core_fprintf(stderr, "#darshan:<op>\t<nprocs>\t<time>\n");
            darshan_core_fprintf(stderr, "darshan:log_open\t%d\t%f\n", nprocs, open_slowest);
            darshan_core_fprintf(stderr, "darshan:job_write\t%d\t%f\n", nprocs, job_slowest);
            darshan_core_fprintf(stderr, "darshan:hash_write\t%d\t%f\n", nprocs, rec_slowest);
            darshan_core_fprintf(stderr, "darshan:header_write\t%d\t%f\n", nprocs, header_slowest);
784 785 786
            for(i = 0; i < DARSHAN_MAX_MODS; i++)
            {
                if(global_mod_use_count[i])
787
                    darshan_core_fprintf(stderr, "darshan:%s_shutdown\t%d\t%f\n", darshan_module_names[i],
788 789
                        nprocs, mod_slowest[i]);
            }
790
            darshan_core_fprintf(stderr, "darshan:core_shutdown\t%d\t%f\n", nprocs, all_slowest);
791
        }
792
    }
793

794 795
    return;
}
796

Shane Snyder's avatar
Shane Snyder committed
797
/* *********************************** */
798

799
#ifdef __DARSHAN_ENABLE_MMAP_LOGS
800 801 802 803 804 805 806
static void *darshan_init_mmap_log(struct darshan_core_runtime* core, int jobid)
{
    int ret;
    int mmap_fd;
    int mmap_size;
    int sys_page_size;
    char cuser[L_cuserid] = {0};
807 808 809
    uint64_t hlevel;
    char hname[HOST_NAME_MAX];
    uint64_t logmod;
810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829
    char *envstr;
    char *mmap_log_path;
    void *mmap_p;

    sys_page_size = sysconf(_SC_PAGESIZE);
    assert(sys_page_size > 0);

    mmap_size = sizeof(struct darshan_header) + DARSHAN_JOB_RECORD_SIZE +
        + DARSHAN_NAME_RECORD_BUF_SIZE + darshan_mod_mem_quota;
    if(mmap_size % sys_page_size)
        mmap_size = ((mmap_size / sys_page_size) + 1) * sys_page_size;

    envstr = getenv(DARSHAN_MMAP_LOG_PATH_OVERRIDE);
    if(envstr)
        mmap_log_path = envstr;
    else
        mmap_log_path = DARSHAN_DEF_MMAP_LOG_PATH;

    darshan_get_user_name(cuser);

830 831 832 833 834 835 836
    /* generate a random number to help differentiate the temporary log */
    /* NOTE: job id is not sufficient for constructing a unique log file name,
     * since a job could be composed of multiple application runs, so we also
     * add a random number component to the log name
     */
    if(my_rank == 0)
    {
837
        hlevel=PMPI_Wtime() * 1000000;
838 839 840
        (void)gethostname(hname, sizeof(hname));
        logmod = darshan_hash((void*)hname,strlen(hname),hlevel);
    }
841
    PMPI_Bcast(&logmod, 1, MPI_UINT64_T, 0, MPI_COMM_WORLD);
842

843 844 845 846
    /* construct a unique temporary log file name for this process
     * to write mmap log data to
     */
    snprintf(core->mmap_log_name, PATH_MAX,
847 848
        "/%s/%s_%s_id%d_mmap-log-%" PRIu64 "-%d.darshan",
        mmap_log_path, cuser, __progname, jobid, logmod, my_rank);
849 850 851 852 853

    /* create the temporary mmapped darshan log */
    mmap_fd = open(core->mmap_log_name, O_CREAT|O_RDWR|O_EXCL , 0644);
    if(mmap_fd < 0)
    {
854
        darshan_core_fprintf(stderr, "darshan library warning: "
855 856 857 858 859 860 861 862 863
            "unable to create darshan log file %s\n", core->mmap_log_name);
        return(NULL);
    }

    /* TODO: ftruncate or just zero fill? */
    /* allocate the necessary space in the log file */
    ret = ftruncate(mmap_fd, mmap_size);
    if(ret < 0)
    {
864
        darshan_core_fprintf(stderr, "darshan library warning: "
865 866 867 868 869 870 871 872 873 874 875 876
            "unable to allocate darshan log file %s\n", core->mmap_log_name);
        close(mmap_fd);
        unlink(core->mmap_log_name);
        return(NULL);
    }

    /* create the memory map for darshan's data structures so they are
     * persisted to file as the application executes
     */
    mmap_p = mmap(NULL, mmap_size, PROT_WRITE, MAP_SHARED, mmap_fd, 0);
    if(mmap_p == MAP_FAILED)
    {
877
        darshan_core_fprintf(stderr, "darshan library warning: "
878 879 880 881 882 883 884 885 886 887 888
            "unable to mmap darshan log file %s\n", core->mmap_log_name);
        close(mmap_fd);
        unlink(core->mmap_log_name);
        return(NULL);
    }

    /* close darshan log file (this does *not* unmap the log file) */
    close(mmap_fd);

    return(mmap_p);
}
889
#endif
890

891
/* record any hints used to write the darshan log in the job data */
892
static void darshan_log_record_hints_and_ver(struct darshan_core_runtime* core)
893 894
{
    char* hints;
895
    char* job_hints;
896 897 898 899 900 901
    int meta_remain = 0;
    char* m;

    /* check environment variable to see if the default MPI file hints have
     * been overridden
     */
902
    hints = getenv(DARSHAN_LOG_HINTS_OVERRIDE);
903 904
    if(!hints)
    {
905
        hints = __DARSHAN_LOG_HINTS;
906 907 908 909 910
    }

    if(!hints || strlen(hints) < 1)
        return;

911 912
    job_hints = strdup(hints);
    if(!job_hints)
913 914 915
        return;

    meta_remain = DARSHAN_JOB_METADATA_LEN -
916
        strlen(core->log_job_p->metadata) - 1;
917 918
    if(meta_remain >= (strlen(PACKAGE_VERSION) + 9))
    {
919
        sprintf(core->log_job_p->metadata, "lib_ver=%s\n", PACKAGE_VERSION);
920 921
        meta_remain -= (strlen(PACKAGE_VERSION) + 9);
    }
922
    if(meta_remain >= (3 + strlen(job_hints)))
923
    {
924
        m = core->log_job_p->metadata + strlen(core->log_job_p->metadata);
925
        /* We have room to store the hints in the metadata portion of
926
         * the job structure.  We just prepend an h= to the hints list.  The
927 928 929
         * metadata parser will ignore = characters that appear in the value
         * portion of the metadata key/value pair.
         */
930
        sprintf(m, "h=%s\n", job_hints);
931
    }
932
    free(job_hints);
933 934 935 936

    return;
}

937 938 939 940 941 942 943 944 945 946 947 948 949 950
static int mnt_data_cmp(const void* a, const void* b)
{
    const struct mnt_data *d_a = (const struct mnt_data*)a;
    const struct mnt_data *d_b = (const struct mnt_data*)b;

    if(strlen(d_a->path) > strlen(d_b->path))
        return(-1);
    else if(strlen(d_a->path) < strlen(d_b->path))
        return(1);
    else
        return(0);
}

/* adds an entry to table of mounted file systems */
951
static void add_entry(char* buf, int* space_left, struct mntent* entry)
952
{
953
    int i;
954 955 956 957
    int ret;
    char tmp_mnt[256];
    struct statfs statfsbuf;

958 959 960 961 962 963 964 965 966 967
    /* avoid adding the same mount points multiple times -- to limit
     * storage space and potential statfs, ioctl, etc calls
     */
    for(i = 0; i < mnt_data_count; i++)
    {
        if((strncmp(mnt_data_array[i].path, entry->mnt_dir, DARSHAN_MAX_MNT_PATH) == 0) &&
           (strncmp(mnt_data_array[i].type, entry->mnt_type, DARSHAN_MAX_MNT_PATH) == 0))
            return;
    }

968 969 970 971 972 973 974 975 976 977 978 979
    strncpy(mnt_data_array[mnt_data_count].path, entry->mnt_dir,
        DARSHAN_MAX_MNT_PATH-1);
    strncpy(mnt_data_array[mnt_data_count].type, entry->mnt_type,
        DARSHAN_MAX_MNT_TYPE-1);
    /* NOTE: we now try to detect the preferred block size for each file 
     * system using fstatfs().  On Lustre we assume a size of 1 MiB 
     * because fstatfs() reports 4 KiB. 
     */
#ifndef LL_SUPER_MAGIC
#define LL_SUPER_MAGIC 0x0BD00BD0
#endif
    ret = statfs(entry->mnt_dir, &statfsbuf);
980
    mnt_data_array[mnt_data_count].fs_info.fs_type = statfsbuf.f_type;
981
    if(ret == 0 && statfsbuf.f_type != LL_SUPER_MAGIC)
982
        mnt_data_array[mnt_data_count].fs_info.block_size = statfsbuf.f_bsize;
983
    else if(ret == 0 && statfsbuf.f_type == LL_SUPER_MAGIC)
984
        mnt_data_array[mnt_data_count].fs_info.block_size = 1024*1024;
985
    else
986
        mnt_data_array[mnt_data_count].fs_info.block_size = 4096;
987

988
#ifdef DARSHAN_LUSTRE
989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007
    /* attempt to retrieve OST and MDS counts from Lustre */
    mnt_data_array[mnt_data_count].fs_info.ost_count = -1;
    mnt_data_array[mnt_data_count].fs_info.mdt_count = -1;
    if ( statfsbuf.f_type == LL_SUPER_MAGIC )
    {
        int n_ost, n_mdt;
        int ret_ost, ret_mdt;
        DIR *mount_dir;

        mount_dir = opendir( entry->mnt_dir );
        if ( mount_dir  ) 
        {
            /* n_ost and n_mdt are used for both input and output to ioctl */
            n_ost = 0;
            n_mdt = 1;

            ret_ost = ioctl( dirfd(mount_dir), LL_IOC_GETOBDCOUNT, &n_ost );
            ret_mdt = ioctl( dirfd(mount_dir), LL_IOC_GETOBDCOUNT, &n_mdt );

1008
            if ( !(ret_ost < 0 || ret_mdt < 0) )
1009 1010 1011 1012 1013 1014 1015
            {
                mnt_data_array[mnt_data_count].fs_info.ost_count = n_ost;
                mnt_data_array[mnt_data_count].fs_info.mdt_count = n_mdt;
            }
            closedir( mount_dir );
        }
    }
1016
#endif
1017

1018
    /* store mount information with the job-level metadata in darshan log */
1019
    ret = snprintf(tmp_mnt, 256, "\n%s\t%s",
1020 1021 1022
        entry->mnt_type, entry->mnt_dir);
    if(ret < 256 && strlen(tmp_mnt) <= (*space_left))
    {
1023
        strcat(buf, tmp_mnt);
1024 1025 1026 1027 1028 1029 1030
        (*space_left) -= strlen(tmp_mnt);
    }

    mnt_data_count++;
    return;
}

1031
/* darshan_get_exe_and_mounts()
1032 1033
 *
 * collects command line and list of mounted file systems into a string that
1034
 * will be stored with the job-level metadata
1035
 */
1036
static void darshan_get_exe_and_mounts(struct darshan_core_runtime *core,
1037
    int argc, char **argv)
1038 1039 1040 1041
{
    FILE* tab;
    struct mntent *entry;
    char* exclude;
1042 1043 1044
    char* truncate_string = "<TRUNCATED>";
    int truncate_offset;
    int space_left = DARSHAN_EXE_LEN;
1045 1046 1047
    FILE *fh;
    int i, ii;
    char cmdl[DARSHAN_EXE_LEN];
1048 1049
    int tmp_index = 0;
    int skip = 0;
1050 1051 1052
    char* env_exclusions;
    char* string;
    char* token;
1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071

    /* skip these fs types */
    static char* fs_exclusions[] = {
        "tmpfs",
        "proc",
        "sysfs",
        "devpts",
        "binfmt_misc",
        "fusectl",
        "debugfs",
        "securityfs",
        "nfsd",
        "none",
        "rpc_pipefs",
        "hugetlbfs",
        "cgroup",
        NULL
    };

1072 1073 1074 1075 1076 1077 1078 1079 1080
    /* Check if user has set the env variable DARSHAN_EXCLUDE_DIRS */
    env_exclusions = getenv("DARSHAN_EXCLUDE_DIRS");
    if(env_exclusions)
    {
        fs_exclusions[0]=NULL;
        /* if DARSHAN_EXCLUDE_DIRS=none, do not exclude any dir */
        if(strncmp(env_exclusions,"none",strlen(env_exclusions))>=0)
        {
            if (my_rank == 0) 
1081
                darshan_core_fprintf(stderr, "Darshan info: no system dirs will be excluded\n");
1082 1083 1084 1085 1086
            darshan_path_exclusions[0]=NULL;
        }
        else
        {
            if (my_rank == 0) 
1087
                darshan_core_fprintf(stderr, "Darshan info: the following system dirs will be excluded: %s\n",
Shane Snyder's avatar
Shane Snyder committed
1088
                    env_exclusions);
1089
            string = strdup(env_exclusions);
1090 1091
            i = 0;
            /* get the comma separated number of directories */
1092 1093
            token = strtok(string, ",");
            while (token != NULL)
1094
            {
1095 1096
                i++;
                token = strtok(NULL, ",");
1097 1098
            }
            user_darshan_path_exclusions=(char **)malloc((i+1)*sizeof(char *));
1099 1100
            assert(user_darshan_path_exclusions);

1101
            i = 0;
1102 1103 1104
            strcpy(string, env_exclusions);
            token = strtok(string, ",");
            while (token != NULL)
1105 1106
            {
                user_darshan_path_exclusions[i]=(char *)malloc(strlen(token)+1);
1107
                assert(user_darshan_path_exclusions[i]);
1108 1109
                strcpy(user_darshan_path_exclusions[i],token);
                i++;
1110
                token = strtok(NULL, ",");
1111 1112
            }
            user_darshan_path_exclusions[i]=NULL;
1113
            free(string);
1114 1115 1116
        }
    }

1117 1118 1119
    /* record exe and arguments */
    for(i=0; i<argc; i++)
    {
1120 1121
        strncat(core->log_exemnt_p, argv[i], space_left);
        space_left = DARSHAN_EXE_LEN-strlen(core->log_exemnt_p);
1122 1123
        if(i <