darshan-core.c 39.7 KB
Newer Older
1
/*
Shane Snyder's avatar
Shane Snyder committed
2
3
4
 * Copyright (C) 2015 University of Chicago.
 * See COPYRIGHT notice in top-level directory.
 *
5
6
 */

7
#define _XOPEN_SOURCE 500
8
#define _GNU_SOURCE
9

10
11
12
13
14
15
#include "darshan-runtime-config.h"

#include <stdio.h>
#ifdef HAVE_MNTENT_H
#include <mntent.h>
#endif
16
17
18
19
20
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <limits.h>
#include <pthread.h>
21
#include <fcntl.h>
22
23
#include <sys/types.h>
#include <sys/stat.h>
24
#include <sys/mman.h>
25
#include <sys/vfs.h>
26
#include <zlib.h>
27
#include <mpi.h>
28
#include <assert.h>
29

30
#include "uthash.h"
Shane Snyder's avatar
Shane Snyder committed
31
#include "darshan.h"
32
#include "darshan-core.h"
Shane Snyder's avatar
Shane Snyder committed
33
#include "darshan-dynamic.h"
34

35
extern char* __progname;
36
extern char* __progname_full;
37

38
/* internal variable delcarations */
39
static struct darshan_core_runtime *darshan_core = NULL;
40
static pthread_mutex_t darshan_core_mutex = PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP;
41
static int my_rank = -1;
42
static int nprocs = -1;
43
static int darshan_mem_alignment = 1;
44

45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
/* paths prefixed with the following directories are not traced by darshan */
char* darshan_path_exclusions[] = {
"/etc/",
"/dev/",
"/usr/",
"/bin/",
"/boot/",
"/lib/",
"/opt/",
"/sbin/",
"/sys/",
"/proc/",
NULL
};

60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
#ifdef DARSHAN_BGQ
extern void bgq_runtime_initialize();
#endif

/* array of init functions for modules which need to be statically
 * initialized by darshan at startup time
 */
void (*mod_static_init_fns[])(void) =
{
#ifdef DARSHAN_BGQ
    &bgq_runtime_initialize,
#endif
    NULL
};

Shane Snyder's avatar
Shane Snyder committed
75
76
77
#define DARSHAN_CORE_LOCK() pthread_mutex_lock(&darshan_core_mutex)
#define DARSHAN_CORE_UNLOCK() pthread_mutex_unlock(&darshan_core_mutex)

78
79
80
81
82
83
/* FS mount information */
#define DARSHAN_MAX_MNTS 64
#define DARSHAN_MAX_MNT_PATH 256
#define DARSHAN_MAX_MNT_TYPE 32
struct mnt_data
{
84
    int block_size;
85
86
87
88
89
90
    char path[DARSHAN_MAX_MNT_PATH];
    char type[DARSHAN_MAX_MNT_TYPE];
};
static struct mnt_data mnt_data_array[DARSHAN_MAX_MNTS];
static int mnt_data_count = 0;

91
92
93
94
/* prototypes for internal helper functions */
static void darshan_get_logfile_name(
    char* logfile_name, int jobid, struct tm* start_tm);
static void darshan_log_record_hints_and_ver(
95
96
    struct darshan_core_runtime* core);
static void darshan_get_exe_and_mounts_root(
97
98
99
    struct darshan_core_runtime *core, int argc, char **argv);
static void darshan_get_exe_and_mounts(
    struct darshan_core_runtime *core, int argc, char **argv);
100
101
static void darshan_block_size_from_path(
    const char *path, int *block_size);
102
static void darshan_get_shared_records(
103
104
    struct darshan_core_runtime *core, darshan_record_id **shared_recs,
    int *shared_rec_cnt);
105
static int darshan_log_open_all(
106
    char *logfile_name, MPI_File *log_fh);
107
static int darshan_deflate_buffer(
Shane Snyder's avatar
Shane Snyder committed
108
109
    void **pointers, int *lengths, int count, char *comp_buf,
    int *comp_buf_length);
110
static int darshan_log_write_record_hash(
111
    MPI_File log_fh, struct darshan_core_runtime *core,
112
113
114
    uint64_t *inout_off);
static int darshan_log_append_all(
    MPI_File log_fh, struct darshan_core_runtime *core, void *buf,
Shane Snyder's avatar
Shane Snyder committed
115
    int count, uint64_t *inout_off);
Shane Snyder's avatar
Shane Snyder committed
116
117
static void darshan_core_cleanup(
    struct darshan_core_runtime* core);
118

119
120
/* *********************************** */

Shane Snyder's avatar
Shane Snyder committed
121
void darshan_core_initialize(int argc, char **argv)
122
{
123
    struct darshan_core_runtime *init_core = NULL;
124
125
    int internal_timing_flag = 0;
    double init_start, init_time, init_max;
126
    char mmap_log_name[PATH_MAX];
127
128
129
    int mmap_fd;
    int mmap_size;
    int sys_page_size;
130
    char *envstr;
131
132
    char *jobid_str;
    int jobid;
133
134
    int ret;
    int tmpval;
135
    int i;
136
137

    DARSHAN_MPI_CALL(PMPI_Comm_size)(MPI_COMM_WORLD, &nprocs);
138
    DARSHAN_MPI_CALL(PMPI_Comm_rank)(MPI_COMM_WORLD, &my_rank);
139
140
141
142

    if(getenv("DARSHAN_INTERNAL_TIMING"))
        internal_timing_flag = 1;

143
    if(internal_timing_flag)
144
145
146
        init_start = DARSHAN_MPI_CALL(PMPI_Wtime)();

    /* setup darshan runtime if darshan is enabled and hasn't been initialized already */
147
    if(!getenv("DARSHAN_DISABLE") && !darshan_core)
148
    {
149
        #if (__DARSHAN_MEM_ALIGNMENT < 1)
150
151
            #error Darshan must be configured with a positive value for --with-mem-align
        #endif
152
        envstr = getenv(DARSHAN_MEM_ALIGNMENT_OVERRIDE);
153
154
155
156
157
158
159
160
161
162
163
        if(envstr)
        {
            ret = sscanf(envstr, "%d", &tmpval);
            /* silently ignore if the env variable is set poorly */
            if(ret == 1 && tmpval > 0)
            {
                darshan_mem_alignment = tmpval;
            }
        }
        else
        {
164
            darshan_mem_alignment = __DARSHAN_MEM_ALIGNMENT;
165
166
167
168
169
170
171
        }

        /* avoid floating point errors on faulty input */
        if (darshan_mem_alignment < 1)
        {
            darshan_mem_alignment = 1;
        }
172

173
174
175
        /* allocate structure to track darshan core runtime information */
        init_core = malloc(sizeof(*init_core));
        if(init_core)
176
        {
177
178
179
            memset(init_core, 0, sizeof(*init_core));
            init_core->wtime_offset = DARSHAN_MPI_CALL(PMPI_Wtime)();

180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
            /* Use DARSHAN_JOBID_OVERRIDE for the env var for __DARSHAN_JOBID */
            envstr = getenv(DARSHAN_JOBID_OVERRIDE);
            if(!envstr)
            {
                envstr = __DARSHAN_JOBID;
            }

            /* find a job id */
            jobid_str = getenv(envstr);
            if(jobid_str)
            {
                /* in cobalt we can find it in env var */
                ret = sscanf(jobid_str, "%d", &jobid);
            }
            if(!jobid_str || ret != 1)
            {
                /* use pid as fall back */
                jobid = getpid();
            }

200
201
            sys_page_size = sysconf(_SC_PAGESIZE);
            assert(sys_page_size > 0);
202

203
204
            /* XXX: MMAP */
            mmap_size = sizeof(struct darshan_header) + DARSHAN_JOB_RECORD_SIZE + DARSHAN_MOD_MEM_MAX;
205
206
            if(mmap_size % sys_page_size)
                mmap_size = ((mmap_size / sys_page_size) + 1) * sys_page_size;
207

208
209
210
211
212
213
214
            /* construct a unique temporary log file name for this process
             * to write mmap log data to
             */
            snprintf(mmap_log_name, PATH_MAX, "/tmp/darshan_job%d.%d",
                jobid, my_rank);

            /* create the temporary mmapped darshan log */
215
216
            mmap_fd = open(mmap_log_name, O_CREAT|O_RDWR|O_EXCL , 0644);
            if(mmap_fd < 0)
217
            {
218
219
220
221
                fprintf(stderr, "darshan library warning: "
                    "unable to create darshan log file %s\n", mmap_log_name);
                free(init_core);
                return;
222
223
            }

224
            /* TODO: what's more expensive? truncate or write zeros? perf test this call and later accesses */
225
226
227
228
229
230
231
232
233
234
235
236
237
238
            /* allocate the necessary space in the log file */
            ret = ftruncate(mmap_fd, mmap_size);
            if(ret < 0)
            {
                fprintf(stderr, "darshan library warning: "
                    "unable to allocate darshan log file %s\n", mmap_log_name);
                free(init_core);
                close(mmap_fd);
                unlink(mmap_log_name);
                return;
            }

            /* memory map buffers for getting at least some summary i/o data
             * into a log file if darshan does not shut down properly
239
             */
240
241
242
            init_core->mmap_p = mmap(NULL, mmap_size, PROT_WRITE, MAP_SHARED,
                mmap_fd, 0);
            if(init_core->mmap_p == MAP_FAILED)
243
            {
244
245
246
247
248
249
                fprintf(stderr, "darshan library warning: "
                    "unable to mmap darshan log file %s\n", mmap_log_name);
                free(init_core);
                close(mmap_fd);
                unlink(mmap_log_name);
                return;
250
251
            }

252
253
254
            /* close darshan log file (this does *not* unmap the log file) */
            close(mmap_fd);

255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
            /* set the memory pointers for each log file region */
            init_core->log_hdr_p = (struct darshan_header *)
                (init_core->mmap_p);
            init_core->log_job_p = (struct darshan_job *)
                (init_core->log_hdr_p + sizeof(struct darshan_header));
            init_core->log_exemnt_p = (char *)
                (((char *)init_core->log_job_p) + sizeof(struct darshan_job));
            /* TODO: file hash & module memory */

            /* XXX: MMAP */

            /* set known header fields for the log file */
            strcpy(init_core->log_hdr_p->version_string, DARSHAN_LOG_VERSION);
            init_core->log_hdr_p->magic_nr = DARSHAN_MAGIC_NR;
            init_core->log_hdr_p->comp_type = DARSHAN_NO_COMP;
270

271
272
273
274
275
            /* set known job-level metadata fields for the log file */
            init_core->log_job_p->uid = getuid();
            init_core->log_job_p->start_time = time(NULL);
            init_core->log_job_p->nprocs = nprocs;
            init_core->log_job_p->jobid = (int64_t)jobid;
276
277
278
279
280
281

            /* if we are using any hints to write the log file, then record those
             * hints with the darshan job information
             */
            darshan_log_record_hints_and_ver(init_core);

282
            /* collect information about command line and mounted file systems */
283
            darshan_get_exe_and_mounts(init_core, argc, argv);
284

Shane Snyder's avatar
Shane Snyder committed
285
            /* bootstrap any modules with static initialization routines */
286
287
288
289
290
291
292
293
            i = 0;
            while(mod_static_init_fns[i])
            {
                (*mod_static_init_fns[i])();
                i++;
            }

            darshan_core = init_core;
294
        }
295
296
    }

297
298
299
300
301
    if(internal_timing_flag)
    {
        init_time = DARSHAN_MPI_CALL(PMPI_Wtime)() - init_start;
        DARSHAN_MPI_CALL(PMPI_Reduce)(&init_time, &init_max, 1,
            MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
302
        if(my_rank == 0)
303
        {
304
305
            fprintf(stderr, "#darshan:<op>\t<nprocs>\t<time>\n");
            fprintf(stderr, "darshan:init\t%d\t%f\n", nprocs, init_max);
306
307
308
309
310
311
        }
    }

    return;
}

Shane Snyder's avatar
Shane Snyder committed
312
void darshan_core_shutdown()
313
{
314
    int i;
315
    struct darshan_core_runtime *final_core;
316
    int internal_timing_flag = 0;
317
    double start_log_time;
318
    double tm_end;
319
320
321
322

    if(getenv("DARSHAN_INTERNAL_TIMING"))
        internal_timing_flag = 1;

323
324
    if(internal_timing_flag)
        start_log_time = DARSHAN_MPI_CALL(PMPI_Wtime)();
325

Shane Snyder's avatar
Shane Snyder committed
326
    /* disable darhan-core while we shutdown */
327
    DARSHAN_CORE_LOCK();
328
    if(!darshan_core)
329
    {
330
        DARSHAN_CORE_UNLOCK();
331
332
        return;
    }
333
334
    final_core = darshan_core;
    darshan_core = NULL;
Shane Snyder's avatar
Shane Snyder committed
335
336
337
338
339

    for(i = 0; i < DARSHAN_MAX_MODS; i++)
    {
        if(final_core->mod_array[i])
        {
340
            final_core->mod_array[i]->mod_funcs.begin_shutdown();
Shane Snyder's avatar
Shane Snyder committed
341
342
        }
    }
343
    DARSHAN_CORE_UNLOCK();
344

345
    final_core->log_job_p->end_time = time(NULL);
346

347
    darshan_core_cleanup(final_core);
348

349
    if(internal_timing_flag)
350
    {
351
352
353
354
355
356
357
358
359
360
361
        double all_tm, all_slowest;

        tm_end = DARSHAN_MPI_CALL(PMPI_Wtime)();

        all_tm = tm_end - start_log_time;

        DARSHAN_MPI_CALL(PMPI_Reduce)(&all_tm, &all_slowest, 1,
            MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);

        if(my_rank == 0)
        {
362
363
            fprintf(stderr, "#darshan:<op>\t<nprocs>\t<time>\n");
            fprintf(stderr, "darshan:core_shutdown\t%d\t%f\n", nprocs, all_slowest);
364
        }
365
    }
366

367
368
    return;
}
369

Shane Snyder's avatar
Shane Snyder committed
370
/* *********************************** */
371

372
/* construct the darshan log file name */
373
static void darshan_get_logfile_name(char* logfile_name, int jobid, struct tm* start_tm)
374
{
Shane Snyder's avatar
Shane Snyder committed
375
    char* user_logfile_name;
376
377
378
    char* logpath;
    char* logname_string;
    char* logpath_override = NULL;
379
#ifdef __DARSHAN_LOG_ENV
380
381
382
383
384
385
386
387
388
    char env_check[256];
    char* env_tok;
#endif
    uint64_t hlevel;
    char hname[HOST_NAME_MAX];
    uint64_t logmod;
    char cuser[L_cuserid] = {0};
    int ret;

Shane Snyder's avatar
Shane Snyder committed
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
    /* first, check if user specifies a complete logpath to use */
    user_logfile_name = getenv("DARSHAN_LOGFILE");
    if(user_logfile_name)
    {
        if(strlen(user_logfile_name) >= (PATH_MAX-1))
        {
            fprintf(stderr, "darshan library warning: user log file name too long.\n");
            logfile_name[0] = '\0';
        }
        else
        {
            strcpy(logfile_name, user_logfile_name);
        }
    }
    else
404
    {
Shane Snyder's avatar
Shane Snyder committed
405
406
        /* otherwise, generate the log path automatically */

407
408
        /* Use DARSHAN_LOG_PATH_OVERRIDE for the value or __DARSHAN_LOG_PATH */
        logpath = getenv(DARSHAN_LOG_PATH_OVERRIDE);
Shane Snyder's avatar
Shane Snyder committed
409
410
        if(!logpath)
        {
411
412
#ifdef __DARSHAN_LOG_PATH
            logpath = __DARSHAN_LOG_PATH;
413
#endif
Shane Snyder's avatar
Shane Snyder committed
414
        }
415

Shane Snyder's avatar
Shane Snyder committed
416
417
418
419
420
421
422
423
424
425
        /* get the username for this job.  In order we will try each of the
         * following until one of them succeeds:
         *
         * - cuserid()
         * - getenv("LOGNAME")
         * - snprintf(..., geteuid());
         *
         * Note that we do not use getpwuid() because it generally will not
         * work in statically compiled binaries.
         */
426
427

#ifndef DARSHAN_DISABLE_CUSERID
Shane Snyder's avatar
Shane Snyder committed
428
        cuserid(cuser);
429
430
#endif

Shane Snyder's avatar
Shane Snyder committed
431
432
        /* if cuserid() didn't work, then check the environment */
        if(strcmp(cuser, "") == 0)
433
        {
Shane Snyder's avatar
Shane Snyder committed
434
435
436
437
438
            logname_string = getenv("LOGNAME");
            if(logname_string)
            {
                strncpy(cuser, logname_string, (L_cuserid-1));
            }
439
440
        }

Shane Snyder's avatar
Shane Snyder committed
441
442
443
444
445
446
        /* if cuserid() and environment both fail, then fall back to uid */
        if(strcmp(cuser, "") == 0)
        {
            uid_t uid = geteuid();
            snprintf(cuser, sizeof(cuser), "%u", uid);
        }
447

Shane Snyder's avatar
Shane Snyder committed
448
449
450
451
        /* generate a random number to help differentiate the log */
        hlevel=DARSHAN_MPI_CALL(PMPI_Wtime)() * 1000000;
        (void)gethostname(hname, sizeof(hname));
        logmod = darshan_hash((void*)hname,strlen(hname),hlevel);
452

Shane Snyder's avatar
Shane Snyder committed
453
454
455
456
        /* see if darshan was configured using the --with-logpath-by-env
         * argument, which allows the user to specify an absolute path to
         * place logs via an env variable.
         */
457
#ifdef __DARSHAN_LOG_ENV
Shane Snyder's avatar
Shane Snyder committed
458
        /* just silently skip if the environment variable list is too big */
459
        if(strlen(__DARSHAN_LOG_ENV) < 256)
460
        {
Shane Snyder's avatar
Shane Snyder committed
461
            /* copy env variable list to a temporary buffer */
462
            strcpy(env_check, __DARSHAN_LOG_ENV);
Shane Snyder's avatar
Shane Snyder committed
463
464
465
            /* tokenize the comma-separated list */
            env_tok = strtok(env_check, ",");
            if(env_tok)
466
            {
Shane Snyder's avatar
Shane Snyder committed
467
                do
468
                {
Shane Snyder's avatar
Shane Snyder committed
469
470
471
472
473
474
475
476
477
                    /* check each env variable in order */
                    logpath_override = getenv(env_tok);
                    if(logpath_override)
                    {
                        /* stop as soon as we find a match */
                        break;
                    }
                }while((env_tok = strtok(NULL, ",")));
            }
478
479
480
        }
#endif

Shane Snyder's avatar
Shane Snyder committed
481
        if(logpath_override)
482
        {
Shane Snyder's avatar
Shane Snyder committed
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
            ret = snprintf(logfile_name, PATH_MAX,
                "%s/%s_%s_id%d_%d-%d-%d-%" PRIu64 ".darshan_partial",
                logpath_override,
                cuser, __progname, jobid,
                (start_tm->tm_mon+1),
                start_tm->tm_mday,
                (start_tm->tm_hour*60*60 + start_tm->tm_min*60 + start_tm->tm_sec),
                logmod);
            if(ret == (PATH_MAX-1))
            {
                /* file name was too big; squish it down */
                snprintf(logfile_name, PATH_MAX,
                    "%s/id%d.darshan_partial",
                    logpath_override, jobid);
            }
498
        }
Shane Snyder's avatar
Shane Snyder committed
499
        else if(logpath)
500
        {
Shane Snyder's avatar
Shane Snyder committed
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
            ret = snprintf(logfile_name, PATH_MAX,
                "%s/%d/%d/%d/%s_%s_id%d_%d-%d-%d-%" PRIu64 ".darshan_partial",
                logpath, (start_tm->tm_year+1900),
                (start_tm->tm_mon+1), start_tm->tm_mday,
                cuser, __progname, jobid,
                (start_tm->tm_mon+1),
                start_tm->tm_mday,
                (start_tm->tm_hour*60*60 + start_tm->tm_min*60 + start_tm->tm_sec),
                logmod);
            if(ret == (PATH_MAX-1))
            {
                /* file name was too big; squish it down */
                snprintf(logfile_name, PATH_MAX,
                    "%s/id%d.darshan_partial",
                    logpath, jobid);
            }
        }
        else
        {
            logfile_name[0] = '\0';
521
522
523
524
        }
    }

    return;
525
526
}

527
/* record any hints used to write the darshan log in the job data */
528
static void darshan_log_record_hints_and_ver(struct darshan_core_runtime* core)
529
530
{
    char* hints;
531
    char* job_hints;
532
533
534
535
536
537
    int meta_remain = 0;
    char* m;

    /* check environment variable to see if the default MPI file hints have
     * been overridden
     */
538
    hints = getenv(DARSHAN_LOG_HINTS_OVERRIDE);
539
540
    if(!hints)
    {
541
        hints = __DARSHAN_LOG_HINTS;
542
543
544
545
546
    }

    if(!hints || strlen(hints) < 1)
        return;

547
548
    job_hints = strdup(hints);
    if(!job_hints)
549
550
551
        return;

    meta_remain = DARSHAN_JOB_METADATA_LEN -
552
        strlen(core->log_job_p->metadata) - 1;
553
554
    if(meta_remain >= (strlen(PACKAGE_VERSION) + 9))
    {
555
        sprintf(core->log_job_p->metadata, "lib_ver=%s\n", PACKAGE_VERSION);
556
557
        meta_remain -= (strlen(PACKAGE_VERSION) + 9);
    }
558
    if(meta_remain >= (3 + strlen(job_hints)))
559
    {
560
        m = core->log_job_p->metadata + strlen(core->log_job_p->metadata);
561
        /* We have room to store the hints in the metadata portion of
562
         * the job structure.  We just prepend an h= to the hints list.  The
563
564
565
         * metadata parser will ignore = characters that appear in the value
         * portion of the metadata key/value pair.
         */
566
        sprintf(m, "h=%s\n", job_hints);
567
    }
568
    free(job_hints);
569
570
571
572

    return;
}

573
574
575
576
577
578
579
580
581
582
583
584
585
586
static int mnt_data_cmp(const void* a, const void* b)
{
    const struct mnt_data *d_a = (const struct mnt_data*)a;
    const struct mnt_data *d_b = (const struct mnt_data*)b;

    if(strlen(d_a->path) > strlen(d_b->path))
        return(-1);
    else if(strlen(d_a->path) < strlen(d_b->path))
        return(1);
    else
        return(0);
}

/* adds an entry to table of mounted file systems */
587
static void add_entry(char* buf, int* space_left, struct mntent *entry)
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
{
    int ret;
    char tmp_mnt[256];
    struct statfs statfsbuf;

    strncpy(mnt_data_array[mnt_data_count].path, entry->mnt_dir,
        DARSHAN_MAX_MNT_PATH-1);
    strncpy(mnt_data_array[mnt_data_count].type, entry->mnt_type,
        DARSHAN_MAX_MNT_TYPE-1);
    /* NOTE: we now try to detect the preferred block size for each file 
     * system using fstatfs().  On Lustre we assume a size of 1 MiB 
     * because fstatfs() reports 4 KiB. 
     */
#ifndef LL_SUPER_MAGIC
#define LL_SUPER_MAGIC 0x0BD00BD0
#endif
    ret = statfs(entry->mnt_dir, &statfsbuf);
    if(ret == 0 && statfsbuf.f_type != LL_SUPER_MAGIC)
        mnt_data_array[mnt_data_count].block_size = statfsbuf.f_bsize;
    else if(ret == 0 && statfsbuf.f_type == LL_SUPER_MAGIC)
        mnt_data_array[mnt_data_count].block_size = 1024*1024;
    else
        mnt_data_array[mnt_data_count].block_size = 4096;

612
    /* store mount information with the job-level metadata in darshan log */
613
    ret = snprintf(tmp_mnt, 256, "\n%s\t%s",
614
615
616
        entry->mnt_type, entry->mnt_dir);
    if(ret < 256 && strlen(tmp_mnt) <= (*space_left))
    {
617
        strcat(buf, tmp_mnt);
618
619
620
621
622
623
624
625
626
627
        (*space_left) -= strlen(tmp_mnt);
    }

    mnt_data_count++;
    return;
}

/* darshan_get_exe_and_mounts_root()
 *
 * collects command line and list of mounted file systems into a string that
628
 * will be stored with the job-level metadata
629
630
 */
static void darshan_get_exe_and_mounts_root(struct darshan_core_runtime *core,
631
    int argc, char **argv)
632
633
634
635
{
    FILE* tab;
    struct mntent *entry;
    char* exclude;
636
637
638
639
    char* truncate_string = "<TRUNCATED>";
    int truncate_offset;
    int space_left = DARSHAN_EXE_LEN;
    int i;
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
    int tmp_index = 0;
    int skip = 0;

    /* skip these fs types */
    static char* fs_exclusions[] = {
        "tmpfs",
        "proc",
        "sysfs",
        "devpts",
        "binfmt_misc",
        "fusectl",
        "debugfs",
        "securityfs",
        "nfsd",
        "none",
        "rpc_pipefs",
        "hugetlbfs",
        "cgroup",
        NULL
    };

661
662
663
    /* record exe and arguments */
    for(i=0; i<argc; i++)
    {
664
665
        strncat(core->log_exemnt_p, argv[i], space_left);
        space_left = DARSHAN_EXE_LEN-strlen(core->log_exemnt_p);
666
667
        if(i < (argc-1))
        {
668
669
            strncat(core->log_exemnt_p, " ", space_left);
            space_left = DARSHAN_EXE_LEN-strlen(core->log_exemnt_p);
670
671
672
673
674
675
676
677
        }
    }

    /* if we don't see any arguments, then use glibc symbol to get
     * program name at least (this happens in fortran)
     */
    if(argc == 0)
    {
678
679
680
681
        strncat(core->log_exemnt_p, __progname_full, space_left);
        space_left = DARSHAN_EXE_LEN-strlen(core->log_exemnt_p);
        strncat(core->log_exemnt_p, " <unknown args>", space_left);
        space_left = DARSHAN_EXE_LEN-strlen(core->log_exemnt_p);
682
683
684
685
686
687
    }

    if(space_left == 0)
    {
        /* we ran out of room; mark that string was truncated */
        truncate_offset = DARSHAN_EXE_LEN - strlen(truncate_string);
688
        sprintf(&(core->log_exemnt_p[truncate_offset]), "%s",
689
690
691
            truncate_string);
    }

692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
    /* we make two passes through mounted file systems; in the first pass we
     * grab any non-nfs mount points, then on the second pass we grab nfs
     * mount points
     */

    tab = setmntent("/etc/mtab", "r");
    if(!tab)
        return;
    /* loop through list of mounted file systems */
    while(mnt_data_count<DARSHAN_MAX_MNTS && (entry = getmntent(tab)) != NULL)
    {
        /* filter out excluded fs types */
        tmp_index = 0;
        skip = 0;
        while((exclude = fs_exclusions[tmp_index]))
        {
            if(!(strcmp(exclude, entry->mnt_type)))
            {
                skip =1;
                break;
            }
            tmp_index++;
        }

        if(skip || (strcmp(entry->mnt_type, "nfs") == 0))
            continue;

719
        add_entry(core->log_exemnt_p, &space_left, entry);
720
721
722
723
724
725
726
727
728
729
730
731
    }
    endmntent(tab);

    tab = setmntent("/etc/mtab", "r");
    if(!tab)
        return;
    /* loop through list of mounted file systems */
    while(mnt_data_count<DARSHAN_MAX_MNTS && (entry = getmntent(tab)) != NULL)
    {
        if(strcmp(entry->mnt_type, "nfs") != 0)
            continue;

732
        add_entry(core->log_exemnt_p, &space_left, entry);
733
734
735
736
737
738
739
740
741
742
743
744
745
746
    }
    endmntent(tab);

    /* Sort mount points in order of longest path to shortest path.  This is
     * necessary so that if we try to match file paths to mount points later
     * we don't match on "/" every time.
     */
    qsort(mnt_data_array, mnt_data_count, sizeof(mnt_data_array[0]), mnt_data_cmp);
    return;
}

/* darshan_get_exe_and_mounts()
 *
 * collects command line and list of mounted file systems into a string that
747
 * will be stored with the job-level metadata
748
 */
749
750
static void darshan_get_exe_and_mounts(struct darshan_core_runtime *core,
    int argc, char **argv)
751
752
753
{
    if(my_rank == 0)
    {
754
        darshan_get_exe_and_mounts_root(core, argc, argv);
755
756
757
758
759
760
761
762
763
    }

    /* broadcast mount count to all nodes */
    DARSHAN_MPI_CALL(PMPI_Bcast)(&mnt_data_count, 1, MPI_INT, 0,
        MPI_COMM_WORLD);
    /* broadcast mount data to all nodes */
    DARSHAN_MPI_CALL(PMPI_Bcast)(mnt_data_array,
        mnt_data_count*sizeof(mnt_data_array[0]), MPI_BYTE, 0, MPI_COMM_WORLD);

764
    return;
765
766
}

767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
static void darshan_block_size_from_path(const char *path, int *block_size)
{
    int i;
    *block_size = -1;

    for(i=0; i<mnt_data_count; i++)
    {
        if(!(strncmp(mnt_data_array[i].path, path, strlen(mnt_data_array[i].path))))
        {
            *block_size = mnt_data_array[i].block_size;
            return;
        }
    }

    return;
}

784
static void darshan_get_shared_records(struct darshan_core_runtime *core,
785
    darshan_record_id **shared_recs, int *shared_rec_cnt)
786
{
787
788
    int i, j;
    int tmp_cnt = core->rec_count;
789
    struct darshan_core_record_ref *tmp, *ref;
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
    darshan_record_id *id_array;
    uint64_t *mod_flags;
    uint64_t *global_mod_flags;

    /* broadcast root's number of records to all other processes */
    DARSHAN_MPI_CALL(PMPI_Bcast)(&tmp_cnt, 1, MPI_INT, 0, MPI_COMM_WORLD);

    /* use root record count to allocate data structures */
    id_array = malloc(tmp_cnt * sizeof(darshan_record_id));
    mod_flags = malloc(tmp_cnt * sizeof(uint64_t));
    global_mod_flags = malloc(tmp_cnt * sizeof(uint64_t));
    *shared_recs = malloc(tmp_cnt * sizeof(darshan_record_id));
    assert(id_array && mod_flags && global_mod_flags && *shared_recs);

    memset(mod_flags, 0, tmp_cnt * sizeof(uint64_t));
    memset(global_mod_flags, 0, tmp_cnt * sizeof(uint64_t));
    memset(*shared_recs, 0, tmp_cnt * sizeof(darshan_record_id));
807
808
809
810

    /* first, determine list of records root process has opened */
    if(my_rank == 0)
    {
811
        i = 0;
812
        HASH_ITER(hlink, core->rec_hash, ref, tmp)
813
        {
814
            id_array[i++] = ref->rec.id;           
815
816
817
818
        }
    }

    /* broadcast root's list of records to all other processes */
819
    DARSHAN_MPI_CALL(PMPI_Bcast)(id_array, (tmp_cnt * sizeof(darshan_record_id)),
820
821
822
        MPI_BYTE, 0, MPI_COMM_WORLD);

    /* everyone looks to see if they opened the same records as root */
823
    for(i=0; i<tmp_cnt; i++)
824
    {
825
826
        HASH_FIND(hlink, core->rec_hash, &id_array[i], sizeof(darshan_record_id), ref);
        if(ref)
827
        {
828
829
            /* we opened that record too, save the mod_flags */
            mod_flags[i] = ref->mod_flags;
830
831
832
        }
    }

833
834
835
    /* now allreduce so everyone agrees which files are shared and
     * which modules accessed them collectively
     */
836
837
    DARSHAN_MPI_CALL(PMPI_Allreduce)(mod_flags, global_mod_flags, tmp_cnt,
        MPI_UINT64_T, MPI_BAND, MPI_COMM_WORLD);
838

839
840
    j = 0;
    for(i=0; i<tmp_cnt; i++)
841
    {
842
        if(global_mod_flags[i] != 0)
843
        {
844
            (*shared_recs)[j++] = id_array[i];
845
846
847
848
849
850
851
852

            /* set global_mod_flags so we know which modules collectively
             * accessed this module. we need this info to support shared
             * file reductions
             */
            HASH_FIND(hlink, core->rec_hash, &id_array[i], sizeof(darshan_record_id), ref);
            assert(ref);
            ref->global_mod_flags = global_mod_flags[i];
853
854
        }
    }
855
    *shared_rec_cnt = j;
856

857
858
859
    return;
}

860
static int darshan_log_open_all(char *logfile_name, MPI_File *log_fh)
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
{
    char *hints;
    char *tok_str;
    char *orig_tok_str;
    char *key;
    char *value;
    char *saveptr = NULL;
    int ret;
    MPI_Info info;

    /* check environment variable to see if the default MPI file hints have
     * been overridden
     */
    MPI_Info_create(&info);

876
    hints = getenv(DARSHAN_LOG_HINTS_OVERRIDE);
877
878
    if(!hints)
    {
879
        hints = __DARSHAN_LOG_HINTS;
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
    }

    if(hints && strlen(hints) > 0)
    {
        tok_str = strdup(hints);
        if(tok_str)
        {
            orig_tok_str = tok_str;
            do
            {
                /* split string on semicolon */
                key = strtok_r(tok_str, ";", &saveptr);
                if(key)
                {
                    tok_str = NULL;
                    /* look for = sign splitting key/value pairs */
                    value = index(key, '=');
                    if(value)
                    {
                        /* break key and value into separate null terminated strings */
                        value[0] = '\0';
                        value++;
                        if(strlen(key) > 0)
                            MPI_Info_set(info, key, value);
                    }
                }
            }while(key != NULL);
            free(orig_tok_str);
        }
    }

    /* open the darshan log file for writing */
    ret = DARSHAN_MPI_CALL(PMPI_File_open)(MPI_COMM_WORLD, logfile_name,
        MPI_MODE_CREATE | MPI_MODE_WRONLY | MPI_MODE_EXCL, info, log_fh);
Shane Snyder's avatar
Shane Snyder committed
914
    if(ret != MPI_SUCCESS)
915
916
917
        return(-1);

    MPI_Info_free(&info);
Shane Snyder's avatar
Shane Snyder committed
918
919
920
    return(0);
}

921
static int darshan_deflate_buffer(void **pointers, int *lengths, int count,
Shane Snyder's avatar
Shane Snyder committed
922
    char *comp_buf, int *comp_buf_length)
923
924
925
926
927
928
{
    int ret = 0;
    int i;
    int total_target = 0;
    z_stream tmp_stream;

929
930
931
932
933
934
935
936
937
938
939
    /* just return if there is no data */
    for(i = 0; i < count; i++)
    {
        total_target += lengths[i];
    }
    if(total_target)
    {
        total_target = 0;
    }
    else
    {
Shane Snyder's avatar
Shane Snyder committed
940
        *comp_buf_length = 0;
941
942
943
        return(0);
    }

944
945
946
947
948
949
950
    memset(&tmp_stream, 0, sizeof(tmp_stream));
    tmp_stream.zalloc = Z_NULL;
    tmp_stream.zfree = Z_NULL;
    tmp_stream.opaque = Z_NULL;

    /* initialize the zlib compression parameters */
    /* TODO: check these parameters? */
Shane Snyder's avatar
Shane Snyder committed
951
952
953
//    ret = deflateInit2(&tmp_stream, Z_DEFAULT_COMPRESSION, Z_DEFLATED,
//        15 + 16, 8, Z_DEFAULT_STRATEGY);
    ret = deflateInit(&tmp_stream, Z_DEFAULT_COMPRESSION);
954
955
956
957
958
    if(ret != Z_OK)
    {
        return(-1);
    }

Shane Snyder's avatar
Shane Snyder committed
959
    tmp_stream.next_out = (unsigned char *)comp_buf;
960
    tmp_stream.avail_out = DARSHAN_COMP_BUF_SIZE;
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002

    /* loop over the input pointers */
    for(i = 0; i < count; i++)
    {
        total_target += lengths[i];
        tmp_stream.next_in = pointers[i];
        tmp_stream.avail_in = lengths[i];
        /* while we have not finished consuming all of the data available to
         * this point in the loop
         */
        while(tmp_stream.total_in < total_target)
        {
            if(tmp_stream.avail_out == 0)
            {
                /* We ran out of buffer space for compression.  In theory,
                 * we could start using some of the file_array buffer space
                 * without having to malloc again.  In practice, this case 
                 * is going to be practically impossible to hit.
                 */
                deflateEnd(&tmp_stream);
                return(-1);
            }

            /* compress data */
            ret = deflate(&tmp_stream, Z_NO_FLUSH);
            if(ret != Z_OK)
            {
                deflateEnd(&tmp_stream);
                return(-1);
            }
        }
    }

    /* flush compression and end */
    ret = deflate(&tmp_stream, Z_FINISH);
    if(ret != Z_STREAM_END)
    {
        deflateEnd(&tmp_stream);
        return(-1);
    }
    deflateEnd(&tmp_stream);

Shane Snyder's avatar
Shane Snyder committed
1003
    *comp_buf_length = tmp_stream.total_out;
1004
1005
1006
    return(0);
}

1007
1008
1009
/* NOTE: the map written to file may contain duplicate id->name entries if a
 *       record is opened by multiple ranks, but not all ranks
 */
1010
static int darshan_log_write_record_hash(MPI_File log_fh, struct darshan_core_runtime *core,
1011
    uint64_t *inout_off)
1012
{
1013
1014
1015
1016
    int ret;
    struct darshan_core_record_ref *ref, *tmp;
    uint32_t name_len;
    size_t record_sz;
1017
    size_t hash_buf_sz = 0;
1018
1019
    char *hash_buf;
    char *hash_buf_off;
1020

1021
    /* allocate a buffer to store at most 64 bytes for each registered record */
1022
    /* NOTE: this buffer may be reallocated if estimate is too small */
1023
    hash_buf_sz = core->rec_count * 64;
1024
1025
    hash_buf = malloc(hash_buf_sz);
    if(!hash_buf)
1026
    {
Shane Snyder's avatar
Shane Snyder committed
1027
        return(-1);
1028
1029
    }

1030
    /* serialize the record hash into a buffer for writing */
1031
    hash_buf_off = hash_buf;
1032
    HASH_ITER(hlink, core->rec_hash, ref, tmp)
1033
    {
1034
1035
1036
1037
        /* to avoid duplicate records, only rank 0 will write shared records */
        if(my_rank > 0 && ref->global_mod_flags)
            continue;

1038
        name_len = strlen(ref->rec.name);
1039
        record_sz = sizeof(darshan_record_id) + sizeof(uint32_t) + name_len;
1040
        /* make sure there is room in the buffer for this record */
1041
        if((hash_buf_off + record_sz) > (hash_buf + hash_buf_sz))
1042
        {
1043
            char *tmp_buf;
1044
1045
            size_t old_buf_sz;

1046
            /* if no room, reallocate the hash buffer at twice the current size */
1047
1048
1049
            old_buf_sz = hash_buf_off - hash_buf;
            hash_buf_sz *= 2;
            tmp_buf = malloc(hash_buf_sz);
1050
1051
            if(!tmp_buf)
            {
1052
                free(hash_buf);
Shane Snyder's avatar
Shane Snyder committed
1053
                return(-1);
1054
1055
            }

1056
1057
1058
1059
            memcpy(tmp_buf, hash_buf, old_buf_sz);
            free(hash_buf);
            hash_buf = tmp_buf;
            hash_buf_off = hash_buf + old_buf_sz;
1060
1061
        }

1062
1063
        /* now serialize the record into the hash buffer.
         * NOTE: darshan record hash serialization method: 
1064
1065
         *          ... darshan_record_id | (uint32_t) path_len | path ...
         */
1066
1067
1068
1069
1070
1071
        *((darshan_record_id *)hash_buf_off) = ref->rec.id;
        hash_buf_off += sizeof(darshan_record_id);
        *((uint32_t *)hash_buf_off) = name_len;
        hash_buf_off += sizeof(uint32_t);
        memcpy(hash_buf_off, ref->rec.name, name_len);
        hash_buf_off += name_len;
1072
    }
Shane Snyder's avatar
Shane Snyder committed
1073
    hash_buf_sz = hash_buf_off - hash_buf;
1074

1075
    /* collectively write out the record hash to the darshan log */
Shane Snyder's avatar
Shane Snyder committed
1076
    ret = darshan_log_append_all(log_fh, core, hash_buf, hash_buf_sz, inout_off);
1077

1078
    free(hash_buf);
1079

1080
    return(ret);
Shane Snyder's avatar
Shane Snyder committed
1081
1082
}

1083
1084
/* NOTE: inout_off contains the starting offset of this append at the beginning
 *       of the call, and contains the ending offset at the end of the call.
Shane Snyder's avatar
Shane Snyder committed
1085
 *       This variable is only valid on the root rank (rank 0).
Shane Snyder's avatar
Shane Snyder committed
1086
 */
1087
static int darshan_log_append_all(MPI_File log_fh, struct darshan_core_runtime *core,
Shane Snyder's avatar
Shane Snyder committed
1088
    void *buf, int count, uint64_t *inout_off)
Shane Snyder's avatar
Shane Snyder committed
1089
1090
1091
{
    MPI_Offset send_off, my_off;
    MPI_Status status;
1092
    int comp_buf_sz = 0;
Shane Snyder's avatar
Shane Snyder committed
1093
1094
    int ret;

1095
    /* compress the input buffer */
Shane Snyder's avatar
Shane Snyder committed
1096
    ret = darshan_deflate_buffer((void **)&buf, &count, 1,
1097
1098
1099
1100
        core->comp_buf, &comp_buf_sz);
    if(ret < 0)
        comp_buf_sz = 0;

1101
    /* figure out where everyone is writing using scan */
1102
    send_off = comp_buf_sz;
Shane Snyder's avatar
Shane Snyder committed
1103
    if(my_rank == 0)
1104
    {
1105
        send_off += *inout_off; /* rank 0 knows the beginning offset */
1106
    }
Shane Snyder's avatar
Shane Snyder committed
1107
1108
1109
1110

    DARSHAN_MPI_CALL(PMPI_Scan)(&send_off, &my_off, 1, MPI_OFFSET,
        MPI_SUM, MPI_COMM_WORLD);
    /* scan in inclusive; subtract local size back out */
1111
    my_off -= comp_buf_sz;
Shane Snyder's avatar
Shane Snyder committed
1112

1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
    if(ret == 0)
    {
        /* no compression errors, proceed with the collective write */
        ret = DARSHAN_MPI_CALL(PMPI_File_write_at_all)(log_fh, my_off,
            core->comp_buf, comp_buf_sz, MPI_BYTE, &status);
    }
    else
    {
        /* error during compression. preserve and return error to caller,
         * but participate in collective write to avoid deadlock.
         */
        (void)DARSHAN_MPI_CALL(PMPI_File_write_at_all)(log_fh, my_off,
            core->comp_buf, comp_buf_sz, MPI_BYTE, &status);
    }
Shane Snyder's avatar
Shane Snyder committed
1127

1128
1129
    if(nprocs > 1)
    {
Shane Snyder's avatar
Shane Snyder committed
1130
        /* send the ending offset from rank (n-1) to rank 0 */
1131
1132
        if(my_rank == (nprocs-1))
        {
1133
            my_off += comp_buf_sz;
1134
1135
1136
            DARSHAN_MPI_CALL(PMPI_Send)(&my_off, 1, MPI_OFFSET, 0, 0,
                MPI_COMM_WORLD);
        }
Shane Snyder's avatar
Shane Snyder committed
1137
        if(my_rank == 0)
1138
1139
1140
1141
        {
            DARSHAN_MPI_CALL(PMPI_Recv)(&my_off, 1, MPI_OFFSET, (nprocs-1), 0,
                MPI_COMM_WORLD, &status);

1142
            *inout_off = my_off;
1143
1144
1145
1146
        }
    }
    else
    {
1147
        *inout_off = my_off + comp_buf_sz;
1148
    }
Shane Snyder's avatar
Shane Snyder committed
1149

1150
1151
    if(ret != 0)
        return(-1);
Shane Snyder's avatar
Shane Snyder committed
1152
    return(0);
1153
1154
}

Shane Snyder's avatar
Shane Snyder committed
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
/* free darshan core data structures to shutdown */
static void darshan_core_cleanup(struct darshan_core_runtime* core)
{
    struct darshan_core_record_ref *tmp, *ref;
    int i;

    HASH_ITER(hlink, core->rec_hash, ref, tmp)
    {
        HASH_DELETE(hlink, core->rec_hash, ref);
        free(ref->rec.name);
        free(ref);
    }

    for(i = 0; i < DARSHAN_MAX_MODS; i++)
    {
        if(core->mod_array[i])
        {        
            free(core->mod_array[i]);
            core->mod_array[i] = NULL;
        }
    }

    free(core);

    return;
}

1182
/* ********************************************************* */
1183
1184

void darshan_core_register_module(
1185
    darshan_module_id mod_id,