darshan-core.c 54.9 KB
Newer Older
1
/*
Shane Snyder's avatar
Shane Snyder committed
2
3
4
 * Copyright (C) 2015 University of Chicago.
 * See COPYRIGHT notice in top-level directory.
 *
5
6
 */

7
#define _XOPEN_SOURCE 500
8
#define _GNU_SOURCE
9

10
11
12
13
14
15
#include "darshan-runtime-config.h"

#include <stdio.h>
#ifdef HAVE_MNTENT_H
#include <mntent.h>
#endif
16
17
18
19
20
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <limits.h>
#include <pthread.h>
21
22
#include <dirent.h>
#include <sys/ioctl.h>
23
24
25
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/vfs.h>
26
#include <zlib.h>
27
#include <mpi.h>
28
#include <assert.h>
29

30
#include "uthash.h"
Shane Snyder's avatar
Shane Snyder committed
31
#include "darshan.h"
32
#include "darshan-core.h"
Shane Snyder's avatar
Shane Snyder committed
33
#include "darshan-dynamic.h"
34

35
36
37
/* XXX stick this into autoconf .h */
#include <lustre/lustreapi.h>

38
extern char* __progname;
39
extern char* __progname_full;
40

41
/* internal variable delcarations */
42
static struct darshan_core_runtime *darshan_core = NULL;
43
static pthread_mutex_t darshan_core_mutex = PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP;
44
static int my_rank = -1;
45
static int nprocs = -1;
46
static int darshan_mem_alignment = 1;
47

48
49
50
51
52
53
54
55
56
57
58
59
/* paths prefixed with the following directories are not traced by darshan */
char* darshan_path_exclusions[] = {
"/etc/",
"/dev/",
"/usr/",
"/bin/",
"/boot/",
"/lib/",
"/opt/",
"/sbin/",
"/sys/",
"/proc/",
60
"/var/",
61
62
63
NULL
};

64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
#ifdef DARSHAN_BGQ
extern void bgq_runtime_initialize();
#endif

/* array of init functions for modules which need to be statically
 * initialized by darshan at startup time
 */
void (*mod_static_init_fns[])(void) =
{
#ifdef DARSHAN_BGQ
    &bgq_runtime_initialize,
#endif
    NULL
};

Shane Snyder's avatar
Shane Snyder committed
79
80
81
#define DARSHAN_CORE_LOCK() pthread_mutex_lock(&darshan_core_mutex)
#define DARSHAN_CORE_UNLOCK() pthread_mutex_unlock(&darshan_core_mutex)

82
83
84
85
86
87
88
89
/* FS mount information */
#define DARSHAN_MAX_MNTS 64
#define DARSHAN_MAX_MNT_PATH 256
#define DARSHAN_MAX_MNT_TYPE 32
struct mnt_data
{
    char path[DARSHAN_MAX_MNT_PATH];
    char type[DARSHAN_MAX_MNT_TYPE];
90
    struct darshan_fs_info fs_info;
91
92
93
94
};
static struct mnt_data mnt_data_array[DARSHAN_MAX_MNTS];
static int mnt_data_count = 0;

95
96
97
98
/* prototypes for internal helper functions */
static void darshan_get_logfile_name(
    char* logfile_name, int jobid, struct tm* start_tm);
static void darshan_log_record_hints_and_ver(
99
100
101
102
103
104
    struct darshan_core_runtime* core);
static void darshan_get_exe_and_mounts_root(
    struct darshan_core_runtime *core, char* trailing_data,
    int space_left);
static char* darshan_get_exe_and_mounts(
    struct darshan_core_runtime *core);
105
106
static void darshan_fs_info_from_path(
    const char *path, struct darshan_fs_info *fs_info);
107
static void darshan_get_shared_records(
108
109
    struct darshan_core_runtime *core, darshan_record_id **shared_recs,
    int *shared_rec_cnt);
110
static int darshan_log_open_all(
111
    char *logfile_name, MPI_File *log_fh);
112
static int darshan_deflate_buffer(
Shane Snyder's avatar
Shane Snyder committed
113
114
    void **pointers, int *lengths, int count, char *comp_buf,
    int *comp_buf_length);
115
static int darshan_log_write_record_hash(
116
    MPI_File log_fh, struct darshan_core_runtime *core,
117
118
119
    uint64_t *inout_off);
static int darshan_log_append_all(
    MPI_File log_fh, struct darshan_core_runtime *core, void *buf,
Shane Snyder's avatar
Shane Snyder committed
120
    int count, uint64_t *inout_off);
Shane Snyder's avatar
Shane Snyder committed
121
122
static void darshan_core_cleanup(
    struct darshan_core_runtime* core);
123

124
125
/* *********************************** */

Shane Snyder's avatar
Shane Snyder committed
126
void darshan_core_initialize(int argc, char **argv)
127
{
128
    struct darshan_core_runtime *init_core = NULL;
129
130
131
    int i;
    int internal_timing_flag = 0;
    double init_start, init_time, init_max;
132
    char *envstr;
133
134
135
    char* truncate_string = "<TRUNCATED>";
    int truncate_offset;
    int chars_left = 0;
136
137
    int ret;
    int tmpval;
138
139

    DARSHAN_MPI_CALL(PMPI_Comm_size)(MPI_COMM_WORLD, &nprocs);
140
    DARSHAN_MPI_CALL(PMPI_Comm_rank)(MPI_COMM_WORLD, &my_rank);
141
142
143
144

    if(getenv("DARSHAN_INTERNAL_TIMING"))
        internal_timing_flag = 1;

145
    if(internal_timing_flag)
146
147
148
        init_start = DARSHAN_MPI_CALL(PMPI_Wtime)();

    /* setup darshan runtime if darshan is enabled and hasn't been initialized already */
149
    if(!getenv("DARSHAN_DISABLE") && !darshan_core)
150
    {
151
        #if (__DARSHAN_MEM_ALIGNMENT < 1)
152
153
            #error Darshan must be configured with a positive value for --with-mem-align
        #endif
154
        envstr = getenv(DARSHAN_MEM_ALIGNMENT_OVERRIDE);
155
156
157
158
159
160
161
162
163
164
165
        if(envstr)
        {
            ret = sscanf(envstr, "%d", &tmpval);
            /* silently ignore if the env variable is set poorly */
            if(ret == 1 && tmpval > 0)
            {
                darshan_mem_alignment = tmpval;
            }
        }
        else
        {
166
            darshan_mem_alignment = __DARSHAN_MEM_ALIGNMENT;
167
168
169
170
171
172
173
        }

        /* avoid floating point errors on faulty input */
        if (darshan_mem_alignment < 1)
        {
            darshan_mem_alignment = 1;
        }
174
175

        /* allocate structure to track darshan_core_runtime information */
176
177
        init_core = malloc(sizeof(*init_core));
        if(init_core)
178
        {
179
            memset(init_core, 0, sizeof(*init_core));
180

181
182
183
184
            init_core->log_job.uid = getuid();
            init_core->log_job.start_time = time(NULL);
            init_core->log_job.nprocs = nprocs;
            init_core->wtime_offset = DARSHAN_MPI_CALL(PMPI_Wtime)();
185
186

            /* record exe and arguments */
187
            for(i=0; i<argc; i++)
188
            {
189
190
                chars_left = DARSHAN_EXE_LEN-strlen(init_core->exe);
                strncat(init_core->exe, argv[i], chars_left);
191
                if(i < (argc-1))
192
                {
193
194
                    chars_left = DARSHAN_EXE_LEN-strlen(init_core->exe);
                    strncat(init_core->exe, " ", chars_left);
195
196
197
198
199
200
201
202
                }
            }

            /* if we don't see any arguments, then use glibc symbol to get
             * program name at least (this happens in fortran)
             */
            if(argc == 0)
            {
203
204
205
206
                chars_left = DARSHAN_EXE_LEN-strlen(init_core->exe);
                strncat(init_core->exe, __progname_full, chars_left);
                chars_left = DARSHAN_EXE_LEN-strlen(init_core->exe);
                strncat(init_core->exe, " <unknown args>", chars_left);
207
208
209
210
211
            }

            if(chars_left == 0)
            {
                /* we ran out of room; mark that string was truncated */
212
                truncate_offset = DARSHAN_EXE_LEN - strlen(truncate_string);
213
                sprintf(&init_core->exe[truncate_offset], "%s",
214
215
                    truncate_string);
            }
216
217

            /* collect information about command line and mounted file systems */
218
            init_core->trailing_data = darshan_get_exe_and_mounts(init_core);
219

Shane Snyder's avatar
Shane Snyder committed
220
221
222
223
224
225
226
            /* bootstrap any modules with static initialization routines */
            i = 0;
            while(mod_static_init_fns[i])
            {
                (*mod_static_init_fns[i])();
                i++;
            }
227
        }
228
229
    }

230
231
232
233
234
    if(internal_timing_flag)
    {
        init_time = DARSHAN_MPI_CALL(PMPI_Wtime)() - init_start;
        DARSHAN_MPI_CALL(PMPI_Reduce)(&init_time, &init_max, 1,
            MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
235
        if(my_rank == 0)
236
        {
237
238
            fprintf(stderr, "#darshan:<op>\t<nprocs>\t<time>\n");
            fprintf(stderr, "darshan:init\t%d\t%f\n", nprocs, init_max);
239
240
241
        }
    }

242
243
244
245
    /* if darshan was successfully initialized, set the global pointer */
    if(init_core)
        darshan_core = init_core;

246
247
248
    return;
}

Shane Snyder's avatar
Shane Snyder committed
249
void darshan_core_shutdown()
250
{
251
    int i;
252
    char *logfile_name;
253
    struct darshan_core_runtime *final_core;
254
    int internal_timing_flag = 0;
255
256
    char *envjobid;
    char *jobid_str;
257
    int jobid;
258
    struct tm *start_tm;
259
    time_t start_time_tmp;
260
261
    int ret = 0;
    int all_ret = 0;
262
263
    int64_t first_start_time;
    int64_t last_end_time;
264
265
    int local_mod_use[DARSHAN_MAX_MODS] = {0};
    int global_mod_use_count[DARSHAN_MAX_MODS] = {0};
266
267
268
    darshan_record_id *shared_recs;
    darshan_record_id *mod_shared_recs;
    int shared_rec_cnt = 0;
269
    double start_log_time;
270
271
272
    double open1 = 0, open2 = 0;
    double job1 = 0, job2 = 0;
    double rec1 = 0, rec2 = 0;
273
274
    double mod1[DARSHAN_MAX_MODS] = {0};
    double mod2[DARSHAN_MAX_MODS] = {0};
275
    double header1 = 0, header2 = 0;
276
    double tm_end;
277
    uint64_t gz_fp = 0;
278
279
    MPI_File log_fh;
    MPI_Status status;
280
281
282
283

    if(getenv("DARSHAN_INTERNAL_TIMING"))
        internal_timing_flag = 1;

284
285
    start_log_time = DARSHAN_MPI_CALL(PMPI_Wtime)();

Shane Snyder's avatar
Shane Snyder committed
286
    /* disable darhan-core while we shutdown */
287
    DARSHAN_CORE_LOCK();
288
    if(!darshan_core)
289
    {
290
        DARSHAN_CORE_UNLOCK();
291
292
        return;
    }
293
294
    final_core = darshan_core;
    darshan_core = NULL;
Shane Snyder's avatar
Shane Snyder committed
295

296
297
298
299
300
301
302
    final_core->comp_buf = malloc(DARSHAN_COMP_BUF_SIZE);
    if(!(final_core->comp_buf))
    {
        darshan_core_cleanup(final_core);
        return;
    }

303
    /* we also need to set which modules were registered on this process and
304
305
     * call into those modules and give them a chance to perform any necessary
     * pre-shutdown steps.
Shane Snyder's avatar
Shane Snyder committed
306
307
308
309
310
311
     */
    for(i = 0; i < DARSHAN_MAX_MODS; i++)
    {
        if(final_core->mod_array[i])
        {
            local_mod_use[i] = 1;
312
            final_core->mod_array[i]->mod_funcs.begin_shutdown();
Shane Snyder's avatar
Shane Snyder committed
313
314
        }
    }
315
    DARSHAN_CORE_UNLOCK();
316
317
318
319

    logfile_name = malloc(PATH_MAX);
    if(!logfile_name)
    {
320
        darshan_core_cleanup(final_core);
321
322
323
        return;
    }

324
    /* set darshan job id/metadata and constuct log file name on rank 0 */
325
    if(my_rank == 0)
326
    {
327
        /* Use DARSHAN_JOBID_OVERRIDE for the env var for __DARSHAN_JOBID */
328
        envjobid = getenv(DARSHAN_JOBID_OVERRIDE);
329
        if(!envjobid)
330
        {
331
            envjobid = __DARSHAN_JOBID;
332
333
        }

334
        /* find a job id */
335
336
337
338
339
340
341
342
343
344
345
346
        jobid_str = getenv(envjobid);
        if(jobid_str)
        {
            /* in cobalt we can find it in env var */
            ret = sscanf(jobid_str, "%d", &jobid);
        }
        if(!jobid_str || ret != 1)
        {
            /* use pid as fall back */
            jobid = getpid();
        }

347
        final_core->log_job.jobid = (int64_t)jobid;
348

349
        /* if we are using any hints to write the log file, then record those
350
         * hints with the darshan job information
351
         */
352
        darshan_log_record_hints_and_ver(final_core);
353

354
        /* use human readable start time format in log filename */
355
        start_time_tmp = final_core->log_job.start_time;
356
        start_tm = localtime(&start_time_tmp);
357

358
359
        /* construct log file name */
        darshan_get_logfile_name(logfile_name, jobid, start_tm);
360
361
362
363
364
365
366
367
368
    }

    /* broadcast log file name */
    DARSHAN_MPI_CALL(PMPI_Bcast)(logfile_name, PATH_MAX, MPI_CHAR, 0,
        MPI_COMM_WORLD);

    if(strlen(logfile_name) == 0)
    {
        /* failed to generate log file name */
369
        free(logfile_name);
370
        darshan_core_cleanup(final_core);
371
372
373
        return;
    }

374
    final_core->log_job.end_time = time(NULL);
375

376
377
378
    /* reduce to report first start time and last end time across all ranks
     * at rank 0
     */
379
380
    DARSHAN_MPI_CALL(PMPI_Reduce)(&final_core->log_job.start_time, &first_start_time, 1, MPI_LONG_LONG, MPI_MIN, 0, MPI_COMM_WORLD);
    DARSHAN_MPI_CALL(PMPI_Reduce)(&final_core->log_job.end_time, &last_end_time, 1, MPI_LONG_LONG, MPI_MAX, 0, MPI_COMM_WORLD);
381
382
    if(my_rank == 0)
    {
383
384
        final_core->log_job.start_time = first_start_time;
        final_core->log_job.end_time = last_end_time;
385
    }
386

387
388
389
    /* reduce the number of times a module was opened globally and bcast to everyone */   
    DARSHAN_MPI_CALL(PMPI_Allreduce)(local_mod_use, global_mod_use_count, DARSHAN_MAX_MODS, MPI_INT, MPI_SUM, MPI_COMM_WORLD);

390
    /* get a list of records which are shared across all processes */
391
    darshan_get_shared_records(final_core, &shared_recs, &shared_rec_cnt);
392

393
394
    if(internal_timing_flag)
        open1 = DARSHAN_MPI_CALL(PMPI_Wtime)();
395
    /* collectively open the darshan log file */
396
    ret = darshan_log_open_all(logfile_name, &log_fh);
397
398
    if(internal_timing_flag)
        open2 = DARSHAN_MPI_CALL(PMPI_Wtime)();
399
400
401
402
403
404
405
406

    /* error out if unable to open log file */
    DARSHAN_MPI_CALL(PMPI_Allreduce)(&ret, &all_ret, 1, MPI_INT,
        MPI_LOR, MPI_COMM_WORLD);
    if(all_ret != 0)
    {
        if(my_rank == 0)
        {
407
408
            fprintf(stderr, "darshan library warning: unable to open log file %s\n",
                logfile_name);
409
410
        }
        free(logfile_name);
411
        darshan_core_cleanup(final_core);
412
413
414
        return;
    }

415
416
    if(internal_timing_flag)
        job1 = DARSHAN_MPI_CALL(PMPI_Wtime)();
417
    /* rank 0 is responsible for writing the compressed darshan job information */
Shane Snyder's avatar
Shane Snyder committed
418
    if(my_rank == 0)
419
    {
420
        void *pointers[2] = {&final_core->log_job, final_core->trailing_data};
421
        int lengths[2] = {sizeof(struct darshan_job), strlen(final_core->trailing_data)};
422
        int comp_buf_sz = 0;
423

424
        /* compress the job info and the trailing mount/exe data */
Shane Snyder's avatar
Shane Snyder committed
425
        all_ret = darshan_deflate_buffer(pointers, lengths, 2,
426
427
            final_core->comp_buf, &comp_buf_sz);
        if(all_ret)
428
        {
429
            fprintf(stderr, "darshan library warning: unable to compress job data\n");
430
            unlink(logfile_name);
431
        }
432
433
434
        else
        {
            /* write the job information, preallocing space for the log header */
Shane Snyder's avatar
Shane Snyder committed
435
            gz_fp += sizeof(struct darshan_header);
436
437
            all_ret = DARSHAN_MPI_CALL(PMPI_File_write_at)(log_fh, gz_fp,
                final_core->comp_buf, comp_buf_sz, MPI_BYTE, &status);
438
439
440
441
442
            if(all_ret != MPI_SUCCESS)
            {
                fprintf(stderr, "darshan library warning: unable to write job data to log file %s\n",
                        logfile_name);
                unlink(logfile_name);
Shane Snyder's avatar
Shane Snyder committed
443
                
444
            }
445
            gz_fp += comp_buf_sz;
446
        }
447
448
    }

449
450
451
452
453
    /* error out if unable to write job information */
    DARSHAN_MPI_CALL(PMPI_Bcast)(&all_ret, 1, MPI_INT, 0, MPI_COMM_WORLD);
    if(all_ret != 0)
    {
        free(logfile_name);
454
        darshan_core_cleanup(final_core);
455
456
        return;
    }
457
458
    if(internal_timing_flag)
        job2 = DARSHAN_MPI_CALL(PMPI_Wtime)();
459

460
461
    if(internal_timing_flag)
        rec1 = DARSHAN_MPI_CALL(PMPI_Wtime)();
462
    /* write the record name->id hash to the log file */
Shane Snyder's avatar
Shane Snyder committed
463
    final_core->log_header.rec_map.off = gz_fp;
464
    ret = darshan_log_write_record_hash(log_fh, final_core, &gz_fp);
Shane Snyder's avatar
Shane Snyder committed
465
    final_core->log_header.rec_map.len = gz_fp - final_core->log_header.rec_map.off;
466

467
    /* error out if unable to write record hash */
468
469
470
471
472
473
    DARSHAN_MPI_CALL(PMPI_Allreduce)(&ret, &all_ret, 1, MPI_INT,
        MPI_LOR, MPI_COMM_WORLD);
    if(all_ret != 0)
    {
        if(my_rank == 0)
        {
474
            fprintf(stderr, "darshan library warning: unable to write record hash to log file %s\n",
475
                logfile_name);
476
            unlink(logfile_name);
477
478
        }
        free(logfile_name);
479
        darshan_core_cleanup(final_core);
480
481
        return;
    }
Shane Snyder's avatar
Shane Snyder committed
482
483
    if(internal_timing_flag)
        rec2 = DARSHAN_MPI_CALL(PMPI_Wtime)();
484

485
486
487
    mod_shared_recs = malloc(shared_rec_cnt * sizeof(darshan_record_id));
    assert(mod_shared_recs);

488
    /* loop over globally used darshan modules and:
489
     *      - perform shared file reductions, if possible
490
     *      - get final output buffer
491
     *      - compress (zlib) provided output buffer
Shane Snyder's avatar
Shane Snyder committed
492
     *      - append compressed buffer to log file
493
494
     *      - add module index info (file offset/length) to log header
     *      - shutdown the module
495
     */
496
    for(i = 0; i < DARSHAN_MAX_MODS; i++)
497
    {
498
        struct darshan_core_module* this_mod = final_core->mod_array[i];
499
        struct darshan_core_record_ref *ref = NULL;
500
        int mod_shared_rec_cnt = 0;
501
        void* mod_buf = NULL;
502
        int mod_buf_sz = 0;
503
        int j;
504

505
        if(global_mod_use_count[i] == 0)
506
507
        {
            if(my_rank == 0)
508
509
510
511
            {
                final_core->log_header.mod_map[i].off = 0;
                final_core->log_header.mod_map[i].len = 0;
            }
512
            continue;
513
        }
514
515
 
        if(internal_timing_flag)
516
            mod1[i] = DARSHAN_MPI_CALL(PMPI_Wtime)();
517

518
        /* set the shared file list for this module */
519
520
        memset(mod_shared_recs, 0, shared_rec_cnt * sizeof(darshan_record_id));
        for(j = 0; j < shared_rec_cnt; j++)
521
522
523
524
        {
            HASH_FIND(hlink, final_core->rec_hash, &shared_recs[j],
                sizeof(darshan_record_id), ref);
            assert(ref);
525
            if(DARSHAN_MOD_FLAG_ISSET(ref->global_mod_flags, i))
526
            {
527
                mod_shared_recs[mod_shared_rec_cnt++] = shared_recs[j];
528
            }
529
        }
530

531
532
533
534
535
        /* if module is registered locally, get the corresponding output buffer
         * 
         * NOTE: this function can be used to run collective operations across
         * modules, if there are file records shared globally.
         */
536
        if(this_mod)
537
        {
538
539
            this_mod->mod_funcs.get_output_data(MPI_COMM_WORLD, mod_shared_recs,
                mod_shared_rec_cnt, &mod_buf, &mod_buf_sz);
540
541
        }

542
        /* append this module's data to the darshan log */
Shane Snyder's avatar
Shane Snyder committed
543
544
545
546
        final_core->log_header.mod_map[i].off = gz_fp;
        ret = darshan_log_append_all(log_fh, final_core, mod_buf, mod_buf_sz, &gz_fp);
        final_core->log_header.mod_map[i].len =
            gz_fp - final_core->log_header.mod_map[i].off;
547

548
        /* error out if the log append failed */
549
550
551
        DARSHAN_MPI_CALL(PMPI_Allreduce)(&ret, &all_ret, 1, MPI_INT,
            MPI_LOR, MPI_COMM_WORLD);
        if(all_ret != 0)
552
        {
553
554
555
556
557
558
559
560
            if(my_rank == 0)
            {
                fprintf(stderr,
                    "darshan library warning: unable to write %s module data to log file %s\n",
                    darshan_module_names[i], logfile_name);
                unlink(logfile_name);
            }
            free(logfile_name);
561
            darshan_core_cleanup(final_core);
562
            return;
563
564
565
        }

        /* shutdown module if registered locally */
566
        if(this_mod)
567
568
569
        {
            this_mod->mod_funcs.shutdown();
        }
570
571
        if(internal_timing_flag)
            mod2[i] = DARSHAN_MPI_CALL(PMPI_Wtime)();
572
573
    }

574
575
    if(internal_timing_flag)
        header1 = DARSHAN_MPI_CALL(PMPI_Wtime)();
576
577
578
579
580
581
    /* write out log header, after running 2 reduction on header variables:
     *  1) reduce 'partial_flag' variable to determine which modules ran out
     *     of memory for storing I/O data
     *  2) reduce 'mod_ver' array to determine which log format version each
     *     module used for this output log
     */
582
583
    if(my_rank == 0)
    {
584
585
586
587
588
589
590
591
        DARSHAN_MPI_CALL(PMPI_Reduce)(MPI_IN_PLACE,
            &(final_core->log_header.partial_flag), 1, MPI_UINT32_T,
            MPI_BOR, 0, MPI_COMM_WORLD);
        DARSHAN_MPI_CALL(PMPI_Reduce)(MPI_IN_PLACE,
            final_core->log_header.mod_ver, DARSHAN_MAX_MODS, MPI_UINT32_T,
            MPI_MAX, 0, MPI_COMM_WORLD);

        /* rank 0 is responsible for writing the log header */
592
593
594
        /* initialize the remaining header fields */
        strcpy(final_core->log_header.version_string, DARSHAN_LOG_VERSION);
        final_core->log_header.magic_nr = DARSHAN_MAGIC_NR;
595
        final_core->log_header.comp_type = DARSHAN_ZLIB_COMP;
596

Shane Snyder's avatar
Shane Snyder committed
597
598
599
        all_ret = DARSHAN_MPI_CALL(PMPI_File_write_at)(log_fh, 0, &(final_core->log_header),
            sizeof(struct darshan_header), MPI_BYTE, &status);
        if(all_ret != MPI_SUCCESS)
600
        {
Shane Snyder's avatar
Shane Snyder committed
601
602
            fprintf(stderr, "darshan library warning: unable to write header to log file %s\n",
                    logfile_name);
603
            unlink(logfile_name);
604
        }
605
    }
606
607
608
609
610
611
612
613
614
    else
    {
        DARSHAN_MPI_CALL(PMPI_Reduce)(&(final_core->log_header.partial_flag),
            &(final_core->log_header.partial_flag), 1, MPI_UINT32_T,
            MPI_BOR, 0, MPI_COMM_WORLD);
        DARSHAN_MPI_CALL(PMPI_Reduce)(final_core->log_header.mod_ver,
            final_core->log_header.mod_ver, DARSHAN_MAX_MODS, MPI_UINT32_T,
            MPI_MAX, 0, MPI_COMM_WORLD);
    }
615

616
617
618
619
620
    /* error out if unable to write log header */
    DARSHAN_MPI_CALL(PMPI_Bcast)(&all_ret, 1, MPI_INT, 0, MPI_COMM_WORLD);
    if(all_ret != 0)
    {
        free(logfile_name);
621
        darshan_core_cleanup(final_core);
622
623
        return;
    }
624
625
    if(internal_timing_flag)
        header2 = DARSHAN_MPI_CALL(PMPI_Wtime)();
626

627
628
629
    DARSHAN_MPI_CALL(PMPI_File_close)(&log_fh);

    /* if we got this far, there are no errors, so rename from *.darshan_partial
630
     * to *-<logwritetime>.darshan, which indicates that this log file is
631
632
     * complete and ready for analysis
     */
633
634
    if(my_rank == 0)
    {
Shane Snyder's avatar
Shane Snyder committed
635
        if(getenv("DARSHAN_LOGFILE"))
636
        {
637
#ifdef __DARSHAN_GROUP_READABLE_LOGS
Shane Snyder's avatar
Shane Snyder committed
638
            chmod(logfile_name, (S_IRUSR|S_IRGRP));
639
#else
Shane Snyder's avatar
Shane Snyder committed
640
            chmod(logfile_name, (S_IRUSR));
641
#endif
Shane Snyder's avatar
Shane Snyder committed
642
643
644
645
646
647
648
649
650
651
652
653
654
655
        }
        else
        {
            char* tmp_index;
            double end_log_time;
            char* new_logfile_name;

            new_logfile_name = malloc(PATH_MAX);
            if(new_logfile_name)
            {
                new_logfile_name[0] = '\0';
                end_log_time = DARSHAN_MPI_CALL(PMPI_Wtime)();
                strcat(new_logfile_name, logfile_name);
                tmp_index = strstr(new_logfile_name, ".darshan_partial");
656
                sprintf(tmp_index, "_%d.darshan", (int)(end_log_time-start_log_time+1));
Shane Snyder's avatar
Shane Snyder committed
657
658
                rename(logfile_name, new_logfile_name);
                /* set permissions on log file */
659
#ifdef __DARSHAN_GROUP_READABLE_LOGS
Shane Snyder's avatar
Shane Snyder committed
660
661
662
663
664
665
                chmod(new_logfile_name, (S_IRUSR|S_IRGRP));
#else
                chmod(new_logfile_name, (S_IRUSR));
#endif
                free(new_logfile_name);
            }
666
        }
667
    }
668

669
    free(logfile_name);
670
    darshan_core_cleanup(final_core);
671

672
    if(internal_timing_flag)
673
    {
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
        double open_tm, open_slowest;
        double header_tm, header_slowest;
        double job_tm, job_slowest;
        double rec_tm, rec_slowest;
        double mod_tm[DARSHAN_MAX_MODS], mod_slowest[DARSHAN_MAX_MODS];
        double all_tm, all_slowest;

        tm_end = DARSHAN_MPI_CALL(PMPI_Wtime)();

        open_tm = open2 - open1;
        header_tm = header2 - header1;
        job_tm = job2 - job1;
        rec_tm = rec2 - rec1;
        all_tm = tm_end - start_log_time;
        for(i = 0;i < DARSHAN_MAX_MODS; i++)
        {
            mod_tm[i] = mod2[i] - mod1[i];
        }

        DARSHAN_MPI_CALL(PMPI_Reduce)(&open_tm, &open_slowest, 1,
            MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
        DARSHAN_MPI_CALL(PMPI_Reduce)(&header_tm, &header_slowest, 1,
            MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
        DARSHAN_MPI_CALL(PMPI_Reduce)(&job_tm, &job_slowest, 1,
            MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
        DARSHAN_MPI_CALL(PMPI_Reduce)(&rec_tm, &rec_slowest, 1,
            MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
        DARSHAN_MPI_CALL(PMPI_Reduce)(&all_tm, &all_slowest, 1,
            MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
        DARSHAN_MPI_CALL(PMPI_Reduce)(mod_tm, mod_slowest, DARSHAN_MAX_MODS,
            MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);

        if(my_rank == 0)
        {
708
709
710
711
712
            fprintf(stderr, "#darshan:<op>\t<nprocs>\t<time>\n");
            fprintf(stderr, "darshan:log_open\t%d\t%f\n", nprocs, open_slowest);
            fprintf(stderr, "darshan:job_write\t%d\t%f\n", nprocs, job_slowest);
            fprintf(stderr, "darshan:hash_write\t%d\t%f\n", nprocs, rec_slowest);
            fprintf(stderr, "darshan:header_write\t%d\t%f\n", nprocs, header_slowest);
713
714
715
            for(i = 0; i < DARSHAN_MAX_MODS; i++)
            {
                if(global_mod_use_count[i])
716
                    fprintf(stderr, "darshan:%s_shutdown\t%d\t%f\n", darshan_module_names[i],
Shane Snyder's avatar
Shane Snyder committed
717
                        nprocs, mod_slowest[i]);
718
            }
719
            fprintf(stderr, "darshan:core_shutdown\t%d\t%f\n", nprocs, all_slowest);
720
        }
721
722
723
724
    }
    
    return;
}
725

Shane Snyder's avatar
Shane Snyder committed
726
/* *********************************** */
727

728
/* construct the darshan log file name */
729
static void darshan_get_logfile_name(char* logfile_name, int jobid, struct tm* start_tm)
730
{
Shane Snyder's avatar
Shane Snyder committed
731
    char* user_logfile_name;
732
733
734
    char* logpath;
    char* logname_string;
    char* logpath_override = NULL;
735
#ifdef __DARSHAN_LOG_ENV
736
737
738
739
740
741
742
743
744
    char env_check[256];
    char* env_tok;
#endif
    uint64_t hlevel;
    char hname[HOST_NAME_MAX];
    uint64_t logmod;
    char cuser[L_cuserid] = {0};
    int ret;

Shane Snyder's avatar
Shane Snyder committed
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
    /* first, check if user specifies a complete logpath to use */
    user_logfile_name = getenv("DARSHAN_LOGFILE");
    if(user_logfile_name)
    {
        if(strlen(user_logfile_name) >= (PATH_MAX-1))
        {
            fprintf(stderr, "darshan library warning: user log file name too long.\n");
            logfile_name[0] = '\0';
        }
        else
        {
            strcpy(logfile_name, user_logfile_name);
        }
    }
    else
760
    {
Shane Snyder's avatar
Shane Snyder committed
761
762
        /* otherwise, generate the log path automatically */

763
764
        /* Use DARSHAN_LOG_PATH_OVERRIDE for the value or __DARSHAN_LOG_PATH */
        logpath = getenv(DARSHAN_LOG_PATH_OVERRIDE);
Shane Snyder's avatar
Shane Snyder committed
765
766
        if(!logpath)
        {
767
768
#ifdef __DARSHAN_LOG_PATH
            logpath = __DARSHAN_LOG_PATH;
769
#endif
Shane Snyder's avatar
Shane Snyder committed
770
        }
771

Shane Snyder's avatar
Shane Snyder committed
772
773
774
775
776
777
778
779
780
781
        /* get the username for this job.  In order we will try each of the
         * following until one of them succeeds:
         *
         * - cuserid()
         * - getenv("LOGNAME")
         * - snprintf(..., geteuid());
         *
         * Note that we do not use getpwuid() because it generally will not
         * work in statically compiled binaries.
         */
782
783

#ifndef DARSHAN_DISABLE_CUSERID
Shane Snyder's avatar
Shane Snyder committed
784
        cuserid(cuser);
785
786
#endif

Shane Snyder's avatar
Shane Snyder committed
787
788
        /* if cuserid() didn't work, then check the environment */
        if(strcmp(cuser, "") == 0)
789
        {
Shane Snyder's avatar
Shane Snyder committed
790
791
792
793
794
            logname_string = getenv("LOGNAME");
            if(logname_string)
            {
                strncpy(cuser, logname_string, (L_cuserid-1));
            }
795
796
        }

Shane Snyder's avatar
Shane Snyder committed
797
798
799
800
801
802
        /* if cuserid() and environment both fail, then fall back to uid */
        if(strcmp(cuser, "") == 0)
        {
            uid_t uid = geteuid();
            snprintf(cuser, sizeof(cuser), "%u", uid);
        }
803

Shane Snyder's avatar
Shane Snyder committed
804
805
806
807
        /* generate a random number to help differentiate the log */
        hlevel=DARSHAN_MPI_CALL(PMPI_Wtime)() * 1000000;
        (void)gethostname(hname, sizeof(hname));
        logmod = darshan_hash((void*)hname,strlen(hname),hlevel);
808

Shane Snyder's avatar
Shane Snyder committed
809
810
811
812
        /* see if darshan was configured using the --with-logpath-by-env
         * argument, which allows the user to specify an absolute path to
         * place logs via an env variable.
         */
813
#ifdef __DARSHAN_LOG_ENV
Shane Snyder's avatar
Shane Snyder committed
814
        /* just silently skip if the environment variable list is too big */
815
        if(strlen(__DARSHAN_LOG_ENV) < 256)
816
        {
Shane Snyder's avatar
Shane Snyder committed
817
            /* copy env variable list to a temporary buffer */
818
            strcpy(env_check, __DARSHAN_LOG_ENV);
Shane Snyder's avatar
Shane Snyder committed
819
820
821
            /* tokenize the comma-separated list */
            env_tok = strtok(env_check, ",");
            if(env_tok)
822
            {
Shane Snyder's avatar
Shane Snyder committed
823
                do
824
                {
Shane Snyder's avatar
Shane Snyder committed
825
826
827
828
829
830
831
832
833
                    /* check each env variable in order */
                    logpath_override = getenv(env_tok);
                    if(logpath_override)
                    {
                        /* stop as soon as we find a match */
                        break;
                    }
                }while((env_tok = strtok(NULL, ",")));
            }
834
835
836
        }
#endif

Shane Snyder's avatar
Shane Snyder committed
837
        if(logpath_override)
838
        {
Shane Snyder's avatar
Shane Snyder committed
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
            ret = snprintf(logfile_name, PATH_MAX,
                "%s/%s_%s_id%d_%d-%d-%d-%" PRIu64 ".darshan_partial",
                logpath_override,
                cuser, __progname, jobid,
                (start_tm->tm_mon+1),
                start_tm->tm_mday,
                (start_tm->tm_hour*60*60 + start_tm->tm_min*60 + start_tm->tm_sec),
                logmod);
            if(ret == (PATH_MAX-1))
            {
                /* file name was too big; squish it down */
                snprintf(logfile_name, PATH_MAX,
                    "%s/id%d.darshan_partial",
                    logpath_override, jobid);
            }
854
        }
Shane Snyder's avatar
Shane Snyder committed
855
        else if(logpath)
856
        {
Shane Snyder's avatar
Shane Snyder committed
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
            ret = snprintf(logfile_name, PATH_MAX,
                "%s/%d/%d/%d/%s_%s_id%d_%d-%d-%d-%" PRIu64 ".darshan_partial",
                logpath, (start_tm->tm_year+1900),
                (start_tm->tm_mon+1), start_tm->tm_mday,
                cuser, __progname, jobid,
                (start_tm->tm_mon+1),
                start_tm->tm_mday,
                (start_tm->tm_hour*60*60 + start_tm->tm_min*60 + start_tm->tm_sec),
                logmod);
            if(ret == (PATH_MAX-1))
            {
                /* file name was too big; squish it down */
                snprintf(logfile_name, PATH_MAX,
                    "%s/id%d.darshan_partial",
                    logpath, jobid);
            }
        }
        else
        {
            logfile_name[0] = '\0';
877
878
879
880
        }
    }

    return;
881
882
}

883
/* record any hints used to write the darshan log in the log header */
884
static void darshan_log_record_hints_and_ver(struct darshan_core_runtime* core)
885
886
887
888
889
890
891
892
893
{
    char* hints;
    char* header_hints;
    int meta_remain = 0;
    char* m;

    /* check environment variable to see if the default MPI file hints have
     * been overridden
     */
894
    hints = getenv(DARSHAN_LOG_HINTS_OVERRIDE);
895
896
    if(!hints)
    {
897
        hints = __DARSHAN_LOG_HINTS;
898
899
900
901
902
903
904
905
906
907
    }

    if(!hints || strlen(hints) < 1)
        return;

    header_hints = strdup(hints);
    if(!header_hints)
        return;

    meta_remain = DARSHAN_JOB_METADATA_LEN -
908
        strlen(core->log_job.metadata) - 1;
909
910
    if(meta_remain >= (strlen(PACKAGE_VERSION) + 9))
    {
911
        sprintf(core->log_job.metadata, "lib_ver=%s\n", PACKAGE_VERSION);
912
913
914
915
        meta_remain -= (strlen(PACKAGE_VERSION) + 9);
    }
    if(meta_remain >= (3 + strlen(header_hints)))
    {
916
        m = core->log_job.metadata + strlen(core->log_job.metadata);
917
918
919
920
921
922
923
924
925
926
927
928
        /* We have room to store the hints in the metadata portion of
         * the job header.  We just prepend an h= to the hints list.  The
         * metadata parser will ignore = characters that appear in the value
         * portion of the metadata key/value pair.
         */
        sprintf(m, "h=%s\n", header_hints);
    }
    free(header_hints);

    return;
}

929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
static int mnt_data_cmp(const void* a, const void* b)
{
    const struct mnt_data *d_a = (const struct mnt_data*)a;
    const struct mnt_data *d_b = (const struct mnt_data*)b;

    if(strlen(d_a->path) > strlen(d_b->path))
        return(-1);
    else if(strlen(d_a->path) < strlen(d_b->path))
        return(1);
    else
        return(0);
}

/* adds an entry to table of mounted file systems */
static void add_entry(char* trailing_data, int* space_left, struct mntent *entry)
{
    int ret;
    char tmp_mnt[256];
    struct statfs statfsbuf;

    strncpy(mnt_data_array[mnt_data_count].path, entry->mnt_dir,
        DARSHAN_MAX_MNT_PATH-1);
    strncpy(mnt_data_array[mnt_data_count].type, entry->mnt_type,
        DARSHAN_MAX_MNT_TYPE-1);
    /* NOTE: we now try to detect the preferred block size for each file 
     * system using fstatfs().  On Lustre we assume a size of 1 MiB 
     * because fstatfs() reports 4 KiB. 
     */
#ifndef LL_SUPER_MAGIC
#define LL_SUPER_MAGIC 0x0BD00BD0
#endif
    ret = statfs(entry->mnt_dir, &statfsbuf);
961
    mnt_data_array[mnt_data_count].fs_info.fs_type = statfsbuf.f_type;
962
    if(ret == 0 && statfsbuf.f_type != LL_SUPER_MAGIC)
963
        mnt_data_array[mnt_data_count].fs_info.block_size = statfsbuf.f_bsize;
964
    else if(ret == 0 && statfsbuf.f_type == LL_SUPER_MAGIC)
965
        mnt_data_array[mnt_data_count].fs_info.block_size = 1024*1024;
966
    else
967
        mnt_data_array[mnt_data_count].fs_info.block_size = 4096;
968

969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
    /* attempt to retrieve OST and MDS counts from Lustre */
    mnt_data_array[mnt_data_count].fs_info.ost_count = -1;
    mnt_data_array[mnt_data_count].fs_info.mdt_count = -1;
    if ( statfsbuf.f_type == LL_SUPER_MAGIC )
    {
        int n_ost, n_mdt;
        int ret_ost, ret_mdt;
        DIR *mount_dir;

        mount_dir = opendir( entry->mnt_dir );
        if ( mount_dir  ) 
        {
            /* n_ost and n_mdt are used for both input and output to ioctl */
            n_ost = 0;
            n_mdt = 1;

            ret_ost = ioctl( dirfd(mount_dir), LL_IOC_GETOBDCOUNT, &n_ost );
            ret_mdt = ioctl( dirfd(mount_dir), LL_IOC_GETOBDCOUNT, &n_mdt );

            if ( ret_ost < 0 || ret_mdt < 0 )
            {
                mnt_data_array[mnt_data_count].fs_info.ost_count = n_ost;
                mnt_data_array[mnt_data_count].fs_info.mdt_count = n_mdt;
            }
            closedir( mount_dir );
        }
    }

997
    /* store mount information for use in header of darshan log */
998
    ret = snprintf(tmp_mnt, 256, "\n%s\t%s",
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
        entry->mnt_type, entry->mnt_dir);
    if(ret < 256 && strlen(tmp_mnt) <= (*space_left))
    {
        strcat(trailing_data, tmp_mnt);
        (*space_left) -= strlen(tmp_mnt);
    }

    mnt_data_count++;
    return;
}

/* darshan_get_exe_and_mounts_root()
 *
 * collects command line and list of mounted file systems into a string that
 * will be stored with the job header
 */
static void darshan_get_exe_and_mounts_root(struct darshan_core_runtime *core,
    char* trailing_data, int space_left)
{
    FILE* tab;
    struct mntent *entry;
    char* exclude;
    int tmp_index = 0;
    int skip = 0;

    /* skip these fs types */
    static char* fs_exclusions[] = {
        "tmpfs",
        "proc",
        "sysfs",
        "devpts",
        "binfmt_misc",
        "fusectl",
        "debugfs",
        "securityfs",
        "nfsd",
        "none",
        "rpc_pipefs",
        "hugetlbfs",
        "cgroup",
        NULL
    };

Shane Snyder's avatar
Shane Snyder committed
1042
    /* length of exe has already been safety checked in darshan initialization */
1043
    strcat(trailing_data, core->exe);
1044
    space_left = DARSHAN_EXE_LEN - strlen(trailing_data);
1045
1046
1047
1048
1049

    /* we make two passes through mounted file systems; in the first pass we
     * grab any non-nfs mount points, then on the second pass we grab nfs
     * mount points
     */
1050
    mnt_data_count = 0;
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108

    tab = setmntent("/etc/mtab", "r");
    if(!tab)
        return;
    /* loop through list of mounted file systems */
    while(mnt_data_count<DARSHAN_MAX_MNTS && (entry = getmntent(tab)) != NULL)
    {
        /* filter out excluded fs types */
        tmp_index = 0;
        skip = 0;
        while((exclude = fs_exclusions[tmp_index]))
        {
            if(!(strcmp(exclude, entry->mnt_type)))
            {
                skip =1;
                break;
            }
            tmp_index++;
        }

        if(skip || (strcmp(entry->mnt_type, "nfs") == 0))
            continue;

        add_entry(trailing_data, &space_left, entry);
    }
    endmntent(tab);

    tab = setmntent("/etc/mtab", "r");
    if(!tab)
        return;
    /* loop through list of mounted file systems */
    while(mnt_data_count<DARSHAN_MAX_MNTS && (entry = getmntent(tab)) != NULL)
    {
        if(strcmp(entry->mnt_type, "nfs") != 0)
            continue;

        add_entry(trailing_data, &space_left, entry);
    }
    endmntent(tab);

    /* Sort mount points in order of longest path to shortest path.  This is
     * necessary so that if we try to match file paths to mount points later
     * we don't match on "/" e