darshan-core.c 67.2 KB
Newer Older
1
/*
Shane Snyder's avatar
Shane Snyder committed
2
3
4
 * Copyright (C) 2015 University of Chicago.
 * See COPYRIGHT notice in top-level directory.
 *
5
6
 */

7
#define _XOPEN_SOURCE 500
8
#define _GNU_SOURCE
9

10
11
12
13
14
15
#include "darshan-runtime-config.h"

#include <stdio.h>
#ifdef HAVE_MNTENT_H
#include <mntent.h>
#endif
16
17
18
19
20
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <limits.h>
#include <pthread.h>
21
#include <fcntl.h>
Shane Snyder's avatar
Shane Snyder committed
22
#include <stdarg.h>
23
24
#include <dirent.h>
#include <sys/ioctl.h>
25
26
#include <sys/types.h>
#include <sys/stat.h>
27
#include <sys/mman.h>
28
#include <sys/vfs.h>
29
#include <zlib.h>
30
#include <mpi.h>
31
#include <assert.h>
32

33
#include "uthash.h"
Shane Snyder's avatar
Shane Snyder committed
34
#include "darshan.h"
35
#include "darshan-core.h"
Shane Snyder's avatar
Shane Snyder committed
36
#include "darshan-dynamic.h"
37

38
#ifdef DARSHAN_LUSTRE
39
#include <lustre/lustre_user.h>
40
#endif
41

42
extern char* __progname;
43
extern char* __progname_full;
44

45
/* internal variable delcarations */
46
static struct darshan_core_runtime *darshan_core = NULL;
47
static pthread_mutex_t darshan_core_mutex = PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP;
48
static int my_rank = -1;
49
static int nprocs = -1;
50
static int darshan_mem_alignment = 1;
51
static long darshan_mod_mem_quota = DARSHAN_MOD_MEM_MAX;
52

53
/* paths prefixed with the following directories are not tracked by darshan */
54
char* darshan_path_exclusions[] = {
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
    "/etc/",
    "/dev/",
    "/usr/",
    "/bin/",
    "/boot/",
    "/lib/",
    "/opt/",
    "/sbin/",
    "/sys/",
    "/proc/",
    "/var/",
    NULL
};
/* paths prefixed with the following directories are tracked by darshan even if
 * they share a root with a path listed in darshan_path_exclusions
 */
char* darshan_path_inclusions[] = {
    "/var/opt/cray/dws/mounts/",
    NULL
74
75
};

76
77
78
/* allow users to override the path exclusions */
char** user_darshan_path_exclusions = NULL;

79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
#ifdef DARSHAN_BGQ
extern void bgq_runtime_initialize();
#endif

/* array of init functions for modules which need to be statically
 * initialized by darshan at startup time
 */
void (*mod_static_init_fns[])(void) =
{
#ifdef DARSHAN_BGQ
    &bgq_runtime_initialize,
#endif
    NULL
};

94
95
96
97
98
99
100
#ifdef DARSHAN_LUSTRE
/* XXX need to use extern to get Lustre module's instrumentation function
 * since modules have no way of providing this to darshan-core
 */
extern void darshan_instrument_lustre_file(const char *filepath, int fd);
#endif

Shane Snyder's avatar
Shane Snyder committed
101
102
103
#define DARSHAN_CORE_LOCK() pthread_mutex_lock(&darshan_core_mutex)
#define DARSHAN_CORE_UNLOCK() pthread_mutex_unlock(&darshan_core_mutex)

104
105
106
107
108
109
110
111
/* FS mount information */
#define DARSHAN_MAX_MNTS 64
#define DARSHAN_MAX_MNT_PATH 256
#define DARSHAN_MAX_MNT_TYPE 32
struct mnt_data
{
    char path[DARSHAN_MAX_MNT_PATH];
    char type[DARSHAN_MAX_MNT_TYPE];
112
    struct darshan_fs_info fs_info;
113
114
115
116
};
static struct mnt_data mnt_data_array[DARSHAN_MAX_MNTS];
static int mnt_data_count = 0;

117
/* prototypes for internal helper functions */
118
#ifdef __DARSHAN_ENABLE_MMAP_LOGS
119
120
static void *darshan_init_mmap_log(
    struct darshan_core_runtime* core, int jobid);
121
#endif
122
static void darshan_log_record_hints_and_ver(
123
    struct darshan_core_runtime* core);
124
125
static void darshan_get_exe_and_mounts(
    struct darshan_core_runtime *core, int argc, char **argv);
126
127
static void darshan_fs_info_from_path(
    const char *path, struct darshan_fs_info *fs_info);
128
static int darshan_add_name_record_ref(
129
    struct darshan_core_runtime *core, darshan_record_id rec_id,
130
    const char *name, darshan_module_id mod_id);
131
132
static void darshan_get_user_name(
    char *user);
133
134
static void darshan_get_logfile_name(
    char* logfile_name, int jobid, struct tm* start_tm);
135
static void darshan_get_shared_records(
136
137
    struct darshan_core_runtime *core, darshan_record_id **shared_recs,
    int *shared_rec_cnt);
138
static int darshan_log_open_all(
139
    char *logfile_name, MPI_File *log_fh);
140
static int darshan_deflate_buffer(
Shane Snyder's avatar
Shane Snyder committed
141
142
    void **pointers, int *lengths, int count, char *comp_buf,
    int *comp_buf_length);
143
static int darshan_log_write_name_record_hash(
144
    MPI_File log_fh, struct darshan_core_runtime *core,
145
146
147
    uint64_t *inout_off);
static int darshan_log_append_all(
    MPI_File log_fh, struct darshan_core_runtime *core, void *buf,
Shane Snyder's avatar
Shane Snyder committed
148
    int count, uint64_t *inout_off);
Shane Snyder's avatar
Shane Snyder committed
149
150
static void darshan_core_cleanup(
    struct darshan_core_runtime* core);
151

152
153
/* *********************************** */

Shane Snyder's avatar
Shane Snyder committed
154
void darshan_core_initialize(int argc, char **argv)
155
{
156
    struct darshan_core_runtime *init_core = NULL;
157
158
    int internal_timing_flag = 0;
    double init_start, init_time, init_max;
159
    char *envstr;
160
161
    char *jobid_str;
    int jobid;
162
    int ret;
163
    int i;
164
165
    int tmpval;
    double tmpfloat;
166

167
168
    PMPI_Comm_size(MPI_COMM_WORLD, &nprocs);
    PMPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
169
170
171
172

    if(getenv("DARSHAN_INTERNAL_TIMING"))
        internal_timing_flag = 1;

173
    if(internal_timing_flag)
174
        init_start = PMPI_Wtime();
175
176

    /* setup darshan runtime if darshan is enabled and hasn't been initialized already */
177
    if(!getenv("DARSHAN_DISABLE") && !darshan_core)
178
    {
179
        #if (__DARSHAN_MEM_ALIGNMENT < 1)
180
181
            #error Darshan must be configured with a positive value for --with-mem-align
        #endif
182
        envstr = getenv(DARSHAN_MEM_ALIGNMENT_OVERRIDE);
183
184
185
186
187
188
189
190
191
192
193
        if(envstr)
        {
            ret = sscanf(envstr, "%d", &tmpval);
            /* silently ignore if the env variable is set poorly */
            if(ret == 1 && tmpval > 0)
            {
                darshan_mem_alignment = tmpval;
            }
        }
        else
        {
194
            darshan_mem_alignment = __DARSHAN_MEM_ALIGNMENT;
195
196
197
        }

        /* avoid floating point errors on faulty input */
198
        if(darshan_mem_alignment < 1)
199
200
201
        {
            darshan_mem_alignment = 1;
        }
202

203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
        /* Use DARSHAN_JOBID_OVERRIDE for the env var for __DARSHAN_JOBID */
        envstr = getenv(DARSHAN_JOBID_OVERRIDE);
        if(!envstr)
        {
            envstr = __DARSHAN_JOBID;
        }

        /* find a job id */
        jobid_str = getenv(envstr);
        if(jobid_str)
        {
            /* in cobalt we can find it in env var */
            ret = sscanf(jobid_str, "%d", &jobid);
        }
        if(!jobid_str || ret != 1)
        {
            /* use pid as fall back */
            jobid = getpid();
        }

223
224
225
226
        /* set the memory quota for darshan modules' records */
        envstr = getenv(DARSHAN_MOD_MEM_OVERRIDE);
        if(envstr)
        {
227
            ret = sscanf(envstr, "%lf", &tmpfloat);
228
            /* silently ignore if the env variable is set poorly */
229
            if(ret == 1 && tmpfloat > 0)
230
            {
231
                darshan_mod_mem_quota = tmpfloat * 1024 * 1024; /* convert from MiB */
232
233
234
            }
        }

235
236
237
        /* allocate structure to track darshan core runtime information */
        init_core = malloc(sizeof(*init_core));
        if(init_core)
238
        {
239
            memset(init_core, 0, sizeof(*init_core));
240
            init_core->wtime_offset = PMPI_Wtime();
241

242
243
        /* TODO: do we alloc new memory as we go or just do everything up front? */

244
245
246
247
248
#ifndef __DARSHAN_ENABLE_MMAP_LOGS
            /* just allocate memory for each log file region */
            init_core->log_hdr_p = malloc(sizeof(struct darshan_header));
            init_core->log_job_p = malloc(sizeof(struct darshan_job));
            init_core->log_exemnt_p = malloc(DARSHAN_EXE_LEN+1);
249
            init_core->log_name_p = malloc(DARSHAN_NAME_RECORD_BUF_SIZE);
250
            init_core->log_mod_p = malloc(darshan_mod_mem_quota);
251
252

            if(!(init_core->log_hdr_p) || !(init_core->log_job_p) ||
253
               !(init_core->log_exemnt_p) || !(init_core->log_name_p) ||
254
255
256
257
258
259
260
261
262
               !(init_core->log_mod_p))
            {
                free(init_core);
                return;
            }
            /* if allocation succeeds, zero fill memory regions */
            memset(init_core->log_hdr_p, 0, sizeof(struct darshan_header));
            memset(init_core->log_job_p, 0, sizeof(struct darshan_job));
            memset(init_core->log_exemnt_p, 0, DARSHAN_EXE_LEN+1);
263
            memset(init_core->log_name_p, 0, DARSHAN_NAME_RECORD_BUF_SIZE);
264
            memset(init_core->log_mod_p, 0, darshan_mod_mem_quota);
265
266
267
268
#else
            /* if mmap logs are enabled, we need to initialize the mmap region
             * before setting the corresponding log file region pointers
             */
269
270
            void *mmap_p = darshan_init_mmap_log(init_core, jobid);
            if(!mmap_p)
271
            {
272
273
                free(init_core);
                return;
274
275
            }

276
            /* set the memory pointers for each log file region */
277
            init_core->log_hdr_p = (struct darshan_header *)mmap_p;
278
            init_core->log_job_p = (struct darshan_job *)
279
                ((char *)init_core->log_hdr_p + sizeof(struct darshan_header));
280
            init_core->log_exemnt_p = (char *)
281
                ((char *)init_core->log_job_p + sizeof(struct darshan_job));
282
            init_core->log_name_p = (void *)
283
284
                ((char *)init_core->log_exemnt_p + DARSHAN_EXE_LEN + 1);
            init_core->log_mod_p = (void *)
285
                ((char *)init_core->log_name_p + DARSHAN_NAME_RECORD_BUF_SIZE);
286

287
            /* set header fields needed for the mmap log mechanism */
288
            init_core->log_hdr_p->comp_type = DARSHAN_NO_COMP;
289
            init_core->log_hdr_p->name_map.off =
290
                ((char *)init_core->log_name_p - (char *)init_core->log_hdr_p);
291
292
293
294
295
#endif

            /* set known header fields for the log file */
            strcpy(init_core->log_hdr_p->version_string, DARSHAN_LOG_VERSION);
            init_core->log_hdr_p->magic_nr = DARSHAN_MAGIC_NR;
296

297
298
299
300
301
            /* set known job-level metadata fields for the log file */
            init_core->log_job_p->uid = getuid();
            init_core->log_job_p->start_time = time(NULL);
            init_core->log_job_p->nprocs = nprocs;
            init_core->log_job_p->jobid = (int64_t)jobid;
302
303
304
305
306
307

            /* if we are using any hints to write the log file, then record those
             * hints with the darshan job information
             */
            darshan_log_record_hints_and_ver(init_core);

308
            /* collect information about command line and mounted file systems */
309
            darshan_get_exe_and_mounts(init_core, argc, argv);
310

311
312
313
314
            /* if darshan was successfully initialized, set the global pointer
             * and bootstrap any modules with static initialization routines
             */
            DARSHAN_CORE_LOCK();
315
            darshan_core = init_core;
316
317
318
319
320
321
322
323
            DARSHAN_CORE_UNLOCK();

            i = 0;
            while(mod_static_init_fns[i])
            {
                (*mod_static_init_fns[i])();
                i++;
            }
324
        }
325
326
    }

327
328
    if(internal_timing_flag)
    {
329
330
        init_time = PMPI_Wtime() - init_start;
        PMPI_Reduce(&init_time, &init_max, 1,
331
            MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
332
        if(my_rank == 0)
333
        {
334
335
            darshan_core_fprintf(stderr, "#darshan:<op>\t<nprocs>\t<time>\n");
            darshan_core_fprintf(stderr, "darshan:init\t%d\t%f\n", nprocs, init_max);
336
337
338
339
340
341
        }
    }

    return;
}

Shane Snyder's avatar
Shane Snyder committed
342
void darshan_core_shutdown()
343
{
344
    struct darshan_core_runtime *final_core;
345
    int internal_timing_flag = 0;
346
347
    struct tm *start_tm;
    time_t start_time_tmp;
348
    int64_t first_start_time;
349
    int64_t last_end_time;
350
    double start_log_time;
351
    double tm_end;
352
353
354
    double open1 = 0, open2 = 0;
    double job1 = 0, job2 = 0;
    double rec1 = 0, rec2 = 0;
355
356
    double mod1[DARSHAN_MAX_MODS] = {0};
    double mod2[DARSHAN_MAX_MODS] = {0};
357
    double header1 = 0, header2 = 0;
358
359
360
361
362
363
364
365
366
367
368
369
    char *logfile_name;
    int local_mod_use[DARSHAN_MAX_MODS] = {0};
    int global_mod_use_count[DARSHAN_MAX_MODS] = {0};
    darshan_record_id *shared_recs;
    darshan_record_id *mod_shared_recs;
    int shared_rec_cnt = 0;
    int ret = 0;
    int all_ret = 0;
    int i;
    uint64_t gz_fp = 0;
    MPI_File log_fh;
    MPI_Status status;
370
371
372
373

    if(getenv("DARSHAN_INTERNAL_TIMING"))
        internal_timing_flag = 1;

374
    /* synchronize before getting start time */
375
376
    PMPI_Barrier(MPI_COMM_WORLD);
    start_log_time = PMPI_Wtime();
377

Shane Snyder's avatar
Shane Snyder committed
378
    /* disable darhan-core while we shutdown */
379
    DARSHAN_CORE_LOCK();
380
    if(!darshan_core)
381
    {
382
        DARSHAN_CORE_UNLOCK();
383
384
        return;
    }
385
386
    final_core = darshan_core;
    darshan_core = NULL;
387
388
    DARSHAN_CORE_UNLOCK();

389
#ifdef __DARSHAN_ENABLE_MMAP_LOGS
390
391
392
393
394
    /* remove the temporary mmap log files */
    /* NOTE: this unlink is not immediate as it must wait for the mapping
     * to no longer be referenced, which in our case happens when the
     * executable exits. If the application terminates mid-shutdown, then
     * there will be no mmap files and no final log file.
395
     */
396
    unlink(final_core->mmap_log_name);
397
#endif
Shane Snyder's avatar
Shane Snyder committed
398

399
400
401
    final_core->log_job_p->end_time = time(NULL);

    /* reduce to report first start and last end time across all ranks at rank 0 */
402
    PMPI_Reduce(&final_core->log_job_p->start_time, &first_start_time,
403
        1, MPI_INT64_T, MPI_MIN, 0, MPI_COMM_WORLD);
404
    PMPI_Reduce(&final_core->log_job_p->end_time, &last_end_time,
405
406
        1, MPI_INT64_T, MPI_MAX, 0, MPI_COMM_WORLD);
    if(my_rank == 0)
Shane Snyder's avatar
Shane Snyder committed
407
    {
408
409
        final_core->log_job_p->start_time = first_start_time;
        final_core->log_job_p->end_time = last_end_time;
Shane Snyder's avatar
Shane Snyder committed
410
    }
411

412
    final_core->comp_buf = malloc(darshan_mod_mem_quota);
413
414
415
416
417
    if(!(final_core->comp_buf))
    {
        darshan_core_cleanup(final_core);
        return;
    }
418

419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
    logfile_name = malloc(PATH_MAX);
    if(!logfile_name)
    {
        darshan_core_cleanup(final_core);
        return;
    }

    /* set the log file name on rank 0 */
    if(my_rank == 0)
    {
        /* use human readable start time format in log filename */
        start_time_tmp = final_core->log_job_p->start_time;
        start_tm = localtime(&start_time_tmp);

        darshan_get_logfile_name(logfile_name, final_core->log_job_p->jobid, start_tm);
    }

    /* broadcast log file name */
437
    PMPI_Bcast(logfile_name, PATH_MAX, MPI_CHAR, 0,
438
439
440
441
442
        MPI_COMM_WORLD);

    if(strlen(logfile_name) == 0)
    {
        /* failed to generate log file name */
443
        darshan_core_fprintf(stderr, "darshan library warning: unable to determine log file path\n");
444
445
446
447
        free(logfile_name);
        darshan_core_cleanup(final_core);
        return;
    }
448

449
450
451
452
453
    /* set which modules were registered locally, and call into
     * them to disable further instrumentation and to perform any
     * other pre-shutdown steps
     */
    for(i = 0; i < DARSHAN_MAX_MODS; i++)
454
    {
455
456
        if(final_core->mod_array[i])
            local_mod_use[i] = 1;
457
458
459
    }

    /* reduce the number of times a module was opened globally and bcast to everyone */
460
    PMPI_Allreduce(local_mod_use, global_mod_use_count,
461
462
463
464
465
466
        DARSHAN_MAX_MODS, MPI_INT, MPI_SUM, MPI_COMM_WORLD);

    /* get a list of records which are shared across all processes */
    darshan_get_shared_records(final_core, &shared_recs, &shared_rec_cnt);

    if(internal_timing_flag)
467
        open1 = PMPI_Wtime();
468
469
470
    /* collectively open the darshan log file */
    ret = darshan_log_open_all(logfile_name, &log_fh);
    if(internal_timing_flag)
471
        open2 = PMPI_Wtime();
472
473

    /* error out if unable to open log file */
474
    PMPI_Allreduce(&ret, &all_ret, 1, MPI_INT,
475
476
477
478
479
        MPI_LOR, MPI_COMM_WORLD);
    if(all_ret != 0)
    {
        if(my_rank == 0)
        {
480
            darshan_core_fprintf(stderr, "darshan library warning: unable to create log file %s\n",
481
482
483
484
485
486
487
488
                logfile_name);
        }
        free(logfile_name);
        darshan_core_cleanup(final_core);
        return;
    }

    if(internal_timing_flag)
489
        job1 = PMPI_Wtime();
490
491
492
    /* rank 0 is responsible for writing the compressed darshan job information */
    if(my_rank == 0)
    {
493
        void *pointers[2] = {final_core->log_job_p, final_core->log_exemnt_p};
494
495
496
497
498
499
500
501
        int lengths[2] = {sizeof(struct darshan_job), strlen(final_core->log_exemnt_p)};
        int comp_buf_sz = 0;

        /* compress the job info and the trailing mount/exe data */
        all_ret = darshan_deflate_buffer(pointers, lengths, 2,
            final_core->comp_buf, &comp_buf_sz);
        if(all_ret)
        {
502
            darshan_core_fprintf(stderr, "darshan library warning: unable to compress job data\n");
503
504
505
506
507
508
            unlink(logfile_name);
        }
        else
        {
            /* write the job information, preallocing space for the log header */
            gz_fp += sizeof(struct darshan_header);
509
            all_ret = PMPI_File_write_at(log_fh, gz_fp,
510
511
512
                final_core->comp_buf, comp_buf_sz, MPI_BYTE, &status);
            if(all_ret != MPI_SUCCESS)
            {
513
                darshan_core_fprintf(stderr,
514
515
516
517
518
519
520
521
522
523
                        "darshan library warning: unable to write job data to log file %s\n",
                        logfile_name);
                unlink(logfile_name);

            }
            gz_fp += comp_buf_sz;
        }
    }

    /* error out if unable to write job information */
524
    PMPI_Bcast(&all_ret, 1, MPI_INT, 0, MPI_COMM_WORLD);
525
526
527
528
529
530
531
    if(all_ret != 0)
    {
        free(logfile_name);
        darshan_core_cleanup(final_core);
        return;
    }
    if(internal_timing_flag)
532
        job2 = PMPI_Wtime();
533
534

    if(internal_timing_flag)
535
        rec1 = PMPI_Wtime();
536
    /* write the record name->id hash to the log file */
537
    final_core->log_hdr_p->name_map.off = gz_fp;
538
    ret = darshan_log_write_name_record_hash(log_fh, final_core, &gz_fp);
539
    final_core->log_hdr_p->name_map.len = gz_fp - final_core->log_hdr_p->name_map.off;
540

541
    /* error out if unable to write the name record hash */
542
    PMPI_Allreduce(&ret, &all_ret, 1, MPI_INT,
543
544
545
546
547
        MPI_LOR, MPI_COMM_WORLD);
    if(all_ret != 0)
    {
        if(my_rank == 0)
        {
548
            darshan_core_fprintf(stderr,
549
550
551
552
553
554
555
556
557
                "darshan library warning: unable to write record hash to log file %s\n",
                logfile_name);
            unlink(logfile_name);
        }
        free(logfile_name);
        darshan_core_cleanup(final_core);
        return;
    }
    if(internal_timing_flag)
558
        rec2 = PMPI_Wtime();
559
560
561
562
563
564
565
566

    mod_shared_recs = malloc(shared_rec_cnt * sizeof(darshan_record_id));
    assert(mod_shared_recs);

    /* loop over globally used darshan modules and:
     *      - get final output buffer
     *      - compress (zlib) provided output buffer
     *      - append compressed buffer to log file
567
     *      - add module map info (file offset/length) to log header
568
569
570
571
572
     *      - shutdown the module
     */
    for(i = 0; i < DARSHAN_MAX_MODS; i++)
    {
        struct darshan_core_module* this_mod = final_core->mod_array[i];
573
        struct darshan_core_name_record_ref *ref = NULL;
574
575
576
577
578
579
580
581
582
        int mod_shared_rec_cnt = 0;
        void* mod_buf = NULL;
        int mod_buf_sz = 0;
        int j;

        if(global_mod_use_count[i] == 0)
        {
            if(my_rank == 0)
            {
583
584
                final_core->log_hdr_p->mod_map[i].off = 0;
                final_core->log_hdr_p->mod_map[i].len = 0;
585
586
587
588
589
            }
            continue;
        }

        if(internal_timing_flag)
590
            mod1[i] = PMPI_Wtime();
591

592
        /* set the shared record list for this module */
593
594
        for(j = 0; j < shared_rec_cnt; j++)
        {
595
            HASH_FIND(hlink, final_core->name_hash, &shared_recs[j],
596
597
598
599
600
601
602
603
604
605
606
                sizeof(darshan_record_id), ref);
            assert(ref);
            if(DARSHAN_MOD_FLAG_ISSET(ref->global_mod_flags, i))
            {
                mod_shared_recs[mod_shared_rec_cnt++] = shared_recs[j];
            }
        }

        /* if module is registered locally, get the corresponding output buffer
         * 
         * NOTE: this function can be used to run collective operations across
607
         * modules, if there are records shared globally.
608
609
610
         */
        if(this_mod)
        {
611
612
            mod_buf = final_core->mod_array[i]->rec_buf_start;
            mod_buf_sz = final_core->mod_array[i]->rec_buf_p - mod_buf;
613
            this_mod->mod_shutdown_func(MPI_COMM_WORLD, mod_shared_recs,
614
615
616
617
                mod_shared_rec_cnt, &mod_buf, &mod_buf_sz);
        }

        /* append this module's data to the darshan log */
618
        final_core->log_hdr_p->mod_map[i].off = gz_fp;
619
        ret = darshan_log_append_all(log_fh, final_core, mod_buf, mod_buf_sz, &gz_fp);
620
621
        final_core->log_hdr_p->mod_map[i].len =
            gz_fp - final_core->log_hdr_p->mod_map[i].off;
622

623
624
625
626
627
628
        /* XXX: DXT manages its own module memory buffers, so we need to
         * explicitly free them
         */
        if(i == DXT_POSIX_MOD || i == DXT_MPIIO_MOD)
            free(mod_buf);

629
        /* error out if the log append failed */
630
        PMPI_Allreduce(&ret, &all_ret, 1, MPI_INT,
631
632
633
634
635
            MPI_LOR, MPI_COMM_WORLD);
        if(all_ret != 0)
        {
            if(my_rank == 0)
            {
636
                darshan_core_fprintf(stderr,
637
638
639
640
641
642
643
644
645
646
                    "darshan library warning: unable to write %s module data to log file %s\n",
                    darshan_module_names[i], logfile_name);
                unlink(logfile_name);
            }
            free(logfile_name);
            darshan_core_cleanup(final_core);
            return;
        }

        if(internal_timing_flag)
647
            mod2[i] = PMPI_Wtime();
648
649
650
    }

    if(internal_timing_flag)
651
        header1 = PMPI_Wtime();
652
    /* write out log header, after running 2 reductions on header variables:
653
     *  1) reduce 'partial_flag' variable to determine which modules ran out
654
     *     of memory for storing data
655
656
657
658
659
660
     *  2) reduce 'mod_ver' array to determine which log format version each
     *     module used for this output log
     */
    if(my_rank == 0)
    {
        /* rank 0 is responsible for writing the log header */
661
662
        final_core->log_hdr_p->comp_type = DARSHAN_ZLIB_COMP;

663
        PMPI_Reduce(
664
665
            MPI_IN_PLACE, &(final_core->log_hdr_p->partial_flag),
            1, MPI_UINT32_T, MPI_BOR, 0, MPI_COMM_WORLD);
666
        PMPI_Reduce(
667
668
            MPI_IN_PLACE, &(final_core->log_hdr_p->mod_ver),
            DARSHAN_MAX_MODS, MPI_UINT32_T, MPI_MAX, 0, MPI_COMM_WORLD);
669

670
        all_ret = PMPI_File_write_at(log_fh, 0, final_core->log_hdr_p,
671
672
673
            sizeof(struct darshan_header), MPI_BYTE, &status);
        if(all_ret != MPI_SUCCESS)
        {
674
            darshan_core_fprintf(stderr, "darshan library warning: unable to write header to log file %s\n",
675
676
677
678
                    logfile_name);
            unlink(logfile_name);
        }
    }
679
680
    else
    {
681
        PMPI_Reduce(
682
683
            &(final_core->log_hdr_p->partial_flag), &(final_core->log_hdr_p->partial_flag),
            1, MPI_UINT32_T, MPI_BOR, 0, MPI_COMM_WORLD);
684
        PMPI_Reduce(
685
686
687
            &(final_core->log_hdr_p->mod_ver), &(final_core->log_hdr_p->mod_ver),
            DARSHAN_MAX_MODS, MPI_UINT32_T, MPI_MAX, 0, MPI_COMM_WORLD);
    }
688
689

    /* error out if unable to write log header */
690
    PMPI_Bcast(&all_ret, 1, MPI_INT, 0, MPI_COMM_WORLD);
691
692
693
694
695
696
697
    if(all_ret != 0)
    {
        free(logfile_name);
        darshan_core_cleanup(final_core);
        return;
    }
    if(internal_timing_flag)
698
        header2 = PMPI_Wtime();
699

700
    PMPI_File_close(&log_fh);
701
702
703
704
705
706
707

    /* if we got this far, there are no errors, so rename from *.darshan_partial
     * to *-<logwritetime>.darshan, which indicates that this log file is
     * complete and ready for analysis
     */
    if(my_rank == 0)
    {
708
        mode_t chmod_mode = S_IRUSR;
709
#ifdef __DARSHAN_GROUP_READABLE_LOGS
710
        chmod_mode |= S_IRGRP;
711
#endif
712
713
714
715

        if(getenv("DARSHAN_LOGFILE"))
        {
            chmod(logfile_name, chmod_mode);
716
717
718
719
720
721
722
723
724
725
726
        }
        else
        {
            char* tmp_index;
            double end_log_time;
            char* new_logfile_name;

            new_logfile_name = malloc(PATH_MAX);
            if(new_logfile_name)
            {
                new_logfile_name[0] = '\0';
727
                end_log_time = PMPI_Wtime();
728
729
730
731
732
                strcat(new_logfile_name, logfile_name);
                tmp_index = strstr(new_logfile_name, ".darshan_partial");
                sprintf(tmp_index, "_%d.darshan", (int)(end_log_time-start_log_time+1));
                rename(logfile_name, new_logfile_name);
                /* set permissions on log file */
733
                chmod(new_logfile_name, chmod_mode);
734
735
736
737
738
739
                free(new_logfile_name);
            }
        }
    }

    free(logfile_name);
740
741
    free(shared_recs);
    free(mod_shared_recs);
742
    darshan_core_cleanup(final_core);
743

744
    if(internal_timing_flag)
745
    {
746
747
748
749
750
        double open_tm, open_slowest;
        double header_tm, header_slowest;
        double job_tm, job_slowest;
        double rec_tm, rec_slowest;
        double mod_tm[DARSHAN_MAX_MODS], mod_slowest[DARSHAN_MAX_MODS];
751
752
        double all_tm, all_slowest;

753
        tm_end = PMPI_Wtime();
754

755
756
757
758
        open_tm = open2 - open1;
        header_tm = header2 - header1;
        job_tm = job2 - job1;
        rec_tm = rec2 - rec1;
759
        all_tm = tm_end - start_log_time;
760
761
762
763
        for(i = 0;i < DARSHAN_MAX_MODS; i++)
        {
            mod_tm[i] = mod2[i] - mod1[i];
        }
764

765
        PMPI_Reduce(&open_tm, &open_slowest, 1,
766
            MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
767
        PMPI_Reduce(&header_tm, &header_slowest, 1,
768
            MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
769
        PMPI_Reduce(&job_tm, &job_slowest, 1,
770
            MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
771
        PMPI_Reduce(&rec_tm, &rec_slowest, 1,
772
            MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
773
        PMPI_Reduce(&all_tm, &all_slowest, 1,
774
            MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
775
        PMPI_Reduce(mod_tm, mod_slowest, DARSHAN_MAX_MODS,
776
            MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
777
778
779

        if(my_rank == 0)
        {
780
781
782
783
784
            darshan_core_fprintf(stderr, "#darshan:<op>\t<nprocs>\t<time>\n");
            darshan_core_fprintf(stderr, "darshan:log_open\t%d\t%f\n", nprocs, open_slowest);
            darshan_core_fprintf(stderr, "darshan:job_write\t%d\t%f\n", nprocs, job_slowest);
            darshan_core_fprintf(stderr, "darshan:hash_write\t%d\t%f\n", nprocs, rec_slowest);
            darshan_core_fprintf(stderr, "darshan:header_write\t%d\t%f\n", nprocs, header_slowest);
785
786
787
            for(i = 0; i < DARSHAN_MAX_MODS; i++)
            {
                if(global_mod_use_count[i])
788
                    darshan_core_fprintf(stderr, "darshan:%s_shutdown\t%d\t%f\n", darshan_module_names[i],
789
790
                        nprocs, mod_slowest[i]);
            }
791
            darshan_core_fprintf(stderr, "darshan:core_shutdown\t%d\t%f\n", nprocs, all_slowest);
792
        }
793
    }
794

795
796
    return;
}
797

Shane Snyder's avatar
Shane Snyder committed
798
/* *********************************** */
799

800
#ifdef __DARSHAN_ENABLE_MMAP_LOGS
801
802
803
804
805
806
807
static void *darshan_init_mmap_log(struct darshan_core_runtime* core, int jobid)
{
    int ret;
    int mmap_fd;
    int mmap_size;
    int sys_page_size;
    char cuser[L_cuserid] = {0};
808
809
810
    uint64_t hlevel;
    char hname[HOST_NAME_MAX];
    uint64_t logmod;
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
    char *envstr;
    char *mmap_log_path;
    void *mmap_p;

    sys_page_size = sysconf(_SC_PAGESIZE);
    assert(sys_page_size > 0);

    mmap_size = sizeof(struct darshan_header) + DARSHAN_JOB_RECORD_SIZE +
        + DARSHAN_NAME_RECORD_BUF_SIZE + darshan_mod_mem_quota;
    if(mmap_size % sys_page_size)
        mmap_size = ((mmap_size / sys_page_size) + 1) * sys_page_size;

    envstr = getenv(DARSHAN_MMAP_LOG_PATH_OVERRIDE);
    if(envstr)
        mmap_log_path = envstr;
    else
        mmap_log_path = DARSHAN_DEF_MMAP_LOG_PATH;

    darshan_get_user_name(cuser);

831
832
833
834
835
836
837
    /* generate a random number to help differentiate the temporary log */
    /* NOTE: job id is not sufficient for constructing a unique log file name,
     * since a job could be composed of multiple application runs, so we also
     * add a random number component to the log name
     */
    if(my_rank == 0)
    {
838
        hlevel=PMPI_Wtime() * 1000000;
839
840
841
        (void)gethostname(hname, sizeof(hname));
        logmod = darshan_hash((void*)hname,strlen(hname),hlevel);
    }
842
    PMPI_Bcast(&logmod, 1, MPI_UINT64_T, 0, MPI_COMM_WORLD);
843

844
845
846
847
    /* construct a unique temporary log file name for this process
     * to write mmap log data to
     */
    snprintf(core->mmap_log_name, PATH_MAX,
848
849
        "/%s/%s_%s_id%d_mmap-log-%" PRIu64 "-%d.darshan",
        mmap_log_path, cuser, __progname, jobid, logmod, my_rank);
850
851
852
853
854

    /* create the temporary mmapped darshan log */
    mmap_fd = open(core->mmap_log_name, O_CREAT|O_RDWR|O_EXCL , 0644);
    if(mmap_fd < 0)
    {
855
        darshan_core_fprintf(stderr, "darshan library warning: "
856
857
858
859
860
861
862
863
864
            "unable to create darshan log file %s\n", core->mmap_log_name);
        return(NULL);
    }

    /* TODO: ftruncate or just zero fill? */
    /* allocate the necessary space in the log file */
    ret = ftruncate(mmap_fd, mmap_size);
    if(ret < 0)
    {
865
        darshan_core_fprintf(stderr, "darshan library warning: "
866
867
868
869
870
871
872
873
874
875
876
877
            "unable to allocate darshan log file %s\n", core->mmap_log_name);
        close(mmap_fd);
        unlink(core->mmap_log_name);
        return(NULL);
    }

    /* create the memory map for darshan's data structures so they are
     * persisted to file as the application executes
     */
    mmap_p = mmap(NULL, mmap_size, PROT_WRITE, MAP_SHARED, mmap_fd, 0);
    if(mmap_p == MAP_FAILED)
    {
878
        darshan_core_fprintf(stderr, "darshan library warning: "
879
880
881
882
883
884
885
886
887
888
889
            "unable to mmap darshan log file %s\n", core->mmap_log_name);
        close(mmap_fd);
        unlink(core->mmap_log_name);
        return(NULL);
    }

    /* close darshan log file (this does *not* unmap the log file) */
    close(mmap_fd);

    return(mmap_p);
}
890
#endif
891

892
/* record any hints used to write the darshan log in the job data */
893
static void darshan_log_record_hints_and_ver(struct darshan_core_runtime* core)
894
895
{
    char* hints;
896
    char* job_hints;
897
898
899
900
901
902
    int meta_remain = 0;
    char* m;

    /* check environment variable to see if the default MPI file hints have
     * been overridden
     */
903
    hints = getenv(DARSHAN_LOG_HINTS_OVERRIDE);
904
905
    if(!hints)
    {
906
        hints = __DARSHAN_LOG_HINTS;
907
908
909
910
911
    }

    if(!hints || strlen(hints) < 1)
        return;

912
913
    job_hints = strdup(hints);
    if(!job_hints)
914
915
916
        return;

    meta_remain = DARSHAN_JOB_METADATA_LEN -
917
        strlen(core->log_job_p->metadata) - 1;
918
919
    if(meta_remain >= (strlen(PACKAGE_VERSION) + 9))
    {
920
        sprintf(core->log_job_p->metadata, "lib_ver=%s\n", PACKAGE_VERSION);
921
922
        meta_remain -= (strlen(PACKAGE_VERSION) + 9);
    }
923
    if(meta_remain >= (3 + strlen(job_hints)))
924
    {
925
        m = core->log_job_p->metadata + strlen(core->log_job_p->metadata);
926
        /* We have room to store the hints in the metadata portion of
927
         * the job structure.  We just prepend an h= to the hints list.  The
928
929
930
         * metadata parser will ignore = characters that appear in the value
         * portion of the metadata key/value pair.
         */
931
        sprintf(m, "h=%s\n", job_hints);
932
    }
933
    free(job_hints);
934
935
936
937

    return;
}

938
939
940
941
942
943
944
945
946
947
948
949
950
951
static int mnt_data_cmp(const void* a, const void* b)
{
    const struct mnt_data *d_a = (const struct mnt_data*)a;
    const struct mnt_data *d_b = (const struct mnt_data*)b;

    if(strlen(d_a->path) > strlen(d_b->path))
        return(-1);
    else if(strlen(d_a->path) < strlen(d_b->path))
        return(1);
    else
        return(0);
}

/* adds an entry to table of mounted file systems */
952
static void add_entry(char* buf, int* space_left, struct mntent* entry)
953
{
954
    int i;
955
956
957
958
    int ret;
    char tmp_mnt[256];
    struct statfs statfsbuf;

959
960
961
962
963
964
965
966
967
968
    /* avoid adding the same mount points multiple times -- to limit
     * storage space and potential statfs, ioctl, etc calls
     */
    for(i = 0; i < mnt_data_count; i++)
    {
        if((strncmp(mnt_data_array[i].path, entry->mnt_dir, DARSHAN_MAX_MNT_PATH) == 0) &&
           (strncmp(mnt_data_array[i].type, entry->mnt_type, DARSHAN_MAX_MNT_PATH) == 0))
            return;
    }

969
970
971
972
973
974
975
976
977
978
979
980
    strncpy(mnt_data_array[mnt_data_count].path, entry->mnt_dir,
        DARSHAN_MAX_MNT_PATH-1);
    strncpy(mnt_data_array[mnt_data_count].type, entry->mnt_type,
        DARSHAN_MAX_MNT_TYPE-1);
    /* NOTE: we now try to detect the preferred block size for each file 
     * system using fstatfs().  On Lustre we assume a size of 1 MiB 
     * because fstatfs() reports 4 KiB. 
     */
#ifndef LL_SUPER_MAGIC
#define LL_SUPER_MAGIC 0x0BD00BD0
#endif
    ret = statfs(entry->mnt_dir, &statfsbuf);
981
    mnt_data_array[mnt_data_count].fs_info.fs_type = statfsbuf.f_type;
982
    if(ret == 0 && statfsbuf.f_type != LL_SUPER_MAGIC)
983
        mnt_data_array[mnt_data_count].fs_info.block_size = statfsbuf.f_bsize;
984
    else if(ret == 0 && statfsbuf.f_type == LL_SUPER_MAGIC)
985
        mnt_data_array[mnt_data_count].fs_info.block_size = 1024*1024;
986
    else
987
        mnt_data_array[mnt_data_count].fs_info.block_size = 4096;
988

989
#ifdef DARSHAN_LUSTRE
990
991
992
993
994
995
996
997
998
999
1000
    /* attempt to retrieve OST and MDS counts from Lustre */
    mnt_data_array[mnt_data_count].fs_info.ost_count = -1;
    mnt_data_array[mnt_data_count].fs_info.mdt_count = -1;
    if ( statfsbuf.f_type == LL_SUPER_MAGIC )
    {
        int n_ost, n_mdt;
        int ret_ost, ret_mdt;
        DIR *mount_dir;

        mount_dir = opendir( entry->mnt_dir );
        if ( mount_dir  ) 
For faster browsing, not all history is shown. View entire blame