darshan-lustre.c 17.3 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
/*
 * Copyright (C) 2015 University of Chicago.
 * See COPYRIGHT notice in top-level directory.
 *
 */

#define _XOPEN_SOURCE 500
#define _GNU_SOURCE

#include "darshan-runtime-config.h"
#include <stdio.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <time.h>
#include <stdlib.h>
#include <assert.h>
#include <pthread.h>
19
20
#include <limits.h>
#include <sys/xattr.h>
21

22
#include <lustre/lustreapi.h>
23
24
25

#include "darshan.h"
#include "darshan-dynamic.h"
26
#include "darshan-lustre.h"
27

28
29
30
static void lustre_runtime_initialize(
    void);
static void lustre_subtract_shared_rec_size(
31
    void *rec_ref_p, void *user_ptr);
32
static void lustre_set_rec_ref_pointers(
33
    void *rec_ref_p, void *user_ptr);
34
35
36
37
38
static int lustre_record_compare(
    const void* a_p, const void* b_p);
int sort_lustre_records(
    void);

39
40
41
42
43
#ifdef HAVE_MPI
static void lustre_mpi_redux(
    void *lustre_buf, MPI_Comm mod_comm,
    darshan_record_id *shared_recs, int shared_rec_count);
#endif
44
static void lustre_shutdown(
45
    void **lustre_buf, int *lustre_buf_sz);
46

47
struct lustre_runtime *lustre_runtime = NULL;
48
49
50
51
52
53
static pthread_mutex_t lustre_runtime_mutex = PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP;
static int my_rank = -1;

#define LUSTRE_LOCK() pthread_mutex_lock(&lustre_runtime_mutex)
#define LUSTRE_UNLOCK() pthread_mutex_unlock(&lustre_runtime_mutex)

54
void darshan_instrument_lustre_file(const char* filepath, int fd)
55
{
56
    struct lustre_record_ref *rec_ref;
57
    struct darshan_lustre_record *rec;
58
    struct darshan_fs_info fs_info;
59
    darshan_record_id rec_id;
60
    int i;
61
62
63
64
65
66
    void *lustre_xattr_val;
    size_t lustre_xattr_size = XATTR_SIZE_MAX;
    struct llapi_layout *lustre_layout;
    uint64_t stripe_size;
    uint64_t stripe_count;
    uint64_t tmp_ost;
67
    size_t rec_size;
68
    int ret;
69
70

    LUSTRE_LOCK();
71

72
73
    /* try to init module if not already */
    if(!lustre_runtime) lustre_runtime_initialize();
74
75

    /* if we aren't initialized, just back out */
76
77
78
79
80
    if(!lustre_runtime)
    {
        LUSTRE_UNLOCK();
        return;
    }
81

82
    /* search the hash table for this file record, and initialize if not found */
83
    rec_id = darshan_core_gen_record_id(filepath);
84
85
86
    rec_ref = darshan_lookup_record_ref(lustre_runtime->record_id_hash,
        &rec_id, sizeof(darshan_record_id));
    if(!rec_ref)
87
    {
88
        if ( (lustre_xattr_val = calloc(1, lustre_xattr_size)) == NULL )
89
90
91
92
93
        {
            LUSTRE_UNLOCK();
            return;
        }

94
95
96
97
98
99
100
101
102
        /* -1 means fgetxattr failed, likely because file isn't on Lustre, but maybe because
         * the Lustre version doesn't support this method of obtaining striping info
         */
        if ( (lustre_xattr_size = fgetxattr( fd, "lustre.lov", lustre_xattr_val, lustre_xattr_size)) == -1 )
        {
            free(lustre_xattr_val);
            LUSTRE_UNLOCK();
            return;
        }
103

104
105
        /* get corresponding Lustre file layout, then extract stripe params */
        if ( (lustre_layout = llapi_layout_get_by_xattr(lustre_xattr_val, lustre_xattr_size, 0)) == NULL)
106
        {
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
            free(lustre_xattr_val);
            LUSTRE_UNLOCK();
            return;
        }
        if (llapi_layout_stripe_size_get(lustre_layout, &stripe_size) == -1)
        {
            llapi_layout_free(lustre_layout);
            free(lustre_xattr_val);
            LUSTRE_UNLOCK();
            return;
        }
        if (llapi_layout_stripe_count_get(lustre_layout, &stripe_count) == -1)
        {
            llapi_layout_free(lustre_layout);
            free(lustre_xattr_val);
122
123
124
125
126
            LUSTRE_UNLOCK();
            return;
        }

        /* allocate and add a new record reference */
127
128
129
        rec_ref = malloc(sizeof(*rec_ref));
        if(!rec_ref)
        {
130
131
            llapi_layout_free(lustre_layout);
            free(lustre_xattr_val);
132
133
134
135
136
137
138
139
140
            LUSTRE_UNLOCK();
            return;
        }
    
        ret = darshan_add_record_ref(&(lustre_runtime->record_id_hash),
            &rec_id, sizeof(darshan_record_id), rec_ref);
        if(ret == 0)
        {
            free(rec_ref);
141
142
            llapi_layout_free(lustre_layout);
            free(lustre_xattr_val);
143
144
145
            LUSTRE_UNLOCK();
            return;
        }
146

147
        rec_size = LUSTRE_RECORD_SIZE( stripe_count );
148

149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
        /* register a Lustre file record with Darshan */
        fs_info.fs_type = -1;
        rec = darshan_core_register_record(
                rec_id,
                filepath,
                DARSHAN_LUSTRE_MOD,
                rec_size,
                &fs_info);

        /* if NULL, darshan has no more memory for instrumenting */
        if(rec == NULL)
        {
            darshan_delete_record_ref(&(lustre_runtime->record_id_hash),
                &rec_id, sizeof(darshan_record_id));
            free(rec_ref);
164
165
            llapi_layout_free(lustre_layout);
            free(lustre_xattr_val);
166
167
168
            LUSTRE_UNLOCK();
            return;
        }
169
170
171
172
173
174
175
176
177
178
179
180
181
182

        /* implicit assumption here that none of these counters will change
         * after the first time a file is opened.  This may not always be
         * true in the future */
        if ( fs_info.fs_type != -1 ) 
        {
            rec->counters[LUSTRE_OSTS] = fs_info.ost_count;
            rec->counters[LUSTRE_MDTS] = fs_info.mdt_count;
        }
        else
        {
            rec->counters[LUSTRE_OSTS] = -1;
            rec->counters[LUSTRE_MDTS] = -1;
        }
183

184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
        rec->counters[LUSTRE_STRIPE_SIZE] = stripe_size;
        rec->counters[LUSTRE_STRIPE_WIDTH] = stripe_count;
        rec->counters[LUSTRE_STRIPE_OFFSET] = -1; // no longer captured
        for ( i = 0; i < stripe_count; i++ )
        {
            if (llapi_layout_ost_index_get(lustre_layout, i, &tmp_ost) == -1)
            {
                darshan_delete_record_ref(&(lustre_runtime->record_id_hash),
                    &rec_id, sizeof(darshan_record_id));
                free(rec_ref);
                llapi_layout_free(lustre_layout);
                free(lustre_xattr_val);
                LUSTRE_UNLOCK();
                return;
            }
            rec->ost_ids[i] = (int64_t)tmp_ost;
        }
        free(lustre_xattr_val);
        llapi_layout_free(lustre_layout);
203

204
205
206
207
        rec->base_rec.id = rec_id;
        rec->base_rec.rank = my_rank;
        rec_ref->record = rec;
        rec_ref->record_size = rec_size;
208
        lustre_runtime->record_count++;
209
    }
210
211

    LUSTRE_UNLOCK();
212
213
214
    return;
}

215
216
static void lustre_runtime_initialize()
{
Philip Carns's avatar
Philip Carns committed
217
    size_t lustre_buf_size;
218
219
220
221
222
223
224
    darshan_module_funcs mod_funcs = {
#ifdef HAVE_MPI
        .mod_redux_func = &lustre_mpi_redux,
#endif
        .mod_shutdown_func = &lustre_shutdown
        };

225

226
227
228
229
    /* try and store a default number of records for this module, assuming
     * each file uses 64 OSTs
     */
    lustre_buf_size = DARSHAN_DEF_MOD_REC_COUNT * LUSTRE_RECORD_SIZE(64);
230
231
232
233

    /* register the lustre module with darshan-core */
    darshan_core_register_module(
        DARSHAN_LUSTRE_MOD,
234
        mod_funcs,
235
        &lustre_buf_size,
236
237
238
239
240
        &my_rank,
        NULL);

    lustre_runtime = malloc(sizeof(*lustre_runtime));
    if(!lustre_runtime)
241
    {
242
        darshan_core_unregister_module(DARSHAN_LUSTRE_MOD);
243
244
        return;
    }
245
    memset(lustre_runtime, 0, sizeof(*lustre_runtime));
246

247
248
249
    return;
}

250
251
252
253
/**************************************************************************
 * Functions exported by Lustre module for coordinating with darshan-core *
 **************************************************************************/

254
255
256
#ifdef HAVE_MPI
static void lustre_mpi_redux(
    void *posix_buf,
257
258
    MPI_Comm mod_comm,
    darshan_record_id *shared_recs,
259
    int shared_rec_count)
260
{
261
    struct lustre_record_ref *rec_ref;
262
263
    int i;

264
    LUSTRE_LOCK();
265
    assert(lustre_runtime);
266

267
268
269
    /* if there are globally shared files, do a shared file reduction */
    /* NOTE: the shared file reduction is also skipped if the 
     * DARSHAN_DISABLE_SHARED_REDUCTION environment variable is set.
270
     */
271
    if (shared_rec_count && !getenv("DARSHAN_DISABLE_SHARED_REDUCTION"))
272
273
274
275
    {
        /* necessary initialization of shared records */
        for(i = 0; i < shared_rec_count; i++)
        {
276
277
            rec_ref = darshan_lookup_record_ref(lustre_runtime->record_id_hash,
                &shared_recs[i], sizeof(darshan_record_id));
278
279
280
281
282
283
284
285
286
            /* As in other modules, it should not be possible to lose a
             * record after we have already performed a collective to
             * identify that it is shared with other ranks.  We print an
             * error msg and continue rather than asserting in this case,
             * though, see #243.
             */
            if(rec_ref)
                rec_ref->record->base_rec.rank = -1;
            else
287
                darshan_core_fprintf(stderr, "WARNING: unexpected condition in Darshan, possibly triggered by memory corruption.  Darshan log may be incorrect.\n");
288
        }
289
    }
290

291
292
293
294
    LUSTRE_UNLOCK();
    return;
}
#endif
295

296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
static void lustre_shutdown(
    void **lustre_buf,
    int *lustre_buf_sz)
{
    LUSTRE_LOCK();
    assert(lustre_runtime);

    lustre_runtime->record_buffer = *lustre_buf;
    lustre_runtime->record_buffer_size = *lustre_buf_sz;

    /* sort the array of files descending by rank so that we get all of the 
     * shared files (marked by rank -1) in a contiguous portion at end 
     * of the array
     */
    sort_lustre_records();

    /* simply drop all shared records from the end of the record array on
     * non-root ranks simply by recalculating the size of the buffer
     */
    if (my_rank != 0)
    {
        darshan_iter_record_refs(lustre_runtime->record_id_hash, 
318
            &lustre_subtract_shared_rec_size, NULL);
319
    }
320

321
322
323
324
325
326
327
    /* modify output buffer size to account for any shared records that were removed */
    *lustre_buf_sz = lustre_runtime->record_buffer_size;

    /* cleanup data structures */
    darshan_clear_record_refs(&(lustre_runtime->record_id_hash), 1);
    free(lustre_runtime);
    lustre_runtime = NULL;
328

329
    LUSTRE_UNLOCK();
330
331
332
    return;
}

333
static void lustre_subtract_shared_rec_size(void *rec_ref_p, void *user_ptr)
334
{
335
    struct lustre_record_ref *l_rec_ref = (struct lustre_record_ref *)rec_ref_p;
336

337
338
339
340
    if(l_rec_ref->record->base_rec.rank == -1)
        lustre_runtime->record_buffer_size -=
            LUSTRE_RECORD_SIZE( l_rec_ref->record->counters[LUSTRE_STRIPE_WIDTH] );
}
341

342
static void lustre_set_rec_ref_pointers(void *rec_ref_p, void *user_ptr)
343
344
345
{
    lustre_runtime->record_ref_array[lustre_runtime->record_ref_array_ndx] = rec_ref_p;
    lustre_runtime->record_ref_array_ndx++;
346
347
348
    return;
}

349
350
351
/* compare function for sorting file records by descending rank */
static int lustre_record_compare(const void* a_p, const void* b_p)
{
Shane Snyder's avatar
Shane Snyder committed
352
353
    const struct lustre_record_ref* a = *((struct lustre_record_ref **)a_p);
    const struct lustre_record_ref* b = *((struct lustre_record_ref **)b_p);
354

355
    if (a->record->base_rec.rank < b->record->base_rec.rank)
356
        return 1;
357
    if (a->record->base_rec.rank > b->record->base_rec.rank)
358
359
        return -1;

360
361
362
363
364
365
    /* if ( a->record->rank == b->record->rank ) we MUST do a secondary
     * sort so that the order of qsort is fully deterministic and consistent
     * across all MPI ranks.  Without a secondary sort, the sort order can
     * be affected by rank-specific variations (e.g., the order in which
     * files are first opened).
     */
366
367
368
369
370
371
    /* sort by ascending darshan record ids */
    if (a->record->base_rec.id > b->record->base_rec.id)
        return 1;
    if (a->record->base_rec.id < b->record->base_rec.id)
        return -1;
    
372
373
374
    return 0;
}

375
/*
376
 * Sort the record_references and records by MPI rank to facilitate shared redux.
377
378
379
380
381
382
383
384
 * This requires craftiness and additional heap utilization because the records
 * (but not record_runtimes) have variable size.  Currently has to temporarily
 * duplicate the entire record_buffer; there is room for more memory-efficient
 * optimization if this becomes a scalability issue.
 */
int sort_lustre_records()
{
    int i;
385
    struct lustre_record_ref *rec_ref;
386
387
388
389
    char  *new_buf, *p;

    /* Create a new buffer to store an entire replica of record_buffer.  Since
     * we know the exact size of record_buffer's useful data at this point, we
390
391
     * can allocate the exact amount we need */
    new_buf = malloc(lustre_runtime->record_buffer_size);
392
393
394
395
    p = new_buf;
    if ( !new_buf )
        return 1;

396
397
398
399
400
401
402
403
404
405
406
    /* allocate array of record reference pointers that we want to sort */
    lustre_runtime->record_ref_array = malloc(lustre_runtime->record_count *
        sizeof(*(lustre_runtime->record_ref_array)));
    if( !lustre_runtime->record_ref_array )
    {
        free(new_buf);
        return 1;
    }

    /* build the array of record reference pointers we want to sort */
    darshan_iter_record_refs(lustre_runtime->record_id_hash,
407
        &lustre_set_rec_ref_pointers, NULL);
408

409
410
    /* qsort breaks the hash table, so delete it now to free its memory buffers
     * and prevent later confusion */
411
    darshan_clear_record_refs(&(lustre_runtime->record_id_hash), 0);
412
413
414

    /* sort the runtime records, which is has fixed-length elements */
    qsort(
415
        lustre_runtime->record_ref_array,
416
        lustre_runtime->record_count,
Shane Snyder's avatar
Shane Snyder committed
417
        sizeof(struct lustre_record_ref *),
418
419
420
        lustre_record_compare
    );

421
422
423
    /* rebuild the hash with the qsorted runtime records, and
     * create reordered record buffer
     */
424
425
    for ( i = 0; i < lustre_runtime->record_count; i++ )
    {
426
        rec_ref = lustre_runtime->record_ref_array[i];
427

428
429
430
431
432
433
        /* add this record reference back to the hash table */
        darshan_add_record_ref(&(lustre_runtime->record_id_hash),
            &(rec_ref->record->base_rec.id), sizeof(darshan_record_id), rec_ref);

        memcpy( p, rec_ref->record, rec_ref->record_size );
        /* fix record pointers within each record reference too - pre-emptively
434
435
         * point them at where they will live in record_buffer after we memcpy
         * below */
436
437
438
        rec_ref->record = (struct darshan_lustre_record *)
            ((char*)(lustre_runtime->record_buffer) + (p - new_buf));
        p += rec_ref->record_size;
439
    }
440
441

    /* copy sorted records back over to Lustre's record buffer */
442
443
444
    memcpy( 
        lustre_runtime->record_buffer, 
        new_buf, 
445
        lustre_runtime->record_buffer_size );
446
447

    free(new_buf);
448
    free(lustre_runtime->record_ref_array);
449
450
451
    return 0;
}

452
453
454
455
#if 0
static void lustre_record_reduction_op(
    void* infile_v, void* inoutfile_v, int *len, MPI_Datatype *datatype);

456
457
458
459
460
461
462
463
464
465
466
467
468
469
/* this is just boilerplate reduction code that isn't currently used */
static void lustre_record_reduction_op(void* infile_v, void* inoutfile_v,
    int *len, MPI_Datatype *datatype)
{
    struct darshan_lustre_record tmp_record;
    struct darshan_lustre_record *infile = infile_v;
    struct darshan_lustre_record *inoutfile = inoutfile_v;
    int i, j;

    assert(lustre_runtime);

    for( i=0; i<*len; i++ )
    {
        memset(&tmp_record, 0, sizeof(struct darshan_lustre_record));
470
471
        tmp_record.base_rec.id = infile->base_rec.id;
        tmp_record.base_rec.rank = -1;
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494

        /* preserve only rank 0's value */
        for( j = LUSTRE_OSTS; j < LUSTRE_NUM_INDICES; j++)
        {
            if ( my_rank == 0 ) 
            {
                tmp_record.counters[j] = infile->counters[j];
            }
            else
            {
                tmp_record.counters[j] = inoutfile->counters[j];
            }
        }

        /* update pointers */
        *inoutfile = tmp_record;
        inoutfile++;
        infile++;
    }

    return;
}

495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
/*
 *  Dump the memory structure of our records and runtime records
 */
void print_lustre_runtime( void )
{
    int i, j;
    struct darshan_lustre_record *rec;

    /* print what we just loaded */
    for ( i = 0; i < lustre_runtime->record_count; i++ )
    {
        rec = (lustre_runtime->record_runtime_array[i]).record;
        printf( "File %2d\n", i );
        for ( j = 0; j < LUSTRE_NUM_INDICES; j++ )
        {
Glenn K. Lockwood's avatar
Glenn K. Lockwood committed
510
            printf( "  Counter %-2d: %10ld, addr %ld\n", 
511
512
513
514
515
516
                j, 
                rec->counters[j],
                (char*)(&(rec->counters[j])) - (char*)(lustre_runtime->record_buffer) );
        }
        for ( j = 0; j < rec->counters[LUSTRE_STRIPE_WIDTH]; j++ )
        {
Glenn K. Lockwood's avatar
Glenn K. Lockwood committed
517
518
            if ( j > 0 && j % 2 == 0 ) printf("\n");
            printf( "  Stripe  %-2d: %10ld, addr %-9d", 
519
520
521
522
                j, 
                rec->ost_ids[j],
                (char*)(&(rec->ost_ids[j])) - (char*)(lustre_runtime->record_buffer) );
        }
Glenn K. Lockwood's avatar
Glenn K. Lockwood committed
523
        printf( "\n" );
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
    }
    return;
}

/*
 *  Dump the order in which records appear in memory
 */
void print_array( void )
{
    int i;
    struct lustre_record_runtime *rec_rt;
    printf("*** DUMPING RECORD LIST BY ARRAY SEQUENCE\n");
    for ( i = 0; i < lustre_runtime->record_count; i++ )
    {
        rec_rt = &(lustre_runtime->record_runtime_array[i]);
        printf( "*** record %d rank %d osts %d\n", 
            rec_rt->record->rec_id, 
            rec_rt->record->rank,
            rec_rt->record->counters[LUSTRE_STRIPE_WIDTH]);
    }
}
void print_hash( void )
{
    struct lustre_record_runtime *rec_rt, *tmp_rec_rt;
    printf("*** DUMPING RECORD LIST BY HASH SEQUENCE\n");
549
    HASH_ITER( hlink, lustre_runtime->record_runtim_hash, rec_rt, tmp_rec_rt )
550
551
552
553
554
555
556
557
    {
        printf( "*** record %d rank %d osts %d\n", 
            rec_rt->record->rec_id, 
            rec_rt->record->rank,
            rec_rt->record->counters[LUSTRE_STRIPE_WIDTH]);
    }
    return;
}
558
#endif
559
560
561



562
563
564
565
566
567
568
569
/*
 * Local variables:
 *  c-indent-level: 4
 *  c-basic-offset: 4
 * End:
 *
 * vim: ts=8 sts=4 sw=4 expandtab
 */