darshan-posix.c 15.9 KB
Newer Older
1 2 3 4 5
/*
 *  (C) 2009 by Argonne National Laboratory.
 *      See COPYRIGHT in top-level directory.
 */

6 7
#define _GNU_SOURCE

8
#include "darshan-runtime-config.h"
9

10 11 12 13 14 15 16 17 18 19 20 21 22 23
#include <stdio.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <stdarg.h>
#include <string.h>
#include <time.h>
#include <stdlib.h>
#include <errno.h>
#include <sys/uio.h>
#include <sys/mman.h>
#include <search.h>
#include <assert.h>
24
#include <libgen.h>
25
#include <aio.h>
26
#include <pthread.h>
27 28

#include "darshan.h"
29
#include "uthash.h"
30

31
#ifndef HAVE_OFF64_T
32 33
typedef int64_t off64_t;
#endif
34 35 36
#ifndef HAVE_AIOCB64
#define aiocb64 aiocb
#endif
37

38 39
/* TODO these go where ? */

40 41 42 43 44
#define DARSHAN_FORWARD_DECL(name,ret,args) \
  extern ret __real_ ## name args;

#define DARSHAN_DECL(__name) __wrap_ ## __name

45 46
#define MAP_OR_FAIL(func)

47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129
/* TODO: where do these file record structs go? (some needed for darshan-util) */
/* TODO: DARSHAN_* OR CP_* */

#define POSIX_MOD_NAME "POSIX"

enum darshan_posix_indices
{
    CP_POSIX_READS,              /* count of posix reads */
    CP_POSIX_WRITES,             /* count of posix writes */
    CP_POSIX_OPENS,              /* count of posix opens */
    CP_POSIX_SEEKS,              /* count of posix seeks */
    CP_POSIX_STATS,              /* count of posix stat/lstat/fstats */
    CP_POSIX_MMAPS,              /* count of posix mmaps */
    CP_POSIX_FREADS,
    CP_POSIX_FWRITES,
    CP_POSIX_FOPENS,
    CP_POSIX_FSEEKS,
    CP_POSIX_FSYNCS,
    CP_POSIX_FDSYNCS,
    CP_MODE,                      /* mode of file */
    CP_BYTES_READ,                /* total bytes read */
    CP_BYTES_WRITTEN,             /* total bytes written */
    CP_MAX_BYTE_READ,             /* highest offset byte read */
    CP_MAX_BYTE_WRITTEN,          /* highest offset byte written */
    CP_CONSEC_READS,              /* count of consecutive reads */
    CP_CONSEC_WRITES,             /* count of consecutive writes */
    CP_SEQ_READS,                 /* count of sequential reads */
    CP_SEQ_WRITES,                /* count of sequential writes */
    CP_RW_SWITCHES,               /* number of times switched between read and write */
    CP_MEM_NOT_ALIGNED,           /* count of accesses not mem aligned */
    CP_MEM_ALIGNMENT,             /* mem alignment in bytes */
    CP_FILE_NOT_ALIGNED,          /* count of accesses not file aligned */
    CP_FILE_ALIGNMENT,            /* file alignment in bytes */
    CP_MAX_READ_TIME_SIZE,
    CP_MAX_WRITE_TIME_SIZE,
    /* buckets */
    CP_SIZE_READ_0_100,           /* count of posix read size ranges */
    CP_SIZE_READ_100_1K,
    CP_SIZE_READ_1K_10K,
    CP_SIZE_READ_10K_100K,
    CP_SIZE_READ_100K_1M,
    CP_SIZE_READ_1M_4M,
    CP_SIZE_READ_4M_10M,
    CP_SIZE_READ_10M_100M,
    CP_SIZE_READ_100M_1G,
    CP_SIZE_READ_1G_PLUS,
    /* buckets */
    CP_SIZE_WRITE_0_100,          /* count of posix write size ranges */
    CP_SIZE_WRITE_100_1K,
    CP_SIZE_WRITE_1K_10K,
    CP_SIZE_WRITE_10K_100K,
    CP_SIZE_WRITE_100K_1M,
    CP_SIZE_WRITE_1M_4M,
    CP_SIZE_WRITE_4M_10M,
    CP_SIZE_WRITE_10M_100M,
    CP_SIZE_WRITE_100M_1G,
    CP_SIZE_WRITE_1G_PLUS,
    /* counters */
    CP_STRIDE1_STRIDE,             /* the four most frequently appearing strides */
    CP_STRIDE2_STRIDE,
    CP_STRIDE3_STRIDE,
    CP_STRIDE4_STRIDE,
    CP_STRIDE1_COUNT,              /* count of each of the most frequent strides */
    CP_STRIDE2_COUNT,
    CP_STRIDE3_COUNT,
    CP_STRIDE4_COUNT,
    CP_ACCESS1_ACCESS,             /* the four most frequently appearing access sizes */
    CP_ACCESS2_ACCESS,
    CP_ACCESS3_ACCESS,
    CP_ACCESS4_ACCESS,
    CP_ACCESS1_COUNT,              /* count of each of the most frequent access sizes */
    CP_ACCESS2_COUNT,
    CP_ACCESS3_COUNT,
    CP_ACCESS4_COUNT,
    CP_DEVICE,                     /* device id reported by stat */
    CP_SIZE_AT_OPEN,
    CP_FASTEST_RANK,
    CP_FASTEST_RANK_BYTES,
    CP_SLOWEST_RANK,
    CP_SLOWEST_RANK_BYTES,

    CP_NUM_INDICES,
};
130

131
/* floating point statistics */
132
enum darshan_f_posix_indices
133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161
{
    /* NOTE: adjust cp_normalize_timestamps() function if any TIMESTAMPS are
     * added or modified in this list
     */
    CP_F_OPEN_TIMESTAMP = 0,    /* timestamp of first open */
    CP_F_READ_START_TIMESTAMP,  /* timestamp of first read */
    CP_F_WRITE_START_TIMESTAMP, /* timestamp of first write */
    CP_F_CLOSE_TIMESTAMP,       /* timestamp of last close */
    CP_F_READ_END_TIMESTAMP,    /* timestamp of last read */
    CP_F_WRITE_END_TIMESTAMP,   /* timestamp of last write */
    CP_F_POSIX_READ_TIME,       /* cumulative posix read time */
    CP_F_POSIX_WRITE_TIME,      /* cumulative posix write time */
    CP_F_POSIX_META_TIME,       /* cumulative posix meta time */
    CP_F_MAX_READ_TIME,
    CP_F_MAX_WRITE_TIME,
    /* Total I/O and meta time consumed by fastest and slowest ranks, 
     * reported in either MPI or POSIX time depending on how the file 
     * was accessed.
     */
    CP_F_FASTEST_RANK_TIME,     
    CP_F_SLOWEST_RANK_TIME,
    CP_F_VARIANCE_RANK_TIME,
    CP_F_VARIANCE_RANK_BYTES,

    CP_F_NUM_INDICES,
};

struct darshan_posix_file
{
162
    darshan_file_id f_id;
163
    int64_t rank;
164 165 166 167
    int64_t counters[CP_NUM_INDICES];
    double fcounters[CP_F_NUM_INDICES];
};

168
struct posix_runtime_file
169
{
170
    struct darshan_posix_file* file_record;
171
    UT_hash_handle hlink;
172
};
173

174
struct posix_runtime_file_ref
175
{
176 177 178 179 180 181 182
    struct posix_runtime_file* file;
    int fd;
    UT_hash_handle hlink;
};

struct posix_runtime
{
183 184
    struct posix_runtime_file* file_runtime_array;
    struct darshan_posix_file* file_record_array;
185
    int file_array_size;
186
    int file_array_ndx;
187 188
    struct posix_runtime_file* file_hash;
    struct posix_runtime_file_ref* fd_hash;
189 190
};

191
static struct posix_runtime *posix_runtime = NULL;
192
static pthread_mutex_t posix_runtime_mutex = PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP;
193
static int my_rank = -1;
194
static int darshan_mem_alignment = 1;
195

196 197 198 199 200 201 202 203 204 205 206 207 208 209 210
/* these are paths that we will not trace */
static char* exclusions[] = {
"/etc/",
"/dev/",
"/usr/",
"/bin/",
"/boot/",
"/lib/",
"/opt/",
"/sbin/",
"/sys/",
"/proc/",
NULL
};

211
DARSHAN_FORWARD_DECL(open, int, (const char *path, int flags, ...));
212
DARSHAN_FORWARD_DECL(close, int, (int fd));
213

214 215
static void posix_runtime_initialize(void);

216 217 218 219
static struct posix_runtime_file* posix_file_by_name(const char *name);
static struct posix_runtime_file* posix_file_by_name_setfd(const char* name, int fd);
static void posix_file_close_fd(int fd);

220 221
static void posix_get_output_data(MPI_Comm comm, void **buffer, int *size);
static void posix_shutdown(void);
222

223 224
#define POSIX_LOCK() pthread_mutex_lock(&posix_runtime_mutex)
#define POSIX_UNLOCK() pthread_mutex_unlock(&posix_runtime_mutex)
225

226
#define POSIX_SET(__file, __counter, __value) do {\
227
    (__file)->file_record->counters[__counter] = __value; \
228 229 230
} while(0)

#define POSIX_F_SET(__file, __counter, __value) do {\
231
    (__file)->file_record->fcounters[__counter] = __value; \
232 233 234
} while(0)

#define POSIX_INC(__file, __counter, __value) do {\
235
    (__file)->file_record->counters[__counter] += __value; \
236 237 238
} while(0)

#define POSIX_F_INC(__file, __counter, __value) do {\
239
    (__file)->file_record->fcounters[__counter] += __value; \
240 241 242 243 244 245 246 247 248 249 250 251
} while(0)

#define POSIX_F_INC_NO_OVERLAP(__file, __tm1, __tm2, __last, __counter) do { \
    if(__tm1 > __last) \
        POSIX_F_INC(__file, __counter, (__tm2-__tm1)); \
    else \
        POSIX_F_INC(__file, __counter, (__tm2 - __last)); \
    if(__tm2 > __last) \
        __last = __tm2; \
} while(0)

#define POSIX_VALUE(__file, __counter) \
252
    ((__file)->file_record->counters[__counter])
253 254

#define POSIX_F_VALUE(__file, __counter) \
255
    ((__file)->file_record->fcounters[__counter])
256 257

#define POSIX_MAX(__file, __counter, __value) do {\
258
    if((__file)->file_record->counters[__counter] < __value) \
259
    { \
260
        (__file)->file_record->counters[__counter] = __value; \
261 262 263
    } \
} while(0)

264
#define POSIX_RECORD_OPEN(__ret, __path, __mode, __stream_flag, __tm1, __tm2) do { \
265
    struct posix_runtime_file* file; \
266 267 268 269 270 271 272 273 274
    char* exclude; \
    int tmp_index = 0; \
    if(__ret < 0) break; \
    while((exclude = exclusions[tmp_index])) { \
        if(!(strncmp(exclude, __path, strlen(exclude)))) \
            break; \
        tmp_index++; \
    } \
    if(exclude) break; \
275
    file = posix_file_by_name_setfd(__path, __ret); \
276
    if(!file) break; \
277
    file->file_record->rank = my_rank; \
278
    if(__mode) \
279
        POSIX_SET(file, CP_MODE, __mode); \
280
    if(__stream_flag)\
281
        POSIX_INC(file, CP_POSIX_FOPENS, 1); \
282
    else \
283 284 285 286
        POSIX_INC(file, CP_POSIX_OPENS, 1); \
    if(POSIX_F_VALUE(file, CP_F_OPEN_TIMESTAMP) == 0) \
        POSIX_F_SET(file, CP_F_OPEN_TIMESTAMP, __tm1); \
} while(0)
287

288
int DARSHAN_DECL(open)(const char *path, int flags, ...)
289 290 291 292 293
{
    int mode = 0;
    int ret;
    double tm1, tm2;

294 295
    MAP_OR_FAIL(open);

296
    if(flags & O_CREAT) 
297 298 299 300 301 302
    {
        va_list arg;
        va_start(arg, flags);
        mode = va_arg(arg, int);
        va_end(arg);

303
        tm1 = darshan_core_wtime();
304
        ret = __real_open(path, flags, mode);
305
        tm2 = darshan_core_wtime();
306 307 308
    }
    else
    {
309
        tm1 = darshan_core_wtime();
310
        ret = __real_open(path, flags);
311
        tm2 = darshan_core_wtime();
312 313
    }

314 315 316
    POSIX_LOCK();
    posix_runtime_initialize();

317
    POSIX_RECORD_OPEN(ret, path, mode, 0, tm1, tm2);
318

319
    POSIX_UNLOCK();
320 321 322 323

    return(ret);
}

324 325 326 327 328 329 330 331 332 333 334 335 336 337 338
int DARSHAN_DECL(close)(int fd)
{
    struct darshan_file_runtime* file;
    int tmp_fd = fd;
    double tm1, tm2;
    int ret;

    MAP_OR_FAIL(close);

    tm1 = darshan_core_wtime();
    ret = __real_close(fd);
    tm2 = darshan_core_wtime();

    POSIX_LOCK();
    posix_runtime_initialize();
339

340
    posix_file_close_fd(tmp_fd);
341 342 343 344 345 346 347

    POSIX_UNLOCK();    

    return(ret);
}

/* ***************************************************** */
348

349
static void posix_runtime_initialize()
350
{
351 352 353 354 355 356 357
    char *alignstr;
    int tmpval;
    int ret;
    int mem_limit;
    struct darshan_module_funcs posix_mod_fns =
    {
        .get_output_data = &posix_get_output_data,
358
        .shutdown = &posix_shutdown
359
    };
360

361
    if(posix_runtime)
362
        return;
363

364 365
    /* register the posix module with darshan core */
    darshan_core_register_module(
366
        DARSHAN_POSIX_MOD,
367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382
        POSIX_MOD_NAME,
        &posix_mod_fns,
        &mem_limit);

    /* return if no memory assigned by darshan core */
    if(mem_limit == 0)
        return;

    posix_runtime = malloc(sizeof(*posix_runtime));
    if(!posix_runtime)
        return;
    memset(posix_runtime, 0, sizeof(*posix_runtime));

    /* set maximum number of file records according to max memory limit */
    /* NOTE: maximum number of records is based on the size of a posix file record */
    posix_runtime->file_array_size = mem_limit / sizeof(struct darshan_posix_file);
383
    posix_runtime->file_array_ndx = 0;
384 385

    /* allocate array of runtime file records */
386 387 388 389 390
    posix_runtime->file_runtime_array = malloc(posix_runtime->file_array_size *
                                               sizeof(struct posix_runtime_file));
    posix_runtime->file_record_array = malloc(posix_runtime->file_array_size *
                                              sizeof(struct darshan_posix_file));
    if(!posix_runtime->file_runtime_array || !posix_runtime->file_record_array)
391 392 393 394
    {
        posix_runtime->file_array_size = 0;
        return;
    }
395 396 397 398
    memset(posix_runtime->file_runtime_array, 0, posix_runtime->file_array_size *
           sizeof(struct posix_runtime_file));
    memset(posix_runtime->file_record_array, 0, posix_runtime->file_array_size *
           sizeof(struct darshan_posix_file));
399

400 401
    DARSHAN_MPI_CALL(PMPI_Comm_rank)(MPI_COMM_WORLD, &my_rank);

402
#if 0
403 404 405 406
    /* set the memory alignment according to config or environment variables */
    #if (__CP_MEM_ALIGNMENT < 1)
        #error Darshan must be configured with a positive value for --with-mem-align
    #endif
407
    alignstr = getenv(CP_MEM_ALIGNMENT_OVERRIDE);
408
    if(alignstr)
409 410 411 412 413 414 415 416
    {
        ret = sscanf(alignstr, "%d", &tmpval);
        /* silently ignore if the env variable is set poorly */
        if(ret == 1 && tmpval > 0)
        {
            darshan_mem_alignment = tmpval;
        }
    }
417
    else
418 419 420
    {
        darshan_mem_alignment = __CP_MEM_ALIGNMENT;
    }
421

422
    /* avoid floating point errors on faulty input */
423
    if(darshan_mem_alignment < 1)
424 425 426
    {
        darshan_mem_alignment = 1;
    }
427
#endif
428

429 430
    return;
}
431

432
static struct posix_runtime_file* posix_file_by_name(const char *name)
433
{
434
    struct posix_runtime_file *file = NULL;
435
    char *newname = NULL;
436
    darshan_file_id file_id;
437

438
    if(!posix_runtime)
439
        return(NULL);
440

441
    newname = darshan_clean_file_path(name);
442
    if(!newname)
443
        newname = (char*)name;
444

445 446 447 448 449
    /* get a unique id for this file from darshan core */
    darshan_core_lookup_id(
        (void*)newname,
        strlen(newname),
        1,
450
        &file_id);
451

452
    /* search the hash table for this file record, and return if found */
453 454
    HASH_FIND(hlink, posix_runtime->file_hash, &file_id, sizeof(darshan_file_id), file);
    if(file)
455
    {
456
        if(newname != name)
457
            free(newname);
458
        return(file);
459 460
    }

461 462 463 464 465 466
    if(posix_runtime->file_array_ndx < posix_runtime->file_array_size);
    {
        /* no existing record, assign a new file record from the global array */
        file = &(posix_runtime->file_runtime_array[posix_runtime->file_array_ndx]);
        file->file_record = &(posix_runtime->file_record_array[posix_runtime->file_array_ndx]);
        file->file_record->f_id = file_id;
467

468 469
        /* add new record to file hash table */
        HASH_ADD(hlink, posix_runtime->file_hash, file_record->f_id, sizeof(darshan_file_id), file);
470

471 472
        posix_runtime->file_array_ndx++;
    }
473 474

    if(newname != name)
475
        free(newname);
476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527
    return(file);
}

static struct posix_runtime_file* posix_file_by_name_setfd(const char* name, int fd)
{
    struct posix_runtime_file* file;
    struct posix_runtime_file_ref* ref;

    if(!posix_runtime)
        return(NULL);

    /* find file record by name first */
    file = posix_file_by_name(name);

    if(!file)
        return(NULL);

    /* search hash table for existing file ref for this fd */
    HASH_FIND(hlink, posix_runtime->fd_hash, &fd, sizeof(int), ref);
    if(ref)
    {
        /* we have a reference.  Make sure it points to the correct file
         * and return it
         */
        ref->file = file;
        return(file);
    }

    /* if we hit this point, then we don't have a reference for this fd
     * in the table yet.  Add it.
     */
    ref = malloc(sizeof(*ref));
    if(!ref)
        return(NULL);
    memset(ref, 0, sizeof(*ref));
    ref->file = file;
    ref->fd = fd;    

    HASH_ADD(hlink, posix_runtime->fd_hash, fd, sizeof(int), ref);

    return(file);
}

static void posix_file_close_fd(int fd)
{
    struct posix_runtime_file_ref *ref;

    if(!posix_runtime)
        return;

    /* search hash table for this fd */
    HASH_FIND(hlink, posix_runtime->fd_hash, &fd, sizeof(int), ref);
528
    if(ref)
529 530 531 532 533 534 535
    {
        /* we have a reference, delete it */
        HASH_DELETE(hlink, posix_runtime->fd_hash, ref);
        free(ref);
    }

    return;
536 537
}

538 539
/* ***************************************************** */

540
static void posix_get_output_data(MPI_Comm comm, void **buffer, int *size)
541
{
542
    int comm_cmp;
543
    
544 545 546 547 548 549 550 551 552 553 554 555 556 557
    MPI_Comm_compare(MPI_COMM_WORLD, comm, &comm_cmp);

    /* only do shared file reductions if this communicator includes _everyone_ */
    if((comm_cmp == MPI_IDENT) || (comm_cmp == MPI_CONGRUENT))
    {
        /* don't reduce shared files if that feature is disabled, either */
        if(!getenv("DARSHAN_DISABLE_SHARED_REDUCTION"))
        {
            /* TODO reduction code */
        }
    }

    *buffer = (void *)(posix_runtime->file_record_array);
    *size = posix_runtime->file_array_ndx * sizeof(struct darshan_posix_file);
558

559
    return;
560 561
}

562
static void posix_shutdown()
563
{
564 565 566 567 568 569
    /* TODO destroy hash tables ?? */

    free(posix_runtime->file_runtime_array);
    free(posix_runtime->file_record_array);
    free(posix_runtime);
    posix_runtime = NULL;
570 571

    return;
572 573
}

574 575 576 577 578 579 580 581
/*
 * Local variables:
 *  c-indent-level: 4
 *  c-basic-offset: 4
 * End:
 *
 * vim: ts=8 sts=4 sw=4 expandtab
 */