codes-workload-mpi-replay.c 24.2 KB
Newer Older
1 2 3 4 5 6 7 8
/*
 * Copyright (C) 2013 University of Chicago.
 * See COPYRIGHT notice in top-level directory.
 *
 */

/* SUMMARY:
 *
9
 *  MPI replay tool for replaying workloads from the codes workload API.
10 11 12 13 14 15 16 17 18 19
 *
 */

#include <stdio.h>
#include <unistd.h>
#include <assert.h>
#include <inttypes.h>
#include <math.h>
#include <getopt.h>
#include <fcntl.h>
20
#include <sys/stat.h>
21 22
#include <mpi.h>

Jonathan Jenkins's avatar
Jonathan Jenkins committed
23 24 25 26
#include <codes/codes-workload.h>
#include <codes/quickhash.h>
#include <codes/configuration.h>
#include <codes/codes.h>
27

28
#define DEBUG_PROFILING 0
29
#define BUF_SIZE (128*1024*1024)
30

31
/* hash table entry for looking up file descriptor of a workload file id */
32
struct file_info
33 34 35 36
{
    struct qlist_head hash_link;
    uint64_t file_hash;
    int file_descriptor;
37
    MPI_File fh;
38 39 40 41 42 43 44 45
};

int replay_workload_op(struct codes_workload_op replay_op, int rank, long long int op_number);
int hash_file_compare(void *key, struct qlist_head *link);

/* command line options */
static int opt_verbose = 0;
static int opt_noop = 0;
46
static double opt_delay_pct = 1.0;
47 48
static double opt_max_delay = -1;
static int opt_prep = 0;
49 50 51 52

/* hash table for storing file descriptors of opened files */
static struct qhash_table *fd_table = NULL;

53 54 55
/* file stream to log rank events to, if verbose turned on */
static FILE *log_stream = NULL;

56
/* global variables for profiling different portions of the replay, if enabled */
57
#if DEBUG_PROFILING
58 59 60 61 62 63
static double total_open_time = 0.0;
static double total_close_time = 0.0;
static double total_read_time = 0.0;
static double total_write_time = 0.0;
static double total_delay_time = 0.0;
static double total_barrier_time = 0.0;
64
#endif
65

66 67
void usage(char *exename)
{
68 69
    fprintf(stderr, "Usage: %s [OPTIONS] --conf <conf_file_path>\n       "
            "--test-dir <workload_test_dir>\n\n", exename);
70
    fprintf(stderr, "\t<conf_file_path> : (absolute) path to a valid workload configuration file\n");
71 72
    fprintf(stderr, "\t<workload_test_dir> : the directory to replay the workload I/O in\n");
    fprintf(stderr, "\n\t[OPTIONS] includes:\n");
73 74 75 76 77
    fprintf(stderr, "\t\t--noop  : do not perform i/o\n");
    fprintf(stderr, "\t\t    -v  : verbose (output i/o details)\n");
    fprintf(stderr, "\t\t--delay-ratio : floating point ratio applied to each delay (defaults to 1)\n");
    fprintf(stderr, "\t\t--max-delay : maximum delay (in seconds) to replay\n");
    fprintf(stderr, "\t\t--prep: instead of running replay, pre-populate files that will be needed for subsequent replay\n");
78 79 80 81

    exit(1);
}

82
void parse_args(int argc, char **argv, char **conf_path, char **test_dir)
83 84 85 86
{
    int index;
    static struct option long_opts[] =
    {
87
        {"conf", 1, NULL, 'c'},
88
        {"test-dir", 1, NULL, 'd'},
89
        {"noop", 0, NULL, 'n'},
90 91 92
        {"prep", 0, NULL, 'P'},
        {"delay-ratio", 1, NULL, 'p'},
        {"max-delay", 1, NULL, 'm'},
93 94 95 96
        {"help", 0, NULL, 0},
        {0, 0, 0, 0}
    };

97
    *conf_path = NULL;
98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113
    *test_dir = NULL;
    while (1)
    {
        int c = getopt_long(argc, argv, "v", long_opts, &index);

        if (c == -1)
            break;

        switch (c)
        {
            case 'v':
                opt_verbose = 1;
                break;
            case 'n':
                opt_noop = 1;
                break;
114 115 116
            case 'P':
                opt_prep = 1;
                break;
117 118
            case 'c':
                *conf_path = optarg;
119 120 121 122
                break;
            case 'd':
                *test_dir = optarg;
                break;
123 124 125
            case 'p':
                opt_delay_pct = atof(optarg);
                break;
126 127 128
            case 'm':
                opt_max_delay = atof(optarg);
                break;
129 130 131 132 133 134 135 136
            case 0:
            case '?':
            default:
                usage(argv[0]);
                break;
        }
    }

137 138 139 140 141 142
    if(opt_noop && opt_prep)
    {
        fprintf(stderr, "Error: cannot use --noop and --prep at the same time.\n");
        usage(argv[0]);
    }

143
    if (optind < argc || !(*conf_path) || !(*test_dir))
144 145 146 147 148 149 150
    {
        usage(argv[0]);
    }

    return;
}

151 152 153 154 155 156 157 158 159
int load_workload(char *conf_path, int rank)
{
    char workload_type[MAX_NAME_LENGTH_WKLD];

    /* load the config file across all ranks */
    configuration_load(conf_path, MPI_COMM_WORLD, &config);

    /* get the workload type out of PARAMS */
    configuration_get_value(&config, "PARAMS", "workload_type",
160
                            NULL, workload_type, MAX_NAME_LENGTH_WKLD);
161 162 163 164 165 166 167 168

    /* set up the workload parameters and load into the workload API */
    if (strcmp(workload_type, "darshan_io_workload") == 0)
    {
        struct darshan_params d_params;

        /* get the darshan params from the config file */
        configuration_get_value(&config, "PARAMS", "log_file_path",
169
                                NULL, d_params.log_file_path, MAX_NAME_LENGTH_WKLD);
170
        return codes_workload_load(workload_type, (char *)&d_params, 0, rank);
171
    }
172
    else if (strcmp(workload_type, "iolang_workload") == 0)
173
    {
174
        struct iolang_params i_params;
175 176
        char rank_count[10];

177
        /* get the iolang i/o params from the config file */
178
        configuration_get_value(&config, "PARAMS", "io_kernel_meta_path",
179
                                NULL, i_params.io_kernel_meta_path, MAX_NAME_LENGTH_WKLD);
180
        configuration_get_value(&config, "PARAMS", "rank_count", NULL, rank_count, 10);
181 182 183
        strcpy(i_params.io_kernel_path, "");
        i_params.num_cns = atoi(rank_count);
        i_params.use_relpath = 1;
184

185
        return codes_workload_load(workload_type, (char *)&i_params, 0, rank);
186
    }
187 188
    else if (strcmp(workload_type, "recorder_io_workload") == 0) {
        struct recorder_params r_params;
189
        char nprocs[10];
190

191
        /* get the recorder params from the config file */
192
        configuration_get_value(&config, "PARAMS", "trace_dir_path",
193 194
                                NULL, r_params.trace_dir_path, MAX_NAME_LENGTH_WKLD);
        configuration_get_value(&config, "PARAMS", "nprocs", NULL, nprocs, 10);
195
        r_params.nprocs = atol(nprocs);
196

197
        return codes_workload_load(workload_type, (char *)&r_params, 0, rank);
198 199

	}
200 201 202 203 204 205 206
    else
    {
        fprintf(stderr, "Error: Invalid workload type specified (%s)\n", workload_type);
        return -1;
    }
}

207
char* buf = NULL;
208 209
int fd = -1;
int op_num = 0;
210

211 212
int main(int argc, char *argv[])
{
213
    char *conf_path;
214
    char *replay_test_path;
215 216
    char *log_dir = "log";
    char my_log_path[MAX_NAME_LENGTH_WKLD];
217 218 219 220 221
    int nprocs;
    int myrank;
    int workload_id;
    struct codes_workload_op next_op;
    long long int replay_op_number = 1;
222
    double load_start, load_end;
223 224 225
    int ret = 0;

    /* parse command line args */
226
    parse_args(argc, argv, &conf_path, &replay_test_path);
227 228 229 230 231 232

    /* initialize MPI */
    MPI_Init(&argc, &argv);
    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
    MPI_Comm_rank(MPI_COMM_WORLD, &myrank);

233 234 235
    /* start workload load timer */
    load_start = MPI_Wtime();

236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256
    /* change the working directory to be the test directory */
    ret = chdir(replay_test_path);
    if (ret < 0)
    {
        fprintf(stderr, "Unable to change to testing directory (%s)\n", strerror(errno));
        goto error_exit;
    }

    /* set the path for logging this rank's events, if verbose is turned on */
    if (opt_verbose)
    {
        mkdir(log_dir, 0755);
        snprintf(my_log_path, MAX_NAME_LENGTH_WKLD, "%s/rank-%d.log", log_dir, myrank);
        log_stream = fopen(my_log_path, "w");
        if (log_stream == NULL)
        {
            fprintf(stderr, "Unable to open log file %s\n", my_log_path);
            goto error_exit;
        }
    }

257 258
    /* initialize workload generator from config file */
    workload_id = load_workload(conf_path, myrank);
259 260 261 262 263 264 265 266 267 268 269 270 271
    if (workload_id < 0)
    {
        goto error_exit;
    }

    /* initialize hash table for storing file descriptors */
    fd_table = qhash_init(hash_file_compare, quickhash_64bit_hash, 29);
    if (!fd_table)
    {
        fprintf(stderr, "File descriptor hash table memory error\n");
        goto error_exit;
    }

272 273 274
    /* synchronize before replay */
    MPI_Barrier(MPI_COMM_WORLD);

275 276 277
    /* loading is finished */
    load_end = MPI_Wtime();

Shane Snyder's avatar
Shane Snyder committed
278 279
    if (myrank == 0) printf("Note: Workload took %.2lf seconds to load.\n", load_end - load_start);

280 281 282 283
    /* replay loop */
    while (1)
    {
        /* get the next replay operation from the workload generator */
284
        codes_workload_get_next(workload_id, 0, myrank, &next_op);
285 286 287

        if (next_op.op_type != CODES_WK_END)
        {
288 289

            if (next_op.op_type == CODES_WK_DELAY)
290
            {
291
                next_op.u.delay.seconds *= opt_delay_pct;
292 293 294 295
                /* cap max delay if requested by cmd line */
                if(opt_max_delay >= 0 && next_op.u.delay.seconds > opt_max_delay)
                    next_op.u.delay.seconds = opt_max_delay;
            }
296

297 298 299 300
            /* replay the next workload operation */
            ret = replay_workload_op(next_op, myrank, replay_op_number++);
            if (ret < 0)
            {
301
                fprintf(stderr, "Error: replay_workload_op() for replay_op_number %lld failed on rank %d\n", replay_op_number-1, myrank);
302 303 304 305 306 307 308 309 310 311
                break;
            }
        }
        else
        {
            /* workload replay for this rank is complete */
            break;
        }
    }

312
    if (log_stream)
313 314
        fclose(log_stream);

315
    /* destroy and finalize the file descriptor hash table */
316
    qhash_destroy_and_finalize(fd_table, struct file_info, hash_link, free);
317

318 319 320 321 322 323
#if DEBUG_PROFILING
    printf("Rank %d:\td=%.4lf, b=%.4lf, o=%.4lf, c=%.4lf, r=%.4lf, w=%.4lf\n",
           myrank, total_delay_time, total_barrier_time, total_open_time,
           total_close_time, total_read_time, total_write_time);
#endif

324 325
error_exit:
    MPI_Finalize();
326

327 328 329
    return ret;
}

330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631
static int track_open_file(uint64_t file_hash, int fildes, MPI_File fh)
{
    struct file_info *tmp_list = NULL;
    int i;

    /* save the file descriptor for this file in a hash table to be retrieved later */
    tmp_list = malloc(sizeof(struct file_info));
    if (!tmp_list)
    {
        fprintf(stderr, "No memory available for file hash entry\n");
        return -1;
    }

    tmp_list->file_hash = file_hash;
    tmp_list->file_descriptor = fildes;
    tmp_list->fh = fh;
    qhash_add(fd_table, &(file_hash), &(tmp_list->hash_link));

    if (!buf)
    {
        buf = malloc(BUF_SIZE);
        assert(buf);
        for(i=0; i<BUF_SIZE; i++)
        {
            buf[i] = '1';
        }
    }

#if DEBUG_PROFILING
    end = MPI_Wtime();
    total_open_time += (end - start);
#endif

    return(0);
}

static int do_read(struct codes_workload_op replay_op, int rank, long long int op_number)
{
    int fildes;
    MPI_File fh;
    struct qlist_head *hash_link = NULL;
    struct file_info *tmp_list = NULL;
    int ret;
    char *op_name;
    MPI_Status status;

    if(replay_op.op_type == CODES_WK_READ)
        op_name = "READ";
    else if(replay_op.op_type == CODES_WK_MPI_READ)
        op_name = "MPI_READ (independent)";
    else
        op_name = "MPI_READ (collective)";

    if (opt_verbose)
        fprintf(log_stream, "[Rank %d] Operation %lld : %s file %llu (sz = %llu, off = %llu)\n",
               rank, op_number, op_name, LLU(replay_op.u.read.file_id), LLU(replay_op.u.read.size),
               LLU(replay_op.u.read.offset));

    if (!opt_noop)
    {
	if(replay_op.u.read.size > BUF_SIZE)
	{
		fprintf(stderr, "ERROR: workload read of size %lu is larger than buffer size of %d\n", replay_op.u.read.size, BUF_SIZE);
		return(-1);
	}

        /* search for the corresponding file descriptor in the hash table */
        hash_link = qhash_search(fd_table, &(replay_op.u.read.file_id));
        if(!hash_link)
        {
            fprintf(stderr, "ERROR: rank %d unable to find fd_table record for file_id %llu in read path.\n", rank, LLU(replay_op.u.read.file_id));
            assert(hash_link);
        }
        tmp_list = qhash_entry(hash_link, struct file_info, hash_link);
        fildes = tmp_list->file_descriptor;
        fh = tmp_list->fh;

        switch(replay_op.op_type)
        {
            case CODES_WK_READ:
                ret = pread(fildes, buf, replay_op.u.read.size, replay_op.u.read.offset);
                break;
            case CODES_WK_MPI_READ:
                ret = MPI_File_read_at(fh, replay_op.u.read.offset, buf, replay_op.u.read.size, MPI_BYTE, &status);
                break;
            case CODES_WK_MPI_COLL_READ:
                ret = MPI_File_read_at_all(fh, replay_op.u.read.offset, buf, replay_op.u.read.size, MPI_BYTE, &status);
                break;
            default:
                assert(0);
        }

        if (ret < 0)
        {
            fprintf(stderr, "Rank %d failure on operation %lld [%s]\n",
                    rank, op_number, op_name);
            return -1;
        }

#if DEBUG_PROFILING
        end = MPI_Wtime();
        total_read_time += (end - start);
#endif
    }
    return(0);
}

static int do_write(struct codes_workload_op replay_op, int rank, long long int op_number)
{
    int fildes;
    MPI_File fh;
    struct qlist_head *hash_link = NULL;
    struct file_info *tmp_list = NULL;
    int ret;
    char *op_name;
    MPI_Status status;

    if(replay_op.op_type == CODES_WK_WRITE)
        op_name = "WRITE";
    else if(replay_op.op_type == CODES_WK_MPI_WRITE)
        op_name = "MPI_WRITE (independent)";
    else
        op_name = "MPI_WRITE (collective)";

    if (opt_verbose)
        fprintf(log_stream, "[Rank %d] Operation %lld : %s file %llu (sz = %llu, off = %llu)\n",
               rank, op_number, op_name, LLU(replay_op.u.write.file_id), LLU(replay_op.u.write.size),
               LLU(replay_op.u.write.offset));

    if (!opt_noop)
    {
	if(replay_op.u.write.size > BUF_SIZE)
	{
		fprintf(stderr, "ERROR: workload write of size %lu is larger than buffer size of %d\n", replay_op.u.write.size, BUF_SIZE);
		return(-1);
	}
		
        /* search for the corresponding file descriptor in the hash table */
        hash_link = qhash_search(fd_table, &(replay_op.u.write.file_id));
        if(!hash_link)
        {
            fprintf(stderr, "ERROR: rank %d unable to find fd_table record for file_id %llu in write path.\n", rank, LLU(replay_op.u.write.file_id));
            assert(hash_link);
        }
        tmp_list = qhash_entry(hash_link, struct file_info, hash_link);
        fildes = tmp_list->file_descriptor;
        fh = tmp_list->fh;

        switch(replay_op.op_type)
        {
            case CODES_WK_WRITE:
                ret = pwrite(fildes, buf, replay_op.u.write.size, replay_op.u.write.offset);
                break;
            case CODES_WK_MPI_WRITE:
                ret = MPI_File_write_at(fh, replay_op.u.write.offset, buf, replay_op.u.write.size, MPI_BYTE, &status);
                break;
            case CODES_WK_MPI_COLL_WRITE:
                ret = MPI_File_write_at_all(fh, replay_op.u.write.offset, buf, replay_op.u.write.size, MPI_BYTE, &status);
                break;
            default:
                assert(0);
        }

        if (ret < 0)
        {
            if(replay_op.op_type == CODES_WK_WRITE)
                perror("pwrite");

            fprintf(stderr, "Rank %d failure on operation %lld [%s]\n",
                    rank, op_number, op_name);
            return -1;
        }

#if DEBUG_PROFILING
        end = MPI_Wtime();
        total_write_time += (end - start);
#endif
    }
    return(0);
}

static int do_close(int rank, uint64_t file_hash, enum codes_workload_op_type type, long long op_number)
{
    int fildes;
    MPI_File fh;
    struct qlist_head *hash_link = NULL;
    struct file_info *tmp_list = NULL;
    int ret;

    if (opt_verbose)
        fprintf(log_stream, "[Rank %d] Operation %lld : %s file %"PRIu64"\n",
                rank, op_number, (type == CODES_WK_CLOSE) ? "CLOSE" : "MPI_CLOSE", file_hash);

    if (!opt_noop)
    {
        /* search for the corresponding file descriptor in the hash table */
        hash_link = qhash_search_and_remove(fd_table, &(file_hash));
        if(!hash_link && opt_prep)
            return(0);
        if(!hash_link)
        {
            fprintf(stderr, "ERROR: rank %d unable to find fd_table record for file_id %llu in close path.\n", rank, LLU(file_hash));
            assert(hash_link);
        }
        tmp_list = qhash_entry(hash_link, struct file_info, hash_link);
        fildes = tmp_list->file_descriptor;
        fh = tmp_list->fh;
        free(tmp_list);

        if(type == CODES_WK_CLOSE)
        {
            /* perform the close operation */
            ret = close(fildes);
            if (ret < 0 && !opt_prep)
            {
                fprintf(stderr, "Rank %d failure on operation %lld [CLOSE: %s]\n",
                        rank, op_number, strerror(errno));
                return -1;
            }
        }
        else
        {
            ret = MPI_File_close(&fh);
            if (ret < 0 && !opt_prep)
            {
                fprintf(stderr, "Rank %d failure on operation %lld [CLOSE]\n",
                        rank, op_number);
                return -1;
            }
        }
#if DEBUG_PROFILING
        end = MPI_Wtime();
        total_close_time += (end - start);
#endif
    }

    return(0);
}

static int do_mpi_open(int rank, uint64_t file_hash, int create_flag, int collective_flag, long long op_number)
{
    int mpi_open_flags = MPI_MODE_RDWR;
    char file_name[250];
    MPI_File fh;
    int ret;
    MPI_Comm comm = MPI_COMM_SELF;

    if (opt_verbose)
        fprintf(log_stream, "[Rank %d] Operation %lld: %s file %"PRIu64"\n", rank, op_number,
               collective_flag ? "MPI_FILE_OPEN (collective)" : "MPI_FILE_OPEN (independent)", file_hash);

    if (!opt_noop)
    {
        /* set the create flag, if necessary */
        if (create_flag)
            mpi_open_flags |= MPI_MODE_CREATE;
        if (collective_flag)
            comm = MPI_COMM_WORLD;

        /* write the file hash to string to be used as the actual file name */
        snprintf(file_name, sizeof(file_name), "%"PRIu64, file_hash);

        /* perform the open operation */
        ret = MPI_File_open(comm, file_name, mpi_open_flags, MPI_INFO_NULL, &fh);
        if (ret < 0)
        {
            fprintf(stderr, "Rank %d failure on operation %lld [%s]\n",
                    rank, op_number, "MPI_FILE_OPEN");
            return -1;
        }

        if(track_open_file(file_hash, -1, fh) < 0)
            return -1;
    }

    return(0);
}

static void convert_read_to_write(struct codes_workload_op *read_op,
    struct codes_workload_op *write_op)
{
    memset(write_op, 0, sizeof(*write_op));

    if(read_op->op_type == CODES_WK_MPI_READ)
        write_op->op_type = CODES_WK_MPI_WRITE;
    else if(read_op->op_type == CODES_WK_MPI_COLL_READ)
        write_op->op_type = CODES_WK_MPI_COLL_WRITE;
    else if(read_op->op_type == CODES_WK_READ)
        write_op->op_type = CODES_WK_WRITE;
    else
        assert(0);

    write_op->start_time = read_op->start_time;
    write_op->end_time = read_op->end_time;
    write_op->sim_start_time = read_op->sim_start_time;
    write_op->u.write.file_id = read_op->u.read.file_id;
    write_op->u.write.offset = read_op->u.read.offset;
    write_op->u.write.size = read_op->u.read.size;

    return;
}

632 633
int replay_workload_op(struct codes_workload_op replay_op, int rank, long long int op_number)
{
Shane Snyder's avatar
Shane Snyder committed
634
    struct timespec delay;
635
    int open_flags = O_RDWR;
636
    char file_name[250];
637 638
    int fildes;
    int ret;
639
    struct codes_workload_op converted_op;
640 641

#if DEBUG_PROFILING
642
    double start, end;
643 644
    start = MPI_Wtime();
#endif
645 646 647 648

    switch (replay_op.op_type)
    {
        case CODES_WK_DELAY:
649 650
            if (opt_prep)
                return(0);
651
            if (opt_verbose)
652
                fprintf(log_stream, "[Rank %d] Operation %lld : DELAY %lf seconds\n",
653 654 655 656 657
                       rank, op_number, replay_op.u.delay.seconds);

            if (!opt_noop)
            {
                /* satisfy delay using second delay then microsecond delay */
Shane Snyder's avatar
Shane Snyder committed
658 659 660 661
                delay.tv_sec = (long long)replay_op.u.delay.seconds;
                delay.tv_nsec = (unsigned int)((replay_op.u.delay.seconds - delay.tv_sec) *
                                               1000 * 1000 * 1000);
                ret = nanosleep(&delay, NULL);
662 663 664 665 666 667 668 669
                if (ret)
                {
                    /* error in sleep */
                    errno = EINTR;
                    fprintf(stderr, "Rank %d failure on operation %lld [DELAY: %s]\n",
                            rank, op_number, strerror(errno));
                    return -1;
                }
670 671 672 673
#if DEBUG_PROFILING
                end = MPI_Wtime();
                total_delay_time += (end - start);
#endif
674 675 676 677
            }
            return 0;
        case CODES_WK_BARRIER:
            if (opt_verbose)
678
                fprintf(log_stream, "[Rank %d] Operation %lld : BARRIER\n", rank, op_number);
679 680 681 682 683 684 685 686 687 688 689 690

            if (!opt_noop)
            {
                /* implement barrier using MPI global barrier on all ranks */
                ret = MPI_Barrier(MPI_COMM_WORLD);
                if (ret != MPI_SUCCESS)
                {
                    /* error in MPI_Barrier */
                    fprintf(stderr, "Rank %d failure on operation %lld [BARRIER: %s]\n",
                            rank, op_number, "Invalid communicator");
                    return -1;
                }
691 692 693 694 695

#if DEBUG_PROFILING
                end = MPI_Wtime();
                total_barrier_time += (end - start);
#endif
696 697
            }
            return 0;
698 699 700 701 702
         case CODES_WK_OPEN:
            if(opt_prep && replay_op.u.open.create_flag)
                return(0);
            if(opt_prep)
                replay_op.u.open.create_flag = 1;
703
            if (opt_verbose)
704
                fprintf(log_stream, "[Rank %d] Operation %lld: %s file %"PRIu64"\n", rank, op_number,
705 706 707 708 709 710 711 712 713
                       (replay_op.u.open.create_flag) ? "CREATE" : "OPEN", replay_op.u.open.file_id);

            if (!opt_noop)
            {
                /* set the create flag, if necessary */
                if (replay_op.u.open.create_flag)
                    open_flags |= O_CREAT;

                /* write the file hash to string to be used as the actual file name */
Shane Snyder's avatar
Shane Snyder committed
714
                snprintf(file_name, sizeof(file_name), "%"PRIu64, replay_op.u.open.file_id);
715 716 717 718 719 720 721 722 723 724 725

                /* perform the open operation */
                fildes = open(file_name, open_flags, 0666);
                if (fildes < 0)
                {
                    fprintf(stderr, "Rank %d failure on operation %lld [%s: %s]\n",
                            rank, op_number, (replay_op.u.open.create_flag) ? "CREATE" : "OPEN",
                            strerror(errno));
                    return -1;
                }

726
                if(track_open_file(replay_op.u.open.file_id, fildes, MPI_FILE_NULL) < 0)
727 728 729
                    return -1;
            }
            return 0;
730 731 732 733 734 735 736 737 738 739
        case CODES_WK_MPI_OPEN:
            if(opt_prep && replay_op.u.open.create_flag)
                return(0);
            return(do_mpi_open(rank, replay_op.u.open.file_id, 
                replay_op.u.open.create_flag, 0, op_number));
        case CODES_WK_MPI_COLL_OPEN:
            if(opt_prep && replay_op.u.open.create_flag)
                return(0);
            return(do_mpi_open(rank, replay_op.u.open.file_id, 
                replay_op.u.open.create_flag, 1, op_number));
740
        case CODES_WK_CLOSE:
741 742 743
        case CODES_WK_MPI_CLOSE:
            return(do_close(rank, replay_op.u.close.file_id, replay_op.op_type,
                op_number));
744
        case CODES_WK_WRITE:
745 746 747 748 749
        case CODES_WK_MPI_WRITE:
        case CODES_WK_MPI_COLL_WRITE:
            if(opt_prep)
                return(0);
            return(do_write(replay_op, rank, op_number));
750
        case CODES_WK_READ:
751 752 753
        case CODES_WK_MPI_READ:
        case CODES_WK_MPI_COLL_READ:
            if(opt_prep)
754
            {
755 756
                convert_read_to_write(&replay_op, &converted_op);
                return(do_write(converted_op, rank, op_number));
757
            }
758
            return(do_read(replay_op, rank, op_number));
759 760
        default:
            fprintf(stderr, "** Rank %d: INVALID OPERATION (op count = %lld) **\n", rank, op_number);
761
            return 0;
762 763 764 765 766 767 768
    }

}

int hash_file_compare(void *key, struct qlist_head *link)
{
    uint64_t *in_file_hash = (uint64_t *)key;
769
    struct file_info *tmp_file;
770

771
    tmp_file = qlist_entry(link, struct file_info, hash_link);
772 773 774 775 776
    if (tmp_file->file_hash == *in_file_hash)
        return 1;

    return 0;
}
777 778 779 780 781 782 783 784 785 786

/*
 * Local variables:
 *  c-indent-level: 4
 *  c-basic-offset: 4
 * End:
 *
 * vim: ft=c ts=8 sts=4 sw=4 expandtab
 */