modelnet-test.c 13.4 KB
Newer Older
1
/*
2
 * Copyright (C) 2013 University of Chicago.
3
 * See COPYRIGHT notice in top-level directory.
4
 *
5 6 7 8
 */

/* SUMMARY:
 *
9
 * This is a test harness for the modelnet module.  It sets up a number of
10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26
 * servers, each of which is paired up with a simplenet LP to serve as the
 * NIC.  Each server exchanges a sequence of requests and acks with one peer
 * and measures the throughput in terms of payload bytes (ack size) moved
 * per second.
 */

#include <string.h>
#include <assert.h>
#include <ross.h>

#include "codes/model-net.h"
#include "codes/lp-io.h"
#include "codes/codes.h"
#include "codes/codes_mapping.h"
#include "codes/configuration.h"
#include "codes/lp-type-lookup.h"

27
#define NUM_REQS 2  /* number of requests sent by each server */
28
#define PAYLOAD_SZ 4096 /* size of simulated data payload, bytes  */
29 30

static int net_id = 0;
31 32 33
static int num_routers = 0;
static int num_servers = 0;
static int offset = 2;
34

35
/* whether to pull instead of push */
36
static int do_pull = 0;
37

38 39 40 41
static int num_routers_per_rep = 0;
static int num_servers_per_rep = 0;
static int lps_per_rep = 0;

42 43 44
typedef struct svr_msg svr_msg;
typedef struct svr_state svr_state;

45 46
char router_name[MAX_NAME_LENGTH];

47 48 49 50 51 52 53 54 55 56 57 58 59 60 61
/* types of events that will constitute triton requests */
enum svr_event
{
    KICKOFF,    /* initial event */
    REQ,        /* request event */
    ACK,        /* ack event */
    LOCAL      /* local event */
};

struct svr_state
{
    int msg_sent_count;   /* requests sent */
    int msg_recvd_count;  /* requests recvd */
    int local_recvd_count; /* number of local messages received */
    tw_stime start_ts;    /* time that we started sending requests */
62
    tw_stime end_ts;      /* time that we ended sending requests */
63 64 65 66 67
};

struct svr_msg
{
    enum svr_event svr_event_type;
68
//    enum net_event net_event_type;
69 70
    tw_lpid src;          /* source of this request or ack */

71 72 73
    // rc for modelnet calls
    model_net_event_return ret;

74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94
    int incremented_flag; /* helper for reverse computation */
};

static void svr_init(
    svr_state * ns,
    tw_lp * lp);
static void svr_event(
    svr_state * ns,
    tw_bf * b,
    svr_msg * m,
    tw_lp * lp);
static void svr_rev_event(
    svr_state * ns,
    tw_bf * b,
    svr_msg * m,
    tw_lp * lp);
static void svr_finalize(
    svr_state * ns,
    tw_lp * lp);

tw_lptype svr_lp = {
95 96 97 98
    (init_f) svr_init,
    (pre_run_f) NULL,
    (event_f) svr_event,
    (revent_f) svr_rev_event,
99 100
    (commit_f) NULL,
    (final_f)  svr_finalize,
101 102
    (map_f) codes_mapping,
    sizeof(svr_state),
103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120
};

extern const tw_lptype* svr_get_lp_type();
static void svr_add_lp_type();
static tw_stime ns_to_s(tw_stime ns);
static tw_stime s_to_ns(tw_stime ns);
static void handle_kickoff_event(
    svr_state * ns,
    svr_msg * m,
    tw_lp * lp);
static void handle_ack_event(
    svr_state * ns,
    svr_msg * m,
    tw_lp * lp);
static void handle_req_event(
    svr_state * ns,
    svr_msg * m,
    tw_lp * lp);
121 122
static void handle_local_event(svr_state * ns);
static void handle_local_rev_event(svr_state * ns);
123 124 125 126 127 128 129 130 131 132 133 134
static void handle_kickoff_rev_event(
    svr_state * ns,
    svr_msg * m,
    tw_lp * lp);
static void handle_ack_rev_event(
    svr_state * ns,
    svr_msg * m,
    tw_lp * lp);
static void handle_req_rev_event(
    svr_state * ns,
    svr_msg * m,
    tw_lp * lp);
135 136 137 138 139 140 141

const tw_optdef app_opt [] =
{
	TWOPT_GROUP("Model net test case" ),
	TWOPT_END()
};

142 143 144 145 146 147
int main(
    int argc,
    char **argv)
{
    int nprocs;
    int rank;
148 149
    int num_nets;
    int *net_ids;
150 151
    //printf("\n Config count %d ",(int) config.lpgroups_count);
    g_tw_ts_end = s_to_ns(60*60*24*365); /* one year, in nsecs */
152
    lp_io_handle handle;
153 154 155 156

    tw_opt_add(app_opt);
    tw_init(&argc, &argv);

157
    if(argc < 2)
158
    {
159
	    printf("\n Usage: mpirun <args> --sync=2/3 mapping_file_name.conf (optional --nkp) ");
160
	    MPI_Finalize();
161
	    return 0;
162 163 164
    }
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
165

166
    configuration_load(argv[2], MPI_COMM_WORLD, &config);
167 168

    model_net_register();
169
    svr_add_lp_type();
170

171
    codes_mapping_setup();
172

173
    net_ids = model_net_configure(&num_nets);
174
    assert(num_nets>=1);
175 176 177
    net_id = *net_ids;
    free(net_ids);

178 179
    num_servers = codes_mapping_get_lp_count("MODELNET_GRP", 0, "server",
            NULL, 1);
180

181
    if(net_id == DRAGONFLY)
182 183 184
    {
      strcpy(router_name, "modelnet_dragonfly_router");
    }
185

186 187
    if(net_id == SLIMFLY)
    {
188
      strcpy(router_name, "slimfly_router");
189
    }
190 191 192 193 194
    
    if(net_id == EXPRESS_MESH)
    {
      strcpy(router_name, "modelnet_express_mesh_router");
    }
195

196 197

    if(net_id == SLIMFLY || net_id == DRAGONFLY || net_id == EXPRESS_MESH)
198
    {
199
	  num_routers = codes_mapping_get_lp_count("MODELNET_GRP", 0,
200
                  router_name, NULL, 1);
201
	  offset = 1;
202
    }
203

204 205 206 207 208
    if(lp_io_prepare("modelnet-test", LP_IO_UNIQ_SUFFIX, &handle, MPI_COMM_WORLD) < 0)
    {
        return(-1);
    }

209
    tw_run();
210 211
    model_net_report_stats(net_id);

212 213 214 215 216
    if(lp_io_flush(handle, MPI_COMM_WORLD) < 0)
    {
        return(-1);
    }

217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237
    tw_end();
    return 0;
}

const tw_lptype* svr_get_lp_type()
{
	    return(&svr_lp);
}

static void svr_add_lp_type()
{
  lp_type_register("server", svr_get_lp_type());
}

static void svr_init(
    svr_state * ns,
    tw_lp * lp)
{
    tw_event *e;
    svr_msg *m;
    tw_stime kickoff_time;
238

239 240 241 242 243 244
    memset(ns, 0, sizeof(*ns));

    /* each server sends a dummy event to itself that will kick off the real
     * simulation
     */

245
    //printf("\n Initializing servers %d ", (int)lp->gid);
246
    /* skew each kickoff event slightly to help avoid event ties later on */
247
    kickoff_time = g_tw_lookahead + tw_rand_unif(lp->rng);
248

249
    e = tw_event_new(lp->gid, kickoff_time, lp);
250 251 252 253 254 255 256 257 258 259 260 261 262
    m = tw_event_data(e);
    m->svr_event_type = KICKOFF;
    tw_event_send(e);

    return;
}

static void svr_event(
    svr_state * ns,
    tw_bf * b,
    svr_msg * m,
    tw_lp * lp)
{
263
    (void)b;
264 265 266
   switch (m->svr_event_type)
    {
        case REQ:
267
            handle_req_event(ns, m, lp);
268 269
            break;
        case ACK:
270
            handle_ack_event(ns, m, lp);
271 272
            break;
        case KICKOFF:
273
            handle_kickoff_event(ns, m, lp);
274 275
            break;
	case LOCAL:
276
	   handle_local_event(ns);
277 278 279 280 281 282 283 284 285 286 287 288 289 290
	 break;
        default:
	    printf("\n Invalid message type %d ", m->svr_event_type);
            assert(0);
        break;
    }
}

static void svr_rev_event(
    svr_state * ns,
    tw_bf * b,
    svr_msg * m,
    tw_lp * lp)
{
291
    (void)b;
292 293 294
    switch (m->svr_event_type)
    {
        case REQ:
295
            handle_req_rev_event(ns, m, lp);
296 297
            break;
        case ACK:
298
            handle_ack_rev_event(ns, m, lp);
299 300
            break;
        case KICKOFF:
301
            handle_kickoff_rev_event(ns, m, lp);
302 303
            break;
	case LOCAL:
304
	    handle_local_rev_event(ns);
305 306 307 308 309 310 311 312 313 314 315 316 317
	    break;
        default:
            assert(0);
            break;
    }

    return;
}

static void svr_finalize(
    svr_state * ns,
    tw_lp * lp)
{
318
    printf("server %llu recvd %d bytes in %f seconds, %f MiB/s sent_count %d recvd_count %d local_count %d \n", (unsigned long long)lp->gid, PAYLOAD_SZ*ns->msg_recvd_count, ns_to_s(ns->end_ts-ns->start_ts),
319
        ((double)(PAYLOAD_SZ*NUM_REQS)/(double)(1024*1024)/ns_to_s(ns->end_ts-ns->start_ts)), ns->msg_sent_count, ns->msg_recvd_count, ns->local_recvd_count);
320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348
    return;
}

/* convert ns to seconds */
static tw_stime ns_to_s(tw_stime ns)
{
    return(ns / (1000.0 * 1000.0 * 1000.0));
}

/* convert seconds to ns */
static tw_stime s_to_ns(tw_stime ns)
{
    return(ns * (1000.0 * 1000.0 * 1000.0));
}

/* handle initial event */
static void handle_kickoff_event(
    svr_state * ns,
    svr_msg * m,
    tw_lp * lp)
{
    svr_msg * m_local = malloc(sizeof(svr_msg));
    svr_msg * m_remote = malloc(sizeof(svr_msg));

//    m_local->svr_event_type = REQ;
    m_local->svr_event_type = LOCAL;
    m_local->src = lp->gid;

    memcpy(m_remote, m_local, sizeof(svr_msg));
349
    m_remote->svr_event_type = (do_pull) ? ACK : REQ;
350 351 352 353 354
    //printf("handle_kickoff_event(), lp %llu.\n", (unsigned long long)lp->gid);

    /* record when transfers started on this server */
    ns->start_ts = tw_now(lp);

355 356 357
    num_servers_per_rep = codes_mapping_get_lp_count("MODELNET_GRP", 1,
            "server", NULL, 1);
    num_routers_per_rep = codes_mapping_get_lp_count("MODELNET_GRP", 1,
358
            router_name, NULL, 1);
359 360 361

    lps_per_rep = num_servers_per_rep * 2 + num_routers_per_rep;

362
    int opt_offset = 0;
363 364
    int total_lps = num_servers * 2 + num_routers;

365 366
    if((net_id == DRAGONFLY || net_id == SLIMFLY || net_id == EXPRESS_MESH) && (lp->gid % lps_per_rep == num_servers_per_rep - 1))
          opt_offset = num_servers_per_rep + num_routers_per_rep; 
367

368
    /* each server sends a request to the next highest server */
369
    int dest_id = (lp->gid + offset + opt_offset)%total_lps;
370
    if (do_pull){
371
        m->ret = model_net_pull_event(net_id, "test", dest_id, PAYLOAD_SZ, 0.0,
372 373 374
                sizeof(svr_msg), (const void*)m_remote, lp);
    }
    else{
375
        m->ret = model_net_event(net_id, "test", dest_id, PAYLOAD_SZ, 0.0, sizeof(svr_msg), (const void*)m_remote, sizeof(svr_msg), (const void*)m_local, lp);
376
    }
377 378 379
    ns->msg_sent_count++;
}

380
static void handle_local_event(svr_state * ns)
381 382 383 384
{
    ns->local_recvd_count++;
}

385
static void handle_local_rev_event(svr_state * ns)
386 387 388 389 390 391 392 393 394 395
{
   ns->local_recvd_count--;
}
/* reverse handler for req event */
static void handle_req_rev_event(
    svr_state * ns,
    svr_msg * m,
    tw_lp * lp)
{
    ns->msg_recvd_count--;
396
    model_net_event_rc2(lp, &m->ret);
397 398 399 400 401 402 403 404 405 406 407 408

    return;
}


/* reverse handler for kickoff */
static void handle_kickoff_rev_event(
    svr_state * ns,
    svr_msg * m,
    tw_lp * lp)
{
    ns->msg_sent_count--;
409
    if (do_pull){
410
        model_net_event_rc2(lp, &m->ret);
411 412
    }
    else{
413
        model_net_event_rc2(lp, &m->ret);
414
    }
415 416 417 418 419 420 421 422 423 424 425 426

    return;
}

/* reverse handler for ack*/
static void handle_ack_rev_event(
    svr_state * ns,
    svr_msg * m,
    tw_lp * lp)
{
    if(m->incremented_flag)
    {
427
        model_net_event_rc2(lp, &m->ret);
428 429
        ns->msg_sent_count--;
    }
430
    // don't worry about resetting end_ts - just let the ack
431
    // event bulldoze it
432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448
    return;
}

/* handle recving ack */
static void handle_ack_event(
    svr_state * ns,
    svr_msg * m,
    tw_lp * lp)
{
    svr_msg * m_local = malloc(sizeof(svr_msg));
    svr_msg * m_remote = malloc(sizeof(svr_msg));

//    m_local->svr_event_type = REQ;
    m_local->svr_event_type = LOCAL;
    m_local->src = lp->gid;

    memcpy(m_remote, m_local, sizeof(svr_msg));
449
    m_remote->svr_event_type = (do_pull) ? ACK : REQ;
450

451
    //printf("handle_ack_event(), lp %llu.\n", (unsigned long long)lp->gid);
452 453 454

    /* safety check that this request got to the right server */
//    printf("\n m->src %d lp->gid %d ", m->src, lp->gid);
455
    int opt_offset = 0;
456

457
   if((net_id == DRAGONFLY || net_id == SLIMFLY || net_id == EXPRESS_MESH) && (lp->gid % lps_per_rep == num_servers_per_rep - 1))
458 459
      opt_offset = num_servers_per_rep + num_routers_per_rep; /* optional offset due to dragonfly mapping */

460 461 462 463 464 465 466 467 468
    tw_lpid dest_id = (lp->gid + offset + opt_offset)%(num_servers*2 + num_routers);

    /* in the "pull" case, src should actually be self */
    if (do_pull){
        assert(m->src == lp->gid);
    }
    else{
        assert(m->src == dest_id);
    }
469 470 471 472

    if(ns->msg_sent_count < NUM_REQS)
    {
        /* send another request */
473
        if (do_pull){
474
            m->ret = model_net_pull_event(net_id, "test", dest_id, PAYLOAD_SZ, 0.0,
475 476 477
                    sizeof(svr_msg), (const void*)m_remote, lp);
        }
        else{
478
            m->ret = model_net_event(net_id, "test", dest_id, PAYLOAD_SZ, 0.0, sizeof(svr_msg), (const void*)m_remote, sizeof(svr_msg), (const void*)m_local, lp);
479
        }
480 481 482 483 484
        ns->msg_sent_count++;
        m->incremented_flag = 1;
    }
    else
    {
485
        ns->end_ts = tw_now(lp);
486 487 488 489 490 491
        m->incremented_flag = 0;
    }

    return;
}

492
/* handle receiving request
493 494
 * (note: this should never be called when doing the "pulling" version of
 * the program) */
495 496 497 498 499
static void handle_req_event(
    svr_state * ns,
    svr_msg * m,
    tw_lp * lp)
{
500
    assert(!do_pull);
501 502 503 504 505 506 507 508
    svr_msg * m_local = malloc(sizeof(svr_msg));
    svr_msg * m_remote = malloc(sizeof(svr_msg));

    m_local->svr_event_type = LOCAL;
    m_local->src = lp->gid;

    memcpy(m_remote, m_local, sizeof(svr_msg));
    m_remote->svr_event_type = ACK;
509
    //printf("handle_req_event(), lp %llu src %llu .\n", (unsigned long long)lp->gid, (unsigned long long) m->src);
510 511

    /* safety check that this request got to the right server */
512
//    printf("\n m->src %d lp->gid %d ", m->src, lp->gid);
513
    int opt_offset = 0;
514

515
    if((net_id == DRAGONFLY || net_id == SLIMFLY || net_id == EXPRESS_MESH) && (m->src % lps_per_rep == num_servers_per_rep - 1))
516 517
          opt_offset = num_servers_per_rep + num_routers_per_rep; /* optional offset due to dragonfly mapping */

518
    assert(lp->gid == (m->src + offset + opt_offset)%(num_servers*2 + num_routers));
519 520 521 522 523 524
    ns->msg_recvd_count++;

    /* send ack back */
    /* simulated payload of 1 MiB */
    /* also trigger a local event for completion of payload msg */
    /* remote host will get an ack event */
525 526

   // mm Q: What should be the size of an ack message? may be a few bytes? or larger..?
527
    m->ret = model_net_event(net_id, "test", m->src, PAYLOAD_SZ, 0.0, sizeof(svr_msg), (const void*)m_remote, sizeof(svr_msg), (const void*)m_local, lp);
528 529 530 531 532 533 534 535 536
//    printf("\n Sending ack to LP %d %d ", m->src, m_remote->src);
    return;
}

/*
 * Local variables:
 *  c-indent-level: 4
 *  c-basic-offset: 4
 * End:
537
 *
538 539
 * vim: ft=c ts=8 sts=4 sw=4 expandtab
 */