modelnet-test.c 13.2 KB
Newer Older
1
/*
2
 * Copyright (C) 2013 University of Chicago.
3
 * See COPYRIGHT notice in top-level directory.
4
 *
5 6 7 8
 */

/* SUMMARY:
 *
9
 * This is a test harness for the modelnet module.  It sets up a number of
10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26
 * servers, each of which is paired up with a simplenet LP to serve as the
 * NIC.  Each server exchanges a sequence of requests and acks with one peer
 * and measures the throughput in terms of payload bytes (ack size) moved
 * per second.
 */

#include <string.h>
#include <assert.h>
#include <ross.h>

#include "codes/model-net.h"
#include "codes/lp-io.h"
#include "codes/codes.h"
#include "codes/codes_mapping.h"
#include "codes/configuration.h"
#include "codes/lp-type-lookup.h"

27
#define NUM_REQS 2  /* number of requests sent by each server */
28
#define PAYLOAD_SZ 4096 /* size of simulated data payload, bytes  */
29 30

static int net_id = 0;
31 32 33
static int num_routers = 0;
static int num_servers = 0;
static int offset = 2;
34

35
/* whether to pull instead of push */ 
36
static int do_pull = 0;
37

38 39 40 41
static int num_routers_per_rep = 0;
static int num_servers_per_rep = 0;
static int lps_per_rep = 0;

42 43 44
typedef struct svr_msg svr_msg;
typedef struct svr_state svr_state;

45 46
char router_name[MAX_NAME_LENGTH];

47 48 49 50 51 52 53 54 55 56 57 58 59 60 61
/* types of events that will constitute triton requests */
enum svr_event
{
    KICKOFF,    /* initial event */
    REQ,        /* request event */
    ACK,        /* ack event */
    LOCAL      /* local event */
};

struct svr_state
{
    int msg_sent_count;   /* requests sent */
    int msg_recvd_count;  /* requests recvd */
    int local_recvd_count; /* number of local messages received */
    tw_stime start_ts;    /* time that we started sending requests */
62
    tw_stime end_ts;      /* time that we ended sending requests */
63 64 65 66 67 68 69 70
};

struct svr_msg
{
    enum svr_event svr_event_type;
//    enum net_event net_event_type; 
    tw_lpid src;          /* source of this request or ack */

71 72 73
    // rc for modelnet calls
    model_net_event_return ret;

74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94
    int incremented_flag; /* helper for reverse computation */
};

static void svr_init(
    svr_state * ns,
    tw_lp * lp);
static void svr_event(
    svr_state * ns,
    tw_bf * b,
    svr_msg * m,
    tw_lp * lp);
static void svr_rev_event(
    svr_state * ns,
    tw_bf * b,
    svr_msg * m,
    tw_lp * lp);
static void svr_finalize(
    svr_state * ns,
    tw_lp * lp);

tw_lptype svr_lp = {
95 96 97 98 99 100 101
    (init_f) svr_init,
    (pre_run_f) NULL,
    (event_f) svr_event,
    (revent_f) svr_rev_event,
    (final_f)  svr_finalize, 
    (map_f) codes_mapping,
    sizeof(svr_state),
102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119
};

extern const tw_lptype* svr_get_lp_type();
static void svr_add_lp_type();
static tw_stime ns_to_s(tw_stime ns);
static tw_stime s_to_ns(tw_stime ns);
static void handle_kickoff_event(
    svr_state * ns,
    svr_msg * m,
    tw_lp * lp);
static void handle_ack_event(
    svr_state * ns,
    svr_msg * m,
    tw_lp * lp);
static void handle_req_event(
    svr_state * ns,
    svr_msg * m,
    tw_lp * lp);
120 121
static void handle_local_event(svr_state * ns);
static void handle_local_rev_event(svr_state * ns);
122 123 124 125 126 127 128 129 130 131 132 133
static void handle_kickoff_rev_event(
    svr_state * ns,
    svr_msg * m,
    tw_lp * lp);
static void handle_ack_rev_event(
    svr_state * ns,
    svr_msg * m,
    tw_lp * lp);
static void handle_req_rev_event(
    svr_state * ns,
    svr_msg * m,
    tw_lp * lp);
134 135 136 137 138 139 140

const tw_optdef app_opt [] =
{
	TWOPT_GROUP("Model net test case" ),
	TWOPT_END()
};

141 142 143 144 145 146
int main(
    int argc,
    char **argv)
{
    int nprocs;
    int rank;
147 148
    int num_nets;
    int *net_ids;
149 150
    //printf("\n Config count %d ",(int) config.lpgroups_count);
    g_tw_ts_end = s_to_ns(60*60*24*365); /* one year, in nsecs */
151
    lp_io_handle handle;
152 153 154 155

    tw_opt_add(app_opt);
    tw_init(&argc, &argv);

156
    if(argc < 2)
157
    {
158
	    printf("\n Usage: mpirun <args> --sync=2/3 mapping_file_name.conf (optional --nkp) ");
159
	    MPI_Finalize();
160
	    return 0;
161 162 163 164
    }
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
  
165
    configuration_load(argv[2], MPI_COMM_WORLD, &config);
166 167

    model_net_register();
168
    svr_add_lp_type();
169 170 171
    
    codes_mapping_setup();
    
172
    net_ids = model_net_configure(&num_nets);
173
    assert(num_nets>=1);
174 175 176
    net_id = *net_ids;
    free(net_ids);

177 178
    num_servers = codes_mapping_get_lp_count("MODELNET_GRP", 0, "server",
            NULL, 1);
179
    
180
    if(net_id == DRAGONFLY)
181 182 183 184 185 186 187 188 189 190
    {
      strcpy(router_name, "modelnet_dragonfly_router");
    }
    
    if(net_id == SLIMFLY)
    {
      strcpy(router_name, "modelnet_slimfly_router");
    }

    if(net_id == SLIMFLY || net_id == DRAGONFLY)
191
    {
192
	  num_routers = codes_mapping_get_lp_count("MODELNET_GRP", 0,
193
                  router_name, NULL, 1); 
194
	  offset = 1;
195
    }
196

197 198 199 200 201
    if(lp_io_prepare("modelnet-test", LP_IO_UNIQ_SUFFIX, &handle, MPI_COMM_WORLD) < 0)
    {
        return(-1);
    }

202
    tw_run();
203 204
    model_net_report_stats(net_id);

205 206 207 208 209
    if(lp_io_flush(handle, MPI_COMM_WORLD) < 0)
    {
        return(-1);
    }

210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237
    tw_end();
    return 0;
}

const tw_lptype* svr_get_lp_type()
{
	    return(&svr_lp);
}

static void svr_add_lp_type()
{
  lp_type_register("server", svr_get_lp_type());
}

static void svr_init(
    svr_state * ns,
    tw_lp * lp)
{
    tw_event *e;
    svr_msg *m;
    tw_stime kickoff_time;
    
    memset(ns, 0, sizeof(*ns));

    /* each server sends a dummy event to itself that will kick off the real
     * simulation
     */

238
    //printf("\n Initializing servers %d ", (int)lp->gid);
239 240 241
    /* skew each kickoff event slightly to help avoid event ties later on */
    kickoff_time = g_tw_lookahead + tw_rand_unif(lp->rng); 

242
    e = tw_event_new(lp->gid, kickoff_time, lp);
243 244 245 246 247 248 249 250 251 252 253 254 255
    m = tw_event_data(e);
    m->svr_event_type = KICKOFF;
    tw_event_send(e);

    return;
}

static void svr_event(
    svr_state * ns,
    tw_bf * b,
    svr_msg * m,
    tw_lp * lp)
{
256
    (void)b;
257 258 259
   switch (m->svr_event_type)
    {
        case REQ:
260
            handle_req_event(ns, m, lp);
261 262
            break;
        case ACK:
263
            handle_ack_event(ns, m, lp);
264 265
            break;
        case KICKOFF:
266
            handle_kickoff_event(ns, m, lp);
267 268
            break;
	case LOCAL:
269
	   handle_local_event(ns);
270 271 272 273 274 275 276 277 278 279 280 281 282 283
	 break;
        default:
	    printf("\n Invalid message type %d ", m->svr_event_type);
            assert(0);
        break;
    }
}

static void svr_rev_event(
    svr_state * ns,
    tw_bf * b,
    svr_msg * m,
    tw_lp * lp)
{
284
    (void)b;
285 286 287
    switch (m->svr_event_type)
    {
        case REQ:
288
            handle_req_rev_event(ns, m, lp);
289 290
            break;
        case ACK:
291
            handle_ack_rev_event(ns, m, lp);
292 293
            break;
        case KICKOFF:
294
            handle_kickoff_rev_event(ns, m, lp);
295 296
            break;
	case LOCAL:
297
	    handle_local_rev_event(ns);
298 299 300 301 302 303 304 305 306 307 308 309 310
	    break;
        default:
            assert(0);
            break;
    }

    return;
}

static void svr_finalize(
    svr_state * ns,
    tw_lp * lp)
{
311 312
    printf("server %llu recvd %d bytes in %f seconds, %f MiB/s sent_count %d recvd_count %d local_count %d \n", (unsigned long long)lp->gid, PAYLOAD_SZ*ns->msg_recvd_count, ns_to_s(ns->end_ts-ns->start_ts), 
        ((double)(PAYLOAD_SZ*NUM_REQS)/(double)(1024*1024)/ns_to_s(ns->end_ts-ns->start_ts)), ns->msg_sent_count, ns->msg_recvd_count, ns->local_recvd_count);
313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341
    return;
}

/* convert ns to seconds */
static tw_stime ns_to_s(tw_stime ns)
{
    return(ns / (1000.0 * 1000.0 * 1000.0));
}

/* convert seconds to ns */
static tw_stime s_to_ns(tw_stime ns)
{
    return(ns * (1000.0 * 1000.0 * 1000.0));
}

/* handle initial event */
static void handle_kickoff_event(
    svr_state * ns,
    svr_msg * m,
    tw_lp * lp)
{
    svr_msg * m_local = malloc(sizeof(svr_msg));
    svr_msg * m_remote = malloc(sizeof(svr_msg));

//    m_local->svr_event_type = REQ;
    m_local->svr_event_type = LOCAL;
    m_local->src = lp->gid;

    memcpy(m_remote, m_local, sizeof(svr_msg));
342
    m_remote->svr_event_type = (do_pull) ? ACK : REQ;
343 344 345 346 347
    //printf("handle_kickoff_event(), lp %llu.\n", (unsigned long long)lp->gid);

    /* record when transfers started on this server */
    ns->start_ts = tw_now(lp);

348 349 350
    num_servers_per_rep = codes_mapping_get_lp_count("MODELNET_GRP", 1,
            "server", NULL, 1);
    num_routers_per_rep = codes_mapping_get_lp_count("MODELNET_GRP", 1,
351
            router_name, NULL, 1);
352 353 354

    lps_per_rep = num_servers_per_rep * 2 + num_routers_per_rep;

355
    int opt_offset = 0;
356 357 358 359
    int total_lps = num_servers * 2 + num_routers;

    if(net_id == DRAGONFLY && (lp->gid % lps_per_rep == num_servers_per_rep - 1))
          opt_offset = num_servers_per_rep + num_routers_per_rep; /* optional offset due to dragonfly mapping */
360
    
361
    /* each server sends a request to the next highest server */
362
    int dest_id = (lp->gid + offset + opt_offset)%total_lps;
363
    if (do_pull){
364
        m->ret = model_net_pull_event(net_id, "test", dest_id, PAYLOAD_SZ, 0.0,
365 366 367
                sizeof(svr_msg), (const void*)m_remote, lp);
    }
    else{
368
        m->ret = model_net_event(net_id, "test", dest_id, PAYLOAD_SZ, 0.0, sizeof(svr_msg), (const void*)m_remote, sizeof(svr_msg), (const void*)m_local, lp);
369
    }
370 371 372
    ns->msg_sent_count++;
}

373
static void handle_local_event(svr_state * ns)
374 375 376 377
{
    ns->local_recvd_count++;
}

378
static void handle_local_rev_event(svr_state * ns)
379 380 381 382 383 384 385 386 387 388
{
   ns->local_recvd_count--;
}
/* reverse handler for req event */
static void handle_req_rev_event(
    svr_state * ns,
    svr_msg * m,
    tw_lp * lp)
{
    ns->msg_recvd_count--;
389
    model_net_event_rc2(lp, &m->ret);
390 391 392 393 394 395 396 397 398 399 400 401

    return;
}


/* reverse handler for kickoff */
static void handle_kickoff_rev_event(
    svr_state * ns,
    svr_msg * m,
    tw_lp * lp)
{
    ns->msg_sent_count--;
402
    if (do_pull){
403
        model_net_event_rc2(lp, &m->ret);
404 405
    }
    else{
406
        model_net_event_rc2(lp, &m->ret);
407
    }
408 409 410 411 412 413 414 415 416 417 418 419

    return;
}

/* reverse handler for ack*/
static void handle_ack_rev_event(
    svr_state * ns,
    svr_msg * m,
    tw_lp * lp)
{
    if(m->incremented_flag)
    {
420
        model_net_event_rc2(lp, &m->ret);
421 422
        ns->msg_sent_count--;
    }
423 424
    // don't worry about resetting end_ts - just let the ack 
    // event bulldoze it
425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441
    return;
}

/* handle recving ack */
static void handle_ack_event(
    svr_state * ns,
    svr_msg * m,
    tw_lp * lp)
{
    svr_msg * m_local = malloc(sizeof(svr_msg));
    svr_msg * m_remote = malloc(sizeof(svr_msg));

//    m_local->svr_event_type = REQ;
    m_local->svr_event_type = LOCAL;
    m_local->src = lp->gid;

    memcpy(m_remote, m_local, sizeof(svr_msg));
442
    m_remote->svr_event_type = (do_pull) ? ACK : REQ;
443

444
    //printf("handle_ack_event(), lp %llu.\n", (unsigned long long)lp->gid);
445 446 447

    /* safety check that this request got to the right server */
//    printf("\n m->src %d lp->gid %d ", m->src, lp->gid);
448
    int opt_offset = 0;
449 450 451
    
   if(net_id == DRAGONFLY && (lp->gid % lps_per_rep == num_servers_per_rep - 1))
      opt_offset = num_servers_per_rep + num_routers_per_rep; /* optional offset due to dragonfly mapping */    	
452

453 454 455 456 457 458 459 460 461
    tw_lpid dest_id = (lp->gid + offset + opt_offset)%(num_servers*2 + num_routers);

    /* in the "pull" case, src should actually be self */
    if (do_pull){
        assert(m->src == lp->gid);
    }
    else{
        assert(m->src == dest_id);
    }
462 463 464 465

    if(ns->msg_sent_count < NUM_REQS)
    {
        /* send another request */
466
        if (do_pull){
467
            m->ret = model_net_pull_event(net_id, "test", dest_id, PAYLOAD_SZ, 0.0,
468 469 470
                    sizeof(svr_msg), (const void*)m_remote, lp);
        }
        else{
471
            m->ret = model_net_event(net_id, "test", dest_id, PAYLOAD_SZ, 0.0, sizeof(svr_msg), (const void*)m_remote, sizeof(svr_msg), (const void*)m_local, lp);
472
        }
473 474 475 476 477
        ns->msg_sent_count++;
        m->incremented_flag = 1;
    }
    else
    {
478
        ns->end_ts = tw_now(lp);
479 480 481 482 483 484
        m->incremented_flag = 0;
    }

    return;
}

485 486 487
/* handle receiving request 
 * (note: this should never be called when doing the "pulling" version of
 * the program) */
488 489 490 491 492
static void handle_req_event(
    svr_state * ns,
    svr_msg * m,
    tw_lp * lp)
{
493
    assert(!do_pull);
494 495 496 497 498 499 500 501
    svr_msg * m_local = malloc(sizeof(svr_msg));
    svr_msg * m_remote = malloc(sizeof(svr_msg));

    m_local->svr_event_type = LOCAL;
    m_local->src = lp->gid;

    memcpy(m_remote, m_local, sizeof(svr_msg));
    m_remote->svr_event_type = ACK;
502
    //printf("handle_req_event(), lp %llu src %llu .\n", (unsigned long long)lp->gid, (unsigned long long) m->src);
503 504

    /* safety check that this request got to the right server */
505
//    printf("\n m->src %d lp->gid %d ", m->src, lp->gid);
506
    int opt_offset = 0;
507 508 509
    if(net_id == DRAGONFLY && (m->src % lps_per_rep == num_servers_per_rep - 1))
          opt_offset = num_servers_per_rep + num_routers_per_rep; /* optional offset due to dragonfly mapping */       
 
510
    assert(lp->gid == (m->src + offset + opt_offset)%(num_servers*2 + num_routers));
511 512 513 514 515 516 517 518
    ns->msg_recvd_count++;

    /* send ack back */
    /* simulated payload of 1 MiB */
    /* also trigger a local event for completion of payload msg */
    /* remote host will get an ack event */
   
   // mm Q: What should be the size of an ack message? may be a few bytes? or larger..? 
519
    m->ret = model_net_event(net_id, "test", m->src, PAYLOAD_SZ, 0.0, sizeof(svr_msg), (const void*)m_remote, sizeof(svr_msg), (const void*)m_local, lp);
520 521 522 523 524 525 526 527 528
//    printf("\n Sending ack to LP %d %d ", m->src, m_remote->src);
    return;
}

/*
 * Local variables:
 *  c-indent-level: 4
 *  c-basic-offset: 4
 * End:
529
 *
530 531
 * vim: ft=c ts=8 sts=4 sw=4 expandtab
 */