modelnet-test.c 13 KB
Newer Older
1
/*
Philip Carns's avatar
Philip Carns committed
2
 * Copyright (C) 2013 University of Chicago.
3
 * See COPYRIGHT notice in top-level directory.
Philip Carns's avatar
Philip Carns committed
4
 *
5 6 7 8
 */

/* SUMMARY:
 *
9
 * This is a test harness for the modelnet module.  It sets up a number of
10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26
 * servers, each of which is paired up with a simplenet LP to serve as the
 * NIC.  Each server exchanges a sequence of requests and acks with one peer
 * and measures the throughput in terms of payload bytes (ack size) moved
 * per second.
 */

#include <string.h>
#include <assert.h>
#include <ross.h>

#include "codes/model-net.h"
#include "codes/lp-io.h"
#include "codes/codes.h"
#include "codes/codes_mapping.h"
#include "codes/configuration.h"
#include "codes/lp-type-lookup.h"

27
#define NUM_REQS 2  /* number of requests sent by each server */
28
#define PAYLOAD_SZ 4096 /* size of simulated data payload, bytes  */
29 30

static int net_id = 0;
31 32 33
static int num_routers = 0;
static int num_servers = 0;
static int offset = 2;
34

35
/* whether to pull instead of push */ 
36
static int do_pull = 0;
37

38 39 40 41
static int num_routers_per_rep = 0;
static int num_servers_per_rep = 0;
static int lps_per_rep = 0;

42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59
typedef struct svr_msg svr_msg;
typedef struct svr_state svr_state;

/* types of events that will constitute triton requests */
enum svr_event
{
    KICKOFF,    /* initial event */
    REQ,        /* request event */
    ACK,        /* ack event */
    LOCAL      /* local event */
};

struct svr_state
{
    int msg_sent_count;   /* requests sent */
    int msg_recvd_count;  /* requests recvd */
    int local_recvd_count; /* number of local messages received */
    tw_stime start_ts;    /* time that we started sending requests */
60
    tw_stime end_ts;      /* time that we ended sending requests */
61 62 63 64 65 66 67 68
};

struct svr_msg
{
    enum svr_event svr_event_type;
//    enum net_event net_event_type; 
    tw_lpid src;          /* source of this request or ack */

69 70 71
    // rc for modelnet calls
    model_net_event_return ret;

72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92
    int incremented_flag; /* helper for reverse computation */
};

static void svr_init(
    svr_state * ns,
    tw_lp * lp);
static void svr_event(
    svr_state * ns,
    tw_bf * b,
    svr_msg * m,
    tw_lp * lp);
static void svr_rev_event(
    svr_state * ns,
    tw_bf * b,
    svr_msg * m,
    tw_lp * lp);
static void svr_finalize(
    svr_state * ns,
    tw_lp * lp);

tw_lptype svr_lp = {
93 94 95 96 97 98 99
    (init_f) svr_init,
    (pre_run_f) NULL,
    (event_f) svr_event,
    (revent_f) svr_rev_event,
    (final_f)  svr_finalize, 
    (map_f) codes_mapping,
    sizeof(svr_state),
100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117
};

extern const tw_lptype* svr_get_lp_type();
static void svr_add_lp_type();
static tw_stime ns_to_s(tw_stime ns);
static tw_stime s_to_ns(tw_stime ns);
static void handle_kickoff_event(
    svr_state * ns,
    svr_msg * m,
    tw_lp * lp);
static void handle_ack_event(
    svr_state * ns,
    svr_msg * m,
    tw_lp * lp);
static void handle_req_event(
    svr_state * ns,
    svr_msg * m,
    tw_lp * lp);
118 119
static void handle_local_event(svr_state * ns);
static void handle_local_rev_event(svr_state * ns);
120 121 122 123 124 125 126 127 128 129 130 131
static void handle_kickoff_rev_event(
    svr_state * ns,
    svr_msg * m,
    tw_lp * lp);
static void handle_ack_rev_event(
    svr_state * ns,
    svr_msg * m,
    tw_lp * lp);
static void handle_req_rev_event(
    svr_state * ns,
    svr_msg * m,
    tw_lp * lp);
132 133 134 135 136 137 138

const tw_optdef app_opt [] =
{
	TWOPT_GROUP("Model net test case" ),
	TWOPT_END()
};

139 140 141 142 143 144
int main(
    int argc,
    char **argv)
{
    int nprocs;
    int rank;
145 146
    int num_nets;
    int *net_ids;
147 148
    //printf("\n Config count %d ",(int) config.lpgroups_count);
    g_tw_ts_end = s_to_ns(60*60*24*365); /* one year, in nsecs */
149
    lp_io_handle handle;
150 151 152 153

    tw_opt_add(app_opt);
    tw_init(&argc, &argv);

154
    if(argc < 2)
155
    {
156
	    printf("\n Usage: mpirun <args> --sync=2/3 mapping_file_name.conf (optional --nkp) ");
157
	    MPI_Finalize();
158
	    return 0;
159 160 161 162
    }
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
  
163
    configuration_load(argv[2], MPI_COMM_WORLD, &config);
164 165

    model_net_register();
166
    svr_add_lp_type();
167 168 169
    
    codes_mapping_setup();
    
170 171 172 173 174
    net_ids = model_net_configure(&num_nets);
    assert(num_nets==1);
    net_id = *net_ids;
    free(net_ids);

175 176
    num_servers = codes_mapping_get_lp_count("MODELNET_GRP", 0, "server",
            NULL, 1);
177
    if(net_id == DRAGONFLY)
178
    {
179 180
	  num_routers = codes_mapping_get_lp_count("MODELNET_GRP", 0,
                  "dragonfly_router", NULL, 1); 
181
	  offset = 1;
182
    }
183

184 185 186 187 188
    if(lp_io_prepare("modelnet-test", LP_IO_UNIQ_SUFFIX, &handle, MPI_COMM_WORLD) < 0)
    {
        return(-1);
    }

189
    tw_run();
190 191
    model_net_report_stats(net_id);

192 193 194 195 196
    if(lp_io_flush(handle, MPI_COMM_WORLD) < 0)
    {
        return(-1);
    }

197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224
    tw_end();
    return 0;
}

const tw_lptype* svr_get_lp_type()
{
	    return(&svr_lp);
}

static void svr_add_lp_type()
{
  lp_type_register("server", svr_get_lp_type());
}

static void svr_init(
    svr_state * ns,
    tw_lp * lp)
{
    tw_event *e;
    svr_msg *m;
    tw_stime kickoff_time;
    
    memset(ns, 0, sizeof(*ns));

    /* each server sends a dummy event to itself that will kick off the real
     * simulation
     */

225
    //printf("\n Initializing servers %d ", (int)lp->gid);
226 227 228
    /* skew each kickoff event slightly to help avoid event ties later on */
    kickoff_time = g_tw_lookahead + tw_rand_unif(lp->rng); 

229
    e = tw_event_new(lp->gid, kickoff_time, lp);
230 231 232 233 234 235 236 237 238 239 240 241 242
    m = tw_event_data(e);
    m->svr_event_type = KICKOFF;
    tw_event_send(e);

    return;
}

static void svr_event(
    svr_state * ns,
    tw_bf * b,
    svr_msg * m,
    tw_lp * lp)
{
243
    (void)b;
244 245 246
   switch (m->svr_event_type)
    {
        case REQ:
247
            handle_req_event(ns, m, lp);
248 249
            break;
        case ACK:
250
            handle_ack_event(ns, m, lp);
251 252
            break;
        case KICKOFF:
253
            handle_kickoff_event(ns, m, lp);
254 255
            break;
	case LOCAL:
256
	   handle_local_event(ns);
257 258 259 260 261 262 263 264 265 266 267 268 269 270
	 break;
        default:
	    printf("\n Invalid message type %d ", m->svr_event_type);
            assert(0);
        break;
    }
}

static void svr_rev_event(
    svr_state * ns,
    tw_bf * b,
    svr_msg * m,
    tw_lp * lp)
{
271
    (void)b;
272 273 274
    switch (m->svr_event_type)
    {
        case REQ:
275
            handle_req_rev_event(ns, m, lp);
276 277
            break;
        case ACK:
278
            handle_ack_rev_event(ns, m, lp);
279 280
            break;
        case KICKOFF:
281
            handle_kickoff_rev_event(ns, m, lp);
282 283
            break;
	case LOCAL:
284
	    handle_local_rev_event(ns);
285 286 287 288 289 290 291 292 293 294 295 296 297
	    break;
        default:
            assert(0);
            break;
    }

    return;
}

static void svr_finalize(
    svr_state * ns,
    tw_lp * lp)
{
298 299
    printf("server %llu recvd %d bytes in %f seconds, %f MiB/s sent_count %d recvd_count %d local_count %d \n", (unsigned long long)lp->gid, PAYLOAD_SZ*ns->msg_recvd_count, ns_to_s(ns->end_ts-ns->start_ts), 
        ((double)(PAYLOAD_SZ*NUM_REQS)/(double)(1024*1024)/ns_to_s(ns->end_ts-ns->start_ts)), ns->msg_sent_count, ns->msg_recvd_count, ns->local_recvd_count);
300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328
    return;
}

/* convert ns to seconds */
static tw_stime ns_to_s(tw_stime ns)
{
    return(ns / (1000.0 * 1000.0 * 1000.0));
}

/* convert seconds to ns */
static tw_stime s_to_ns(tw_stime ns)
{
    return(ns * (1000.0 * 1000.0 * 1000.0));
}

/* handle initial event */
static void handle_kickoff_event(
    svr_state * ns,
    svr_msg * m,
    tw_lp * lp)
{
    svr_msg * m_local = malloc(sizeof(svr_msg));
    svr_msg * m_remote = malloc(sizeof(svr_msg));

//    m_local->svr_event_type = REQ;
    m_local->svr_event_type = LOCAL;
    m_local->src = lp->gid;

    memcpy(m_remote, m_local, sizeof(svr_msg));
329
    m_remote->svr_event_type = (do_pull) ? ACK : REQ;
330 331 332 333 334
    //printf("handle_kickoff_event(), lp %llu.\n", (unsigned long long)lp->gid);

    /* record when transfers started on this server */
    ns->start_ts = tw_now(lp);

335 336 337 338
    num_servers_per_rep = codes_mapping_get_lp_count("MODELNET_GRP", 1,
            "server", NULL, 1);
    num_routers_per_rep = codes_mapping_get_lp_count("MODELNET_GRP", 1,
            "dragonfly_router", NULL, 1);
339 340 341

    lps_per_rep = num_servers_per_rep * 2 + num_routers_per_rep;

342
    int opt_offset = 0;
343 344 345 346
    int total_lps = num_servers * 2 + num_routers;

    if(net_id == DRAGONFLY && (lp->gid % lps_per_rep == num_servers_per_rep - 1))
          opt_offset = num_servers_per_rep + num_routers_per_rep; /* optional offset due to dragonfly mapping */
347
    
348
    /* each server sends a request to the next highest server */
349
    int dest_id = (lp->gid + offset + opt_offset)%total_lps;
350
    if (do_pull){
351
        m->ret = model_net_pull_event(net_id, "test", dest_id, PAYLOAD_SZ, 0.0,
352 353 354
                sizeof(svr_msg), (const void*)m_remote, lp);
    }
    else{
355
        m->ret = model_net_event(net_id, "test", dest_id, PAYLOAD_SZ, 0.0, sizeof(svr_msg), (const void*)m_remote, sizeof(svr_msg), (const void*)m_local, lp);
356
    }
357 358 359
    ns->msg_sent_count++;
}

360
static void handle_local_event(svr_state * ns)
361 362 363 364
{
    ns->local_recvd_count++;
}

365
static void handle_local_rev_event(svr_state * ns)
366 367 368 369 370 371 372 373 374 375
{
   ns->local_recvd_count--;
}
/* reverse handler for req event */
static void handle_req_rev_event(
    svr_state * ns,
    svr_msg * m,
    tw_lp * lp)
{
    ns->msg_recvd_count--;
376
    model_net_event_rc2(lp, &m->ret);
377 378 379 380 381 382 383 384 385 386 387 388

    return;
}


/* reverse handler for kickoff */
static void handle_kickoff_rev_event(
    svr_state * ns,
    svr_msg * m,
    tw_lp * lp)
{
    ns->msg_sent_count--;
389
    if (do_pull){
390
        model_net_event_rc2(lp, &m->ret);
391 392
    }
    else{
393
        model_net_event_rc2(lp, &m->ret);
394
    }
395 396 397 398 399 400 401 402 403 404 405 406

    return;
}

/* reverse handler for ack*/
static void handle_ack_rev_event(
    svr_state * ns,
    svr_msg * m,
    tw_lp * lp)
{
    if(m->incremented_flag)
    {
407
        model_net_event_rc2(lp, &m->ret);
408 409
        ns->msg_sent_count--;
    }
410 411
    // don't worry about resetting end_ts - just let the ack 
    // event bulldoze it
412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428
    return;
}

/* handle recving ack */
static void handle_ack_event(
    svr_state * ns,
    svr_msg * m,
    tw_lp * lp)
{
    svr_msg * m_local = malloc(sizeof(svr_msg));
    svr_msg * m_remote = malloc(sizeof(svr_msg));

//    m_local->svr_event_type = REQ;
    m_local->svr_event_type = LOCAL;
    m_local->src = lp->gid;

    memcpy(m_remote, m_local, sizeof(svr_msg));
429
    m_remote->svr_event_type = (do_pull) ? ACK : REQ;
430

431
    //printf("handle_ack_event(), lp %llu.\n", (unsigned long long)lp->gid);
432 433 434

    /* safety check that this request got to the right server */
//    printf("\n m->src %d lp->gid %d ", m->src, lp->gid);
435
    int opt_offset = 0;
436 437 438
    
   if(net_id == DRAGONFLY && (lp->gid % lps_per_rep == num_servers_per_rep - 1))
      opt_offset = num_servers_per_rep + num_routers_per_rep; /* optional offset due to dragonfly mapping */    	
439

440 441 442 443 444 445 446 447 448
    tw_lpid dest_id = (lp->gid + offset + opt_offset)%(num_servers*2 + num_routers);

    /* in the "pull" case, src should actually be self */
    if (do_pull){
        assert(m->src == lp->gid);
    }
    else{
        assert(m->src == dest_id);
    }
449 450 451 452

    if(ns->msg_sent_count < NUM_REQS)
    {
        /* send another request */
453
        if (do_pull){
454
            m->ret = model_net_pull_event(net_id, "test", dest_id, PAYLOAD_SZ, 0.0,
455 456 457
                    sizeof(svr_msg), (const void*)m_remote, lp);
        }
        else{
458
            m->ret = model_net_event(net_id, "test", dest_id, PAYLOAD_SZ, 0.0, sizeof(svr_msg), (const void*)m_remote, sizeof(svr_msg), (const void*)m_local, lp);
459
        }
460 461 462 463 464
        ns->msg_sent_count++;
        m->incremented_flag = 1;
    }
    else
    {
465
        ns->end_ts = tw_now(lp);
466 467 468 469 470 471
        m->incremented_flag = 0;
    }

    return;
}

472 473 474
/* handle receiving request 
 * (note: this should never be called when doing the "pulling" version of
 * the program) */
475 476 477 478 479
static void handle_req_event(
    svr_state * ns,
    svr_msg * m,
    tw_lp * lp)
{
480
    assert(!do_pull);
481 482 483 484 485 486 487 488
    svr_msg * m_local = malloc(sizeof(svr_msg));
    svr_msg * m_remote = malloc(sizeof(svr_msg));

    m_local->svr_event_type = LOCAL;
    m_local->src = lp->gid;

    memcpy(m_remote, m_local, sizeof(svr_msg));
    m_remote->svr_event_type = ACK;
489
    //printf("handle_req_event(), lp %llu src %llu .\n", (unsigned long long)lp->gid, (unsigned long long) m->src);
490 491

    /* safety check that this request got to the right server */
492
//    printf("\n m->src %d lp->gid %d ", m->src, lp->gid);
493
    int opt_offset = 0;
494 495 496
    if(net_id == DRAGONFLY && (m->src % lps_per_rep == num_servers_per_rep - 1))
          opt_offset = num_servers_per_rep + num_routers_per_rep; /* optional offset due to dragonfly mapping */       
 
497
    assert(lp->gid == (m->src + offset + opt_offset)%(num_servers*2 + num_routers));
498 499 500 501 502 503 504 505
    ns->msg_recvd_count++;

    /* send ack back */
    /* simulated payload of 1 MiB */
    /* also trigger a local event for completion of payload msg */
    /* remote host will get an ack event */
   
   // mm Q: What should be the size of an ack message? may be a few bytes? or larger..? 
506
    m->ret = model_net_event(net_id, "test", m->src, PAYLOAD_SZ, 0.0, sizeof(svr_msg), (const void*)m_remote, sizeof(svr_msg), (const void*)m_local, lp);
507 508 509 510 511 512 513 514 515
//    printf("\n Sending ack to LP %d %d ", m->src, m_remote->src);
    return;
}

/*
 * Local variables:
 *  c-indent-level: 4
 *  c-basic-offset: 4
 * End:
516
 *
517 518
 * vim: ft=c ts=8 sts=4 sw=4 expandtab
 */