codes-best-practices.tex 35.6 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31

\documentclass[conference,10pt,compsocconf,onecolumn]{IEEEtran}
%\documentclass{acm_proc_article-sp}
%\documentclass{sig-alternate}
% Add the compsoc option for Computer Society conferences.
%
% If IEEEtran.cls has not been installed into the LaTeX system files,
% manually specify the path to it like:
% \documentclass[conference]{../sty/IEEEtran}

% *** CITATION PACKAGES ***
%
\usepackage{cite}
% cite.sty was written by Donald Arseneau
% V1.6 and later of IEEEtran pre-defines the format of the cite.sty package
% \cite{} output to follow that of IEEE. Loading the cite package will
% result in citation numbers being automatically sorted and properly
% "compressed/ranged". e.g., [1], [9], [2], [7], [5], [6] without using
% cite.sty will become [1], [2], [5]--[7], [9] using cite.sty. cite.sty's
% \cite will automatically add leading space, if needed. Use cite.sty's
% noadjust option (cite.sty V3.8 and later) if you want to turn this off.
% cite.sty is already installed on most LaTeX systems. Be sure and use
% version 4.0 (2003-05-27) and later if using hyperref.sty. cite.sty does
% not currently provide for hyperlinked citations.
% The latest version can be obtained at:
% http://www.ctan.org/tex-archive/macros/latex/contrib/cite/
% The documentation is contained in the cite.sty file itself.
%
\usepackage{setspace}
\usepackage{wrapfig}
\usepackage{color}
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
\usepackage{listing}
\usepackage{listings}

\lstset{ %
frame=single,
language=C,
captionpos=b,
columns=fullflexible,
morekeywords={aesop,pwait,pbranch,pbreak},
numbers=left,
basicstyle=\scriptsize\ttfamily,
breaklines=true,
framexleftmargin=0em,
boxpos=c,
resetmargins=true,
xleftmargin=6ex
%basicstyle=\footnotesize
}

51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151

\usepackage[pdftex]{graphicx}
%\usepackage{graphicx}
% declare the path(s) where your graphic files are
\graphicspath{{./}{data/beagle/}}
% and their extensions so you won't have to specify these with
% every instance of \includegraphics
\DeclareGraphicsExtensions{.pdf,.jpeg,.png}

% *** GRAPHICS RELATED PACKAGES ***
%
\ifCLASSINFOpdf
 % \usepackage[pdftex]{graphicx}
 % declare the path(s) where your graphic files are
 % \graphicspath{{../pdf/}{../jpeg/}}
 % and their extensions so you won't have to specify these with
 % every instance of \includegraphics
 % \DeclareGraphicsExtensions{.pdf,.jpeg,.png}
\else
 % or other class option (dvipsone, dvipdf, if not using dvips). graphicx
 % will default to the driver specified in the system graphics.cfg if no
 % driver is specified.
 % \usepackage[dvips]{graphicx}
 % declare the path(s) where your graphic files are
 % \graphicspath{{../eps/}}
 % and their extensions so you won't have to specify these with
 % every instance of \includegraphics
 % \DeclareGraphicsExtensions{.eps}
\fi

% *** ALIGNMENT PACKAGES ***
%
%\usepackage{array}
% Frank Mittelbach's and David Carlisle's array.sty patches and improves
% the standard LaTeX2e array and tabular environments to provide better
% appearance and additional user controls. As the default LaTeX2e table
% generation code is lacking to the point of almost being broken with
% respect to the quality of the end results, all users are strongly
% advised to use an enhanced (at the very least that provided by array.sty)
% set of table tools. array.sty is already installed on most systems. The
% latest version and documentation can be obtained at:
% http://www.ctan.org/tex-archive/macros/latex/required/tools/

%\usepackage{eqparbox}
% Also of notable interest is Scott Pakin's eqparbox package for creating
% (automatically sized) equal width boxes - aka "natural width parboxes".
% Available at:
% http://www.ctan.org/tex-archive/macros/latex/contrib/eqparbox/

\usepackage{algorithm}
\usepackage[noend]{algorithmic}

% *** SUBFIGURE PACKAGES ***
\usepackage[tight,footnotesize]{subfigure}
% \usepackage{subfigure}
% subfigure.sty was written by Steven Douglas Cochran. This package makes it
% easy to put subfigures in your figures. e.g., "Figure 1a and 1b". For IEEE
% work, it is a good idea to load it with the tight package option to reduce
% the amount of white space around the subfigures. subfigure.sty is already
% installed on most LaTeX systems. The latest version and documentation can
% be obtained at:
% http://www.ctan.org/tex-archive/obsolete/macros/latex/contrib/subfigure/
% subfigure.sty has been superceeded by subfig.sty.

%\usepackage[caption=false]{caption}
%\usepackage[font=footnotesize]{subfig}
% subfig.sty, also written by Steven Douglas Cochran, is the modern
% replacement for subfigure.sty. However, subfig.sty requires and
% automatically loads Axel Sommerfeldt's caption.sty which will override
% IEEEtran.cls handling of captions and this will result in nonIEEE style
% figure/table captions. To prevent this problem, be sure and preload
% caption.sty with its "caption=false" package option. This is will preserve
% IEEEtran.cls handing of captions. Version 1.3 (2005/06/28) and later 
% (recommended due to many improvements over 1.2) of subfig.sty supports
% the caption=false option directly:
%\usepackage[caption=false,font=footnotesize]{subfig}
%
% The latest version and documentation can be obtained at:
% http://www.ctan.org/tex-archive/macros/latex/contrib/subfig/
% The latest version and documentation of caption.sty can be obtained at:
% http://www.ctan.org/tex-archive/macros/latex/contrib/caption/

% *** PDF, URL AND HYPERLINK PACKAGES ***
%
\usepackage{url}
% url.sty was written by Donald Arseneau. It provides better support for
% handling and breaking URLs. url.sty is already installed on most LaTeX
% systems. The latest version can be obtained at:
% http://www.ctan.org/tex-archive/macros/latex/contrib/misc/
% Read the url.sty source comments for usage information. Basically,
% \url{my_url_here}.

% *** Do not adjust lengths that control margins, column widths, etc. ***
% *** Do not use packages that alter fonts (such as pslatex).         ***
% There should be no need to do such things with IEEEtran.cls V1.6 and later.
% (Unless specifically asked to do so by the journal or conference you plan
% to submit to, of course. )

% correct bad hyphenation here
\hyphenation{op-tical net-works semi-conduc-tor}

152
153
154
155
156
\newcommand{\codesmapping}[1]{\texttt{codes\_mapping}}
\newcommand{\codesconfig}[1]{\texttt{codes\_config}}
\newcommand{\codesmodelnet}[1]{\texttt{model\_net}}


157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
\begin{document}
\title{CODES Best Practices}

%\author{\IEEEauthorblockN{Someone\IEEEauthorrefmark{1}} \\
%\IEEEauthorblockA{\IEEEauthorrefmark{1}Somewhere}
%}

%\numberofauthors{6} %  in this sample file, there are a *total*


% use for special paper notices
%\IEEEspecialpapernotice{(Invited Paper)}

% use arabic rather than roman numerals for table references
\renewcommand{\thetable}{\arabic{table}}

% make the title area
\maketitle

\begin{abstract}
177
178
This document outlines best practices for developing models in the
CODES/ROSS framework.  The reader should already be familiar with ROSS
Philip Carns's avatar
Philip Carns committed
179
and discrete event simulation in general; those topics are covered in the primary
180
181
182
ROSS documentation.
%
The main purpose of this document is to help the reader produce
183
CODES models in a consistent, modular style so that components can be more
184
185
easily shared and reused.  It also includes a few tips to help avoid common
simulation bugs.
186
187
\end{abstract}

188
\section{CODES: modularizing models}
189

190
191
192
This section covers some of the basic principles of how to organize model
components to be more modular and easier to reuse across CODES models.

193
\subsection{Units of time}
194

195
ROSS does not dictate the units to be used in simulation timestamps.
Philip Carns's avatar
Philip Carns committed
196
The \texttt{tw\_stime} type could represent any time unit
197
(e.g. days, hours, seconds, nanoseconds, etc.).  When building CODES
Philip Carns's avatar
Philip Carns committed
198
199
models you should \emph{always treat timestamps as double precision floating
point numbers representing nanoseconds}, however.
200
201
202
All components within a model must agree on the time units in order to
advance simulation time consistently.  Several common utilities in the
CODES project expect to operate in terms of nanoseconds.
203

204
205
\subsection{Organizing models by LP types}

206
207
208
209
210
211
212
213
214
215
216
ROSS allows you to use as many different LP types as you would like to
construct your models.  Try to take advantage of this as much as possible by
organizing your simulation so that each component of the system that you are
modeling is implemented within its own LP type.  For example, a storage
system model might use different LPs for hard disks, clients, network
adapters, and servers.  There are multiple reasons for dividing up models
like this:

\begin{itemize}
\item General modularity: makes it easier to pull out particular components
(for example, a disk model) for use in other models.
217
\item Simplicity: if each LP type is only handling a limited set of
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
events, then the event structure, state structure, and event handler
functions will all be much smaller and easier to understand.
\item Reverse computation: it makes it easier to implement reverse
computation, not only because the code is simpler, but also because you can
implement and test reverse computation per component rather than having to
apply it to an entire model all at once before testing.
\end{itemize}

It is also important to note that you can divide up models not just by
hardware components, but also by functionality, just as
you would modularize the implementation of a distributed file system.  For
example, a storage daemon might include separate LPs for replication, failure
detection, and reconstruction.  Each of those LPs can share the same network
card and disk resources for accurate modeling of resource usage.  They key
reason for splitting them up is to simplify the model and to encourage
reuse.

Philip Carns's avatar
Philip Carns committed
235
236
237
238
239
240
241
242
243
244
245
246
One hypothetical downside to splitting up models into multiple LP types is that it likely
means that your model will generate more events than a monolithic model
would have.  Remember that \emph{ROSS is really efficient at generating and
processing events}, though!  It is usually a premature optimization to try to optimize a model by
replacing events with function calls in cases where you know the necessary
data is available on the local MPI process.  Also recall that any information
exchanged via event automatically benefits by shifting burden for
tracking/retaining event data and event ordering into ROSS rather than your
model.  This can help simplify reverse computation by breaking complex
operations into smaller, easier to understand (and reverse) event units with
deterministic ordering.

247
248
249
250
251
252
253
254
255
256
257
Adding reference to storage server example: 

In the simple storage server example following this section, there are multiple
LP types i.e. a storage server LP and a Network LP. The storage server LP initiates
data transmission and reception to/from neighboring storage server LP, it also keeps
track of the amount of data sent/received in bytes. The job of data transmission 
is delegated to the network LP which simply transports the data to destination storage 
server LP. The network LP is unaware of the total amount of data sent by a particular
server. At the same time, the storage server LP is unaware of the networking protocol
used by the network LP for transporting the messages. 

258
259
TODO: reference example, for now see how the LPs are organized in Triton
model.
260
261
262

\subsection{Protecting data structures}

Philip Carns's avatar
Philip Carns committed
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
ROSS operates by exchanging events between LPs.  If an LP is sending
an event to another LP of the same type, then in general it can do so
by allocating an event structure (e.g. \texttt{tw\_event\_new()}),
populating the event structure, and transmitting it
(e.g. \texttt{tw\_event\_send()}).  If an LP is sending an event to
another LP of a \emph{different} type, however, then it should use an
explicit API to do so without exposing the other LP's event structure
definition.  Event structures are not a robust API for exchanging data
across different LP types.  If one LP type accesses the event (or state)
structure of another LP type, then it entangles the two components such
that one LP is dependent upon the internal architecture of another LP.
This not only makes it difficult to reuse components, but also makes it
difficult to check for incompatibilities at compile time.  The compiler
has no way to know which fields in a struct must be set before sending
an event.

For these reasons we encourage that a) each LP be implemented in a separate
source file and b) all event structs and state structs
be defined only within those source files.  They should not be exposed in external
282
283
headers.  If the definitions are placed in a header then it makes it
possible for those event and state structs to be used as an ad-hoc interface
Philip Carns's avatar
Philip Carns committed
284
between LPs of different types.
285

Philip Carns's avatar
Philip Carns committed
286
287
Section~\ref{sec:completion} will describe alternative mechanisms for
exchanging information between different LP types.
288
289
290
291
292
293
294
295
296

TODO: reference example, for now see how structs are defined in Triton
model.

\subsection{Techniques for exchanging information and completion events
across LP types}
\label{sec:completion}

TODO: fill this in.
297

298
299
Send events into an LP using a C function API that calls event\_new under
the covers.
300

301
302
303
304
Indicate completion back to the calling LP by either delivering an opaque 
message back to the calling LP (that was passed in by the caller in a void*
argument), or by providing an API function for 2nd LP type to
use to call back (show examples of both).
305
306
307

\section{CODES: common utilities}

Philip Carns's avatar
Philip Carns committed
308
309
TODO: point out what repo each of these utilities can be found in.

310
\subsection{codes\_mapping}
311
\label{sec:mapping}
312

313
TODO: pull in Misbah's codes-mapping documentation.
314
315
316

\subsection{modelnet}

317
318
319
TODO: fill this in.  Modelnet is a network abstraction layer for use in
CODES models.  It provides a consistent API that can be used to send
messages between nodes using a variety of different network transport
Philip Carns's avatar
Philip Carns committed
320
321
models.  Note that modelnet requires the use of the codes-mapping API,
described in previous section.
322

323
324
\subsection{lp-io}

325
TODO: fill this in.  lp-io is a simple API for storing modest-sized
326
simulation results (not continuous traces).  It handles reverse computation
327
and avoids doing any disk I/O until the simulation is complete.  All data is
Philip Carns's avatar
Philip Carns committed
328
329
330
331
332
333
written with collective I/O into a unified output directory.  lp-io is
mostly useful for cases in which you would like each LP instance to report
statistics, but for scalability and data management reasons those results
should be aggregated into a single file rather than producing a separate
file per LP.

334
335
TODO: look at ross/IO code and determine how it relates to this.

336
337
338
339
340
341
342
343
344
345
\subsection{codes-workload generator}

TODO: fill this in.  codes-workload is an abstraction layer for feeding I/O
workloads into a simulation.  It supports multiple back-ends for generating
those I/O events; data could come from a trace file, from Darshan, or from a
synthetic description.

This component is under active development right now, not complete yet.  The
synthetic generator is probably pretty solid for use already though.

346
347
348
349
350
351
352
353
\subsection{codes\_event\_new}

TODO: fill this in.  codes\_event\_new is a small wrapper to tw\_event\_new
that checks the incoming timestamp and makes sure that you don't exceed the
global end timestamp for ROSS.  The assumption is that CODES models will
normally run to a completion condition rather than until simulation time
runs out, see later section for more information on this approach.

354
355
356
357
358
359
\subsection{ross/IO}

TODO: fill this in.  This is the I/O library included with ROSS, based on
phasta I/O library.  What are the use cases and how do you use it. Does it
deprecate the lp-io tool?

360
361
\section{CODES: reproducability and model safety}

362
363
364
365
TODO: fill this in.  These are things that aren't required for modularity,
but just help you create models that produce consistent results and avoid
some common bugs.

366
367
\subsection{Event magic numbers}

368
369
370
371
TODO: fill this in.  Put magic numbers at the top of each event struct and
check them in event handler.  This makes sure that you don't accidentally
send the wrong event type to an LP.

372
373
\subsection{Small timestamps for LP transitions}

374
375
376
377
378
379
380
TODO: fill this in.  Sometimes you need to exchange events between LPs
without really consuming significant time (for example, to transfer
information from a server to its locally attached network card).  It is
tempting to use a timestamp of 0, but this causes timestamp ties in ROSS
which might have a variety of unintended consequences.  Use
codes\_local\_latency for timing of local event transitions to add some
random noise, can be thought of as bus overhead or context switch overhead.
381
382
383
384
385

\section{ROSS: general tips}

\subsection{Organizing event structures}

386
387
388
389
390
391
392
393
394
395
TODO: fill this in.  The main idea is to use unions to organize fields
within event structures.  Keeps the size down and makes it a little clearer
what variables are used by which event types.

\subsection{Avoiding event timestamp ties}

TODO: fill this in.   Why ties are bad (hurts reproducability, if not
accuracy, which in turn makes correctness testing more difficult).  Things
you can do to avoid ties, like skewing initial events by a random number
generator.
396
397
398

\subsection{Validating across simulation modes}

399
400
401
402
TODO: fill this in.  The general idea is that during development you should
do test runs with serial, parallel conservative, and parallel optimistic
runs to make sure that you get consistent results.  These modes stress
different aspects of the model.
403
404
405

\subsection{Reverse computation}

406
407
408
409
410
TODO: fill this in.  General philosophy of when the best time to add reverse
computation is (probably not in your initial rough draft prototype, but it
is best to go ahead and add it before the model is fully complete or else it
becomes too daunting/invasive).

Philip Carns's avatar
Philip Carns committed
411
412
413
414
415
416
417
418
419
420
Other things to talk about (maybe these are different subsections):
\begin{itemize}
\item propagate and maintain as much state as possible in event structures
rather than state structures
\item rely on ordering enforced by ROSS (each
reverse handler only needs to reverse as single event, in order)
\item keeping functions small 
\item building internal APIs for managing functions with reverse functions
\item how to handle queues
\end{itemize}
421
422
423
424
425
426
427
428
429
430

\subsection{How to complete a simulation}

TODO: fill this in.  Most core ROSS examples are design to intentionally hit
the end timestamp for the simulation (i.e. they are modeling a continuous,
steady state system).  This isn't necessarily true when modeling a
distributed storage system.  You might instead want the simulation to end
when you have completed a particular application workload (or collection of
application workloads), when a fault has been repaired, etc.  Talk about how
to handle this cleanly.
431

Philip Carns's avatar
Philip Carns committed
432
\subsection{Kicking off a simulation}
433
\label{sec_kickoff}
Philip Carns's avatar
Philip Carns committed
434
435
436
437
438

TOOD: fill this in.  Each LP needs to send an event to itself at the
beginning of the simulation (explain why).  We usually skew these with
random numbers to help break ties right off the bat (explain why).

439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
\subsection{Handling non-trivial event dependencies}

In storage system simulations, it will often be the case that clients, servers,
or both issue multiple asynchronous (parallel) operations, performing some
action upon the completion of them. More generally, the problem is: an event
issuance (an ack to the client) is based on the completion of more than one
asynchronous/parallel events (local write on primary server, forwarding write to
replica server). Further complicating the matter for storage simulations, there
can be any number of outstanding requests, each waiting on multiple events. 

In ROSS's sequential and conservative parallel modes, the necessary state can
easily be stored in the LP as a queue of statuses for each set of events,
enqueuing upon asynchronous event issuances and updating/dequeuing upon each
completion. Each LP can assign unique IDs to each queue item and propagate the
IDs through the asynchronous events for lookup purposes. However, in optimistic
mode we may remove an item from the queue and then be forced to re-insert it
during reverse computation.

Naively, one could simply never remove queue items, but of course memory will
quickly be consumed.

An elegant solution to this is to \emph{cache the status state in the event
structure that causes the dequeue}. ROSS's reverse computation semantics ensures
that this event will be reversed before the completion events of any of the
other asynchronous events, allowing us to easily recover the state. Furthermore,
events are garbage-collected as the GVT, reducing memory management complexity.
However, this strategy has the disadvantage of increasing event size
accordingly.

468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
\section{Best practices quick reference}

NOTE: these may be integrated with the remaining notes or used as a summary of
section(s).

\subsection{ROSS simulation development}

\begin{enumerate}

    \item prefer fine-grained, simple LPs to coarse-grained, complex LPs
    \begin{enumerate}
        \item can simplify both LP state and reverse computation implementation
        \item ROSS is very good at event processing, likely small difference in
            performance
    \end{enumerate}

    \item consider separating single-source generation of concurrent events with
        "feedback" events or "continue" events to self
    \begin{enumerate}
        \item generating multiple concurrent events makes rollback more difficult
    \end{enumerate}

    \item use dummy events to work around "event-less" advancement of simulation time 

    \item add a small amount of time "noise" to events to prevent ties

    \item prefer more and smaller events to fewer and larger events
    \begin{enumerate}
        \item simplifies individual event processing
        \item ROSS uses bounded event structure size in communication, so
            smaller bound $\rightarrow$  less communication overhead
    \end{enumerate}

    \item prefer placing state in event structure to LP state structure
    \begin{enumerate}
        \item simplifies reverse computation -- less persistent state
        \item NOTE: tradeoff with previous point - consider efficiency vs.
            complexity
    \end{enumerate}

    \item try to implement event processing with only LP-local information
    \begin{enumerate}
        \item reverse computation with collective knowledge is difficult
    \end{enumerate}

513
514
515
516
    \item for optimistic-mode-capable tracking of multiple asynchronous event
        dependencies, cache status in the event state signifying the last
        satisfied dependency to ease reverse computation

517
518
\end{enumerate}

519
\section{CODES Example Model}
520
521
522

TODO: Standardize the namings for codes configuration, mapping, and model-net.

523
This is a simple CODES example to demonstrate the concepts described above.  In
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
this scenario, we have a certain number of storage servers, identified
through indices $0,\ldots, n-1$ where each server has a network interface card
(NIC) associated with it. The servers exchange messages with their neighboring
server via their NIC card (i.e., server $i$ pings server $i+1$, rolling over the
index if necessary). When the neighboring server receives the message, it sends
an acknowledgement message to the sending server in response. Upon receiving the
acknowledgement, the sending server issues another message. This process continues until
some number of messages have been sent. For simplicity, it is assumed that each
server has a direct link to its neighbor, and no network congestion occurs due
to concurrent messages being sent.

The model is relatively simple to simulate through the usage of ROSS. There are
two distinct LP types in the simulation: the server and the NIC. Refer to
Listings \ref{snippet1} for data structure definitions. The server LPs
are in charge of issuing/acknowledging the messages, while the NIC LPs
(implemented via CODES's model-net) transmit the data and inform their
corresponding servers upon completion. This LP decomposition strategy is
generally preferred for ROSS-based simulations: have single-purpose, simple LPs
representing logical system components.
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564

\begin{figure}
\begin{lstlisting}[caption=Server state and event message struct, label=snippet1]
struct svr_state
{
    int msg_sent_count;   /* requests sent */
    int msg_recvd_count;  /* requests recvd */
    int local_recvd_count; /* number of local messages received */
    tw_stime start_ts;    /* time that we started sending requests */
};

struct svr_msg
{
    enum svr_event svr_event_type;
    tw_lpid src;          /* source of this request or ack */

    int incremented_flag; /* helper for reverse computation */
};

\end{lstlisting}
\end{figure}

565
566
567
In this program, CODES is used in the following four ways: to provide
configuration utilities for the program, to logically separate and provide
lookup functionality for multiple LP types, to automate LP placement on KPs/PEs,
568
569
570
and to simplify/modularize the underlying network structure. The \codesconfig{}
API is used for the first use-case, the \codesmapping{} API is used for
the second and third use-cases, and the \codesmodelnet{} API is used for the
571
572
573
fourth use-case. The following sections discuss these while covering necessary
ROSS-specific information.

574
\subsection{\codesconfig{}}
575
576
577
578
579
580

Listing~\ref{snippet2} shows a stripped version of example.conf (see the file
for comments). The configuration format allows categories, and optionally
subgroups within the category, of key-value pairs for configuration. The LPGROUPS
listing defines the LP configuration and (described in
Section~\ref{subsec:codes_mapping}). The PARAMS category is used by both
581
582
\codesmapping{} and \codesmodelnet{} for configuration, providing both ROSS-specific and
network specific parameters. For instance, the \texttt{message\_size} field defines the
583
584
585
maximum event size used in ROSS for memory management. Of course, user-defined
categories can be used as well, which are used in this case to define the rounds
of communication and the size of each message.
586
587
588
589
590

\begin{figure}
\begin{lstlisting}[caption=example configuration file for CODES LP mapping, label=snippet2]
LPGROUPS
{
591
   SERVERS
592
593
594
595
596
597
598
599
600
601
602
603
604
605
   {
      repetitions="16";
      server="1";
      modelnet_simplenet="1";
   }
}
PARAMS
{
   packet_size="512";
   message_size="256";
   modelnet="simplenet";
   net_startup_ns="1.5";
   net_bw_mbps="20000";
}
606
607
608
609
610
server_pings
{
  num_reqs="5";
  payload_sz="4096";
} 
611
612
613
\end{lstlisting}
\end{figure} 

614

615
\subsection{\codesmapping{}}
616
617
\label{subsec:codes_mapping}

618
619
620
The \codesmapping{} API transparently maps LP types to MPI ranks (Aka ROSS PE's).
The LP type and count can be specified through \codesconfig{}. In this
section, we focus on the \codesmapping{} API as well as configuration. Refer again
621
622
623
to Listing~\ref{snippet2}. Multiple LP types are specified in a single LP group
(there can also be multiple LP groups in a config file).

624
625
626
627
628
629
630
631
632
In Listing~\ref{snippet2}, there is 1 server LP and 1
\texttt{modelnet\_simplenet} LP type in a group and this combination is repeated
16 time (repetitions="16").  ROSS will assign the LPs to the PEs (PEs is an
abstraction for MPI rank in ROSS) by placing 1 server LP then 1
\texttt{modelnet\_simplenet} LP a total of 16 times. This configuration is
useful if there is heavy communication involved between the server and
\texttt{modelnet\_simplenet} LP types, in which case ROSS will place them on the
same PE so that the communication between server and
\texttt{modelnet\_simplenet} LPs will not involve remote messages. 
633
634

An important consideration when defining the configuration file is the way
635
\codesmodelnet{} maps the network-layer LPs (the NICs in this example) and the upper
636
637
638
639
level LPs (e.g., the servers). Specifically, each NIC is mapped in a one-to-one
manner with the calling LP through the calling LP's group name, repetition
number, and number within the repetition.

640
After the initialization function calls of ROSS (\texttt{tw\_init}), the configuration
641
file can be loaded in the example program using the calls in Figure
642
643
\ref{snippet3}. Each LP type must register itself using \texttt{lp\_type\_register}
before setting up the mapping. Figure \ref{snippet4} shows an example of how
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
the server LP registers itself. 

\begin{figure}
\begin{lstlisting}[caption=CODES mapping function calls in example program, label=snippet3]
    int main(int argc, char **argv)
    {
    	.....
	/* ROSS initialization function calls */
    	tw_opt_add(app_opt);
    	tw_init(&argc, &argv);

	/* loading the config file of codes-mapping */
    	configuration_load(argv[2], MPI_COMM_WORLD, &config);

	/* Setup the model-net parameters specified in the config file */
    	net_id=model_net_set_params();
	
	/* register the server LP type (model-net LP type is registered internally in model_net_set_params() */
    	svr_add_lp_type();

	/* Now setup codes mapping */
    	codes_mapping_setup();

	/* query codes mapping API */
    	num_servers = codes_mapping_get_group_reps("MODELNET_GRP") * codes_mapping_get_lp_count("MODELNET_GRP", "server");
    	..... 
   }
\end{lstlisting}
\end{figure}

\begin{figure}
\begin{lstlisting}[caption=Registering an LP type, label=snippet4]
static void svr_add_lp_type()
{
  lp_type_register("server", svr_get_lp_type());
}
\end{lstlisting}
\end{figure}

683
The \codesmapping{} API provides ways to query information like number of LPs of
684
685
a particular LP types, group to which a LP type belongs, repetitions in the
group (For details see codes-base/codes/codes-mapping.h file).  Figure
686
687
\ref{snippet3} shows how to setup the \codesmapping{} API with our CODES example
and computes basic information by querying the number of servers in a particular
688
689
690
691
692
693
694
695
696
group. 

\subsection{Event Handlers}
In this example, we have two LP types i.e. a server LP and a model-net LP.
Since the servers only send and receive messages to each other, the server LP state
maintains a count of the number of remote messages it has sent and received as
well as the number of local completion messages.   

For the server event message, we have four message types KICKOFF, REQ, ACK and
697
LOCAL. With a KICKOFF event, each LP sends a message to itself (the simulation
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
begins from here).  To avoid event ties, we add a small noise using the random
number generator (See Section \ref{sec_kickoff}). The server LP state data structure
and server message data structures are given in Figure \ref{snippet5}. The \`REQ\'
message is sent by a server to its neighboring server and when received,
neighboring server sends back a message of type \`ACK\'.

TODO: Add magic numbers in the example file to demonstrate the magic number best
practice. 

\begin{figure}
\begin{lstlisting}[caption=Event handler of the server LP type., label=snippet5]
static void svr_event(svr_state * ns, tw_bf * b, svr_msg * m, tw_lp * lp)
{
   switch (m->svr_event_type)
    {
        case REQ:
        ...
	case ACK:
        ...
	case KICKOFF:
        ...
	case LOCAL:
        ...
	default:
            printf("\n Invalid message type %d ", m->svr_event_type);
            assert(0);
        break;
    }
}
\end{lstlisting}
\end{figure}

730
731
\subsection{\codesmodelnet{}}
\codesmodelnet{} is an abstraction layer that allow models to send messages
732
across components using different network transports. This is a
733
734
735
consistent API that can send messages across either torus, dragonfly, or
simplenet network models without changing the higher level model code.

736
737
738
In the CODES example, we use \emph{simple-net} as the underlying plug-in for
\codesmodelnet{}. The simple-net parameters are specified by the user in the config
file (See Figure \ref{snippet2}). A call to \texttt{model\_net\_set\_params} sets up
739
740
the model\-net parameters as given in the config file.

741
742
743
744
745
746
747
748
749
\codesmodelnet{} assumes that the caller already knows what LP it wants to
deliver the message to and how large the simulated message is. It carries two
types of events (1) a remote event to be delivered to a higher level model LP
(In the example, the \codesmodelnet{} LPs carry the remote event to the server LPs) and
(2) a local event to be delivered to the caller once the message has been
transmitted from the node (In the example, a local completion message is
delivered to the server LP once the Model-Net LP sends the message). Figure
\ref{snippet6} shows how the server LP sends messages to the neighboring server
using the model\-net LP. 
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776

\begin{figure}
\begin{lstlisting}[caption=Example code snippet showing data transfer through model-net API, label=snippet6]
static void handle_kickoff_event(svr_state * ns,
    tw_bf * b,
    svr_msg * m,
    tw_lp * lp)
{
    ......
    /* record when transfers started on this server */
    ns->start_ts = tw_now(lp);

    /* each server sends a request to the next highest server */
    int dest_id = (lp->gid + offset)%(num_servers*2 + num_routers);

    /* model-net needs to know about (1) higher-level destination LP which is a neighboring server in this case
     * (2) struct and size of remote message and (3) struct and size of local message (a local message can be null) */
    
    model_net_event(net_id, "test", dest_id, PAYLOAD_SZ, sizeof(svr_msg), (const void*)m_remote, sizeof(svr_msg), (const void*)m_local, lp);
    ns->msg_sent_count++;
   .....
}

\end{lstlisting}
\end{figure}
 
\subsection{Reverse computation}
777
778
779
780
781
782
783
784
785
786
787
788
789
790

ROSS has the capability for optimistic parallel simulation, but instead of
saving the state of each LP, they instead require users to perform \emph{reverse
computation}. That is, while the event messages are themselves preserved (until
the Global Virtual Time (GVT) algorithm renders the messages unneeded), the LP
state is not preserved. Hence, it is up to the simulation developer to provide
functionality to reverse the LP state, given the event to be reversed. ROSS
makes this simpler in that events will always be rolled back in exactly the
order they were applied. Note that ROSS also has both serial and parallel
conservative modes, so reverse computation may not be necessary if the
simulation is not computationally intense.

For our example program, recall the ``forward'' event handlers. They perform the
following: 
791
792
\begin{enumerate}
    \item Kickoff: send a message to the peer server, and increment sender LP's
793
        count of sent messages.
794
    \item Request (received from peer server): increment receiver count of
795
        received messages, and send an acknowledgement to the sender.
796
    \item Acknowledgement (received from message receiver): send the next
797
798
        message to the receiver and increment messages sent count. Set a flag
        indicating whether a message has been sent.  
799
    \item Local \codesmodelnet{} callback: increment the local model-net
800
        received messages count.
801
\end{enumerate}
802
803
804

In terms of LP state, the four operations are simply modifying counts. Hence,
the ``reverse'' event handlers need to merely roll back those changes: 
805
806
807
\begin{enumerate}
    \item  Kickoff: decrement sender LP's count of sent messages.
    \item Request (received from peer server): decrement receiver count of
808
        received messages.
809
    \item Acknowledgement (received from message receiver): decrement messages
810
811
        sent count if flag indicating a message has been sent has not been
        set.
812
    \item Local \codesmodelnet{} callback: decrement the local model-net
813
        received messages count.
814
\end{enumerate}
815
816
817
818
819
820
821
822
823
824
825

For more complex LP states (such as maintaining queues), reverse event
processing becomes similarly more complex. Other sections of this document
highlight strategies of dealing with those.

Note that ROSS maintains the ``lineage'' of events currently stored, which
enables ROSS to roll back the messages in the order they were originally
processed. This greatly simplifies the reverse computation process: the LP state
when reversing the effects of a particular event is exactly the state that
resulted from processing the event in the first place (of course, unless the
event handlers are buggy).
826

827
828
829
\section{TODO}

\begin{itemize}
830
831
\item Build a single example model that demonstrates the concepts in this
document, refer to it throughout.
832
833
834
\item reference to ROSS user's guide, airport model, etc.
\item put a pdf or latex2html version of this document on the codes web page
when ready
835
836
\end{itemize}

837
838
839
840
841
842
843
844
845
846
847
848
849
\begin{figure}
\begin{lstlisting}[caption=Example code snippet., label=snippet-example]
for (i=0; i<n; i++) {
    for (j=0; j<i; j++) {
        /* do something */
    }
}
\end{lstlisting}
\end{figure}

Figure ~\ref{fig:snippet-example} shows an example of how to show a code
snippet in latex.  We can use this format as needed throughout the document.

850
\end{document}