Commit 73365e34 authored by qiao.kang@eecs.northwestern.edu's avatar qiao.kang@eecs.northwestern.edu

Merge branch 'master' of xgitlab.cels.anl.gov:sds/benvolio

parents cca077d9 5e699078
AUTOMAKE_OPTIONS = foreign
ACLOCAL_AMFLAGS="-Im4"
AM_CPPFLAGS = -I$(top_srcdir)/include
COMMON_INCLUDES = -I$(top_srcdir)/include
AM_CPPFLAGS = $(COMMON_INCLUDES)
bin_PROGRAMS = src/provider/bv-server \
src/client/bv-shutdown
bin_PROGRAMS = src/client/bv-shutdown \
src/client/bv-ping
# automake will use *either* AM_CPPFLAGS or mumble_CPPFLAGS. Since we set
# per-object CPPFLAGS we need to repeat AM_CPPFLAGS
#src_provider_bv_server_CPPFLAGS= -I$(top_srcdir)/include
# we can get away with this because bv-server is a single source + provider
# convienence library. If bv-server starts picking up more source files we
# might have to make a library
# (see https://www.gnu.org/software/automake/manual/html_node/Per_002dObject-Flags.html)
if USE_PMIX
bin_PROGRAMS+=src/provider/bv-server.pmix
src_provider_bv_server_pmix_SOURCES = src/provider/server.c \
src/provider/bv-provider.h
src_provider_bv_server_pmix_CPPFLAGS = $(COMMON_INCLUDES) "-DUSE_PMIX"
src_provider_bv_server_pmix_LDADD = lib/libbv-provider.la
endif
if USE_MPI
bin_PROGRAMS+=src/provider/bv-server.mpi
src_provider_bv_server_mpi_SOURCES = src/provider/server.c \
src/provider/bv-provider.h
src_provider_bv_server_mpi_CPPFLAGS = $(COMMON_INCLUDES) "-DUSE_MPI"
src_provider_bv_server_mpi_LDADD = lib/libbv-provider.la
endif
src_provider_bv_server_SOURCES = src/provider/server.c
src_provider_bv_server_LDADD = lib/libbv-provider.la
src_client_bv_shutdown_SOURCES = src/client/bv-shutdown.c
src_client_bv_shutdown_LDADD = lib/libbv-client.la
src_client_bv_ping_SOURCES = src/client/bv-ping.c
src_client_bv_ping_LDADD = lib/libbv-client.la
lib_LTLIBRARIES = lib/libbv-client.la \
lib/libbv-provider.la
......
......@@ -22,6 +22,7 @@ repository, but just in case you are not, you can get the latest benvolio from
* [thallium](https://xgitlab.cels.anl.gov/sds/thallium) C++ wrappers to Mercury, Margo, and Argobots
* [ssg](https://xgitlab.cels.anl.gov/sds/ssg) group management for server identification
* [abt-io](https://xgitlab.cels.anl.gov/sds/abt-io/) providing I/O bindings for Argobots
* For the provider, either MPI or PMIx to launch the service.
We find the [spack](https://spack.readthedocs.io/en/latest/) package manager
very helpful for building everything you need. We have some additional
......@@ -49,7 +50,9 @@ and the _client_ . A more complicated example would run the provider on one or
more nodes and the client on one or more (possibly the same) node. In this
case we are just going to run everything on one node.
1. PMIx: We recently switched benvolio to use [PMIx](https://pmix.org/). Some
#### Provider with PMIx
1. PMIx: Benvolio can use [PMIx](https://pmix.org/). Some
job schedulers like SLURM are PMIx-aware, but more likely you will have to use
the PMIx reference runtime (prrte). Spack can install that for you (`spack
install prrte` but NOTE: you'll need the one line fix in
......@@ -69,9 +72,24 @@ arguments are mandatory.
* `-s streams` : how many margo streams (Argobot Execution Streams) to use
$ prun -np 2 ./src/provider/bv-server -f bv.svc -p sm: &
$ prun -np 2 ./src/provider/bv-server.pmix -f bv.svc -p sm: &
#### Provider with MPI
1. Launching with MPI depends a lot on your platform. `aprun`, `jsrun`, or simply
`mpiexec` are all possible mechanisms to start an MPI job. For simplicity I
will assume `mpiexec`. Consult your site-specific documentation if there is a
more apropriate mechanism
2. The command line arguments for `bv-server` are the same no matter how you launch the providers.
$ mpiexec -np 2 ./src/provider/bv-server.mpi -f bv.svc -p sm: &
3. A simple benvolio test: `tests/simple` runs through a few I/O patterns. It
#### Client
No matter which way you started your benvolio provider, client code runs the same way.
1. A simple benvolio test: `tests/simple` runs through a few I/O patterns. It
has one mandatory argument: the name of the provider statefile. You can give
it an optional file name, or it will write to and read from a file called
'dummy' by default. If you see `Error: PMIx event notification registration
......@@ -81,9 +99,49 @@ failed! [-31]` you can ignore that for now. We're still investigating
$ ./tests/simple bv.svc
4. cleanup. the `bv_shutdown` tool is a simple client with only one job: ask
the provider to shut itself down. Then we stop the prrte daemons with the
2. cleanup. the `bv_shutdown` tool is a simple client with only one job: ask
the provider to shut itself down. If necessary, we stop the prrte daemons with the
'-terminate' command
$ ./src/client/bv-shutdown bv.svc
$ prun -terminate
### Internal statistics
Benvolio developers can generate a timing report with a call to
`bv_statistics()`. That call will emit a line contaning timings and counters
for benvolio's internal operations. If the `show_server` flag is set, a client
will also emit information from all benvolio providers.
Here's an example of the output:
SERVER: write_rpc_calls 2 write_rpc_time 0.0026474 server_write_calls 5 server_write_time 0.00186205 bytes_written 15011 write_expose 0 write_bulk_time 4.02927e-05 write_bulk_xfers 2 write_response 4.36306e-05 read_rpc_calls 2 read_rpc_time 0.0026679 server_read_calls 7 server_read_time 0.00181675 bytes_read 15006 read_bulk_time 5.31673e-05 read_bulk_xfers 2 read_expose 0 read_response 4.41074e-05 getfd 0.00121784 mutex_time 9.53674e-07
CLIENT: client_write_calls 2 client_write_time 0.00291276 client_bytes_written 15011 client_read_calls 2 client_read_time 0.0028851 client_bytes_read 15006 client_init_time 0.0610526
In the above sample output, one can subtract (server) `write_rpc_time` from
(client) `client_write_time` to compute the RPC overhead. Typically this
number will be quite small but could be large if for some reason the server was
unable to handle client requests efficiently (maybe requires more margo
threads?) or clients and servers were exchanging very complicated data
structures.
Benvolio is a mochi service, and as such makes heavy use of margo and mercury.
By setting the environment variable `MARGO_ENABLE_DIAGNOSTICS=1` benvolio will
also include additional information about this software abstraction layer.
```
# Margo diagnostics
#Addr Hash and Address Name: 18446744029030521930,ofi+verbs;ofi_rxm://10.41.20.24:56684
# Fri Jul 24 17:59:15 2020
# Function Name, Average Time Per Call, Cumulative Time, Highwatermark, Lowwatermark, Call Count
trigger_elapsed,0.000001399,0.000041962,0.000000238,0.000013590,30
progress_elapsed_zero_timeout,0.000002861,0.000002861,0.000002861,0.000002861,1
progress_elapsed_nonzero_timeout,0.007764083,0.100933075,0.000066280,0.090931177,13
bulk_create_elapsed,0.000027227,0.000136137,0.000003099,0.000055313,5
```
For more informaiton about Margo-level informatio, including profiling and the
immensely informative "breadcrumb" feature, please refer to [Margo
Instrumentation](https://xgitlab.cels.anl.gov/sds/margo/blob/master/doc/instrumentation.md)
......@@ -45,24 +45,72 @@ CPPFLAGS="$ABTIO_CFLAGS $CPPFLAGS"
CFLAGS="$ABTIO_CFLAGS $CFLAGS"
# check that SSG was compiled with PMIX support
# Bootstrapping: we can do it with either MPI or PMIx, but both require SSG to
# support it. We need to check for both SSG's level of support as well as
# general support in the environment.
# check if SSG was compiled with PMIX support
ssg_with_pmix=no
ssg_with_mpi=no
AC_CHECK_LIB([ssg],
[ssg_group_create_pmix],
,
[AC_MSG_ERROR([SSG must be built with PMIx support!])],
[ssg_with_pmix=yes
AC_DEFINE([SSG_HAS_PMIX], 1, [Does SSG library have PMIx support?])],
[],
[-lpmix]
)
# check for PMIx: required to boostrap server provider processes
AC_CHECK_LIB([ssg],
[ssg_group_create_mpi],
[ssg_with_mpi=yes
AC_DEFINE([SSG_HAS_MPI], 1,[Does SSG library have MPI support])],
[],
[-lmpi]
)
# benvolio will use MPI, PMIx or both to bootstrap, but only if we find the
# necessary supporting libraries
pmix_ok=no
mpi_ok=no
AC_CHECK_LIB([pmix], [PMIx_Init])
AC_MSG_CHECKING([If PMIx programs can be compiled])
AC_LINK_IFELSE(
[AC_LANG_PROGRAM([[#include<pmix.h>]], [[PMIx_Init(NULL, NULL,0);]] )],
[AC_DEFINE([HAVE_PMIX], [1], [Define to 1 if compiled with PMIx support])
pmix_ok=yes
AC_MSG_RESULT([yes])],
[AC_MSG_RESULT([no])]
[AC_MSG_ERROR([PMIx required to launch remote providers]) ])
)
AC_MSG_CHECKING([If MPI programs can be compiled])
AC_LINK_IFELSE(
[AC_LANG_PROGRAM([[#include<mpi.h>]], [[MPI_Init(0,0);]])],
[AC_MSG_RESULT([yes])
mpi_ok=yes
AC_DEFINE([HAVE_MPI], [1], [Able to use MPI])],
[AC_MSG_RESULT([no])]
)
use_pmix=no
if test x$pmix_ok = xyes -a x$ssg_with_pmix = xyes ; then
use_pmix=yes
fi
use_mpi=no
if test x$mpi_ok = xyes -a x$ssg_with_mpi = xyes ; then
use_mpi=yes;
fi
AM_CONDITIONAL([USE_PMIX], [ test x$use_pmix = xyes ])
AM_CONDITIONAL([USE_MPI], [ test x$use_mpi = xyes ])
# we need *some* way to bootstrap: we can support one or the other or both but not none
AC_MSG_CHECKING([If at least one bootstrap method available])
if test x$use_pmix = xyes -o x$use_mpi = xyes ; then
AC_MSG_RESULT(yes)
else
AC_MSG_ERROR([Unable to detect MPI or PMIx ])
fi
AC_CHECK_LIB([lustreapi], [llapi_file_get_stripe])
# checks for header files
......
# Adding a new RPC to Benvolio
Let's walk through the process of adding an RPC to benvolio
## Background
If benvolio were using mercury or margo directly, a new rpc would require a bit
of work. We would have to register RPCs and define serialization and
deserialization structures. Benvolio, though, uses thallium to handle a lot of
the grunt work.
## Implementing 'ping'
I wanted a quick way to tell if the benvolio providers were up and healthy. I
could try to read from a nonexistent file, but a real ping rpc will let us even
more. In addition to a binary "up/down" result we can also report how many
processes are in the provider's group.
You can find more information about thallium and "provider objects" in the
thallium
[documentation](https://mochi.readthedocs.io/en/latest/thallium/09_providers.html)
### Server side
- add a `ping` method to the `bv_svc_provider` struct. We'll start with no
arguments and only a single return value, but will add more to this later
- in the constructor, call 'define' to register the new method with thallium
- add the thallium `remote_procedure` to the `rpcs` list: benvolio will use that
list to deregister all the RPCs at exit.
### Client side
Unlike the providers, clients need a way to pick out individual rpcs.
- Add a `tl::remote_procedure` member to the `bv_client` struct.
- Initialize that new member in the `bv_client` constructor
- `deregister` the new RPC in the `bv_client` destructor
That's all it takes to set up the RPC itself, but client code will need a way to invoke it.
- provide an implementation of `bv_ping` in `bv-client.cc`
- all benvolio client apis take a `bv_client_t` argument. This client context
came from `bv_init`
- this implementation invokes the RPC by using the `on` method. We might have
several targets we wish to `ping`: we can iterate through all the providers
like this:
```
int ret = 0;
for (auto target : client->targets)
ret += client->ping_op.on(target)().as<int>();
return ret;
```
- add a prototype in bv.h
Finally, you will need a way to exercise this new routine. You can add it to
the `tests/simple.c` test or write a new utility. The whole point of adding
this ping RPC was to be able to use a utility program, so we will write one and
add it to the build system. One can refer to `src/client/bv-shutdown.c`
for a good example of such a utility.
......@@ -117,6 +117,16 @@ ssize_t bv_getsize(bv_client_t client, const char *filename);
#define BV_NOCREATE 0
int bv_declare(bv_client_t client, const char *filename, int flags, int mode);
/**
* ping: get the health of a benvolio service
* - client:i (IN) benvolio client object
* - nr_providers (OUT) how many providers in the benvolio service
*
* returns: zero on success, non-zero if unable to communicate with any
* provider
*/
size_t bv_ping(bv_client_t client, size_t *nr_providers);
#ifdef __cplusplus
}
#endif
......
......@@ -14,6 +14,7 @@ class io_stats {
mutex_time(0.0),
client_write_calls(0), client_write_time(0.0),
client_bytes_written(0),
client_write_expose(0.0), client_read_expose(0.0),
client_read_calls(0), client_read_time(0.0),
client_bytes_read(0),
client_init_time(0.0)
......@@ -83,6 +84,8 @@ class io_stats {
double client_read_time; // time client spent in "bv_read
int64_t client_bytes_read; // bytes recieved from provider
double client_init_time; // how long does it take to set everything up
double client_write_expose; // time spent registering memory before writing
double client_read_expose; // time spent registering memory before reading
io_stats & operator += (const io_stats &rhs) {
......@@ -145,9 +148,11 @@ class io_stats {
std::cout << "client_write_calls " << client_write_calls
<< " client_write_time " << client_write_time
<< " client_bytes_written " << client_bytes_written
<< " client_write_expose_time " << client_write_expose
<< " client_read_calls " << client_read_calls
<< " client_read_time " << client_read_time
<< " client_bytes_read " << client_bytes_read
<< " client_read_expose_time " << client_read_expose
<< " client_init_time " << client_init_time
<< std::endl;
}
......
......@@ -60,6 +60,8 @@ struct bv_client {
tl::remote_procedure statistics_op;
tl::remote_procedure size_op;
tl::remote_procedure declare_op;
tl::remote_procedure ping_op;
ssg_group_id_t gid; // attaches to this group; not a member
io_stats statistics;
......@@ -83,6 +85,7 @@ struct bv_client {
statistics_op(engine->define("statistics")),
size_op(engine->define("size")),
declare_op(engine->define("declare")),
ping_op(engine->define("ping")),
gid(group) {}
// writing our own destructor so we can ensure nothing that needs thallium
......@@ -96,6 +99,7 @@ struct bv_client {
statistics_op.deregister();
size_op.deregister();
declare_op.deregister();
ping_op.deregister();
targets.erase(targets.begin(), targets.end());
delete engine;
......@@ -161,7 +165,8 @@ bv_client_t bv_init(bv_config_t config)
ret = ssg_group_observe(client->engine->get_margo_instance(), client->gid);
if (ret != SSG_SUCCESS) {
fprintf(stderr, "ssg_group attach: (%d) Is remote provider running?\n", ret);
assert (ret == SSG_SUCCESS);
delete client;
return NULL;
}
nr_targets = ssg_get_group_size(client->gid);
......@@ -259,7 +264,15 @@ static size_t bv_io(bv_client_t client, const char *filename, io_kind op,
for (unsigned int i=0, j=0; i< client->targets.size(); i++) {
if (my_reqs[i].mem_vec.size() == 0) continue; // no work for this target
double expose_time = ABT_get_wtime();
my_bulks.push_back(client->engine->expose(my_reqs[i].mem_vec, mode));
expose_time = ABT_get_wtime() - expose_time;
if (op == BV_READ)
client->statistics.client_read_expose += expose_time;
else
client->statistics.client_write_expose += expose_time;
responses.push_back(rpc.on(client->targets[i]).async(my_bulks[j++], std::string(filename), my_reqs[i].offset, my_reqs[i].len));
}
......@@ -334,6 +347,7 @@ int bv_statistics(bv_client_t client, int show_server)
}
std::cout << "CLIENT: ";
client->statistics.print_client();
margo_diag_dump(client->engine->get_margo_instance(), "-", 0);
return ret;
}
......@@ -369,6 +383,19 @@ int bv_declare(bv_client_t client, const char *filename, int flags, int mode)
return ret;
}
size_t bv_ping(bv_client_t client, size_t *nr_providers)
{
int ret = 0;
if (client == NULL) {
nr_providers=0;
return -1;
}
*nr_providers = client->targets.size();
for (auto target : client->targets)
ret += client->ping_op.on(target)().as<int>();
return ret;
}
bv_config_t bvutil_cfg_get(const char *filename)
{
bv_config_t cfg = new (bv_config);
......
/*
* simple utility to let us know if benvolio providers are up and
* running
*
*/
#include <stdio.h>
#include <bv.h>
int main(int argc, char **argv)
{
int ret = 0;
size_t count = 0;
bv_config_t cfg = bvutil_cfg_get(argv[1]);
bv_client_t client= bv_init(cfg);
bvutil_cfg_free(cfg);
ret = bv_ping(client, &count);
if (ret == 0) {
printf("benvolio UP: %ld providers\n", count);
} else {
printf("benvolio DOWN: %d of %ld providers unresponsive\n", ret, count);
}
bv_finalize(client);
return ret;
}
......@@ -668,6 +668,11 @@ struct bv_svc_provider : public tl::provider<bv_svc_provider>
return 0;
}
int ping()
{
return 0;
}
bv_svc_provider(tl::engine *e, abt_io_instance_id abtio,
ssg_group_id_t gid, const uint16_t provider_id, const int b, int x, tl::pool &pool)
......@@ -686,6 +691,7 @@ struct bv_svc_provider : public tl::provider<bv_svc_provider>
rpcs.push_back(define("statistics", &bv_svc_provider::statistics));
rpcs.push_back(define("size", &bv_svc_provider::getsize));
rpcs.push_back(define("declare", &bv_svc_provider::declare));
rpcs.push_back(define("ping", &bv_svc_provider::ping));
}
void dump_io_req(const std::string extra, const tl::bulk &client_bulk, const std::vector<off_t> &file_starts, const std::vector<uint64_t> &file_sizes)
......
#include <margo.h>
#include <abt-io.h>
#include <ssg.h>
#include <ssg-pmix.h>
#include <getopt.h>
#ifdef USE_PMIX
#include <pmix.h>
#include <ssg-pmix.h>
#endif
#ifdef USE_MPI
#include <mpi.h>
#include <ssg-mpi.h>
#endif
#include "bv-provider.h"
......@@ -45,9 +51,8 @@ int main(int argc, char **argv)
margo_instance_id mid;
abt_io_instance_id abtio;
bv_svc_provider_t bv_id;
pmix_proc_t proc;
int ret;
int rank, nprocs;
int nprocs;
ssg_group_id_t gid;
int c;
char *proto=NULL;
......@@ -90,13 +95,24 @@ int main(int argc, char **argv)
abtio = abt_io_init(nthreads);
margo_push_finalize_callback(mid, finalize_abtio, (void*)abtio);
ret = ssg_init();
ASSERT(ret == 0, "ssg_init() failed (ret = %d)\n", ret);
#ifdef USE_PMIX
pmix_proc_t proc;
ret = PMIx_Init(&proc, NULL, 0);
ASSERT(ret == PMIX_SUCCESS, "PMIx_Init failed (ret = %d)\n", ret);
ret = ssg_init();
ASSERT(ret == 0, "ssg_init() failed (ret = %d)\n", ret);
gid = ssg_group_create_pmix(mid, BV_PROVIDER_GROUP_NAME, proc, NULL, NULL, NULL);
ASSERT(gid != SSG_GROUP_ID_INVALID, "ssg_group_create_pmix() failed (ret = %s)","SSG_GROUP_ID_NULL");
#endif
#ifdef USE_MPI
MPI_Init(&argc, &argv);
gid = ssg_group_create_mpi(mid, BV_PROVIDER_GROUP_NAME, MPI_COMM_WORLD, NULL, NULL, NULL);
ASSERT(gid != SSG_GROUP_ID_INVALID, "ssg_group_create_mpi() failed (ret = %s)" , "SSG_GROUP_ID_NULL");
#endif
finalize_args_t args = {
.g_id = gid,
.m_id = mid
......@@ -116,5 +132,11 @@ int main(int argc, char **argv)
free(statefile);
margo_wait_for_finalize(mid);
#ifdef USE_PMIX
PMIx_Finalize(NULL, 0);
#endif
#ifdef USE_MPI
MPI_Finalize();
#endif
}
......@@ -31,6 +31,9 @@ int main(int argc, char **argv)
else
filename = "dummy";
printf("ping: %s\n", bv_ping(client) ? "UP" : "DOWN");
ret = bv_ping(client);
printf("delete:\n");
bv_delete(client, filename);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment