Commit 12ec6ec8 authored by Shane Snyder's avatar Shane Snyder
Browse files

Merge branch 'dev-margo-p2p-latency' into 'master'

point to point latency benchmark

See merge request !5
parents abc9b6ce ab463930
......@@ -7,3 +7,8 @@ tests_ssg_test_simple_LDADD = src/libssg.la
tests_ssg_test_attach_SOURCES = tests/ssg-test-attach.c
tests_ssg_test_attach_LDADD = src/libssg.la
if SSG_HAVE_MPI
check_PROGRAMS += tests/perf-regression/margo-p2p-latency
tests_perf_regression_margo_p2p_latency_LDADD = src/libssg.la
endif
margo-p2p-latency is a point to point latency benchmark. It measures round
trip latency for a noop (i.e. as close to an empty request and response
structure as possible) RPC.
Example compile (must build with MPI support):
```
./configure <normal arguments> CC=mpicc
make
make tests
```
Example execution (requires mpi):
```
mpiexec -n 2 tests/perf-regression/margo-p2p-latency -i 10 -n sm://
```
-i is number of iterations
-n is transport to use
These are example scripts for executing an automated regression test on the
Cooley system at the ALCF. The entire process is handled by the
"run-regression.sh" script, which is suitable for execution within a cron job.
diff --git a/src/plugins/ctp/verbs/ctp_verbs_api.c b/src/plugins/ctp/verbs/ctp_verbs_api.c
index 06da1f4..e187ebd 100644
--- a/src/plugins/ctp/verbs/ctp_verbs_api.c
+++ b/src/plugins/ctp/verbs/ctp_verbs_api.c
@@ -2982,12 +2982,7 @@ verbs_handle_disconnected(cci__ep_t * ep, struct rdma_cm_event *cm_evt)
/* Either way, we got the DISCONNECTED event, it is safe to cleanup
* the QP and CM id.
*/
- ret = rdma_destroy_ep(vconn->id);
- if (ret == -1) {
- ret = errno;
- debug(CCI_DB_WARN, "%s: rdma_destroy_ep() returned %s",
- __func__, strerror(ret));
- }
+ rdma_destroy_ep(vconn->id);
if (!vconn->cci_disconn) {
verbs_destroy_conn(ep, conn);
#!/bin/bash
#COBALT -n 2
#COBALT -t 5
#COBALT --mode script
#COBALT -A radix-io
#COBALT -q ibleaf3-debug
#COBALT --env CCI_CONFIG=/home/carns/working/install-cooley/etc/cci.conf:LD_LIBRARY_PATH=/home/carns/tmp/mochi-regression-install/lib
# NOTE: remmeber to uncomment some stuff in ~/.bashrc for library paths
mpirun -f $COBALT_NODEFILE -n 2 ./margo-p2p-latency -i 100000 -n verbs://
#!/bin/bash
# This is a shell script to be run from a login node of the Cooley system at
# the ALCF, that will download, compile, and execute the ssg performance
# regression tests, including any dependencies
# exit on any error
set -e
SANDBOX=/tmp/mochi-regression-$$
PREFIX=~/tmp/mochi-regression-install-$$
JOBDIR=~/tmp/mochi-regression-job-$$
export CFLAGS="-O3"
export PKG_CONFIG_PATH=$PREFIX/lib/pkgconfig
# scratch area to clone and build things
mkdir $SANDBOX
cp cci-rdma-destroy-ep.patch $SANDBOX
# scratch area for job submission
mkdir $JOBDIR
cp margo-p2p-latency.qsub $JOBDIR
cd $SANDBOX
git clone https://github.com/pmodels/argobots.git
git clone https://github.com/CCI/cci.git
git clone https://github.com/mercury-hpc/mercury.git
wget http://dist.schmorp.de/libev/libev-4.24.tar.gz
tar -xvzf libev-4.24.tar.gz
git clone https://xgitlab.cels.anl.gov/sds/abt-snoozer.git
git clone https://xgitlab.cels.anl.gov/sds/margo.git
git clone https://xgitlab.cels.anl.gov/sds/ssg.git
# argobots
echo "=== BUILDING ARGOBOTS ==="
cd $SANDBOX/argobots
libtoolize
./autogen.sh
mkdir build
cd build
../configure --prefix=$PREFIX --enable-perf-opt
make -j 3
make install
# cci
echo "=== BUILDING CCI ==="
cd $SANDBOX/cci
patch -p1 < $SANDBOX/cci-rdma-destroy-ep.patch
libtoolize
./autogen.pl
mkdir build
cd build
../configure --prefix=$PREFIX
make -j 3
make install
# mercury
echo "=== BUILDING MERCURY ==="
cd $SANDBOX/mercury
mkdir build
cd build
cmake -DNA_USE_CCI:BOOL=ON -DMERCURY_USE_BOOST_PP:BOOL=ON -DCMAKE_INSTALL_PREFIX=/$PREFIX -DBoost_NO_BOOST_CMAKE=TRUE -DMERCURY_USE_CHECKSUMS:BOOL=OFF -DBUILD_SHARED_LIBS:BOOL=ON -DMERCURY_USE_SELF_FORWARD:BOOL=ON -DNA_USE_SM:BOOL=OFF ../
make -j 3
make install
# libev
echo "=== BUILDING LIBEV ==="
cd $SANDBOX/libev-4.24
mkdir build
cd build
../configure --prefix=$PREFIX
make -j 3
make install
# abt-snoozer
echo "=== BUILDING ABT-SNOOZER ==="
cd $SANDBOX/abt-snoozer
libtoolize
./prepare.sh
mkdir build
cd build
../configure --prefix=$PREFIX
make -j 3
make install
# margo
echo "=== BUILDING MARGO ==="
cd $SANDBOX/margo
libtoolize
./prepare.sh
mkdir build
cd build
../configure --prefix=$PREFIX
make -j 3
make install
# ssg
echo "=== BUILDING SSG ==="
cd $SANDBOX/ssg
git checkout dev-margo-p2p-latency
libtoolize
./prepare.sh
mkdir build
cd build
../configure --prefix=$PREFIX CC=mpicc
make -j 3
make install
make tests
# set up job to run
echo "=== SUBMITTING AND WAITING FOR JOB ==="
cp $SANDBOX/ssg/build/tests/perf-regression/.libs/margo-p2p-latency $JOBDIR
cd $JOBDIR
JOBID=`qsub --env CCI_CONFIG=/home/carns/working/install-cooley/etc/cci.conf:LD_LIBRARY_PATH=$PREFIX/lib ./margo-p2p-latency.qsub`
cqwait $JOBID
echo "=== JOB DONE, COLLECTING AND SENDING RESULTS ==="
# gather output, strip out funny characters, mail
cat $JOBID.* > combined.$JOBID.txt
dos2unix combined.$JOBID.txt
mailx -s "margo-p2p-latency (cooley)" sds-commits@mcs.anl.gov < combined.$JOBID.txt
cd /tmp
rm -rf $SANDBOX
rm -rf $PREFIX
/*
* Copyright (c) 2017 UChicago Argonne, LLC
*
* See COPYRIGHT in top-level directory.
*/
#include <unistd.h>
#include <stdio.h>
#include <string.h>
#include <assert.h>
#include <mpi.h>
#include <margo.h>
#include <abt-snoozer.h>
#include <mercury.h>
#include <abt.h>
#include <ssg.h>
#include <ssg-mpi.h>
struct options
{
int iterations;
char* na_transport;
};
static void parse_args(int argc, char **argv, struct options *opts);
static void usage(void);
static int run_benchmark(int iterations, hg_id_t id, ssg_member_id_t target,
ssg_group_id_t gid, margo_instance_id mid, hg_context_t *hg_context,
double *measurement_array);
static void bench_routine_print(const char* op, int size, int iterations,
double* measurement_array);
static int measurement_cmp(const void* a, const void *b);
DECLARE_MARGO_RPC_HANDLER(noop_ult);
static hg_id_t noop_id;
static int rpcs_serviced = 0;
static ABT_eventual rpcs_serviced_eventual;
static struct options g_opts;
int main(int argc, char **argv)
{
margo_instance_id mid;
int nranks;
hg_context_t *hg_context;
hg_class_t *hg_class;
int ret;
ssg_group_id_t gid;
ssg_member_id_t self;
int rank;
hg_bool_t flag;
double *measurement_array;
int namelen;
char processor_name[MPI_MAX_PROCESSOR_NAME];
ABT_init(argc, argv);
MPI_Init(&argc, &argv);
/* 2 process rtt measurements only */
MPI_Comm_size(MPI_COMM_WORLD, &nranks);
if(nranks != 2)
{
usage();
exit(EXIT_FAILURE);
}
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Get_processor_name(processor_name,&namelen);
printf("Process %d of %d is on %s\n",
rank, nranks, processor_name);
parse_args(argc, argv, &g_opts);
/* boilerplate HG initialization steps */
/***************************************/
hg_class = HG_Init(g_opts.na_transport, HG_TRUE);
if(!hg_class)
{
fprintf(stderr, "Error: HG_Init()\n");
return(-1);
}
hg_context = HG_Context_create(hg_class);
if(!hg_context)
{
fprintf(stderr, "Error: HG_Context_create()\n");
HG_Finalize(hg_class);
return(-1);
}
/* set primary ES to idle without polling */
ret = ABT_snoozer_xstream_self_set();
if(ret != 0)
{
fprintf(stderr, "Error: ABT_snoozer_xstream_self_set()\n");
return(-1);
}
/* actually start margo */
mid = margo_init(0, 0, hg_context);
assert(mid);
MARGO_REGISTER(
mid,
"noop_rpc",
void,
void,
noop_ult_handler,
MARGO_DEFAULT_MPLEX_ID,
NULL);
/* set up group */
ret = ssg_init(mid);
assert(ret == 0);
gid = ssg_group_create_mpi("margo-p2p-latency", MPI_COMM_WORLD, NULL, NULL);
assert(gid != SSG_GROUP_ID_NULL);
assert(ssg_get_group_size(gid) == 2);
self = ssg_get_group_self_id(gid);
#if 0
printf("MPI rank %d has SSG ID %lu\n", rank, self);
#endif
/* TODO: there should be a cleaner way to get ID from MARGO_REGISTER */
ret = HG_Registered_name(hg_class, "noop_rpc", &noop_id, &flag);
assert(ret == 0 && flag);
if(self == 0)
{
/* ssg id 0 runs benchmark */
measurement_array = calloc(g_opts.iterations, sizeof(*measurement_array));
assert(measurement_array);
ret = run_benchmark(g_opts.iterations, noop_id, 1, gid, mid, hg_context, measurement_array);
assert(ret == 0);
printf("# <op> <iterations> <size> <min> <q1> <med> <avg> <q3> <max>\n");
bench_routine_print("noop", 0, g_opts.iterations, measurement_array);
free(measurement_array);
}
else
{
/* ssg id 1 acts as server, waiting until iterations have been
* completed
*/
ret = ABT_eventual_create(0, &rpcs_serviced_eventual);
assert(ret == 0);
ABT_eventual_wait(rpcs_serviced_eventual, NULL);
assert(rpcs_serviced == g_opts.iterations);
sleep(3);
}
ssg_group_destroy(gid);
ssg_finalize();
margo_finalize(mid);
HG_Context_destroy(hg_context);
/* TODO: debug this further later; HG_Finalize() always hangs at this point
* when using CCI/verbs right now.
*/
#if 0
HG_Finalize(hg_class);
#endif
MPI_Finalize();
ABT_finalize();
return 0;
}
static void parse_args(int argc, char **argv, struct options *opts)
{
int opt;
int ret;
memset(opts, 0, sizeof(*opts));
while((opt = getopt(argc, argv, "n:i:")) != -1)
{
switch(opt)
{
case 'i':
ret = sscanf(optarg, "%d", &opts->iterations);
if(ret != 1)
{
usage();
exit(EXIT_FAILURE);
}
break;
case 'n':
opts->na_transport = strdup(optarg);
if(!opts->na_transport)
{
perror("strdup");
exit(EXIT_FAILURE);
}
break;
default:
usage();
exit(EXIT_FAILURE);
}
}
if(opts->iterations < 1 || !opts->na_transport)
{
usage();
exit(EXIT_FAILURE);
}
return;
}
static void usage(void)
{
fprintf(stderr,
"Usage: "
"margo-p2p-latency -i <iterations> -n <na>\n"
"\t-i <iterations> - number of RPC iterations\n"
"\t-n <na> - na transport\n"
"\t\texample: mpiexec -n 2 ./margo-p2p-latency -i 10000 -n verbs://\n"
"\t\t(must be run with exactly 2 processes\n");
return;
}
/* service a remote RPC for a no-op */
static void noop_ult(hg_handle_t handle)
{
margo_instance_id mid;
const struct hg_info *hgi;
hgi = HG_Get_info(handle);
assert(hgi);
mid = margo_hg_class_to_instance(hgi->hg_class);
margo_respond(mid, handle, NULL);
HG_Destroy(handle);
rpcs_serviced++;
if(rpcs_serviced == g_opts.iterations)
{
ABT_eventual_set(rpcs_serviced_eventual, NULL, 0);
}
return;
}
DEFINE_MARGO_RPC_HANDLER(noop_ult)
static int run_benchmark(int iterations, hg_id_t id, ssg_member_id_t target,
ssg_group_id_t gid, margo_instance_id mid, hg_context_t *hg_context,
double *measurement_array)
{
hg_handle_t handle;
hg_addr_t target_addr;
int i;
int ret;
double tm1, tm2;
target_addr = ssg_get_addr(gid, target);
assert(target_addr != HG_ADDR_NULL);
ret = HG_Create(hg_context, target_addr, id, &handle);
assert(ret == 0);
/* TODO: have command line option to toggle whether we reuse one handle
* or create/release on every cycle
*/
for(i=0; i<iterations; i++)
{
tm1 = ABT_get_wtime();
ret = margo_forward(mid, handle, NULL);
tm2 = ABT_get_wtime();
assert(ret == 0);
measurement_array[i] = tm2-tm1;
}
HG_Destroy(handle);
return(0);
}
static void bench_routine_print(const char* op, int size, int iterations, double* measurement_array)
{
double min, max, q1, q3, med, avg, sum;
int bracket1, bracket2;
int i;
qsort(measurement_array, iterations, sizeof(double), measurement_cmp);
min = measurement_array[0];
max = measurement_array[iterations-1];
sum = 0;
for(i=0; i<iterations; i++)
{
sum += measurement_array[i];
}
avg = sum/(double)iterations;
bracket1 = iterations/2;
if(iterations%2)
bracket2 = bracket1 + 1;
else
bracket2 = bracket1;
med = (measurement_array[bracket1] + measurement_array[bracket2])/(double)2;
bracket1 = iterations/4;
if(iterations%4)
bracket2 = bracket1 + 1;
else
bracket2 = bracket1;
q1 = (measurement_array[bracket1] + measurement_array[bracket2])/(double)2;
bracket1 *= 3;
if(iterations%4)
bracket2 = bracket1 + 1;
else
bracket2 = bracket1;
q3 = (measurement_array[bracket1] + measurement_array[bracket2])/(double)2;
printf("%s\t%d\t%d\t%.9f\t%.9f\t%.9f\t%.9f\t%.9f\t%.9f\n", op, iterations, size, min, q1, med, avg, q3, max);
#if 0
for(i=0; i<iterations; i++)
{
printf("\t%.9f", measurement_array[i]);
}
printf("\n");
#endif
fflush(NULL);
return;
}
static int measurement_cmp(const void* a, const void *b)
{
const double *d_a = a;
const double *d_b = b;
if(*d_a < *d_b)
return(-1);
else if(*d_a > *d_b)
return(1);
else
return(0);
}
These are example scripts for executing an automated regression test on the
Theta system at the ALCF. The entire process is handled by the
"run-regression.sh" script, which is suitable for execution within a cron job.
#!/bin/bash
#COBALT -n 2
#COBALT -t 10
#COBALT --mode script
#COBALT -A radix-io
#COBALT -q debug-flat-quad
#COBALT -M carns@mcs.anl.gov
aprun -n 2 -N 1 ./margo-p2p-latency -i 100000 -n ofi+gni://ipogif0:5000
#!/bin/bash
# This is a shell script to be run from a login node of the Theta system at
# the ALCF, that will download, compile, and execute the ssg performance
# regression tests, including any dependencies
# exit on any error
set -e
SANDBOX=/tmp/mochi-regression-$$
PREFIX=~/tmp/mochi-regression-install-$$
JOBDIR=~/tmp/mochi-regression-job-$$
# gcc
module swap PrgEnv-intel PrgEnv-gnu
module load boost/gnu
export CC=gcc
export CFLAGS="-O3 -I$BOOST_ROOT/include"
export PKG_CONFIG_PATH="$PREFIX/lib/pkgconfig:$PKG_CONFIG_PATH"
export CRAYPE_LINK_TYPE=dynamic
# scratch area to clone and build things
mkdir $SANDBOX
# scratch area for job submission
mkdir $JOBDIR
cp margo-p2p-latency.qsub $JOBDIR
cd $SANDBOX
git clone https://github.com/ofiwg/libfabric.git
git clone https://github.com/pmodels/argobots.git
git clone https://github.com/mercury-hpc/mercury.git
wget http://dist.schmorp.de/libev/libev-4.24.tar.gz
tar -xvzf libev-4.24.tar.gz
git clone https://xgitlab.cels.anl.gov/sds/abt-snoozer.git
git clone https://xgitlab.cels.anl.gov/sds/margo.git
git clone https://xgitlab.cels.anl.gov/sds/ssg.git
# argobots
echo "=== BUILDING ARGOBOTS ==="
cd $SANDBOX/argobots
./autogen.sh
mkdir build
cd build
../configure --prefix=$PREFIX --enable-perf-opt
make -j 3
make install
# libfabric
echo "=== BUILDING LIBFABRIC ==="
cd $SANDBOX/libfabric
./autogen.sh
mkdir build
cd build
../configure --prefix=$PREFIX --enable-ugni-static --disable-rxd --disable-rxm --disable-udp --disable-usnic --disable-verbs --disable-sockets