ssg.c 18 KB
Newer Older
1
2
3
4
5
6
/*
 * Copyright (c) 2016 UChicago Argonne, LLC
 *
 * See COPYRIGHT in top-level directory.
 */

7
#include "ssg-config.h"
Shane Snyder's avatar
Shane Snyder committed
8

Jonathan Jenkins's avatar
Jonathan Jenkins committed
9
10
11
12
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
13
#include <errno.h>
Jonathan Jenkins's avatar
Jonathan Jenkins committed
14
15
#include <stdlib.h>
#include <string.h>
Jonathan Jenkins's avatar
Jonathan Jenkins committed
16
#include <assert.h>
17
18
19
#ifdef HAVE_MPI
#include <mpi.h>
#endif
Jonathan Jenkins's avatar
Jonathan Jenkins committed
20

Shane Snyder's avatar
Shane Snyder committed
21
#include <mercury.h>
22
#include <abt.h>
Shane Snyder's avatar
Shane Snyder committed
23
#include <margo.h>
24

25
#include "ssg.h"
26
#include "ssg-internal.h"
27
28
29
#if USE_SWIM_FD
#include "swim-fd/swim-fd.h"
#endif
30
#include "uthash.h"
31

32
/* SSG helper routine prototypes */
33
34
static int ssg_group_destroy_internal(
    ssg_group_t *g);
35
36
static hg_return_t ssg_group_lookup(
    ssg_group_t * g, const char * const addr_strs[]);
37
38
39
static void ssg_generate_group_id(
    const char * name, const char * leader_addr_str,
    ssg_group_id_t *group_id);
40
41
static const char ** ssg_setup_addr_str_list(
    char * buf, int num_addrs);
42

43
44
/* XXX: i think we ultimately need per-mid ssg instances rather than 1 global? */
ssg_instance_t *ssg_inst = NULL;
Shane Snyder's avatar
Shane Snyder committed
45

Shane Snyder's avatar
Shane Snyder committed
46
47
48
49
DECLARE_MARGO_RPC_HANDLER(ssg_attach_recv_ult)

static hg_id_t ssg_attach_rpc_id;

50
51
52
/***************************************************
 *** SSG runtime intialization/shutdown routines ***
 ***************************************************/
53

54
55
int ssg_init(
    margo_instance_id mid)
Jonathan Jenkins's avatar
Jonathan Jenkins committed
56
{
Shane Snyder's avatar
Shane Snyder committed
57
58
    hg_class_t *hg_cls = margo_get_class(mid);

59
60
61
62
63
64
65
66
67
68
    if (ssg_inst)
        return SSG_FAILURE;

    /* initialize an SSG instance for this margo instance */
    ssg_inst = malloc(sizeof(*ssg_inst));
    if (!ssg_inst)
        return SSG_FAILURE;
    memset(ssg_inst, 0, sizeof(*ssg_inst));
    ssg_inst->mid = mid;

Shane Snyder's avatar
Shane Snyder committed
69
70
71
72
    /* register HG RPCs for SSG */
    ssg_attach_rpc_id = MERCURY_REGISTER(hg_cls, "ssg_attach", void, void,
        ssg_attach_recv_ult_handler);

73
74
    return SSG_SUCCESS;
}
Jonathan Jenkins's avatar
Jonathan Jenkins committed
75

76
int ssg_finalize()
77
{
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
    ssg_group_t *g, *tmp;

    if (!ssg_inst)
        return SSG_FAILURE;

    /* destroy all active groups */
    HASH_ITER(hh, ssg_inst->group_table, g, tmp)
    {
        HASH_DELETE(hh, ssg_inst->group_table, g);
        ssg_group_destroy_internal(g);
    }

    free(ssg_inst);
    ssg_inst = NULL;

93
    return SSG_SUCCESS;
94
}
Jonathan Jenkins's avatar
Jonathan Jenkins committed
95

96
97
98
/*************************************
 *** SSG group management routines ***
 *************************************/
Jonathan Jenkins's avatar
Jonathan Jenkins committed
99

100
int ssg_group_create(
101
102
    const char * group_name,
    const char * const group_addr_strs[],
103
104
    int group_size,
    ssg_group_id_t * group_id)
105
{
106
    hg_class_t *hgcl = NULL;
Shane Snyder's avatar
Shane Snyder committed
107
    hg_addr_t self_addr = HG_ADDR_NULL;
108
    char *self_addr_str = NULL;
109
    hg_size_t self_addr_str_size = 0;
110
111
112
113
    const char *self_addr_substr = NULL;
    const char *addr_substr = NULL;
    int i;
    ssg_group_t *g = NULL;
114
    ssg_group_id_t new_gid;
115
    hg_return_t hret;
116
    int sret = SSG_FAILURE;
117

118
119
120
    if (!ssg_inst) goto fini;

    hgcl = margo_get_class(ssg_inst->mid);
121
    if (!hgcl) goto fini;
Jonathan Jenkins's avatar
Jonathan Jenkins committed
122

123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
    /* generate a unique ID for this group  */
    ssg_generate_group_id(group_name, group_addr_strs[0], &new_gid);

    /* make sure we aren't re-adding an existing group */
    HASH_FIND(hh, ssg_inst->group_table, &new_gid.name_hash, sizeof(uint64_t), g);
    if (g) goto fini;

    /* allocate an SSG group data structure and initialize some of it */
    g = malloc(sizeof(*g));
    if (!g) goto fini;
    memset(g, 0, sizeof(*g));
    g->group_name = strdup(group_name);
    if (!g->group_name) goto fini;
    memcpy(&g->group_id, &new_gid, sizeof(new_gid));
    // TODO? g->self_id = -1;
    g->group_view.size = group_size;
    g->group_view.member_states = malloc(
        group_size * sizeof(*g->group_view.member_states));
    if (!g->group_view.member_states) goto fini;
    memset(g->group_view.member_states, 0,
        group_size * sizeof(*g->group_view.member_states));

145
    /* get my address */
146
147
    hret = HG_Addr_self(hgcl, &self_addr);
    if (hret != HG_SUCCESS) goto fini;
148
    hret = HG_Addr_to_string(hgcl, NULL, &self_addr_str_size, self_addr);
149
    if (hret != HG_SUCCESS) goto fini;
150
    self_addr_str = malloc(self_addr_str_size);
151
    if (self_addr_str == NULL) goto fini;
152
    hret = HG_Addr_to_string(hgcl, self_addr_str, &self_addr_str_size, self_addr);
153
154
    if (hret != HG_SUCCESS) goto fini;

155
156
157
158
    /* strstr is used here b/c there may be inconsistencies in whether the class
     * is included in the address or not (it should not be in HG_Addr_to_string,
     * but it's possible that it is in the list of group address strings)
     */
159
    self_addr_substr = strstr(self_addr_str, "://");
Shane Snyder's avatar
Shane Snyder committed
160
161
162
163
    if (self_addr_substr == NULL)
        self_addr_substr = self_addr_str;
    else
        self_addr_substr += 3;
Shane Snyder's avatar
Shane Snyder committed
164

165
166
167
168
    /* resolve my rank within the group */
    for (i = 0; i < group_size; i++)
    {
        addr_substr = strstr(group_addr_strs[i], "://");
Shane Snyder's avatar
Shane Snyder committed
169
170
171
172
        if (addr_substr == NULL)
            addr_substr = group_addr_strs[i];
        else
            addr_substr += 3;
173
174
175
        if (strcmp(self_addr_substr, addr_substr) == 0)
        {
            /* this is my address -- my rank is the offset in the address array */
176
177
            g->self_id = i; // TODO 
            g->group_view.member_states[i].addr = self_addr;
178
179
180
181
        }
        else
        {
            /* initialize group member addresses to NULL before looking them up */
182
            g->group_view.member_states[i].addr = HG_ADDR_NULL;
183
        }
184
        g->group_view.member_states[i].is_member = 1;
185
    }
186
#if 0
187
    /* TODO: if unable to resolve my rank within the group, error out */
188
    if (g->self_id == -1)
189
190
191
192
193
    {
        fprintf(stderr, "Error: SSG unable to resolve rank in group %s\n",
            group_name);
        goto fini;
    }
194
#endif
195

196
    /* lookup hg address information for all group members */
197
198
199
200
201
202
203
    hret = ssg_group_lookup(g, group_addr_strs);
    if (hret != HG_SUCCESS)
    {
        fprintf(stderr, "Error: SSG unable to complete lookup for group %s\n",
            group_name);
        goto fini;
    }
Shane Snyder's avatar
Shane Snyder committed
204
    SSG_DEBUG(g, "group lookup successful (size=%d)\n", group_size);
205
206

#if USE_SWIM_FD
207
208
209
210
211
212
    int swim_active = 1;
#ifdef SWIM_FORCE_FAIL
    if (g->self_rank == 1)
        swim_active = 0;
#endif

213
    /* initialize swim failure detector */
214
215
    // TODO: we should probably barrier or sync somehow to avoid rpc failures
    // due to timing skew of different ranks initializing swim
216
    g->fd_ctx = (void *)swim_init(g, swim_active);
217
    if (g->fd_ctx == NULL) goto fini;
218
219
#endif

220
221
    /* add this group reference to our group table */
    HASH_ADD(hh, ssg_inst->group_table, group_id.name_hash, sizeof(uint64_t), g);
222

223
224
    /* everything successful -- set the output for this call */
    memcpy(group_id, &new_gid, sizeof(new_gid));
225
    sret = SSG_SUCCESS;
226

227
    /* don't free these pointers on success */
Shane Snyder's avatar
Shane Snyder committed
228
    self_addr = HG_ADDR_NULL;
229
230
    g = NULL;
fini:
Shane Snyder's avatar
Shane Snyder committed
231
232
    if (hgcl && self_addr != HG_ADDR_NULL) HG_Addr_free(hgcl, self_addr);
    free(self_addr_str);
233
234
    if (g)
    {
235
236
        free(g->group_name);
        free(g->group_view.member_states);
237
238
239
        free(g);
    }

240
    return sret;
241
242
}

243
int ssg_group_create_config(
244
    const char * group_name,
245
246
    const char * file_name,
    ssg_group_id_t * group_id)
247
248
249
250
251
252
253
254
255
256
{
    int fd;
    struct stat st;
    char *rd_buf = NULL;
    ssize_t rd_buf_sz;
    char *tok;
    void *addr_buf = NULL;
    int addr_buf_len = 0, num_addrs = 0;
    int ret;
    const char **addr_strs = NULL;
257
    int sret = SSG_FAILURE;
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292

    /* open config file for reading */
    fd = open(file_name, O_RDONLY);
    if (fd == -1)
    {
        fprintf(stderr, "Error: SSG unable to open config file %s for group %s\n",
            file_name, group_name);
        goto fini;
    }

    /* get file size and allocate a buffer to store it */
    ret = fstat(fd, &st);
    if (ret == -1)
    {
        fprintf(stderr, "Error: SSG unable to stat config file %s for group %s\n",
            file_name, group_name);
        goto fini;
    }
    rd_buf = malloc(st.st_size+1);
    if (rd_buf == NULL) goto fini;

    /* load it all in one fell swoop */
    rd_buf_sz = read(fd, rd_buf, st.st_size);
    if (rd_buf_sz != st.st_size)
    {
        fprintf(stderr, "Error: SSG unable to read config file %s for group %s\n",
            file_name, group_name);
        goto fini;
    }
    rd_buf[rd_buf_sz]='\0';

    /* strtok the result - each space-delimited address is assumed to be
     * a unique mercury address
     */
    tok = strtok(rd_buf, "\r\n\t ");
Jonathan Jenkins's avatar
Jonathan Jenkins committed
293
294
    if (tok == NULL) goto fini;

295
296
    /* build up the address buffer */
    addr_buf = malloc(rd_buf_sz);
297
    if (addr_buf == NULL) goto fini;
298
299
    do
    {
Jonathan Jenkins's avatar
Jonathan Jenkins committed
300
        int tok_sz = strlen(tok);
301
302
        memcpy((char*)addr_buf + addr_buf_len, tok, tok_sz+1);
        addr_buf_len += tok_sz+1;
Jonathan Jenkins's avatar
Jonathan Jenkins committed
303
304
305
        num_addrs++;
        tok = strtok(NULL, "\r\n\t ");
    } while (tok != NULL);
306
307
308
309
310
311
312
    if (addr_buf_len != rd_buf_sz)
    {
        /* adjust buffer size if our initial guess was wrong */
        void *tmp = realloc(addr_buf, addr_buf_len);
        if (tmp == NULL) goto fini;
        addr_buf = tmp;
    }
Jonathan Jenkins's avatar
Jonathan Jenkins committed
313

314
315
316
    /* set up address string array for group members */
    addr_strs = ssg_setup_addr_str_list(addr_buf, num_addrs);
    if (!addr_strs) goto fini;
317

318
    /* invoke the generic group create routine using our list of addrs */
319
    sret = ssg_group_create(group_name, addr_strs, num_addrs, group_id);
Jonathan Jenkins's avatar
Jonathan Jenkins committed
320
321

fini:
322
    /* cleanup before returning */
Jonathan Jenkins's avatar
Jonathan Jenkins committed
323
    if (fd != -1) close(fd);
324
    free(rd_buf);
325
    free(addr_buf);
326
327
    free(addr_strs);

328
    return sret;
Jonathan Jenkins's avatar
Jonathan Jenkins committed
329
330
331
}

#ifdef HAVE_MPI
332
int ssg_group_create_mpi(
333
    const char * group_name,
334
335
    MPI_Comm comm,
    ssg_group_id_t * group_id)
Jonathan Jenkins's avatar
Jonathan Jenkins committed
336
{
337
    hg_class_t *hgcl = NULL;
Jonathan Jenkins's avatar
Jonathan Jenkins committed
338
    hg_addr_t self_addr = HG_ADDR_NULL;
339
    char *self_addr_str = NULL;
340
341
    hg_size_t self_addr_str_size = 0;
    int self_addr_str_size_int = 0; /* for mpi-friendly conversion */
342
343
344
345
346
    char *addr_buf = NULL;
    int *sizes = NULL;
    int *sizes_psum = NULL;
    int comm_size = 0, comm_rank = 0;
    const char **addr_strs = NULL;
347
    hg_return_t hret;
348
    int sret = SSG_FAILURE;
349

350
351
352
    if (!ssg_inst) goto fini;

    hgcl = margo_get_class(ssg_inst->mid);
353
    if (!hgcl) goto fini;
Jonathan Jenkins's avatar
Jonathan Jenkins committed
354

355
    /* get my address */
Jonathan Jenkins's avatar
Jonathan Jenkins committed
356
357
    hret = HG_Addr_self(hgcl, &self_addr);
    if (hret != HG_SUCCESS) goto fini;
358
    hret = HG_Addr_to_string(hgcl, NULL, &self_addr_str_size, self_addr);
Shane Snyder's avatar
Shane Snyder committed
359
    if (hret != HG_SUCCESS) goto fini;
360
    self_addr_str = malloc(self_addr_str_size);
Jonathan Jenkins's avatar
Jonathan Jenkins committed
361
    if (self_addr_str == NULL) goto fini;
362
    hret = HG_Addr_to_string(hgcl, self_addr_str, &self_addr_str_size, self_addr);
Jonathan Jenkins's avatar
Jonathan Jenkins committed
363
    if (hret != HG_SUCCESS) goto fini;
364
    self_addr_str_size_int = (int)self_addr_str_size; /* null char included in call */
Jonathan Jenkins's avatar
Jonathan Jenkins committed
365

366
    /* gather the buffer sizes */
Jonathan Jenkins's avatar
Jonathan Jenkins committed
367
368
369
370
    MPI_Comm_size(comm, &comm_size);
    MPI_Comm_rank(comm, &comm_rank);
    sizes = malloc(comm_size * sizeof(*sizes));
    if (sizes == NULL) goto fini;
371
    sizes[comm_rank] = self_addr_str_size_int;
Jonathan Jenkins's avatar
Jonathan Jenkins committed
372
373
    MPI_Allgather(MPI_IN_PLACE, 0, MPI_BYTE, sizes, 1, MPI_INT, comm);

374
375
376
    /* compute a exclusive prefix sum of the data sizes, including the
     * total at the end
     */
Jonathan Jenkins's avatar
Jonathan Jenkins committed
377
378
379
380
381
382
    sizes_psum = malloc((comm_size+1) * sizeof(*sizes_psum));
    if (sizes_psum == NULL) goto fini;
    sizes_psum[0] = 0;
    for (int i = 1; i < comm_size+1; i++)
        sizes_psum[i] = sizes_psum[i-1] + sizes[i-1];

383
    /* allgather the addresses */
384
385
    addr_buf = malloc(sizes_psum[comm_size]);
    if (addr_buf == NULL) goto fini;
386
    MPI_Allgatherv(self_addr_str, self_addr_str_size_int, MPI_BYTE,
387
            addr_buf, sizes, sizes_psum, MPI_BYTE, comm);
Jonathan Jenkins's avatar
Jonathan Jenkins committed
388

389
390
391
392
393
    /* set up address string array for group members */
    addr_strs = ssg_setup_addr_str_list(addr_buf, comm_size);
    if (!addr_strs) goto fini;

    /* invoke the generic group create routine using our list of addrs */
394
    sret = ssg_group_create(group_name, addr_strs, comm_size, group_id);
Shane Snyder's avatar
Shane Snyder committed
395
396

fini:
397
    /* cleanup before returning */
Shane Snyder's avatar
Shane Snyder committed
398
399
    free(sizes);
    free(sizes_psum);
400
    free(addr_buf);
401
402
    if (hgcl && self_addr != HG_ADDR_NULL) HG_Addr_free(hgcl, self_addr);
    free(self_addr_str);
403
404
    free(addr_strs);

405
    return sret;
Shane Snyder's avatar
Shane Snyder committed
406
407
408
}
#endif

409
410
int ssg_group_destroy(
    ssg_group_id_t group_id)
Shane Snyder's avatar
Shane Snyder committed
411
{
412
413
    ssg_group_t *g;
    int sret;
Shane Snyder's avatar
Shane Snyder committed
414

415
416
    if (!ssg_inst)
        return SSG_FAILURE;
Shane Snyder's avatar
Shane Snyder committed
417

418
419
420
421
    /* find the group structure and destroy it */
    HASH_FIND(hh, ssg_inst->group_table, &group_id.name_hash, sizeof(uint64_t), g);
    HASH_DELETE(hh, ssg_inst->group_table, g);
    sret = ssg_group_destroy_internal(g);
Shane Snyder's avatar
Shane Snyder committed
422

423
    return sret;
Jonathan Jenkins's avatar
Jonathan Jenkins committed
424
425
}

426
427
428
int ssg_group_attach(
    ssg_group_id_t group_id)
{
429
#if 0
Shane Snyder's avatar
Shane Snyder committed
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
    hg_class_t *hgcl = NULL;
    hg_addr_t srvr_addr = HG_ADDR_NULL;
    hg_handle_t handle = HG_HANDLE_NULL;
    hg_return_t hret;

    hgcl = margo_get_class(ssg_mid);
    if (!hgcl) goto fini;

    /* lookup the address of the given group's leader server */
    hret = margo_addr_lookup(ssg_mid, group_id.addr_str, &srvr_addr);
    if (hret != HG_SUCCESS) goto fini;

    hret = HG_Create(margo_get_context(ssg_mid), srvr_addr, ssg_attach_rpc_id,
        &handle);
    if (hret != HG_SUCCESS) goto fini;

    /* XXX: send a request to the leader addr to attach to the group */
    hret = margo_forward(ssg_mid, handle, NULL);
    if (hret != HG_SUCCESS) goto fini;

    /* XXX: store the obtained view locally to refer to */

    /* TODO: hold on to leader addr so we don't have to look it up again? */
fini:
    if (hgcl && srvr_addr != HG_ADDR_NULL) HG_Addr_free(hgcl, srvr_addr);
    if (handle != HG_HANDLE_NULL) HG_Destroy(handle);
456

457
#endif
458
459
460
461
462
463
464
465
466
467
    return SSG_SUCCESS;
}

int ssg_group_detach(
    ssg_group_id_t group_id)
{

    return SSG_SUCCESS;
}

468
469
470
/*********************************
 *** SSG group access routines ***
 *********************************/
Shane Snyder's avatar
Shane Snyder committed
471

Shane Snyder's avatar
Shane Snyder committed
472
ssg_member_id_t ssg_get_group_self_id(
473
    ssg_group_id_t group_id)
Shane Snyder's avatar
Shane Snyder committed
474
{
Shane Snyder's avatar
Shane Snyder committed
475
476
477
478
479
480
481
482
483
484
    ssg_group_t *g;

    if (!ssg_inst)
        return SSG_MEMBER_ID_INVALID;

    HASH_FIND(hh, ssg_inst->group_table, &group_id.name_hash, sizeof(uint64_t), g);
    if (!g)
        return SSG_MEMBER_ID_INVALID;

    return g->self_id;
Shane Snyder's avatar
Shane Snyder committed
485
486
}

487
488
int ssg_get_group_size(
    ssg_group_id_t group_id)
Shane Snyder's avatar
Shane Snyder committed
489
{
Shane Snyder's avatar
Shane Snyder committed
490
491
492
493
494
495
496
497
498
499
    ssg_group_t *g;

    if (!ssg_inst)
        return 0;

    HASH_FIND(hh, ssg_inst->group_table, &group_id.name_hash, sizeof(uint64_t), g);
    if (!g)
        return 0;

    return g->group_view.size;
Shane Snyder's avatar
Shane Snyder committed
500
501
}

502
503
504
hg_addr_t ssg_get_addr(
    ssg_group_id_t group_id,
    ssg_member_id_t member_id)
Shane Snyder's avatar
Shane Snyder committed
505
{
Shane Snyder's avatar
Shane Snyder committed
506
507
508
509
510
511
512
513
514
515
    ssg_group_t *g;

    if (!ssg_inst)
        return HG_ADDR_NULL;

    HASH_FIND(hh, ssg_inst->group_table, &group_id.name_hash, sizeof(uint64_t), g);
    if (!g)
        return HG_ADDR_NULL;

    return g->group_view.member_states[g->self_id].addr;
Shane Snyder's avatar
Shane Snyder committed
516
517
}

518
519
520
/************************************
 *** SSG internal helper routines ***
 ************************************/
521

522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
static int ssg_group_destroy_internal(ssg_group_t *g)
{
    int i;

    /* TODO: send a leave message to the group ? */

#if USE_SWIM_FD
    /* free up failure detector state */
    if(g->fd_ctx)
        swim_finalize(g->fd_ctx);
#endif

    /* destroy group state */
    for (i = 0; i < g->group_view.size; i++)
    {
        if (g->group_view.member_states[i].addr != HG_ADDR_NULL)
        {
            HG_Addr_free(margo_get_class(ssg_inst->mid),
                g->group_view.member_states[i].addr);
        }
    }
    free(g->group_name);
    free(g->group_view.member_states);
    free(g);

    return SSG_SUCCESS;
}

550
static void ssg_lookup_ult(void * arg);
Jonathan Jenkins's avatar
Jonathan Jenkins committed
551
552
struct lookup_ult_args
{
553
    ssg_group_t *g;
554
    ssg_member_id_t member_id;
555
    const char *addr_str;
Jonathan Jenkins's avatar
Jonathan Jenkins committed
556
557
558
    hg_return_t out;
};

559
560
static hg_return_t ssg_group_lookup(
    ssg_group_t * g, const char * const addr_strs[])
Jonathan Jenkins's avatar
Jonathan Jenkins committed
561
{
Jonathan Jenkins's avatar
Jonathan Jenkins committed
562
563
    ABT_thread *ults;
    struct lookup_ult_args *args;
564
    int i, r;
565
    int aret;
Jonathan Jenkins's avatar
Jonathan Jenkins committed
566
    hg_return_t hret = HG_SUCCESS;
Jonathan Jenkins's avatar
Jonathan Jenkins committed
567

568
    if (g == NULL) return HG_INVALID_PARAM;
Jonathan Jenkins's avatar
Jonathan Jenkins committed
569

570
    /* initialize ULTs */
571
    ults = malloc(g->group_view.size * sizeof(*ults));
Jonathan Jenkins's avatar
Jonathan Jenkins committed
572
    if (ults == NULL) return HG_NOMEM_ERROR;
573
    args = malloc(g->group_view.size * sizeof(*args));
574
575
    if (args == NULL)
    {
Jonathan Jenkins's avatar
Jonathan Jenkins committed
576
577
578
        free(ults);
        return HG_NOMEM_ERROR;
    }
579
    for (i = 0; i < g->group_view.size; i++)
Jonathan Jenkins's avatar
Jonathan Jenkins committed
580
581
        ults[i] = ABT_THREAD_NULL;

582
    for (i = 1; i < g->group_view.size; i++)
583
    {
584
        r = (g->self_id + i) % g->group_view.size;
585
        args[r].g = g;
586
        args[r].member_id = r;
587
        args[r].addr_str = addr_strs[r];
588
589
        aret = ABT_thread_create(*margo_get_handler_pool(ssg_inst->mid),
                &ssg_lookup_ult, &args[r], ABT_THREAD_ATTR_NULL, &ults[r]);
Jonathan Jenkins's avatar
Jonathan Jenkins committed
590
591
        if (aret != ABT_SUCCESS) {
            hret = HG_OTHER_ERROR;
592
            goto fini;
Jonathan Jenkins's avatar
Jonathan Jenkins committed
593
        }
Jonathan Jenkins's avatar
Jonathan Jenkins committed
594
    }
Jonathan Jenkins's avatar
Jonathan Jenkins committed
595

596
    /* wait on all */
597
    for (i = 1; i < g->group_view.size; i++)
598
    {
599
        r = (g->self_id + i) % g->group_view.size;
600
        aret = ABT_thread_join(ults[r]);
Jonathan Jenkins's avatar
Jonathan Jenkins committed
601
602
        ABT_thread_free(&ults[r]);
        ults[r] = ABT_THREAD_NULL; // in case of cascading failure from join
603
604
        if (aret != ABT_SUCCESS)
        {
Jonathan Jenkins's avatar
Jonathan Jenkins committed
605
            hret = HG_OTHER_ERROR;
Jonathan Jenkins's avatar
Jonathan Jenkins committed
606
            break;
Jonathan Jenkins's avatar
Jonathan Jenkins committed
607
        }
608
609
        else if (args[r].out != HG_SUCCESS)
        {
Shane Snyder's avatar
Shane Snyder committed
610
            fprintf(stderr, "Error: SSG unable to lookup HG address for rank %d"
611
                "(err=%d)\n", r, args[r].out);
Jonathan Jenkins's avatar
Jonathan Jenkins committed
612
613
            hret = args[r].out;
            break;
Jonathan Jenkins's avatar
Jonathan Jenkins committed
614
615
616
        }
    }

617
fini:
618
    /* cleanup */
619
    for (i = 0; i < g->group_view.size; i++)
620
621
622
    {
        if (ults[i] != ABT_THREAD_NULL)
        {
623
624
            ABT_thread_cancel(ults[i]);
            ABT_thread_free(ults[i]);
Jonathan Jenkins's avatar
Jonathan Jenkins committed
625
626
        }
    }
627
628
    free(ults);
    free(args);
Jonathan Jenkins's avatar
Jonathan Jenkins committed
629
630

    return hret;
Jonathan Jenkins's avatar
Jonathan Jenkins committed
631
}
Jonathan Jenkins's avatar
Jonathan Jenkins committed
632

633
634
static void ssg_lookup_ult(
    void * arg)
Jonathan Jenkins's avatar
Jonathan Jenkins committed
635
{
636
637
    struct lookup_ult_args *l = arg;
    ssg_group_t *g = l->g;
Jonathan Jenkins's avatar
Jonathan Jenkins committed
638

639
    l->out = margo_addr_lookup(ssg_inst->mid, l->addr_str,
640
        &g->group_view.member_states[l->member_id].addr);
641
642
    return;
}
643

Shane Snyder's avatar
Shane Snyder committed
644
645
646
647
648
649
650
static void ssg_attach_recv_ult(hg_handle_t handle)
{
    HG_Destroy(handle);
    return;
}
DEFINE_MARGO_RPC_HANDLER(ssg_attach_recv_ult)

651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
static void ssg_generate_group_id(
    const char * name, const char * leader_addr_str,
    ssg_group_id_t *group_id)
{
    uint32_t upper, lower;

    /* hash the group name to obtain an 64-bit unique ID */
    ssg_hashlittle2(name, strlen(name), &lower, &upper);

    group_id->magic_nr = SSG_MAGIC_NR;
    group_id->name_hash = lower + (((uint64_t)upper)<<32);
    strcpy(group_id->addr_str, leader_addr_str);

    return;
}

667
668
669
670
671
672
673
static const char ** ssg_setup_addr_str_list(
    char * buf, int num_addrs)
{
    const char **ret = malloc(num_addrs * sizeof(*ret));
    if (ret == NULL) return NULL;

    ret[0] = buf;
674
675
    for (int i = 1; i < num_addrs; i++)
    {
676
677
        const char * a = ret[i-1];
        ret[i] = a + strlen(a) + 1;
Jonathan Jenkins's avatar
Jonathan Jenkins committed
678
    }
679
    return ret;
Jonathan Jenkins's avatar
Jonathan Jenkins committed
680
}