Commit a9853488 authored by Shane Snyder's avatar Shane Snyder

implemented and tested pmix failure notify

parent 345820bf
......@@ -183,11 +183,11 @@ int ssg_group_detach(
/**
* Obtains the caller's member ID in the given SSG group.
*
* @param[in] group_id SSG group ID
* @param[in] mid Corresponding Margo instance identifier
* @returns caller's group ID on success, SSG_MEMBER_ID_INVALID otherwise
*/
ssg_member_id_t ssg_get_group_self_id(
ssg_group_id_t group_id);
ssg_member_id_t ssg_get_self_id(
margo_instance_id mid);
/**
* Obtains the size of a given SSG group.
......@@ -205,7 +205,7 @@ int ssg_get_group_size(
* @param[in] member_id SSG group member ID
* @returns HG address of given group member on success, HG_ADDR_NULL otherwise
*/
hg_addr_t ssg_get_addr(
hg_addr_t ssg_get_group_addr(
ssg_group_id_t group_id,
ssg_member_id_t member_id);
......
......@@ -6,6 +6,8 @@
#pragma once
#include "ssg-config.h"
#include <stdint.h>
#include <inttypes.h>
......@@ -25,34 +27,12 @@ extern "C" {
#define SSG_MAGIC_NR 17321588
#define SSG_GET_SELF_ADDR_STR(__mid, __addr_str) do { \
hg_addr_t __self_addr; \
hg_size_t __size; \
__addr_str = NULL; \
if (margo_addr_self(__mid, &__self_addr) != HG_SUCCESS) break; \
if (margo_addr_to_string(__mid, NULL, &__size, __self_addr) != HG_SUCCESS) { \
margo_addr_free(__mid, __self_addr); \
break; \
} \
if ((__addr_str = malloc(__size)) == NULL) { \
margo_addr_free(__mid, __self_addr); \
break; \
} \
if (margo_addr_to_string(__mid, __addr_str, &__size, __self_addr) != HG_SUCCESS) { \
free(__addr_str); \
__addr_str = NULL; \
margo_addr_free(__mid, __self_addr); \
break; \
} \
margo_addr_free(__mid, __self_addr); \
} while(0)
/* debug printing macro for SSG */
#ifdef DEBUG
#define SSG_DEBUG(__g, __fmt, ...) do { \
double __now = ABT_get_wtime(); \
fprintf(__g->dbg_log, "[%.6lf] %20"PRIu64" (%s): " __fmt, __now, \
__g->self_id, __g->name, ## __VA_ARGS__); \
fprintf(__g->dbg_log, "%.6lf %20"PRIu64" (%s): " __fmt, __now, \
__g->ssg_inst->self_id, __g->name, ## __VA_ARGS__); \
fflush(__g->dbg_log); \
} while(0)
#else
......@@ -62,6 +42,19 @@ extern "C" {
/* SSG internal dataypes */
typedef struct ssg_instance
{
margo_instance_id mid;
char *self_addr_str;
ssg_member_id_t self_id;
struct ssg_group *group_table;
struct ssg_attached_group *attached_group_table;
#ifdef SSG_HAVE_PMIX
size_t pmix_failure_evhdlr_ref;
#endif
ABT_rwlock lock;
} ssg_instance_t;
/* TODO: associate a version number with a descriptor? */
typedef struct ssg_group_descriptor
{
......@@ -72,6 +65,13 @@ typedef struct ssg_group_descriptor
int ref_count;
} ssg_group_descriptor_t;
enum ssg_group_descriptor_owner_status
{
SSG_OWNER_IS_UNASSOCIATED = 0,
SSG_OWNER_IS_MEMBER,
SSG_OWNER_IS_ATTACHER
};
typedef struct ssg_member_state
{
ssg_member_id_t id;
......@@ -81,16 +81,6 @@ typedef struct ssg_member_state
UT_hash_handle hh;
} ssg_member_state_t;
typedef struct ssg_member_update
{
ssg_member_update_type_t type;
union
{
char *member_addr_str;
ssg_member_id_t member_id;
} u;
} ssg_member_update_t;
typedef struct ssg_group_view
{
unsigned int size;
......@@ -100,7 +90,7 @@ typedef struct ssg_group_view
typedef struct ssg_group
{
char *name;
ssg_member_id_t self_id;
ssg_instance_t *ssg_inst;
ssg_group_view_t view;
ssg_member_state_t *dead_members;
ssg_group_descriptor_t *descriptor;
......@@ -117,26 +107,22 @@ typedef struct ssg_group
typedef struct ssg_attached_group
{
char *name;
ssg_instance_t *ssg_inst;
ssg_group_view_t view;
ssg_group_descriptor_t *descriptor;
ABT_rwlock lock;
UT_hash_handle hh;
} ssg_attached_group_t;
typedef struct ssg_instance
{
margo_instance_id mid;
ssg_group_t *group_table;
ssg_attached_group_t *attached_group_table;
ABT_rwlock lock;
} ssg_instance_t;
enum ssg_group_descriptor_owner_status
typedef struct ssg_member_update
{
SSG_OWNER_IS_UNASSOCIATED = 0,
SSG_OWNER_IS_MEMBER,
SSG_OWNER_IS_ATTACHER
};
ssg_member_update_type_t type;
union
{
char *member_addr_str;
ssg_member_id_t member_id;
} u;
} ssg_member_update_t;
/* SSG internal function prototypes */
......
......@@ -105,7 +105,6 @@ int ssg_group_join_send(
hg_bulk_t bulk_handle = HG_BULK_NULL;
void *tmp_view_buf = NULL, *b;
hg_size_t tmp_view_buf_size = SSG_VIEW_BUF_DEF_SIZE;
char *self_addr_str = NULL;
ssg_group_join_request_t join_req;
ssg_group_join_response_t join_resp;
hg_return_t hret;
......@@ -119,9 +118,6 @@ int ssg_group_join_send(
ssg_group_join_rpc_id, &handle);
if (hret != HG_SUCCESS) goto fini;
SSG_GET_SELF_ADDR_STR(ssg_inst->mid, self_addr_str);
if (!self_addr_str) goto fini;
/* allocate a buffer to try to store the group view in */
/* NOTE: We don't know if this buffer is big enough to store the complete
* view. If the buffer is not large enough, the group member we are
......@@ -137,7 +133,7 @@ int ssg_group_join_send(
/* send a join request to the given group member address */
/* XXX is the whole descriptor really needed? */
memcpy(&join_req.group_descriptor, group_descriptor, sizeof(*group_descriptor));
join_req.addr_str = self_addr_str;
join_req.addr_str = ssg_inst->self_addr_str;
join_req.bulk_handle = bulk_handle;
hret = margo_forward(handle, &join_req);
if (hret != HG_SUCCESS) goto fini;
......@@ -197,7 +193,6 @@ fini:
if (handle != HG_HANDLE_NULL) margo_destroy(handle);
if (bulk_handle != HG_BULK_NULL) margo_bulk_free(bulk_handle);
free(tmp_view_buf);
free(self_addr_str);
return sret;
}
......@@ -561,7 +556,6 @@ DEFINE_MARGO_RPC_HANDLER(ssg_group_attach_recv_ult)
static int ssg_group_serialize(
ssg_group_t *g, void **buf, hg_size_t *buf_size)
{
char *self_addr_str;
ssg_member_state_t *member_state, *tmp;
hg_size_t group_buf_size = 0;
void *group_buf;
......@@ -570,11 +564,8 @@ static int ssg_group_serialize(
*buf = NULL;
*buf_size = 0;
SSG_GET_SELF_ADDR_STR(ssg_inst->mid, self_addr_str);
if (!self_addr_str) return SSG_FAILURE;
/* first determine size */
group_buf_size = strlen(self_addr_str) + 1;
group_buf_size = strlen(ssg_inst->self_addr_str) + 1;
HASH_ITER(hh, g->view.member_map, member_state, tmp)
{
group_buf_size += strlen(member_state->addr_str) + 1;
......@@ -583,13 +574,12 @@ static int ssg_group_serialize(
group_buf = malloc(group_buf_size);
if(!group_buf)
{
free(self_addr_str);
return SSG_FAILURE;
}
buf_p = group_buf;
strcpy(buf_p, self_addr_str);
buf_p += strlen(self_addr_str) + 1;
strcpy(buf_p, ssg_inst->self_addr_str);
buf_p += strlen(ssg_inst->self_addr_str) + 1;
HASH_ITER(hh, g->view.member_map, member_state, tmp)
{
str_p = member_state->addr_str;
......@@ -599,7 +589,6 @@ static int ssg_group_serialize(
*buf = group_buf;
*buf_size = group_buf_size;
free(self_addr_str);
return SSG_SUCCESS;
}
......
......@@ -54,7 +54,7 @@ static ssg_group_t * ssg_group_create_internal(
static int ssg_group_view_create(
const char * const group_addr_strs[], int group_size,
const char * self_addr_str, ABT_rwlock view_lock,
ssg_group_view_t * view, ssg_member_id_t * self_id);
ssg_group_view_t * view);
static ssg_member_state_t * ssg_group_view_add_member(
const char * addr_str, hg_addr_t addr, ssg_member_id_t member_id,
ssg_group_view_t * view);
......@@ -83,7 +83,7 @@ void ssg_pmix_proc_failure_reg_cb(
pmix_status_t status, size_t evhdlr_ref, void *cbdata);
#endif
/* XXX: i think we ultimately need per-mid ssg instances rather than 1 global? */
/* XXX: we ultimately need per-mid ssg instances rather than 1 global */
ssg_instance_t *ssg_inst = NULL;
/***************************************************
......@@ -94,6 +94,8 @@ int ssg_init(
margo_instance_id mid)
{
struct timespec ts;
hg_addr_t self_addr;
hg_size_t self_addr_str_size;
if (ssg_inst)
return SSG_FAILURE;
......@@ -112,22 +114,35 @@ int ssg_init(
clock_gettime(CLOCK_MONOTONIC, &ts);
srand(ts.tv_nsec + getpid());
#ifdef SSG_HAVE_PMIX
if (PMIx_Initialized())
/* get my self address string and ID (which are constant per-mid) */
if (margo_addr_self(mid, &self_addr) != HG_SUCCESS)
{
/* use PMIx event registrations to inform us of terminated/aborted procs */
pmix_status_t err_codes[2] = {PMIX_PROC_TERMINATED, PMIX_ERR_PROC_ABORTED};
PMIx_Register_event_handler(err_codes, 2, NULL, 0,
ssg_pmix_proc_failure_notify_fn, ssg_pmix_proc_failure_reg_cb, NULL /* XXX */);
free(ssg_inst);
return SSG_FAILURE;
}
else
if (margo_addr_to_string(mid, NULL, &self_addr_str_size, self_addr) != HG_SUCCESS)
{
margo_addr_free(mid, self_addr);
free(ssg_inst);
return SSG_FAILURE;
}
if ((ssg_inst->self_addr_str = malloc(self_addr_str_size)) == NULL)
{
fprintf(stderr, "Warning: skipping PMIx event notification registration -- "\
"PMIx not initialized\n");
margo_addr_free(mid, self_addr);
free(ssg_inst);
return SSG_FAILURE;
}
if (margo_addr_to_string(mid, ssg_inst->self_addr_str, &self_addr_str_size, self_addr) != HG_SUCCESS)
{
free(ssg_inst->self_addr_str);
margo_addr_free(mid, self_addr);
free(ssg_inst);
return SSG_FAILURE;
}
#endif
ssg_inst->self_id = ssg_gen_member_id(ssg_inst->self_addr_str);
margo_addr_free(mid, self_addr);
return SSG_SUCCESS;
}
......@@ -141,6 +156,11 @@ int ssg_finalize()
ABT_rwlock_wrlock(ssg_inst->lock);
#ifdef SSG_HAVE_PMIX
if (ssg_inst->pmix_failure_evhdlr_ref)
PMIx_Deregister_event_handler(ssg_inst->pmix_failure_evhdlr_ref, NULL, NULL);
#endif
/* destroy all active groups */
HASH_ITER(hh, ssg_inst->group_table, g, g_tmp)
{
......@@ -159,6 +179,7 @@ int ssg_finalize()
ABT_rwlock_unlock(ssg_inst->lock);
ABT_rwlock_free(&ssg_inst->lock);
free(ssg_inst->self_addr_str);
free(ssg_inst);
ssg_inst = NULL;
......@@ -285,7 +306,6 @@ ssg_group_id_t ssg_group_create_mpi(
void * update_cb_dat)
{
int i;
char *self_addr_str = NULL;
int self_addr_str_size = 0;
char *addr_str_buf = NULL;
int *sizes = NULL;
......@@ -296,16 +316,12 @@ ssg_group_id_t ssg_group_create_mpi(
if (!ssg_inst) goto fini;
/* get my address */
SSG_GET_SELF_ADDR_STR(ssg_inst->mid, self_addr_str);
if (self_addr_str == NULL) goto fini;
self_addr_str_size = (int)strlen(self_addr_str) + 1;
/* gather the buffer sizes */
MPI_Comm_size(comm, &comm_size);
MPI_Comm_rank(comm, &comm_rank);
sizes = malloc(comm_size * sizeof(*sizes));
if (sizes == NULL) goto fini;
self_addr_str_size = (int)strlen(ssg_inst->self_addr_str) + 1;
sizes[comm_rank] = self_addr_str_size;
MPI_Allgather(MPI_IN_PLACE, 0, MPI_BYTE, sizes, 1, MPI_INT, comm);
......@@ -321,7 +337,7 @@ ssg_group_id_t ssg_group_create_mpi(
/* allgather the addresses */
addr_str_buf = malloc(sizes_psum[comm_size]);
if (addr_str_buf == NULL) goto fini;
MPI_Allgatherv(self_addr_str, self_addr_str_size, MPI_BYTE,
MPI_Allgatherv(ssg_inst->self_addr_str, self_addr_str_size, MPI_BYTE,
addr_str_buf, sizes, sizes_psum, MPI_BYTE, comm);
/* set up address string array for group members */
......@@ -334,7 +350,6 @@ ssg_group_id_t ssg_group_create_mpi(
fini:
/* cleanup before returning */
free(self_addr_str);
free(sizes);
free(sizes_psum);
free(addr_str_buf);
......@@ -351,13 +366,12 @@ ssg_group_id_t ssg_group_create_pmix(
ssg_membership_update_cb update_cb,
void * update_cb_dat)
{
char *self_addr_str = NULL;
pmix_proc_t tmp_proc;
pmix_value_t value;
pmix_value_t *val_p;
pmix_value_t *addr_vals = NULL;
unsigned int nprocs;
char key[128];
char key[512];
pmix_info_t *info;
bool flag;
const char **addr_strs = NULL;
......@@ -367,9 +381,49 @@ ssg_group_id_t ssg_group_create_pmix(
if (!ssg_inst || !PMIx_Initialized()) goto fini;
/* get my address */
SSG_GET_SELF_ADDR_STR(ssg_inst->mid, self_addr_str);
if (self_addr_str == NULL) goto fini;
/* XXX config switch for this functionality */
/* if not already done, register for PMIx process failure notifications */
if (!ssg_inst->pmix_failure_evhdlr_ref)
{
/* use PMIx event registrations to inform us of terminated/aborted procs */
pmix_status_t err_codes[2] = {PMIX_PROC_TERMINATED, PMIX_ERR_PROC_ABORTED};
PMIx_Register_event_handler(err_codes, 2, NULL, 0,
ssg_pmix_proc_failure_notify_fn, ssg_pmix_proc_failure_reg_cb,
&ssg_inst->pmix_failure_evhdlr_ref);
/* exchange information needed to map PMIx ranks to SSG member IDs */
snprintf(key, 512, "ssg-%s-%d-id", proc.nspace, proc.rank);
PMIX_VALUE_LOAD(&value, &ssg_inst->self_id, PMIX_UINT64);
ret = PMIx_Put(PMIX_GLOBAL, key, &value);
if (ret != PMIX_SUCCESS)
{
fprintf(stderr, "Warning: skipping PMIx event notification registration -- "\
"Unable to put PMIx rank mapping\n");
PMIx_Deregister_event_handler(ssg_inst->pmix_failure_evhdlr_ref, NULL, NULL);
}
/* commit the put data to the local pmix server */
ret = PMIx_Commit();
if (ret != PMIX_SUCCESS)
{
fprintf(stderr, "Warning: skipping PMIx event notification registration -- "\
"Unable to commit PMIx rank mapping\n");
PMIx_Deregister_event_handler(ssg_inst->pmix_failure_evhdlr_ref, NULL, NULL);
}
/* barrier, additionally requesting to collect relevant process data */
PMIX_INFO_CREATE(info, 1);
flag = true;
PMIX_INFO_LOAD(info, PMIX_COLLECT_DATA, &flag, PMIX_BOOL);
ret = PMIx_Fence(&proc, 1, info, 1);
if (ret != PMIX_SUCCESS)
{
fprintf(stderr, "Warning: skipping PMIx event notification registration -- "\
"Unable to exchange PMIx rank mapping\n");
PMIx_Deregister_event_handler(ssg_inst->pmix_failure_evhdlr_ref, NULL, NULL);
}
PMIX_INFO_FREE(info, 1);
}
/* XXX note we are assuming every process in the job wants to join this group... */
/* get the total nprocs in the job */
......@@ -380,8 +434,8 @@ ssg_group_id_t ssg_group_create_pmix(
PMIX_VALUE_RELEASE(val_p);
/* put my address string using a well-known key */
if (snprintf(key, 128, "%s-%d-hg-addr", proc.nspace, proc.rank) >= 128) goto fini;
PMIX_VALUE_LOAD(&value, self_addr_str, PMIX_STRING);
snprintf(key, 512, "ssg-%s-%s-%d-hg-addr", group_name, proc.nspace, proc.rank);
PMIX_VALUE_LOAD(&value, ssg_inst->self_addr_str, PMIX_STRING);
ret = PMIx_Put(PMIX_GLOBAL, key, &value);
if (ret != PMIX_SUCCESS) goto fini;
......@@ -394,6 +448,7 @@ ssg_group_id_t ssg_group_create_pmix(
flag = true;
PMIX_INFO_LOAD(info, PMIX_COLLECT_DATA, &flag, PMIX_BOOL);
ret = PMIx_Fence(&proc, 1, info, 1);
if (ret != PMIX_SUCCESS) goto fini;
PMIX_INFO_FREE(info, 1);
addr_strs = malloc(nprocs * sizeof(*addr_strs));
......@@ -406,11 +461,12 @@ ssg_group_id_t ssg_group_create_pmix(
/* skip ourselves */
if(n == proc.rank)
{
addr_strs[n] = self_addr_str;
addr_strs[n] = ssg_inst->self_addr_str;
continue;
}
if (snprintf(key, 128, "%s-%d-hg-addr", proc.nspace, n) >= 128) goto fini;
if (snprintf(key, 128, "ssg-%s-%s-%d-hg-addr", group_name,
proc.nspace, n) >= 128) goto fini;
tmp_proc.rank = n;
val_p = &addr_vals[n];
......@@ -426,7 +482,6 @@ ssg_group_id_t ssg_group_create_pmix(
fini:
/* cleanup before returning */
free(self_addr_str);
free(addr_strs);
PMIX_VALUE_FREE(addr_vals, nprocs);
......@@ -472,7 +527,6 @@ ssg_group_id_t ssg_group_join(
void * update_cb_dat)
{
ssg_group_descriptor_t *in_group_descriptor = (ssg_group_descriptor_t *)in_group_id;
char *self_addr_str = NULL;
hg_addr_t group_target_addr = HG_ADDR_NULL;
char *group_name = NULL;
int group_size;
......@@ -505,10 +559,6 @@ ssg_group_id_t ssg_group_join(
&group_name, &group_size, &view_buf);
if (sret != SSG_SUCCESS || !group_name || !view_buf) goto fini;
/* get my address string */
SSG_GET_SELF_ADDR_STR(ssg_inst->mid, self_addr_str);
if (self_addr_str == NULL) goto fini;
/* set up address string array for all group members */
addr_strs = ssg_addr_str_buf_to_list(view_buf, group_size);
if (!addr_strs) goto fini;
......@@ -516,7 +566,7 @@ ssg_group_id_t ssg_group_join(
/* append self address string to list of group member address strings */
addr_strs = realloc(addr_strs, (group_size+1)*sizeof(char *));
if(!addr_strs) goto fini;
addr_strs[group_size++] = self_addr_str;
addr_strs[group_size++] = ssg_inst->self_addr_str;
g = ssg_group_create_internal(group_name, addr_strs, group_size,
update_cb, update_cb_dat);
......@@ -534,7 +584,6 @@ fini:
free(addr_strs);
free(view_buf);
free(group_name);
free(self_addr_str);
return g_id;
}
......@@ -574,7 +623,7 @@ int ssg_group_leave(
}
ABT_rwlock_unlock(ssg_inst->lock);
sret = ssg_group_leave_send(group_descriptor, g->self_id, group_target_addr);
sret = ssg_group_leave_send(group_descriptor, ssg_inst->self_id, group_target_addr);
if (sret != SSG_SUCCESS) goto fini;
/* at least one group member knows of the leave request -- safe to
......@@ -650,7 +699,7 @@ int ssg_group_attach(
ag->descriptor->owner_status = SSG_OWNER_IS_ATTACHER;
/* create the view for the group */
sret = ssg_group_view_create(addr_strs, group_size, NULL, ag->lock, &ag->view, NULL);
sret = ssg_group_view_create(addr_strs, group_size, NULL, ag->lock, &ag->view);
if (sret != SSG_SUCCESS) goto fini;
/* add this group reference to our group table */
......@@ -701,36 +750,18 @@ int ssg_group_detach(
}
#endif
/*********************************
*** SSG group access routines ***
*********************************/
/*********************************************************
*** SSG routines for obtaining self/group information ***
*********************************************************/
ssg_member_id_t ssg_get_group_self_id(
ssg_group_id_t group_id)
ssg_member_id_t ssg_get_self_id(
margo_instance_id mid)
{
ssg_group_descriptor_t *group_descriptor = (ssg_group_descriptor_t *)group_id;
ssg_group_t *g;
ssg_member_id_t self_id;
if (!ssg_inst || group_id == SSG_GROUP_ID_NULL) return SSG_MEMBER_ID_INVALID;
/* XXX eventually mid needed to distinguish multiple ssg contexts */
if (group_descriptor->owner_status != SSG_OWNER_IS_MEMBER)
{
fprintf(stderr, "Error: SSG can only obtain a self ID from a group the" \
" caller is a member of\n");
return SSG_MEMBER_ID_INVALID;
}
ABT_rwlock_rdlock(ssg_inst->lock);
HASH_FIND(hh, ssg_inst->group_table, &group_descriptor->name_hash,
sizeof(uint64_t), g);
if (g)
self_id = g->self_id;
else
self_id = SSG_MEMBER_ID_INVALID;
ABT_rwlock_unlock(ssg_inst->lock);
if (!ssg_inst) return SSG_MEMBER_ID_INVALID;
return self_id;
return ssg_inst->self_id;
}
int ssg_get_group_size(
......@@ -781,7 +812,7 @@ int ssg_get_group_size(
return group_size;
}
hg_addr_t ssg_get_addr(
hg_addr_t ssg_get_group_addr(
ssg_group_id_t group_id,
ssg_member_id_t member_id)
{
......@@ -1061,6 +1092,7 @@ void ssg_group_dump(
if (group_descriptor->owner_status == SSG_OWNER_IS_MEMBER)
{
fprintf(stderr, "MEMBER DUMP\n");
ssg_group_t *g;
ABT_rwlock_rdlock(ssg_inst->lock);
......@@ -1073,7 +1105,7 @@ void ssg_group_dump(
group_size = g->view.size + 1;
group_name = g->name;
strcpy(group_role, "member");
sprintf(group_self_id, "%lu", g->self_id);
sprintf(group_self_id, "%lu", ssg_inst->self_id);
}
ABT_rwlock_unlock(ssg_inst->lock);
}
......@@ -1134,7 +1166,6 @@ static ssg_group_t * ssg_group_create_internal(
int group_size, ssg_membership_update_cb update_cb, void *update_cb_dat)
{
uint64_t name_hash;
char *self_addr_str = NULL;
int sret;
int success = 0;
ssg_group_t *g = NULL, *check_g;
......@@ -1143,36 +1174,26 @@ static ssg_group_t * ssg_group_create_internal(
name_hash = ssg_hash64_str(group_name);
/* get my address string */
SSG_GET_SELF_ADDR_STR(ssg_inst->mid, self_addr_str);
if (self_addr_str == NULL) goto fini;
/* allocate an SSG group data structure and initialize some of it */
g = malloc(sizeof(*g));
if (!g) goto fini;
memset(g, 0, sizeof(*g));
g->name = strdup(group_name);
if (!g->name) goto fini;
g->ssg_inst = ssg_inst;
g->update_cb = update_cb;
g->update_cb_dat = update_cb_dat;
ABT_rwlock_create(&g->lock);
/* generate unique descriptor for this group */
g->descriptor = ssg_group_descriptor_create(name_hash, self_addr_str,
g->descriptor = ssg_group_descriptor_create(name_hash, ssg_inst->self_addr_str,
SSG_OWNER_IS_MEMBER);
if (g->descriptor == NULL) goto fini;
/* initialize the group view */
sret = ssg_group_view_create(group_addr_strs, group_size, self_addr_str,
g->lock, &g->view, &g->self_id);
sret = ssg_group_view_create(group_addr_strs, group_size, ssg_inst->self_addr_str,
g->lock, &g->view);
if (sret != SSG_SUCCESS) goto fini;
if (g->self_id == SSG_MEMBER_ID_INVALID)
{
/* if unable to resolve my rank within the group, error out */
fprintf(stderr, "Error: SSG unable to resolve rank in group %s\n",
group_name);
goto fini;
}
#ifdef DEBUG
/* set debug output pointer */
......@@ -1181,7 +1202,7 @@ static ssg_group_t * ssg_group_create_internal(
{
char dbg_log_path[PATH_MAX];
snprintf(dbg_log_path, PATH_MAX, "%s/ssg-%s-%lu.log",
dbg_log_dir, g->name, g->self_id);
dbg_log_dir, g->name, g->ssg_inst->self_id);
g->dbg_log = fopen(dbg_log_path, "a");
if (!g->dbg_log) goto fini;
}
......@@ -1211,7 +1232,8 @@ static ssg_group_t * ssg_group_create_internal(
goto fini;
}
SSG_DEBUG(g, "group create successful (size=%d, self=%s)\n", group_size, self_addr_str);
SSG_DEBUG(g, "group create successful (size=%d, self=%s)\n",
group_size, ssg_inst->self_addr_str);
success = 1;
fini:
......@@ -1229,7 +1251,6 @@ fini:
free(g);
g = NULL;
}
free(self_addr_str);
return g;
}
......@@ -1237,21 +1258,17 @@ fini:
static int ssg_group_view_create(
const char * const group_addr_strs[], int group_size,
const char * self_addr_str, ABT_rwlock view_lock,
ssg_group_view_t * view, ssg_member_id_t * self_id)
ssg_group_view_t * view)
{
int i, j, r;
ABT_thread *lookup_ults = NULL;
struct ssg_group_lookup_ult_args *lookup_ult_args = NULL;
const char *self_addr_substr = NULL;
const char *addr_substr = NULL;
int self_found = 0;
int aret;
int sret = SSG_FAILURE;
if (self_id)
*self_id = SSG_MEMBER_ID_INVALID;
if ((self_id != NULL && self_addr_str == NULL) || !view) goto fini;
/* allocate lookup ULTs */
lookup_ults = malloc(group_size * sizeof(*lookup_ults));
if (lookup_ults == NULL) goto fini;
......@@ -1283,7 +1300,6 @@ static int ssg_group_view_create(
if (group_addr_strs[j] == NULL || strlen(group_addr_strs[j]) == 0) continue;
/* resolve self id in group if caller asked for it */
if (self_addr_substr)
{
addr_substr = strstr(group_addr_strs[j], "://");
......@@ -1294,10 +1310,8 @@ static int ssg_group_view_create(
if (strcmp(self_addr_substr, addr_substr) == 0)
{
if (self_id)
*self_id = ssg_gen_member_id(group_addr_strs[j]);
/* don't look up our own address, we already know it */
self_found = 1;
continue;
}
}
......@@ -1331,6 +1345,15 @@ static int ssg_group_view_create(
}
}
/* if we provided a self address string and didn't find ourselves,
* then we return with an error
*/
if (self_addr_str && !self_found)
{
fprintf(stderr, "Error: SSG unable to resolve self ID in group\n");
goto fini;
}
/* clean exit */
sret = SSG_SUCCESS;
......@@ -1535,6 +1558,67 @@ static const char ** ssg_addr_str_buf_to_list(
return ret;
}
#ifdef SSG_HAVE_PMIX
void ssg_pmix_proc_failure_notify_fn(
size_t evhdlr_registration_id, pmix_status_t status, const pmix_proc_t *source,
pmix_info_t info[], size_t ninfo, pmix_info_t results[], size_t nresults,
pmix_event_notification_cbfunc_fn_t cbfunc, void *cbdata)
{
char key[512];
pmix_value_t ssg_id_val;
pmix_value_t *val_p;
pmix_status_t ret;
ssg_group_t *g, *g_tmp;
ssg_member_update_t fail_update;
assert(status == PMIX_PROC_TERMINATED || status == PMIX_ERR_PROC_ABORTED);