Commit c2f6be3c authored by Jason Cope's avatar Jason Cope
Browse files

updated a years worth of improvements / bug fixes to the BMI portals layer

- runs stable on XT4 and XT5
- tested up to 200,000 cores with IOFSL
parent b7a189c3
......@@ -13,7 +13,7 @@ ifneq (,$(BUILD_PORTALS))
# Local definitions.
#
DIR := src/io/bmi/bmi_portals
cfiles := portals.c dlmalloc.c portals_conn.c portals_helpers.c portals_wrappers.c
cfiles := portals.c dlmalloc.c portals_comm.c portals_conn.c portals_helpers.c portals_wrappers.c portals_trace.c
#
# Export these to the top Makefile to tell it what to build.
......
......@@ -4,13 +4,16 @@
#include <signal.h>
#include "portals_conn.h"
#include "portals_comm.h"
#include "portals_helpers.h"
#include "portals_trace.h"
#include "src/common/quicklist/quicklist.h"
#include "src/common/gen-locks/gen-locks.h"
#include "src/common/id-generator/id-generator.h"
#include "src/io/bmi/bmi.h"
#include "src/io/bmi/bmi-method-support.h"
#include "src/io/bmi/bmi-method-callback.h"
/* bmi mode */
#define CLIENT 0
......@@ -88,6 +91,30 @@ static int bmip_is_clone = 0;
pthread_barrier_t bmip_comm_bar;
pthread_barrierattr_t bmi_comm_bar_attr;
static void * bmiptl_safe_malloc(size_t size, char * fl, char * fn, int l)
{
void * m = NULL;
m = malloc(size);
if(!m)
{
fprintf(stderr, "%s:%i malloc failed. size = %lu\n", __func__, __LINE__, size);
assert(m != NULL);
}
return m;
}
static void bmiptl_safe_free(void * m, char * fl, char * fn, int l)
{
if(!m)
{
fprintf(stderr, "%s:%i free failed. NULL buffer detected\n", __func__, __LINE__);
assert(m != NULL);
}
free(m);
}
int bmip_comm_barrier_init(void)
{
int ret = 0;
......@@ -196,6 +223,16 @@ BMI_portals_initialize(bmi_method_addr_p listen_addr, int method_id,
portals_node_type = (init_flags & BMI_INIT_SERVER) ? SERVER : CLIENT;
portals_method_id = method_id;
/* dynamically allocate the correct mem */
if(portals_node_type == SERVER)
{
bmip_allocate_server_mem();
}
else if(portals_node_type == CLIENT)
{
bmip_allocate_client_mem();
}
if(portals_node_type == SERVER || (!client_clone_mode && portals_node_type == CLIENT) )
{
/* if we have the addr */
......@@ -227,17 +264,11 @@ BMI_portals_initialize(bmi_method_addr_p listen_addr, int method_id,
{
bmip_comm_barrier_init();
#ifdef BMIP_CLONE
//client_clone_pid = clone(bmip_client_clone_init, clone_stack_top, CLONE_THREAD|CLONE_FILES|CLONE_SIGHAND|CLONE_VM, listen_addr);
client_clone_pid = clone(bmip_client_clone_init, clone_stack_top, CLONE_THREAD|CLONE_SIGHAND|CLONE_VM, listen_addr);
bmip_comm_barrier();
#endif
}
if(portals_node_type == SERVER || (!client_clone_mode && portals_node_type == CLIENT))
{
fprintf(stderr, "%s:%i nid = %i pid = %i\n", __func__, __LINE__, bmip_get_ptl_nid(), bmip_get_ptl_pid());
}
return 0;
}
......@@ -245,6 +276,7 @@ BMI_portals_initialize(bmi_method_addr_p listen_addr, int method_id,
static int
BMI_portals_finalize(void)
{
bmip_trace_dump_list();
if(portals_node_type == SERVER)
{
bmip_dest_eqs();
......@@ -287,16 +319,10 @@ BMI_portals_set_info(int option, void* inout_parameter)
bmi_method_addr_p addr = (bmi_method_addr_p)inout_parameter;
portals_addr_t * a = (portals_addr_t *)addr->method_data;
fprintf(stderr, "%s:%i drop portals addr = %s\n", __func__, __LINE__, a->hostname);
/* remove from the list */
qlist_del(&a->list);
gen_mutex_unlock(&addr_lock);
/* cleanup */
//free(a->hostname);
//free(a);
break;
}
case BMI_OPTIMISTIC_BUFFER_REG:
......@@ -344,7 +370,7 @@ BMI_portals_memalloc(bmi_size_t size, enum bmi_op_type send_recv)
}
else
{
return malloc(size);
return bmiptl_safe_malloc(size, __FILE__, __func__, __LINE__);
}
return NULL;
}
......@@ -360,7 +386,7 @@ BMI_portals_memfree(void* buffer, bmi_size_t size, enum bmi_op_type send_recv)
}
else
{
free(buffer);
bmiptl_safe_free(buffer, __FILE__, __func__, __LINE__);
}
return ret;
}
......@@ -369,7 +395,7 @@ BMI_portals_memfree(void* buffer, bmi_size_t size, enum bmi_op_type send_recv)
static int
BMI_portals_unexpected_free(void* buffer)
{
free(buffer);
bmiptl_safe_free(buffer, __FILE__, __func__, __LINE__);
return 0;
}
......@@ -390,7 +416,7 @@ BMI_portals_post_send(bmi_op_id_t* id, bmi_method_addr_p dest,
mop->context_id = context_id;
*id = mop->op_id;
bmip_server_post_send(((portals_addr_t *)dest->method_data)->pid, (int64_t)tag, 1, &buffer, (size_t *)&size, BMIP_USE_CVTEST, user_ptr, *id);
bmip_server_post_send(((portals_addr_t *)dest->method_data)->pid, (int64_t)tag, 1, (void **)&buffer, (size_t *)&size, BMIP_USE_CVTEST, user_ptr, *id);
return 0;
}
else
......@@ -441,6 +467,18 @@ BMI_portals_post_sendunexpected(bmi_op_id_t* id, bmi_method_addr_p dest,
return bmip_delegate_ret;
}
else
{
method_op_p mop = bmi_alloc_method_op(0);
mop->addr = dest;
mop->method_data = NULL;
mop->user_ptr = user_ptr;
mop->context_id = context_id;
*id = mop->op_id;
return bmip_server_post_unex_send(((portals_addr_t *)dest->method_data)->pid,
1, buffer, size, tag, user_ptr, *id);
}
return -1;
}
......@@ -459,6 +497,7 @@ BMI_portals_post_recv(bmi_op_id_t* id, bmi_method_addr_p src, void* buffer,
void* user_ptr, bmi_context_id context_id, PVFS_hint hints)
{
int ret = 0;
if(portals_node_type == SERVER)
{
method_op_p mop = bmi_alloc_method_op(0);
......@@ -508,7 +547,7 @@ BMI_portals_test(bmi_op_id_t id, int* outcount, bmi_error_code_t* error_code,
if(portals_node_type == SERVER)
{
method_op_p op = (method_op_p)id_gen_fast_lookup(id);
*outcount = bmip_server_test_event_id(max_idle_time_ms, 1, user_ptr, actual_size, id);
*outcount = bmip_server_test_event_id(max_idle_time_ms, 1, user_ptr, (size_t *)actual_size, id);
*error_code = 0;
if(*outcount > 0)
......@@ -549,8 +588,8 @@ BMI_portals_testcontext(int incount, bmi_op_id_t* out_id_array, int* outcount,
{
if(portals_node_type == SERVER)
{
int64_t * opids = (int64_t *)malloc(sizeof(int64_t) * incount);
size_t * sizes = (size_t *)malloc(sizeof(size_t) * incount);
int64_t * opids = (int64_t *)bmiptl_safe_malloc(sizeof(int64_t) * incount, __FILE__, __func__, __LINE__);
size_t * sizes = (size_t *)bmiptl_safe_malloc(sizeof(size_t) * incount, __FILE__, __func__, __LINE__);
*outcount = bmip_server_test_events(max_idle_time_ms, incount, user_ptr_array, sizes, opids);
......@@ -571,8 +610,8 @@ BMI_portals_testcontext(int incount, bmi_op_id_t* out_id_array, int* outcount,
}
}
free(opids);
free(sizes);
bmiptl_safe_free(opids, __FILE__, __func__, __LINE__);
bmiptl_safe_free(sizes, __FILE__, __func__, __LINE__);
return 0;
}
......@@ -652,10 +691,10 @@ BMI_portals_testunexpected(int incount, int* outcount,
{
int i = 0;
/* TODO make these global per server... prevent alloc / dealloc */
void ** buffers = (void **)malloc(sizeof(void *) * incount);
size_t * sizes = (size_t *)malloc(sizeof(size_t) * incount);
int64_t * tags = (int64_t *)malloc(sizeof(int64_t) * incount);
ptl_process_id_t * addrs = (ptl_process_id_t *)malloc(sizeof(ptl_process_id_t) * incount);
void ** buffers = (void **)bmiptl_safe_malloc(sizeof(void *) * incount, __FILE__, __func__, __LINE__);
size_t * sizes = (size_t *)bmiptl_safe_malloc(sizeof(size_t) * incount, __FILE__, __func__, __LINE__);
int64_t * tags = (int64_t *)bmiptl_safe_malloc(sizeof(int64_t) * incount, __FILE__, __func__, __LINE__);
ptl_process_id_t * addrs = (ptl_process_id_t *)bmiptl_safe_malloc(sizeof(ptl_process_id_t) * incount, __FILE__, __func__, __LINE__);
*outcount = bmip_server_test_unex_events(max_idle_time_ms, incount, buffers, sizes, tags, addrs);
......@@ -671,10 +710,10 @@ BMI_portals_testunexpected(int incount, int* outcount,
/* TODO make these global per server... prevent alloc / dealloc */
/* cleanup */
free(buffers);
free(sizes);
free(tags);
free(addrs);
bmiptl_safe_free(buffers, __FILE__, __func__, __LINE__);
bmiptl_safe_free(sizes, __FILE__, __func__, __LINE__);
bmiptl_safe_free(tags, __FILE__, __func__, __LINE__);
bmiptl_safe_free(addrs, __FILE__, __func__, __LINE__);
return 0;
}
......@@ -710,6 +749,7 @@ BMI_portals_method_addr_lookup(const char * id)
if(strcmp(id, a->hostname) == 0)
{
found = 1;
addr = a->p_addr; /* store the address */
break;
}
}
......@@ -746,10 +786,6 @@ BMI_portals_method_addr_lookup(const char * id)
/* add it to the list */
qlist_add_tail(&((portals_addr_t *)addr->method_data)->list, &bmip_addr_list);
}
else
{
fprintf(stderr, "%s:%i not found\n", __func__, __LINE__);
}
gen_mutex_unlock(&addr_lock);
return addr;
......@@ -769,7 +805,7 @@ BMI_portals_post_send_list(bmi_op_id_t* id, bmi_method_addr_p dest,
{
method_op_p mop = bmi_alloc_method_op(0);
*id = mop->op_id;
bmip_server_post_send(((portals_addr_t *)dest->method_data)->pid, (int64_t)tag, list_count, (const void **)buffer_list, (size_t *)size_list, BMIP_USE_CVTEST, user_ptr, *id);
bmip_server_post_send(((portals_addr_t *)dest->method_data)->pid, (int64_t)tag, list_count, (void **)buffer_list, (size_t *)size_list, BMIP_USE_CVTEST, user_ptr, *id);
return 0;
}
else
......
#include "portals_comm.h"
#include "portals_wrappers.h"
#include <stdio.h>
const char * bmip_ptl_ev_type(ptl_event_t * ev)
{
switch(ev->type)
{
case PTL_EVENT_SEND_START:
return "PTL_EVENT_SEND_START";
case PTL_EVENT_SEND_END:
return "PTL_EVENT_SEND_END";
case PTL_EVENT_PUT_START:
return "PTL_EVENT_PUT_START";
case PTL_EVENT_PUT_END:
return "PTL_EVENT_PUT_END";
case PTL_EVENT_ACK:
return "PTL_EVENT_ACK";
case PTL_EVENT_GET_START:
return "PTL_EVENT_GET_START";
case PTL_EVENT_GET_END:
return "PTL_EVENT_GET_END";
case PTL_EVENT_REPLY_START:
return "PTL_EVENT_REPLY_START";
case PTL_EVENT_REPLY_END:
return "PTL_EVENT_REPLY_END";
case PTL_EVENT_UNLINK:
return "PTL_EVENT_UNLINK";
default:
return "UNKNOWN";
};
out:
return NULL;
}
int bmip_unex_handler(ptl_event_t * ev)
{
int ret = ev->type;
switch(ev->type)
{
case PTL_EVENT_SEND_START:
break;
case PTL_EVENT_SEND_END:
break;
case PTL_EVENT_PUT_START:
break;
case PTL_EVENT_PUT_END:
break;
case PTL_EVENT_ACK:
break;
case PTL_EVENT_GET_START:
break;
case PTL_EVENT_GET_END:
break;
case PTL_EVENT_REPLY_START:
break;
case PTL_EVENT_REPLY_END:
break;
case PTL_EVENT_UNLINK:
break;
default:
ret = -1;
break;
};
out:
return ret;
}
int bmip_wait_event(int timeout, ptl_handle_eq_t * eq, ptl_event_t * ev)
{
int ret = -1;
int i = 0;
const int numhandles = 1;
ptl_event_t sev;
ptl_event_t * lev;
/* detect if we want a copy of the event data or not */
if(ev == NULL)
{
lev = &sev;
}
else
{
lev = ev;
}
/* wait for an unexpected message */
#ifndef BMIP_USE_TIMEOUT
ret = bmip_ptl_eq_wait(*eq, lev);
#else
ret = bmip_ptl_eq_poll(eq, numhandles, timeout, lev, &i);
#endif
if(ret != PTL_EQ_EMPTY)
{
if(ret != PTL_OK)
{
fprintf(stderr, "eq wait failure\n");
ret = -1;
goto out;
}
else
{
ret = bmip_unex_handler(lev);
if(ret == -1)
{
ret = -1;
fprintf(stderr, "ev handler failure\n");
goto out;
}
}
}
else
{
ret = -2;
}
out:
return ret;
}
#ifndef PORTALS_COMM_H
#define PORTALS_COMM_H
#include <portals/portals3.h>
#include <sys/utsname.h>
int bmip_wait_event(int timeout, ptl_handle_eq_t * eq, ptl_event_t * ev);
#endif
#include "portals_conn.h"
#include "portals_comm.h"
#include "portals_wrappers.h"
#include "portals_helpers.h"
#include "portals_trace.h"
#include <portals/portals3.h>
#include <sys/utsname.h>
......@@ -68,46 +70,20 @@ static uint64_t unex_send_size = 0;
static double unex_send_time = 0;
/* local unex msg space... 8KB */
#define BMIP_UNEX_SPACE (8 * (1<<10))
#define BMIP_CLIENT_UNEX_SPACE (8 * (1<<10))
#define BMIP_SERVER_UNEX_SPACE (8 * (1<<10))
#define BMIP_UNEX_MSG_SIZE (8 * (1<<10) + 1)
static char bmip_unex_space[BMIP_UNEX_SPACE];
static const size_t bmip_unex_space_length = BMIP_UNEX_SPACE;
static void * bmip_unex_space = NULL;
static void * bmip_unex_space_end = NULL;
static gen_mutex_t bmip_server_unex_space_mutex = GEN_MUTEX_INITIALIZER;
static size_t bmip_unex_space_length = 0;
/* ex msg space */
#if 0
#ifdef BMIPSERVERMEM
#warning server mem
//static const size_t BMIP_SERVER_EX_SPACE = (1ul * 1024ul * 1024ul * 1024ul);
static const size_t BMIP_SERVER_EX_SPACE = (2ul * 1024ul * 1024ul * 1024ul);
//static const size_t BMIP_SERVER_EX_SPACE = (4ul * 1024ul * 1024ul * 1024ul);
//static const size_t BMIP_SERVER_EX_SPACE = (8ul * 1024ul * 1024ul * 1024ul);
#else
#warning client mem
static const size_t BMIP_SERVER_EX_SPACE = (16 * (1<<20));
#endif
#else
#ifdef BMIPSERVERMEM
#warning server mem
#define BMIP_SERVER_EX_SPACE (512ul * 1024ul * 1024ul)
//#define BMIP_SERVER_EX_SPACE (1ul * 1024ul * 1024ul * 1024ul)
//#define BMIP_SERVER_EX_SPACE (2ul * 1024ul * 1024ul * 1024ul)
//#define BMIP_SERVER_EX_SPACE (4ul * 1024ul * 1024ul * 1024ul)
//#define BMIP_SERVER_EX_SPACE (8ul * 1024ul * 1024ul * 1024ul)
#else
#warning client mem
#define BMIP_SERVER_EX_SPACE (16 * (1<<20))
#endif
#endif
#define BMIP_SERVER_EX_SPACE (2ul * 1024ul * 1024ul * 1024ul)
#define BMIP_CLIENT_EX_SPACE (16 * (1<<20))
#define BMIP_EX_MSG_SIZE (8 * (1<<20) + 1)
#if 1
static char bmip_ex_space_buffer[BMIP_SERVER_EX_SPACE];
static void * bmip_ex_space = &bmip_ex_space_buffer[0];
static void * bmip_ex_space_end = &bmip_ex_space_buffer[BMIP_SERVER_EX_SPACE - 1];
#else
static void * bmip_ex_space = NULL;
static void * bmip_ex_space_end = NULL;
#endif
static size_t bmip_ex_space_length = 0;
......@@ -126,6 +102,7 @@ static const uint64_t ex_req_mb = (1ULL << 61);
static const uint64_t ex_rsp_mb = (1ULL << 60);
static const uint64_t ex_req_put_mb = (1ULL << 59);
static const uint64_t ex_req_get_mb = (1ULL << 58);
static const uint64_t unex_server_traffic_mb = (1ULL << 57);
/* counters for the done ops */
static int server_num_done = 0;
......@@ -133,13 +110,22 @@ static int server_num_unex_done = 0;
/* pending and done operation lists */
static QLIST_HEAD(bmip_unex_pending);
static QLIST_HEAD(bmip_unex_ss_pending);
static QLIST_HEAD(bmip_unex_done);
static QLIST_HEAD(bmip_cur_ops);
static QLIST_HEAD(bmip_cur_ss_ops);
static QLIST_HEAD(bmip_done_ops);
static QLIST_HEAD(bmip_server_pending_send_ops);
static QLIST_HEAD(bmip_server_pending_recv_ops);
static QLIST_HEAD(bmip_addr_seq_list);
static int bmip_s2s_exsend_active = 0;
static QLIST_HEAD(bmip_s2s_exsend);
static int bmip_s2s_unexsend_active = 0;
static QLIST_HEAD(bmip_s2s_unexsend);
static QLIST_HEAD(bmip_precvlocalop);
static QLIST_HEAD(bmip_psendlocalop);
/* lock for list accesses */
static gen_mutex_t list_mutex = GEN_MUTEX_INITIALIZER;
static gen_mutex_t sig_mutex = GEN_MUTEX_INITIALIZER;
......@@ -179,124 +165,105 @@ static unsigned int bmip_client_seq = 0;
/* server data allocation space */
static mspace portals_data_space = NULL;
const char * bmip_ptl_ev_type(ptl_event_t * ev)
{
switch(ev->type)
{
case PTL_EVENT_SEND_START:
return "PTL_EVENT_SEND_START";
case PTL_EVENT_SEND_END:
return "PTL_EVENT_SEND_END";
case PTL_EVENT_PUT_START:
return "PTL_EVENT_PUT_START";
case PTL_EVENT_PUT_END:
return "PTL_EVENT_PUT_END";
case PTL_EVENT_ACK:
return "PTL_EVENT_ACK";
case PTL_EVENT_GET_START:
return "PTL_EVENT_GET_START";
case PTL_EVENT_GET_END:
return "PTL_EVENT_GET_END";
case PTL_EVENT_REPLY_START:
return "PTL_EVENT_REPLY_START";
case PTL_EVENT_REPLY_END:
return "PTL_EVENT_REPLY_END";
case PTL_EVENT_UNLINK:
return "PTL_EVENT_UNLINK";
default:
return "UNKNOWN";
};
out:
return NULL;
static size_t bmip_total_alloc_space = 0;
static size_t bmip_total_alloc_count = 0;
static void * bmip_safe_malloc(size_t size, char * fl, char * fn, int l)
{
void * m = NULL;
if(size == 0)
{
fprintf(stderr, "%s:%i WARNING: 0 size detected. source info %s:%s:%i\n", __func__, __LINE__, fl, fn, l);
fflush(stderr);
return NULL;
}
m = malloc(size);
if(!m)
{
fprintf(stderr, "%s:%i malloc failed. size = %lu\n", __func__, __LINE__, size);
assert(m != NULL);
}
bmip_total_alloc_space += size;
bmip_total_alloc_count++;
return m;
}
int bmip_unex_handler(ptl_event_t * ev)
static void bmip_safe_free(void * m, char * fl, char * fn, int l)
{
int ret = ev->type;
if(!m)
{
fprintf(stderr, "%s:%i WARNING: NULL buffer detected. source info %s:%s:%i\n", __func__, __LINE__, fl, fn, l);
fflush(stderr);
return;
}
free(m);
//fprintf(stderr, "%s: event type %s, nid %i, pid %i, match_bits %llx, rlength %i, mlength %i, offset %i hdr data = %llu\n", __func__, bmip_ptl_ev_type(ev), ev->initiator.nid, ev->initiator.pid, ev->match_bits, ev->rlength, ev->mlength, ev->offset, ev->hdr_data);
switch(ev->type)
{
case PTL_EVENT_SEND_START:
break;
case PTL_EVENT_SEND_END:
break;
case PTL_EVENT_PUT_START:
break;
case PTL_EVENT_PUT_END:
break;
case PTL_EVENT_ACK:
break;
case PTL_EVENT_GET_START:
break;
case PTL_EVENT_GET_END:
break;
case PTL_EVENT_REPLY_START:
break;
case PTL_EVENT_REPLY_END:
break;
case PTL_EVENT_UNLINK:
break;
default:
ret = -1;
break;
};
out:
return ret;
bmip_total_alloc_count--;
}
int bmip_wait_event(int timeout, ptl_handle_eq_t * eq, ptl_event_t * ev)
void bmip_allocate_client_mem(void)
{
int ret = -1;
int i = 0;
const int numhandles = 1;
ptl_event_t sev;
ptl_event_t * lev;
bmip_ex_space_length = BMIP_CLIENT_EX_SPACE;
bmip_ex_space = (void *)bmip_safe_malloc(sizeof(char) * BMIP_CLIENT_EX_SPACE, __FILE__, __func__, __LINE__);
bmip_unex_space_length = BMIP_CLIENT_UNEX_SPACE;
bmip_unex_space = (void *)bmip_safe_malloc(sizeof(char) * BMIP_CLIENT_UNEX_SPACE, __FILE__, __func__, __LINE__);
}
/* detect if we want a copy of the event data or not */
if(ev == NULL)
{
lev = &sev;
}
else
{
lev = ev;
}
void bmip_allocate_server_mem(void)