Commit b3d39d59 authored by Pavan Balaji's avatar Pavan Balaji
Browse files

[svn-r6989] Initial draft of the load-leveler bootstrap server. This is a

restrictive model that functions under the following constraints
(mainly because of the inflexibility of POE):

1. The load-leveler scripts cannot specify more than one tasks per
node. The number of tasks per node need to be specified to mpiexec
using the -ppn option.

2. The number of processes being launched need to cover all of the
allocated nodes.
parent 3c89dfe2
......@@ -139,9 +139,9 @@ fi
#########################################################################
# Check what bootstrap server we should use
#########################################################################
AC_ARG_WITH(hydra-bss, [ --with-hydra-bss=name - Boot-strap Server (ssh,rsh,fork,slurm,lsf,persist)],
AC_ARG_WITH(hydra-bss, [ --with-hydra-bss=name - Boot-strap Server (ssh,rsh,fork,slurm,ll,lsf,persist)],
[ hydra_bss=$withval ],
[ hydra_bss="ssh,rsh,fork,slurm,lsf,persist" ])
[ hydra_bss="ssh,rsh,fork,slurm,ll,lsf,persist" ])
AC_MSG_CHECKING(boot-strap server)
AC_MSG_RESULT($hydra_bss)
hydra_bss_names="`echo $hydra_bss | sed -e 's/:/ /g' -e 's/,/ /g'`"
......@@ -166,6 +166,10 @@ for hydra_bss_name in ${hydra_bss_names}; do
hydra_bss_external=true
available_bss=`echo $available_bss slurm`
;;
ll)
hydra_bss_external=true
available_bss=`echo $available_bss ll`
;;
lsf)
hydra_bss_external=true
available_bss=`echo $available_bss lsf`
......
......@@ -12,4 +12,7 @@ libhydra_la_SOURCES += $(top_srcdir)/tools/bootstrap/external/external_init.c \
$(top_srcdir)/tools/bootstrap/external/slurm_launch.c \
$(top_srcdir)/tools/bootstrap/external/slurm_query_node_list.c \
$(top_srcdir)/tools/bootstrap/external/slurm_query_proxy_id.c \
$(top_srcdir)/tools/bootstrap/external/ll_launch.c \
$(top_srcdir)/tools/bootstrap/external/ll_query_node_list.c \
$(top_srcdir)/tools/bootstrap/external/ll_query_proxy_id.c \
$(top_srcdir)/tools/bootstrap/external/lsf_query_node_list.c
......@@ -10,6 +10,7 @@
#include "ssh.h"
#include "slurm.h"
#include "lsf.h"
#include "ll.h"
HYD_status HYDT_bscd_external_launch_procs(char **args, struct HYD_node *node_list,
int *control_fd, int enable_stdin,
......
......@@ -16,6 +16,8 @@ static HYD_status external_init(void)
if (!strcmp(HYDT_bsci_info.bootstrap, "slurm"))
HYDT_bsci_fns.launch_procs = HYDT_bscd_slurm_launch_procs;
else if (!strcmp(HYDT_bsci_info.bootstrap, "ll"))
HYDT_bsci_fns.launch_procs = HYDT_bscd_ll_launch_procs;
else
HYDT_bsci_fns.launch_procs = HYDT_bscd_external_launch_procs;
......@@ -27,6 +29,11 @@ static HYD_status external_init(void)
HYDT_bsci_fns.query_node_list = HYDT_bscd_slurm_query_node_list;
}
if (!strcmp(HYDT_bsci_info.bootstrap, "ll")) {
HYDT_bsci_fns.query_proxy_id = HYDT_bscd_ll_query_proxy_id;
HYDT_bsci_fns.query_node_list = HYDT_bscd_ll_query_node_list;
}
if (!strcmp(HYDT_bsci_info.bootstrap, "lsf"))
HYDT_bsci_fns.query_node_list = HYDT_bscd_lsf_query_node_list;
......@@ -57,6 +64,11 @@ HYD_status HYDT_bsci_slurm_init(void)
return external_init();
}
HYD_status HYDT_bsci_ll_init(void)
{
return external_init();
}
HYD_status HYDT_bsci_lsf_init(void)
{
return external_init();
......
/* -*- Mode: C; c-basic-offset:4 ; -*- */
/*
* (C) 2008 by Argonne National Laboratory.
* See COPYRIGHT in top-level directory.
*/
#ifndef LL_H_INCLUDED
#define LL_H_INCLUDED
#include "hydra_base.h"
HYD_status HYDT_bscd_ll_launch_procs(char **args, struct HYD_node *node_list,
int *control_fd, int enable_stdin,
HYD_status(*stdout_cb) (void *buf, int buflen),
HYD_status(*stderr_cb) (void *buf, int buflen));
HYD_status HYDT_bscd_ll_query_proxy_id(int *proxy_id);
HYD_status HYDT_bscd_ll_query_node_list(struct HYD_node **node_list);
HYD_status HYDTI_bscd_ll_query_node_count(int *count);
extern int HYDT_bscd_ll_user_node_list;
#endif /* LL_H_INCLUDED */
/* -*- Mode: C; c-basic-offset:4 ; -*- */
/*
* (C) 2008 by Argonne National Laboratory.
* See COPYRIGHT in top-level directory.
*/
#include "hydra_utils.h"
#include "bsci.h"
#include "bscu.h"
#include "ll.h"
static int fd_stdin, fd_stdout, fd_stderr;
HYD_status HYDT_bscd_ll_launch_procs(char **args, struct HYD_node *node_list,
int *control_fd, int enable_stdin,
HYD_status(*stdout_cb) (void *buf, int buflen),
HYD_status(*stderr_cb) (void *buf, int buflen))
{
int num_hosts, idx, i, fd, total_procs, node_count;
int *pid, *fd_list;
char *targs[HYD_NUM_TMP_STRINGS], *node_list_str = NULL;
char *path = NULL, *extra_arg_list = NULL, *extra_arg;
struct HYD_node *node;
HYD_status status = HYD_SUCCESS;
HYDU_FUNC_ENTER();
/* We use the following priority order for the executable path:
* (1) user-specified; (2) search in path; (3) Hard-coded
* location */
if (HYDT_bsci_info.bootstrap_exec)
path = HYDU_strdup(HYDT_bsci_info.bootstrap_exec);
if (!path)
path = HYDU_find_full_path("poe");
if (!path)
path = HYDU_strdup("/usr/bin/poe");
idx = 0;
targs[idx++] = HYDU_strdup(path);
if (HYDT_bscd_ll_user_node_list) {
HYDU_ERR_SETANDJUMP(status, HYD_INTERNAL_ERROR,
"ll does not support user-defined host lists\n");
}
/* Check how many nodes are being passed for the launch */
status = HYDTI_bscd_ll_query_node_count(&total_procs);
HYDU_ERR_POP(status, "unable to query for the node count\n");
node_count = 0;
for (node = node_list; node; node = node->next)
node_count++;
if (total_procs != node_count)
HYDU_ERR_SETANDJUMP(status, HYD_INTERNAL_ERROR,
"processes to be launched have to cover all nodes\n");
MPL_env2str("HYDRA_LAUNCH_EXTRA_ARGS", (const char **) &extra_arg_list);
if (extra_arg_list) {
extra_arg = strtok(extra_arg_list, " ");
while (extra_arg) {
targs[idx++] = HYDU_strdup(extra_arg);
extra_arg = strtok(NULL, " ");
}
}
/* Fill in the remaining arguments */
for (i = 0; args[i]; i++)
targs[idx++] = HYDU_strdup(args[i]);
/* Increase pid list to accommodate the new pid */
HYDU_MALLOC(pid, int *, (HYD_bscu_pid_count + 1) * sizeof(int), status);
for (i = 0; i < HYD_bscu_pid_count; i++)
pid[i] = HYD_bscu_pid_list[i];
HYDU_FREE(HYD_bscu_pid_list);
HYD_bscu_pid_list = pid;
/* Increase fd list to accommodate these new fds */
HYDU_MALLOC(fd_list, int *, (HYD_bscu_fd_count + 3) * sizeof(int), status);
for (i = 0; i < HYD_bscu_fd_count; i++)
fd_list[i] = HYD_bscu_fd_list[i];
HYDU_FREE(HYD_bscu_fd_list);
HYD_bscu_fd_list = fd_list;
/* append proxy ID as -1 */
targs[idx++] = HYDU_int_to_str(-1);
targs[idx++] = NULL;
status = HYDU_create_process(targs, NULL,
enable_stdin ? &fd_stdin : NULL, &fd_stdout,
&fd_stderr, &HYD_bscu_pid_list[HYD_bscu_pid_count++], -1);
HYDU_ERR_POP(status, "create process returned error\n");
/* We don't wait for stdin to close */
HYD_bscu_fd_list[HYD_bscu_fd_count++] = fd_stdout;
HYD_bscu_fd_list[HYD_bscu_fd_count++] = fd_stderr;
/* Register stdio callbacks for the spawned process */
if (enable_stdin) {
fd = STDIN_FILENO;
status = HYDT_dmx_register_fd(1, &fd, HYD_POLLIN, &fd_stdin, HYDT_bscu_stdin_cb);
HYDU_ERR_POP(status, "demux returned error registering fd\n");
}
status = HYDT_dmx_register_fd(1, &fd_stdout, HYD_POLLIN, stdout_cb, HYDT_bscu_inter_cb);
HYDU_ERR_POP(status, "demux returned error registering fd\n");
status = HYDT_dmx_register_fd(1, &fd_stderr, HYD_POLLIN, stderr_cb, HYDT_bscu_inter_cb);
HYDU_ERR_POP(status, "demux returned error registering fd\n");
fn_exit:
if (node_list_str)
HYDU_FREE(node_list_str);
HYDU_free_strlist(targs);
if (path)
HYDU_FREE(path);
HYDU_FUNC_EXIT();
return status;
fn_fail:
goto fn_exit;
}
/* -*- Mode: C; c-basic-offset:4 ; -*- */
/*
* (C) 2008 by Argonne National Laboratory.
* See COPYRIGHT in top-level directory.
*/
#include "hydra_utils.h"
#include "bsci.h"
#include "bscu.h"
#include "ll.h"
int HYDT_bscd_ll_user_node_list = 1;
static struct HYD_node *global_node_list = NULL;
static int total_node_count = 0;
static HYD_status process_mfile_token(char *token, int newline)
{
int num_procs;
char *hostname, *procs;
HYD_status status = HYD_SUCCESS;
if (newline) { /* The first entry gives the hostname and processes */
hostname = strtok(token, ":");
procs = strtok(NULL, ":");
num_procs = procs ? atoi(procs) : 1;
status = HYDU_add_to_node_list(hostname, num_procs, &global_node_list);
HYDU_ERR_POP(status, "unable to initialize proxy\n");
}
else { /* Not a new line */
HYDU_ERR_SETANDJUMP(status, HYD_INTERNAL_ERROR,
"token %s not supported at this time\n", token);
}
fn_exit:
return status;
fn_fail:
goto fn_exit;
}
HYD_status HYDT_bscd_ll_query_node_list(struct HYD_node **node_list)
{
char *hostfile;
HYD_status status = HYD_SUCCESS;
HYDU_FUNC_ENTER();
if (MPL_env2str("LOADL_HOSTFILE", (const char **) &hostfile) == 0)
hostfile = NULL;
if (hostfile == NULL) {
*node_list = NULL;
HYDU_ERR_SETANDJUMP(status, HYD_INTERNAL_ERROR, "No LL nodefile found\n");
}
else {
status = HYDU_parse_hostfile(hostfile, process_mfile_token);
HYDU_ERR_POP(status, "error parsing hostfile\n");
}
*node_list = global_node_list;
/* node list is provided by the bootstrap server */
HYDT_bscd_ll_user_node_list = 0;
fn_exit:
HYDU_FUNC_EXIT();
return status;
fn_fail:
goto fn_exit;
}
static HYD_status process_mfile_count(char *token, int newline)
{
HYD_status status = HYD_SUCCESS;
if (newline)
total_node_count++;
fn_exit:
return status;
fn_fail:
goto fn_exit;
}
HYD_status HYDTI_bscd_ll_query_node_count(int *count)
{
char *hostfile;
HYD_status status = HYD_SUCCESS;
HYDU_FUNC_ENTER();
if (MPL_env2str("LOADL_HOSTFILE", (const char **) &hostfile) == 0)
hostfile = NULL;
if (hostfile == NULL) {
HYDU_ERR_SETANDJUMP(status, HYD_INTERNAL_ERROR, "No LL nodefile found\n");
}
else {
total_node_count = 0;
status = HYDU_parse_hostfile(hostfile, process_mfile_count);
HYDU_ERR_POP(status, "error parsing hostfile\n");
*count = total_node_count;
}
fn_exit:
HYDU_FUNC_EXIT();
return status;
fn_fail:
goto fn_exit;
}
/* -*- Mode: C; c-basic-offset:4 ; -*- */
/*
* (C) 2008 by Argonne National Laboratory.
* See COPYRIGHT in top-level directory.
*/
#include "hydra_utils.h"
#include "bsci.h"
#include "bscu.h"
#include "ll.h"
HYD_status HYDT_bscd_ll_query_proxy_id(int *proxy_id)
{
HYD_status status = HYD_SUCCESS;
HYDU_FUNC_ENTER();
if (MPL_env2int("MP_CHILD", proxy_id) == 0) {
*proxy_id = -1;
HYDU_ERR_SETANDJUMP(status, HYD_INTERNAL_ERROR, "cannot find ll proxy ID\n");
}
fn_exit:
HYDU_FUNC_EXIT();
return status;
fn_fail:
goto fn_exit;
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment