Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Rob Latham
MPICH-BlueGene
Commits
6902be2e
Commit
6902be2e
authored
Oct 29, 2010
by
Pavan Balaji
Browse files
[svn-r7397] Updates to the FTB support to include CKPTED, CKPT_FAIL, RESTARTED,
RESTART_FAIL, PROCS_DEAD.
parent
a58f5a65
Changes
1
Hide whitespace changes
Inline
Side-by-side
src/pm/hydra/pm/pmiserv/pmip_cb.c
View file @
6902be2e
...
...
@@ -258,6 +258,7 @@ static HYD_status pmi_cb(int fd, HYD_event_t events, void *userp)
struct
HYD_pmcd_pmi_hdr
hdr
;
enum
HYD_pmcd_pmi_cmd
cmd
;
struct
HYD_pmcd_pmip_pmi_handle
*
h
;
char
ftb_event_payload
[
HYDT_FTB_MAX_PAYLOAD_DATA
];
HYD_status
status
=
HYD_SUCCESS
;
HYDU_FUNC_ENTER
();
...
...
@@ -281,6 +282,12 @@ static HYD_status pmi_cb(int fd, HYD_event_t events, void *userp)
* applications, this is harder to identify, so we just let
* the user cleanup the processes on a failure. */
if
(
using_pmi_port
||
HYD_pmcd_pmip
.
downstream
.
pmi_fd_active
[
i
])
{
MPL_snprintf
(
ftb_event_payload
,
HYDT_FTB_MAX_PAYLOAD_DATA
,
"pgid:%d rank:%d"
,
HYD_pmcd_pmip
.
local
.
pgid
,
HYD_pmcd_pmip
.
downstream
.
pmi_rank
[
i
]);
status
=
HYDT_ftb_publish
(
"FTB_MPI_PROCS_DEAD"
,
ftb_event_payload
);
HYDU_ERR_POP
(
status
,
"FTB publish failed
\n
"
);
if
(
HYD_pmcd_pmip
.
user_global
.
auto_cleanup
)
{
HYD_pmcd_pmip_killjob
();
...
...
@@ -528,6 +535,10 @@ static HYD_status launch_procs(void)
HYDU_ERR_POP
(
status
,
"unable to create env
\n
"
);
/* Restart the proxy. Specify stdin fd only if pmi_rank 0 is in this proxy. */
MPL_snprintf
(
ftb_event_payload
,
HYDT_FTB_MAX_PAYLOAD_DATA
,
"pgid:%d ranks:%d-%d"
,
HYD_pmcd_pmip
.
local
.
pgid
,
HYD_pmcd_pmip
.
downstream
.
pmi_rank
[
0
],
HYD_pmcd_pmip
.
downstream
.
pmi_rank
[
HYD_pmcd_pmip
.
local
.
proxy_process_count
-
1
]);
status
=
HYDT_ckpoint_restart
(
HYD_pmcd_pmip
.
local
.
pgid
,
HYD_pmcd_pmip
.
local
.
id
,
env
,
HYD_pmcd_pmip
.
local
.
proxy_process_count
,
pmi_ranks
,
...
...
@@ -537,14 +548,11 @@ static HYD_status launch_procs(void)
HYD_pmcd_pmip
.
downstream
.
out
,
HYD_pmcd_pmip
.
downstream
.
err
,
HYD_pmcd_pmip
.
downstream
.
pid
);
MPL_snprintf
(
ftb_event_payload
,
HYDT_FTB_MAX_PAYLOAD_DATA
,
"pgid:%d ranks:%d-%d"
,
HYD_pmcd_pmip
.
local
.
pgid
,
pmi_ranks
[
0
],
pmi_ranks
[
HYD_pmcd_pmip
.
local
.
proxy_process_count
-
1
]);
if
(
status
)
{
status
=
HYDT_ftb_publish
(
"FTB_MPI_CKPOINT_FAILED"
,
ftb_event_payload
);
HYDU_ERR_POP
(
status
,
"checkpoint restart FTB publishing failure
\n
"
);
}
status
=
HYDT_ftb_publish
(
"FTB_MPI_CKPOINTED"
,
ftb_event_payload
);
if
(
status
)
status
=
HYDT_ftb_publish
(
"FTB_MPI_PROCS_RESTART_FAIL"
,
ftb_event_payload
);
else
status
=
HYDT_ftb_publish
(
"FTB_MPI_PROCS_RESTARTED"
,
ftb_event_payload
);
HYDU_ERR_POP
(
status
,
"checkpoint restart FTB publishing failure
\n
"
);
goto
fn_spawn_complete
;
}
...
...
@@ -882,6 +890,7 @@ HYD_status HYD_pmcd_pmip_control_cmd_cb(int fd, HYD_event_t events, void *userp)
{
int
cmd_len
,
closed
;
enum
HYD_pmcd_pmi_cmd
cmd
;
char
ftb_event_payload
[
HYDT_FTB_MAX_PAYLOAD_DATA
];
HYD_status
status
=
HYD_SUCCESS
;
HYDU_FUNC_ENTER
();
...
...
@@ -908,7 +917,18 @@ HYD_status HYD_pmcd_pmip_control_cmd_cb(int fd, HYD_event_t events, void *userp)
}
else
if
(
cmd
==
CKPOINT
)
{
HYD_pmcd_pmi_proxy_dump
(
status
,
STDOUT_FILENO
,
"requesting checkpoint
\n
"
);
MPL_snprintf
(
ftb_event_payload
,
HYDT_FTB_MAX_PAYLOAD_DATA
,
"pgid:%d ranks:%d-%d"
,
HYD_pmcd_pmip
.
local
.
pgid
,
HYD_pmcd_pmip
.
downstream
.
pmi_rank
[
0
],
HYD_pmcd_pmip
.
downstream
.
pmi_rank
[
HYD_pmcd_pmip
.
local
.
proxy_process_count
-
1
]);
status
=
HYDT_ckpoint_suspend
(
HYD_pmcd_pmip
.
local
.
pgid
,
HYD_pmcd_pmip
.
local
.
id
);
if
(
status
)
status
=
HYDT_ftb_publish
(
"FTB_MPI_PROCS_CKPT_FAIL"
,
ftb_event_payload
);
else
status
=
HYDT_ftb_publish
(
"FTB_MPI_PROCS_CKPTED"
,
ftb_event_payload
);
HYDU_ERR_POP
(
status
,
"FTB publishing failure
\n
"
);
HYDU_ERR_POP
(
status
,
"checkpoint suspend failed
\n
"
);
HYD_pmcd_pmi_proxy_dump
(
status
,
STDOUT_FILENO
,
"checkpoint completed
\n
"
);
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment