ckpoint.c 5.71 KB
Newer Older
1
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
2
/*
Pavan Balaji's avatar
Pavan Balaji committed
3
 *  (C) 2009 by Argonne National Laboratory.
4
5
6
 *      See COPYRIGHT in top-level directory.
 */

Pavan Balaji's avatar
Pavan Balaji committed
7
#include "hydra.h"
8
#include "ckpoint.h"
9
10
11
12
#include "hydt_ftb.h"
#ifdef HAVE_PTHREAD_H
#include "pthread.h"
#endif
13

14
15
16
17
#if defined HAVE_BLCR
#include "blcr/ckpoint_blcr.h"
#endif /* HAVE_BLCR */

18
19
20
#ifdef HAVE_PTHREADS
static pthread_t thread;
#endif
21
static enum { HYDT_CKPOINT_NONE, HYDT_CKPOINT_RUNNING, HYDT_CKPOINT_FINISHED } in_ckpt;
22
struct HYDT_ckpoint_info HYDT_ckpoint_info;
23

24
HYD_status HYDT_ckpoint_init(const char *user_ckpointlib, int user_ckpoint_num)
25
{
26
    HYD_status status = HYD_SUCCESS;
27
28
29

    HYDU_FUNC_ENTER();

30
31
    if (user_ckpointlib)
        HYDT_ckpoint_info.ckpointlib = user_ckpointlib;
Pavan Balaji's avatar
Pavan Balaji committed
32
    else if (MPL_env2str("HYDRA_CKPOINTLIB", (const char **) &HYDT_ckpoint_info.ckpointlib) == 0)
33
#ifdef HYDRA_DEFAULT_CKPOINTLIB
34
        HYDT_ckpoint_info.ckpointlib = HYDRA_DEFAULT_CKPOINTLIB;
35
36
37
38
#else
    {
        /* If there is no default checkpointlib, we bail out */
        HYDT_ckpoint_info.ckpointlib = NULL;
39
        goto fn_exit;
40
41
42
    }
#endif

43

44
    HYDT_ckpoint_info.ckpoint_num = (user_ckpoint_num == -1) ? 0 : user_ckpoint_num;
45
    in_ckpt = HYDT_CKPOINT_NONE;
46

47
#if defined HAVE_BLCR
48
49
    if (!strcmp(HYDT_ckpoint_info.ckpointlib, "blcr")) {
        status = HYDT_ckpoint_blcr_init();
50
        HYDU_ERR_POP(status, "blcr checkpoint returned error\n");
51
        goto fn_exit;
52
    }
53
54
#endif /* HAVE_BLCR */

55
56
    HYDU_ERR_SETANDJUMP(status, HYD_INTERNAL_ERROR, "unrecognized ckpoint library\n");

57
  fn_exit:
58
59
60
    HYDU_FUNC_EXIT();
    return status;

61
  fn_fail:
62
63
64
    goto fn_exit;
}

65
66
67
68
69
#ifdef HAVE_PTHREADS
static void *ckpoint_thread(void *arg)
{
    HYD_status status = HYD_SUCCESS;
    char ftb_event_payload[HYDT_FTB_MAX_PAYLOAD_DATA];
70

71
72
73
74
75
76
77
78
79
80
#if defined HAVE_BLCR
    if (!strcmp(HYDT_ckpoint_info.ckpointlib, "blcr")) {
        status =
            HYDT_ckpoint_blcr_checkpoint(HYDT_ckpoint_info.prefix, HYDT_ckpoint_info.pgid,
                                         HYDT_ckpoint_info.id, HYDT_ckpoint_info.ckpoint_num);
        HYDU_ERR_POP(status, "blcr checkpoint returned error\n");
    }
#endif /* HAVE_BLCR */

    HYDT_ftb_publish("FTB_MPI_PROCS_CKPTED", ftb_event_payload);
81

82
83
    ++HYDT_ckpoint_info.ckpoint_num;

84
  fn_exit:
85
    in_ckpt = HYDT_CKPOINT_FINISHED;
86
    return (void *) (long) status;
87
88

  fn_fail:
89
90
    HYDT_ftb_publish("FTB_MPI_PROCS_CKPT_FAIL", ftb_event_payload);
    goto fn_exit;
91

92
}
93
#endif
94
95


96
HYD_status HYDT_ckpoint_checkpoint(int pgid, int id, const char *user_ckpoint_prefix)
97
{
98
#ifdef HAVE_PTHREADS
99
    HYD_status status = HYD_SUCCESS;
100
101
    struct stat st;
    int ret;
102

103
104
    HYDU_FUNC_ENTER();

105
106
107
108
109
110
111
    HYDU_ASSERT(user_ckpoint_prefix, status);

    ret = stat(user_ckpoint_prefix, &st);
    HYDU_ERR_CHKANDJUMP(status, ret, HYD_FAILURE,
                        "Failed to stat checkpoint prefix \"%s\": %s\n",
                        user_ckpoint_prefix, strerror(errno));
    HYDU_ERR_CHKANDJUMP(status, !S_ISDIR(st.st_mode), HYD_FAILURE,
112
                        "checkpoint prefix \"%s\" is not a directory.\n", user_ckpoint_prefix);
113

114
115
    HYDU_ERR_CHKANDJUMP(status, in_ckpt == HYDT_CKPOINT_RUNNING, HYD_FAILURE,
                        "Previous checkpoint has not completed.");
116
117

    /* if another ckpoint thread had started and finished, we need to
118
     * join with it to free resources */
119
120
    if (in_ckpt == HYDT_CKPOINT_FINISHED) {
        ret = pthread_join(thread, NULL);
Pavan Balaji's avatar
Pavan Balaji committed
121
        HYDU_ERR_CHKANDJUMP(status, ret, HYD_FAILURE, "pthread_join failed: %s.", strerror(ret));
122
    }
123

124
125
126
127
128
129
130
    /* set state, and start the thread to do the checkpoint */
    in_ckpt = HYDT_CKPOINT_RUNNING;
    HYDT_ckpoint_info.prefix = user_ckpoint_prefix;
    HYDT_ckpoint_info.pgid = pgid;
    HYDT_ckpoint_info.id = id;
    ret = pthread_create(&thread, NULL, ckpoint_thread, NULL);
    HYDU_ERR_CHKANDJUMP(status, ret, HYD_FAILURE, "pthread_create failed: %s.", strerror(ret));
131

132
  fn_exit:
133
134
135
    HYDU_FUNC_EXIT();
    return status;

136
  fn_fail:
137
    goto fn_exit;
138
139
140
141
#else
    HYD_status status = HYD_SUCCESS;
    HYDU_FUNC_ENTER();
    HYDU_ERR_SETANDJUMP(status, HYD_FAILURE, "pthreads required for checkpointing");
142
143
  fn_exit:
  fn_fail:
144
145
146
    HYDU_FUNC_EXIT();
    return status;
#endif
147
148
}

149
HYD_status HYDT_ckpoint_restart(int pgid, int id, struct HYD_env * envlist, int num_ranks,
150
151
                                int ranks[], int *in, int *out, int *err, int *pid,
                                const char *user_ckpoint_prefix)
152
{
153
    HYD_status status = HYD_SUCCESS;
154
155
    struct stat st;
    int ret;
156
    char ftb_event_payload[HYDT_FTB_MAX_PAYLOAD_DATA];
157
158
159

    HYDU_FUNC_ENTER();

160
    HYDU_ASSERT(user_ckpoint_prefix, status);
161

162
163
164
165
166
    ret = stat(user_ckpoint_prefix, &st);
    HYDU_ERR_CHKANDJUMP(status, ret, HYD_FAILURE,
                        "Failed to stat checkpoint prefix \"%s\": %s\n",
                        user_ckpoint_prefix, strerror(errno));
    HYDU_ERR_CHKANDJUMP(status, !S_ISDIR(st.st_mode), HYD_FAILURE,
167
                        "checkpoint prefix \"%s\" is not a directory.\n", user_ckpoint_prefix);
168

169
#if defined HAVE_BLCR
170
    if (!strcmp(HYDT_ckpoint_info.ckpointlib, "blcr")) {
171
        status =
172
            HYDT_ckpoint_blcr_restart(user_ckpoint_prefix, pgid, id,
173
                                      HYDT_ckpoint_info.ckpoint_num, envlist, num_ranks, ranks,
174
                                      in, out, err, pid);
175
        HYDU_ERR_POP(status, "blcr restart returned error\n");
176
    }
177
178
#endif /* HAVE_BLCR */

179
180
    HYDT_ftb_publish("FTB_MPI_PROCS_RESTARTED", ftb_event_payload);

181
182
    /* next checkpoint number should be the one after the one we restarted from */
    ++HYDT_ckpoint_info.ckpoint_num;
183

184
  fn_exit:
185
186
187
    HYDU_FUNC_EXIT();
    return status;

188
  fn_fail:
189
    HYDT_ftb_publish("FTB_MPI_PROCS_RESTART_FAIL", ftb_event_payload);
190
191
    goto fn_exit;
}