mpid_comm_failure_ack.c 5.27 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
/*
 *  (C) 2014 by Argonne National Laboratory.
 *      See COPYRIGHT in top-level directory.
 */

#include "mpidimpl.h"

#undef FUNCNAME
#define FUNCNAME MPID_Comm_failure_ack
#undef FCNAME
#define FCNAME MPIU_QUOTE(FUNCNAME)
int MPID_Comm_failure_ack(MPID_Comm *comm_ptr)
{
    int mpi_errno = MPI_SUCCESS;
    MPIDI_STATE_DECL(MPID_STATE_MPID_COMM_FAILURE_ACK);

    MPIDI_FUNC_ENTER(MPID_STATE_MPID_COMM_FAILURE_ACK);

    /* Update the list of failed processes that we know about locally.
     * This part could technically be turned off and be a correct
     * implementation, but it would be slower about propagating failure
     * information. Also, this is the failure case so speed isn't as
     * important. */
    MPIDI_CH3U_Check_for_failed_procs();

    /* Update the marker for the last known failed process in this
     * communciator. */
29
    comm_ptr->dev.last_ack_rank = MPIDI_last_known_failed;
30

31
32
33
    /* Mark the communicator as any source active */
    comm_ptr->dev.anysource_enabled = 1;

34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
fn_exit:
    MPIDI_FUNC_EXIT(MPID_STATE_MPID_COMM_FAILURE_ACK);
    return mpi_errno;
fn_fail:
    goto fn_exit;
}

#undef FUNCNAME
#define FUNCNAME MPID_Comm_failure_get_acked
#undef FCNAME
#define FCNAME MPIU_QUOTE(FUNCNAME)
int MPID_Comm_failure_get_acked(MPID_Comm *comm_ptr, MPID_Group **group_ptr)
{
    int mpi_errno = MPI_SUCCESS;
    MPID_Group *failed_group, *comm_group;
    MPIDI_STATE_DECL(MPID_STATE_MPID_COMM_FAILURE_GET_ACKED);

    MPIDI_FUNC_ENTER(MPID_STATE_MPID_COMM_FAILURE_GET_ACKED);

    /* Get the group of all failed processes */
    MPIDI_CH3U_Check_for_failed_procs();
55
    MPIDI_CH3U_Get_failed_group(comm_ptr->dev.last_ack_rank, &failed_group);
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
    if (failed_group == MPID_Group_empty) {
        *group_ptr = MPID_Group_empty;
        goto fn_exit;
    }

    MPIR_Comm_group_impl(comm_ptr, &comm_group);

    /* Get the intersection of all falied processes in this communicator */
    MPIR_Group_intersection_impl(failed_group, comm_group, group_ptr);

    MPIR_Group_release(comm_group);
    MPIR_Group_release(failed_group);

fn_exit:
    MPIDI_FUNC_EXIT(MPID_STATE_MPID_COMM_FAILURE_GET_ACKED);
    return mpi_errno;
fn_fail:
    goto fn_exit;
}
75
76
77
78
79
80
81
82
83
84
85
86
87

#undef FUNCNAME
#define FUNCNAME MPID_Comm_failed_bitarray
#undef FCNAME
#define FCNAME MPIU_QUOTE(FUNCNAME)
int MPID_Comm_failed_bitarray(MPID_Comm *comm_ptr, uint32_t **bitarray, int acked)
{
    int mpi_errno = MPI_SUCCESS;
    int size, i;
    uint32_t bit;
    int *failed_procs, *group_procs;
    MPID_Group *failed_group, *comm_group;
    MPIU_CHKLMEM_DECL(2);
88
    MPIDI_STATE_DECL(MPID_STATE_COMM_FAILED_BITARRAY);
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111

    MPIDI_FUNC_ENTER(MPID_STATE_COMM_FAILED_BITARRAY);

    /* TODO - Fix this for intercommunicators */
    size = comm_ptr->local_size;

    /* We can fit sizeof(uint32_t) * 8 ranks in one uint64_t so divide the
     * size by that */
    /* This buffer will be handed back to the calling function so we use a
     * "real" malloc here and expect the caller to free the buffer later. The
     * other buffers in this function are temporary and will be automatically
     * cleaned up at the end of the function. */
    *bitarray = (uint32_t *) MPIU_Malloc(sizeof(uint32_t) * (size / (sizeof(uint32_t) * 8)+1));
    if (!(*bitarray)) {
        fprintf(stderr, "Could not allocate space for bitarray\n");
        PMPI_Abort(MPI_COMM_WORLD, 1);
    }
    for (i = 0; i <= size/(sizeof(uint32_t)*8); i++) *bitarray[i] = 0;

    mpi_errno = MPIDI_CH3U_Check_for_failed_procs();
    if (mpi_errno) MPIU_ERR_POP(mpi_errno);

    if (acked)
112
        MPIDI_CH3U_Get_failed_group(comm_ptr->dev.last_ack_rank, &failed_group);
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
    else
        MPIDI_CH3U_Get_failed_group(-2, &failed_group);

    if (failed_group == MPID_Group_empty) goto fn_exit;


    MPIU_CHKLMEM_MALLOC(group_procs, int *, sizeof(int)*failed_group->size, mpi_errno, "group_procs");
    for (i = 0; i < failed_group->size; i++) group_procs[i] = i;
    MPIU_CHKLMEM_MALLOC(failed_procs, int *, sizeof(int)*failed_group->size, mpi_errno, "failed_procs");

    MPIR_Comm_group_impl(comm_ptr, &comm_group);

    MPIR_Group_translate_ranks_impl(failed_group, failed_group->size, group_procs, comm_group, failed_procs);

    /* The bits will actually be ordered in decending order rather than
     * ascending. This is purely for readability since it makes no practical
     * difference. So if the bits look like this:
     *
     * 10001100 01001000 00000000 00000001
     *
     * Then processes 1, 5, 6, 9, 12, and 32 have failed. */
    for (i = 0; i < failed_group->size; i++) {
        bit = 0x80000000;
        bit >>= failed_procs[i] % (sizeof(uint32_t) * 8);

        *bitarray[failed_procs[i] / (sizeof(uint32_t) * 8)] |= bit;
    }

    MPIR_Group_free_impl(comm_group);

  fn_exit:
    MPIU_CHKLMEM_FREEALL();
Pavan Balaji's avatar
Pavan Balaji committed
145
    MPIDI_FUNC_EXIT(MPID_STATE_COMM_FAILED_BITARRAY);
146
147
148
149
    return mpi_errno;
  fn_fail:
    goto fn_exit;
}
150

151
152
153
154
155
156
157
158
#undef FUNCNAME
#define FUNCNAME MPID_Comm_AS_enabled
#undef FCNAME
#define FCNAME MPIU_QUOTE(FUNCNAME)
int MPID_Comm_AS_enabled(MPID_Comm *comm_ptr) {
    return comm_ptr->dev.anysource_enabled;
}

159
160
161
162
163
164
165
166

#undef FUNCNAME
#define FUNCNAME MPID_Request_is_anysource
#undef FCNAME
#define FCNAME MPIU_QUOTE(FUNCNAME)
int MPID_Request_is_anysource(MPID_Request *request_ptr) {
    return request_ptr->dev.match.parts.rank == MPI_ANY_SOURCE;
}