Commit 5e90e631 authored by Sudheer Chunduri's avatar Sudheer Chunduri
Browse files

add a counter for rank total sync time and variance

parent 1ee57973
......@@ -181,6 +181,7 @@ enum apmpi_f_mpiop_totalsync_indices
/* aggregate (across all the ranks) per MPI op times */
#define APMPI_F_MPI_GLOBAL_COUNTERS \
Y(RANK_TOTAL_MPITIME) \
Y(RANK_TOTAL_MPISYNCTIME) \
Z(APMPI_F_GLOBAL_NUM_INDICES)
enum apmpi_f_mpi_global_indices
{
......@@ -209,6 +210,7 @@ struct darshan_apmpi_header_record
struct darshan_base_record base_rec;
int64_t magic;
double apmpi_f_variance_total_mpitime;
double apmpi_f_variance_total_mpisynctime;
uint64_t appid;
};
......
......@@ -419,7 +419,7 @@ static void apmpi_shared_record_variance(MPI_Comm mod_comm)
return;
}
/* get total i/o time variances for shared records */
/* get total mpi time variances across the ranks */
var_send_buf->n = 1;
var_send_buf->S = 0;
var_send_buf->T = apmpi_runtime->perf_record->fglobalcounters[RANK_TOTAL_MPITIME];
......@@ -432,6 +432,19 @@ static void apmpi_shared_record_variance(MPI_Comm mod_comm)
apmpi_runtime->header_record->apmpi_f_variance_total_mpitime =
(var_recv_buf->S / var_recv_buf->n);
}
/* get total mpi sync time variances across the ranks */
var_send_buf->n = 1;
var_send_buf->S = 0;
var_send_buf->T = apmpi_runtime->perf_record->fglobalcounters[RANK_TOTAL_MPISYNCTIME];
PMPI_Reduce(var_send_buf, var_recv_buf, 1,
var_dt, var_op, 0, mod_comm);
if(my_rank == 0)
{
apmpi_runtime->header_record->apmpi_f_variance_total_mpisynctime =
(var_recv_buf->S / var_recv_buf->n);
}
PMPI_Type_free(&var_dt);
PMPI_Op_free(&var_op);
......@@ -471,14 +484,16 @@ static void apmpi_mpi_redux(
APMPI_UNLOCK();
return;
}
double mpisync_time = 0.0;
/* Compute Total MPI time per rank: RANK_TOTAL_MPITIME */
for (i=MPI_SEND_TOTAL_TIME; i<APMPI_F_NUM_INDICES; i+=3){
apmpi_runtime->perf_record->fglobalcounters[RANK_TOTAL_MPITIME] += apmpi_runtime->perf_record->fcounters[i];
}
for (i=MPI_BARRIER_TOTAL_SYNC_TIME; i<APMPI_F_SYNC_NUM_INDICES; i++){
apmpi_runtime->perf_record->fglobalcounters[RANK_TOTAL_MPITIME] += apmpi_runtime->perf_record->fsynccounters[i];
mpisync_time += apmpi_runtime->perf_record->fsynccounters[i];
}
apmpi_runtime->perf_record->fglobalcounters[RANK_TOTAL_MPITIME] += mpisync_time;
apmpi_runtime->perf_record->fglobalcounters[RANK_TOTAL_MPISYNCTIME] = mpisync_time;
#if 0
red_send_buf = apmpi_runtime->perf_record;
......
......@@ -135,6 +135,7 @@ static int darshan_log_get_apmpi_rec(darshan_fd fd, void** buf_p)
DARSHAN_BSWAP64(&(hdr_rec->base_rec.id));
DARSHAN_BSWAP64(&(hdr_rec->base_rec.rank));
DARSHAN_BSWAP64(&(hdr_rec->apmpi_f_variance_total_mpitime));
DARSHAN_BSWAP64(&(hdr_rec->apmpi_f_variance_total_mpisynctime));
DARSHAN_BSWAP64(&(hdr_rec->appid));
}
else
......@@ -217,6 +218,10 @@ static void darshan_log_print_apmpi_rec(void *rec, char *file_name,
hdr_rec->base_rec.rank, hdr_rec->base_rec.id,
"RANKS_TOTAL_MPITIME_VARIANCE", hdr_rec->apmpi_f_variance_total_mpitime,
"", "", "");
DARSHAN_F_COUNTER_PRINT(darshan_module_names[APMPI_MOD],
hdr_rec->base_rec.rank, hdr_rec->base_rec.id,
"RANKS_TOTAL_MPISYNCTIME_VARIANCE", hdr_rec->apmpi_f_variance_total_mpisynctime,
"", "", "");
first_rec = 0;
}
else
......@@ -293,6 +298,10 @@ static void darshan_log_print_apmpi_rec_diff(void *file_rec1, char *file_name1,
hdr_rec1->base_rec.rank, hdr_rec1->base_rec.id,
"RANKS_TOTAL_MPITIME_VARIANCE", hdr_rec1->apmpi_f_variance_total_mpitime,
"", "", "");
DARSHAN_F_COUNTER_PRINT(darshan_module_names[APMPI_MOD],
hdr_rec1->base_rec.rank, hdr_rec1->base_rec.id,
"RANKS_TOTAL_MPISYNCTIME_VARIANCE", hdr_rec1->apmpi_f_variance_total_mpisynctime,
"", "", "");
}
else if (!hdr_rec1)
{
......@@ -305,6 +314,10 @@ static void darshan_log_print_apmpi_rec_diff(void *file_rec1, char *file_name1,
hdr_rec2->base_rec.rank, hdr_rec2->base_rec.id,
"RANKS_TOTAL_MPITIME_VARIANCE", hdr_rec2->apmpi_f_variance_total_mpitime,
"", "", "");
DARSHAN_F_COUNTER_PRINT(darshan_module_names[APMPI_MOD],
hdr_rec2->base_rec.rank, hdr_rec2->base_rec.id,
"RANKS_TOTAL_MPISYNCTIME_VARIANCE", hdr_rec2->apmpi_f_variance_total_mpisynctime,
"", "", "");
}
else
{
......@@ -333,6 +346,19 @@ static void darshan_log_print_apmpi_rec_diff(void *file_rec1, char *file_name1,
"RANKS_TOTAL_MPITIME_VARIANCE", hdr_rec2->apmpi_f_variance_total_mpitime,
"", "", "");
}
if (hdr_rec1->apmpi_f_variance_total_mpisynctime != hdr_rec2->apmpi_f_variance_total_mpisynctime)
{
printf("- ");
DARSHAN_F_COUNTER_PRINT(darshan_module_names[APMPI_MOD],
hdr_rec1->base_rec.rank, hdr_rec1->base_rec.id,
"RANKS_TOTAL_MPISYNCTIME_VARIANCE", hdr_rec1->apmpi_f_variance_total_mpisynctime,
"", "", "");
printf("+ ");
DARSHAN_F_COUNTER_PRINT(darshan_module_names[APMPI_MOD],
hdr_rec2->base_rec.rank, hdr_rec2->base_rec.id,
"RANKS_TOTAL_MPISYNCTIME_VARIANCE", hdr_rec2->apmpi_f_variance_total_mpisynctime,
"", "", "");
}
}
}
else
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment