Commit 7be6f3a6 authored by Shane Snyder's avatar Shane Snyder
Browse files

Merge branch 'master' into 'snyder-updates2'

# Conflicts:
#   apxc/util/darshan-apxc-logutils.c
parents d90b6875 702eeba7
......@@ -208,8 +208,8 @@ enum apmpi_f_mpiop_synctime_indices
/* aggregate (across all the ranks) per MPI op times */
#define APMPI_F_MPI_GLOBAL_COUNTERS \
Y(RANK_TOTAL_MPITIME) \
Y(RANK_TOTAL_MPISYNCTIME) \
Y(MPI_TOTAL_COMM_TIME) \
Y(MPI_TOTAL_COMM_SYNC_TIME) \
Z(APMPI_F_MPI_GLOBAL_NUM_INDICES)
enum apmpi_f_mpi_global_indices
{
......@@ -238,7 +238,6 @@ struct darshan_apmpi_header_record
{
struct darshan_base_record base_rec;
int64_t magic;
uint32_t version;
uint32_t sync_flag;
double apmpi_f_variance_total_mpitime;
double apmpi_f_variance_total_mpisynctime;
......
......@@ -404,7 +404,6 @@ static void apmpi_runtime_initialize()
#else
apmpi_runtime->header_record->sync_flag = 0;
#endif
apmpi_runtime->header_record->version = APMPI_VER;
}
apmpi_runtime->rec_id = darshan_core_gen_record_id("APMPI"); //record name
......@@ -478,7 +477,7 @@ static void apmpi_shared_record_variance(MPI_Comm mod_comm)
/* get total mpi time variances across the ranks */
var_send_buf->n = 1;
var_send_buf->S = 0;
var_send_buf->T = apmpi_runtime->perf_record->fglobalcounters[RANK_TOTAL_MPITIME];
var_send_buf->T = apmpi_runtime->perf_record->fglobalcounters[MPI_TOTAL_COMM_TIME];
PMPI_Reduce(var_send_buf, var_recv_buf, 1,
var_dt, var_op, 0, mod_comm);
......@@ -491,7 +490,7 @@ static void apmpi_shared_record_variance(MPI_Comm mod_comm)
/* get total mpi sync time variances across the ranks */
var_send_buf->n = 1;
var_send_buf->S = 0;
var_send_buf->T = apmpi_runtime->perf_record->fglobalcounters[RANK_TOTAL_MPISYNCTIME];
var_send_buf->T = apmpi_runtime->perf_record->fglobalcounters[MPI_TOTAL_COMM_SYNC_TIME];
PMPI_Reduce(var_send_buf, var_recv_buf, 1,
var_dt, var_op, 0, mod_comm);
......@@ -539,15 +538,15 @@ static void apmpi_mpi_redux(
return;
}
double mpisync_time = 0.0;
/* Compute Total MPI time per rank: RANK_TOTAL_MPITIME */
/* Compute Total MPI time per rank: MPI_TOTAL_COMM_TIME */
for (i=MPI_SEND_TOTAL_TIME; i<APMPI_F_MPIOP_TOTALTIME_NUM_INDICES; i+=3){ // times (total_time, max_time, min_time)
apmpi_runtime->perf_record->fglobalcounters[RANK_TOTAL_MPITIME] += apmpi_runtime->perf_record->fcounters[i];
apmpi_runtime->perf_record->fglobalcounters[MPI_TOTAL_COMM_TIME] += apmpi_runtime->perf_record->fcounters[i];
}
for (i=MPI_BARRIER_TOTAL_SYNC_TIME; i<APMPI_F_MPIOP_SYNCTIME_NUM_INDICES; i++){
mpisync_time += apmpi_runtime->perf_record->fsynccounters[i];
}
apmpi_runtime->perf_record->fglobalcounters[RANK_TOTAL_MPITIME] += mpisync_time;
apmpi_runtime->perf_record->fglobalcounters[RANK_TOTAL_MPISYNCTIME] = mpisync_time;
apmpi_runtime->perf_record->fglobalcounters[MPI_TOTAL_COMM_TIME] += mpisync_time;
apmpi_runtime->perf_record->fglobalcounters[MPI_TOTAL_COMM_SYNC_TIME] = mpisync_time;
#if 0
red_send_buf = apmpi_runtime->perf_record;
......
......@@ -19,7 +19,6 @@ struct darshan_apmpi_header_record
{
struct darshan_base_record base_rec;
int64_t magic;
uint32_t version;
uint32_t sync_flag;
double apmpi_f_variance_total_mpitime;
double apmpi_f_variance_total_mpisynctime;
......@@ -56,7 +55,6 @@ def log_get_apmpi_record(log, mod_type, dtype='dict'):
rec['rank'] = hdr[0].base_rec.rank
rec['magic'] = hdr[0].magic
rec['sync_flag'] = hdr[0].sync_flag
rec['version'] = hdr[0].version
rec['variance_total_mpitime'] = hdr[0].apmpi_f_variance_total_mpitime
rec['variance_total_mpisynctime'] = hdr[0].apmpi_f_variance_total_mpisynctime
else:
......
......@@ -215,12 +215,12 @@ static void darshan_log_print_apmpi_rec(void *rec, char *file_name,
hdr_rec = rec;
DARSHAN_F_COUNTER_PRINT(darshan_module_names[DARSHAN_APMPI_MOD],
hdr_rec->base_rec.rank, hdr_rec->base_rec.id,
"RANKS_TOTAL_MPITIME_VARIANCE", hdr_rec->apmpi_f_variance_total_mpitime,
"MPI_TOTAL_COMM_TIME_VARIANCE", hdr_rec->apmpi_f_variance_total_mpitime,
"", "", "");
if(hdr_rec->sync_flag)
DARSHAN_F_COUNTER_PRINT(darshan_module_names[DARSHAN_APMPI_MOD],
hdr_rec->base_rec.rank, hdr_rec->base_rec.id,
"RANKS_TOTAL_MPISYNCTIME_VARIANCE", hdr_rec->apmpi_f_variance_total_mpisynctime,
"MPI_TOTAL_COMM_SYNC_TIME_VARIANCE", hdr_rec->apmpi_f_variance_total_mpisynctime,
"", "", "");
first_rec = 0;
sync_flag = hdr_rec->sync_flag;
......@@ -231,7 +231,7 @@ static void darshan_log_print_apmpi_rec(void *rec, char *file_name,
DARSHAN_S_COUNTER_PRINT(darshan_module_names[DARSHAN_APMPI_MOD],
prf_rec->base_rec.rank, prf_rec->base_rec.id,
"nodeid", prf_rec->node_name,
"MPI_PROCESSOR_NAME", prf_rec->node_name,
"", "", "");
for(i = 0; i < APMPI_NUM_INDICES; i++)
......@@ -275,10 +275,30 @@ static void darshan_log_print_apmpi_rec(void *rec, char *file_name,
static void darshan_log_print_apmpi_description(int ver)
{
printf("\n# description of APMPI counters: %d\n", ver);
//printf("# node: node connected to this router\n");
//printf("# AR_RTR_x_y_INQ_PRF_INCOMING_FLIT_VC[0-7]: flits on VCz of x y tile\n");
//printf("# AR_RTR_x_y_INQ_PRF_ROWBUS_STALL_CNT: stalls on x y tile\n");
printf("# global summary stats showing the variance across all the MPI processes.\n");
printf("# MPI_TOTAL_COMM_TIME_VARIANCE: variance in total communication time across all the processes.\n");
printf("# MPI_TOTAL_COMM_SYNC_TIME_VARIANCE: variance in total sync time across all the processes.\n");
printf("# per-process summary stats based on the MPI op instrumented counters.\n");
printf("# MPI_PROCESSOR_NAME: name of the processor used by the MPI process.\n");
printf("# MPI_TOTAL_COMM_TIME: total communication (MPI) time of a process across all the MPI ops.\n");
printf("# MPI_TOTAL_COMM_SYNC_TIME: total sync time of a process across all the MPI ops.\n");
printf("# APMPI_*: MPI operation counts.\n");
printf("# Blocking Point-to-point, Nonblocking Point-to-point, Misc MPI operations.\n");
printf("# Blocking Collective, Nonblocking Collective and RMA opeations are instrumented.\n");
printf("# Total MPI operations instrumented in this release: 74.\n");
printf("# The following counters (as applicable) are reported for each instrumented operation.\n");
printf("# CALL_COUNT: total call count for an MPI operation.\n");
printf("# TOTAL_BYTES: total bytes (cumulative across all calls of an op) used with an MPI op.\n");
printf("# MSG_SIZE_AGG_0_256: total bytes for all the calls of an MPI op with message size range [0, 256B].\n");
printf("# MSG_SIZE_AGG_256_1K: total bytes for all the calls of an MPI op with message size range (256B, 1KB].\n");
printf("# MSG_SIZE_AGG_1K_8K: total bytes for all the calls of an MPI op with message size range (1KB, 8KB].\n");
printf("# MSG_SIZE_AGG_8K_256K: total bytes for all the calls of an MPI op with message size range (8KB, 256KB].\n");
printf("# MSG_SIZE_AGG_256K_1M: total bytes for all the calls of an MPI op with message size range (256KB, 1MB].\n");
printf("# MSG_SIZE_AGG_1M_PLUS: total bytes for all the calls of an MPI op with message size greater than 1MB.\n");
printf("# TOTAL_TIME: total time (cumulative across all calls of an op) of an MPI op.\n");
printf("# MIN_TIME: maximum time across all calls of an MPI op.\n");
printf("# MAX_TIME: minimum time across all calls of an MPI op.\n");
printf("# TOTAL_SYNC_TIME: total sync time (cumulative across all calls of an op) of an MPI op.\n");
return;
}
......@@ -305,12 +325,12 @@ static void darshan_log_print_apmpi_rec_diff(void *file_rec1, char *file_name1,
printf("- ");
DARSHAN_F_COUNTER_PRINT(darshan_module_names[DARSHAN_APMPI_MOD],
hdr_rec1->base_rec.rank, hdr_rec1->base_rec.id,
"RANKS_TOTAL_MPITIME_VARIANCE", hdr_rec1->apmpi_f_variance_total_mpitime,
"MPI_TOTAL_COMM_TIME_VARIANCE", hdr_rec1->apmpi_f_variance_total_mpitime,
"", "", "");
if(sync_flag)
DARSHAN_F_COUNTER_PRINT(darshan_module_names[DARSHAN_APMPI_MOD],
hdr_rec1->base_rec.rank, hdr_rec1->base_rec.id,
"RANKS_TOTAL_MPISYNCTIME_VARIANCE", hdr_rec1->apmpi_f_variance_total_mpisynctime,
"MPI_TOTAL_COMM_SYNC_TIME_VARIANCE", hdr_rec1->apmpi_f_variance_total_mpisynctime,
"", "", "");
}
else if (!hdr_rec1)
......@@ -318,12 +338,12 @@ static void darshan_log_print_apmpi_rec_diff(void *file_rec1, char *file_name1,
printf("+ ");
DARSHAN_F_COUNTER_PRINT(darshan_module_names[DARSHAN_APMPI_MOD],
hdr_rec2->base_rec.rank, hdr_rec2->base_rec.id,
"RANKS_TOTAL_MPITIME_VARIANCE", hdr_rec2->apmpi_f_variance_total_mpitime,
"MPI_TOTAL_COMM_TIME_VARIANCE", hdr_rec2->apmpi_f_variance_total_mpitime,
"", "", "");
if(sync_flag)
DARSHAN_F_COUNTER_PRINT(darshan_module_names[DARSHAN_APMPI_MOD],
hdr_rec2->base_rec.rank, hdr_rec2->base_rec.id,
"RANKS_TOTAL_MPISYNCTIME_VARIANCE", hdr_rec2->apmpi_f_variance_total_mpisynctime,
"MPI_TOTAL_COMM_SYNC_TIME_VARIANCE", hdr_rec2->apmpi_f_variance_total_mpisynctime,
"", "", "");
}
else
......@@ -333,12 +353,12 @@ static void darshan_log_print_apmpi_rec_diff(void *file_rec1, char *file_name1,
printf("- ");
DARSHAN_F_COUNTER_PRINT(darshan_module_names[DARSHAN_APMPI_MOD],
hdr_rec1->base_rec.rank, hdr_rec1->base_rec.id,
"RANKS_TOTAL_MPITIME_VARIANCE", hdr_rec1->apmpi_f_variance_total_mpitime,
"MPI_TOTAL_COMM_TIME_VARIANCE", hdr_rec1->apmpi_f_variance_total_mpitime,
"", "", "");
printf("+ ");
DARSHAN_F_COUNTER_PRINT(darshan_module_names[DARSHAN_APMPI_MOD],
hdr_rec2->base_rec.rank, hdr_rec2->base_rec.id,
"RANKS_TOTAL_MPITIME_VARIANCE", hdr_rec2->apmpi_f_variance_total_mpitime,
"MPI_TOTAL_COMM_TIME_VARIANCE", hdr_rec2->apmpi_f_variance_total_mpitime,
"", "", "");
}
if(sync_flag)
......@@ -348,12 +368,12 @@ static void darshan_log_print_apmpi_rec_diff(void *file_rec1, char *file_name1,
printf("- ");
DARSHAN_F_COUNTER_PRINT(darshan_module_names[DARSHAN_APMPI_MOD],
hdr_rec1->base_rec.rank, hdr_rec1->base_rec.id,
"RANKS_TOTAL_MPISYNCTIME_VARIANCE", hdr_rec1->apmpi_f_variance_total_mpisynctime,
"MPI_TOTAL_COMM_SYNC_TIME_VARIANCE", hdr_rec1->apmpi_f_variance_total_mpisynctime,
"", "", "");
printf("+ ");
DARSHAN_F_COUNTER_PRINT(darshan_module_names[DARSHAN_APMPI_MOD],
hdr_rec2->base_rec.rank, hdr_rec2->base_rec.id,
"RANKS_TOTAL_MPISYNCTIME_VARIANCE", hdr_rec2->apmpi_f_variance_total_mpisynctime,
"MPI_TOTAL_COMM_SYNC_TIME_VARIANCE", hdr_rec2->apmpi_f_variance_total_mpisynctime,
"", "", "");
}
}
......@@ -366,7 +386,7 @@ static void darshan_log_print_apmpi_rec_diff(void *file_rec1, char *file_name1,
printf("- ");
DARSHAN_S_COUNTER_PRINT(darshan_module_names[DARSHAN_APMPI_MOD],
prf_rec1->base_rec.rank, prf_rec1->base_rec.id,
"nodeid", prf_rec1->node_name,
"MPI_PROCESSOR_NAME", prf_rec1->node_name,
"", "", "");
}
else if (!prf_rec1)
......@@ -374,7 +394,7 @@ static void darshan_log_print_apmpi_rec_diff(void *file_rec1, char *file_name1,
printf("+ ");
DARSHAN_S_COUNTER_PRINT(darshan_module_names[DARSHAN_APMPI_MOD],
prf_rec2->base_rec.rank, prf_rec2->base_rec.id,
"nodeid", prf_rec2->node_name,
"MPI_PROCESSOR_NAME", prf_rec2->node_name,
"", "", "");
}
else if (prf_rec1->node_name != prf_rec2->node_name)
......@@ -382,12 +402,12 @@ static void darshan_log_print_apmpi_rec_diff(void *file_rec1, char *file_name1,
printf("- ");
DARSHAN_S_COUNTER_PRINT(darshan_module_names[DARSHAN_APMPI_MOD],
prf_rec1->base_rec.rank, prf_rec1->base_rec.id,
"nodeid", prf_rec1->node_name,
"MPI_PROCESSOR_NAME", prf_rec1->node_name,
"", "", "");
printf("+ ");
DARSHAN_S_COUNTER_PRINT(darshan_module_names[DARSHAN_APMPI_MOD],
prf_rec2->base_rec.rank, prf_rec2->base_rec.id,
"nodeid", prf_rec2->node_name,
"MPI_PROCESSOR_NAME", prf_rec2->node_name,
"", "", "");
}
int i;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment