Commit 7be6f3a6 authored by Shane Snyder's avatar Shane Snyder
Browse files

Merge branch 'master' into 'snyder-updates2'

# Conflicts:
#   apxc/util/darshan-apxc-logutils.c
parents d90b6875 702eeba7
...@@ -208,8 +208,8 @@ enum apmpi_f_mpiop_synctime_indices ...@@ -208,8 +208,8 @@ enum apmpi_f_mpiop_synctime_indices
/* aggregate (across all the ranks) per MPI op times */ /* aggregate (across all the ranks) per MPI op times */
#define APMPI_F_MPI_GLOBAL_COUNTERS \ #define APMPI_F_MPI_GLOBAL_COUNTERS \
Y(RANK_TOTAL_MPITIME) \ Y(MPI_TOTAL_COMM_TIME) \
Y(RANK_TOTAL_MPISYNCTIME) \ Y(MPI_TOTAL_COMM_SYNC_TIME) \
Z(APMPI_F_MPI_GLOBAL_NUM_INDICES) Z(APMPI_F_MPI_GLOBAL_NUM_INDICES)
enum apmpi_f_mpi_global_indices enum apmpi_f_mpi_global_indices
{ {
...@@ -238,7 +238,6 @@ struct darshan_apmpi_header_record ...@@ -238,7 +238,6 @@ struct darshan_apmpi_header_record
{ {
struct darshan_base_record base_rec; struct darshan_base_record base_rec;
int64_t magic; int64_t magic;
uint32_t version;
uint32_t sync_flag; uint32_t sync_flag;
double apmpi_f_variance_total_mpitime; double apmpi_f_variance_total_mpitime;
double apmpi_f_variance_total_mpisynctime; double apmpi_f_variance_total_mpisynctime;
......
...@@ -404,7 +404,6 @@ static void apmpi_runtime_initialize() ...@@ -404,7 +404,6 @@ static void apmpi_runtime_initialize()
#else #else
apmpi_runtime->header_record->sync_flag = 0; apmpi_runtime->header_record->sync_flag = 0;
#endif #endif
apmpi_runtime->header_record->version = APMPI_VER;
} }
apmpi_runtime->rec_id = darshan_core_gen_record_id("APMPI"); //record name apmpi_runtime->rec_id = darshan_core_gen_record_id("APMPI"); //record name
...@@ -478,7 +477,7 @@ static void apmpi_shared_record_variance(MPI_Comm mod_comm) ...@@ -478,7 +477,7 @@ static void apmpi_shared_record_variance(MPI_Comm mod_comm)
/* get total mpi time variances across the ranks */ /* get total mpi time variances across the ranks */
var_send_buf->n = 1; var_send_buf->n = 1;
var_send_buf->S = 0; var_send_buf->S = 0;
var_send_buf->T = apmpi_runtime->perf_record->fglobalcounters[RANK_TOTAL_MPITIME]; var_send_buf->T = apmpi_runtime->perf_record->fglobalcounters[MPI_TOTAL_COMM_TIME];
PMPI_Reduce(var_send_buf, var_recv_buf, 1, PMPI_Reduce(var_send_buf, var_recv_buf, 1,
var_dt, var_op, 0, mod_comm); var_dt, var_op, 0, mod_comm);
...@@ -491,7 +490,7 @@ static void apmpi_shared_record_variance(MPI_Comm mod_comm) ...@@ -491,7 +490,7 @@ static void apmpi_shared_record_variance(MPI_Comm mod_comm)
/* get total mpi sync time variances across the ranks */ /* get total mpi sync time variances across the ranks */
var_send_buf->n = 1; var_send_buf->n = 1;
var_send_buf->S = 0; var_send_buf->S = 0;
var_send_buf->T = apmpi_runtime->perf_record->fglobalcounters[RANK_TOTAL_MPISYNCTIME]; var_send_buf->T = apmpi_runtime->perf_record->fglobalcounters[MPI_TOTAL_COMM_SYNC_TIME];
PMPI_Reduce(var_send_buf, var_recv_buf, 1, PMPI_Reduce(var_send_buf, var_recv_buf, 1,
var_dt, var_op, 0, mod_comm); var_dt, var_op, 0, mod_comm);
...@@ -539,15 +538,15 @@ static void apmpi_mpi_redux( ...@@ -539,15 +538,15 @@ static void apmpi_mpi_redux(
return; return;
} }
double mpisync_time = 0.0; double mpisync_time = 0.0;
/* Compute Total MPI time per rank: RANK_TOTAL_MPITIME */ /* Compute Total MPI time per rank: MPI_TOTAL_COMM_TIME */
for (i=MPI_SEND_TOTAL_TIME; i<APMPI_F_MPIOP_TOTALTIME_NUM_INDICES; i+=3){ // times (total_time, max_time, min_time) for (i=MPI_SEND_TOTAL_TIME; i<APMPI_F_MPIOP_TOTALTIME_NUM_INDICES; i+=3){ // times (total_time, max_time, min_time)
apmpi_runtime->perf_record->fglobalcounters[RANK_TOTAL_MPITIME] += apmpi_runtime->perf_record->fcounters[i]; apmpi_runtime->perf_record->fglobalcounters[MPI_TOTAL_COMM_TIME] += apmpi_runtime->perf_record->fcounters[i];
} }
for (i=MPI_BARRIER_TOTAL_SYNC_TIME; i<APMPI_F_MPIOP_SYNCTIME_NUM_INDICES; i++){ for (i=MPI_BARRIER_TOTAL_SYNC_TIME; i<APMPI_F_MPIOP_SYNCTIME_NUM_INDICES; i++){
mpisync_time += apmpi_runtime->perf_record->fsynccounters[i]; mpisync_time += apmpi_runtime->perf_record->fsynccounters[i];
} }
apmpi_runtime->perf_record->fglobalcounters[RANK_TOTAL_MPITIME] += mpisync_time; apmpi_runtime->perf_record->fglobalcounters[MPI_TOTAL_COMM_TIME] += mpisync_time;
apmpi_runtime->perf_record->fglobalcounters[RANK_TOTAL_MPISYNCTIME] = mpisync_time; apmpi_runtime->perf_record->fglobalcounters[MPI_TOTAL_COMM_SYNC_TIME] = mpisync_time;
#if 0 #if 0
red_send_buf = apmpi_runtime->perf_record; red_send_buf = apmpi_runtime->perf_record;
......
...@@ -19,7 +19,6 @@ struct darshan_apmpi_header_record ...@@ -19,7 +19,6 @@ struct darshan_apmpi_header_record
{ {
struct darshan_base_record base_rec; struct darshan_base_record base_rec;
int64_t magic; int64_t magic;
uint32_t version;
uint32_t sync_flag; uint32_t sync_flag;
double apmpi_f_variance_total_mpitime; double apmpi_f_variance_total_mpitime;
double apmpi_f_variance_total_mpisynctime; double apmpi_f_variance_total_mpisynctime;
...@@ -56,7 +55,6 @@ def log_get_apmpi_record(log, mod_type, dtype='dict'): ...@@ -56,7 +55,6 @@ def log_get_apmpi_record(log, mod_type, dtype='dict'):
rec['rank'] = hdr[0].base_rec.rank rec['rank'] = hdr[0].base_rec.rank
rec['magic'] = hdr[0].magic rec['magic'] = hdr[0].magic
rec['sync_flag'] = hdr[0].sync_flag rec['sync_flag'] = hdr[0].sync_flag
rec['version'] = hdr[0].version
rec['variance_total_mpitime'] = hdr[0].apmpi_f_variance_total_mpitime rec['variance_total_mpitime'] = hdr[0].apmpi_f_variance_total_mpitime
rec['variance_total_mpisynctime'] = hdr[0].apmpi_f_variance_total_mpisynctime rec['variance_total_mpisynctime'] = hdr[0].apmpi_f_variance_total_mpisynctime
else: else:
......
...@@ -215,12 +215,12 @@ static void darshan_log_print_apmpi_rec(void *rec, char *file_name, ...@@ -215,12 +215,12 @@ static void darshan_log_print_apmpi_rec(void *rec, char *file_name,
hdr_rec = rec; hdr_rec = rec;
DARSHAN_F_COUNTER_PRINT(darshan_module_names[DARSHAN_APMPI_MOD], DARSHAN_F_COUNTER_PRINT(darshan_module_names[DARSHAN_APMPI_MOD],
hdr_rec->base_rec.rank, hdr_rec->base_rec.id, hdr_rec->base_rec.rank, hdr_rec->base_rec.id,
"RANKS_TOTAL_MPITIME_VARIANCE", hdr_rec->apmpi_f_variance_total_mpitime, "MPI_TOTAL_COMM_TIME_VARIANCE", hdr_rec->apmpi_f_variance_total_mpitime,
"", "", ""); "", "", "");
if(hdr_rec->sync_flag) if(hdr_rec->sync_flag)
DARSHAN_F_COUNTER_PRINT(darshan_module_names[DARSHAN_APMPI_MOD], DARSHAN_F_COUNTER_PRINT(darshan_module_names[DARSHAN_APMPI_MOD],
hdr_rec->base_rec.rank, hdr_rec->base_rec.id, hdr_rec->base_rec.rank, hdr_rec->base_rec.id,
"RANKS_TOTAL_MPISYNCTIME_VARIANCE", hdr_rec->apmpi_f_variance_total_mpisynctime, "MPI_TOTAL_COMM_SYNC_TIME_VARIANCE", hdr_rec->apmpi_f_variance_total_mpisynctime,
"", "", ""); "", "", "");
first_rec = 0; first_rec = 0;
sync_flag = hdr_rec->sync_flag; sync_flag = hdr_rec->sync_flag;
...@@ -231,7 +231,7 @@ static void darshan_log_print_apmpi_rec(void *rec, char *file_name, ...@@ -231,7 +231,7 @@ static void darshan_log_print_apmpi_rec(void *rec, char *file_name,
DARSHAN_S_COUNTER_PRINT(darshan_module_names[DARSHAN_APMPI_MOD], DARSHAN_S_COUNTER_PRINT(darshan_module_names[DARSHAN_APMPI_MOD],
prf_rec->base_rec.rank, prf_rec->base_rec.id, prf_rec->base_rec.rank, prf_rec->base_rec.id,
"nodeid", prf_rec->node_name, "MPI_PROCESSOR_NAME", prf_rec->node_name,
"", "", ""); "", "", "");
for(i = 0; i < APMPI_NUM_INDICES; i++) for(i = 0; i < APMPI_NUM_INDICES; i++)
...@@ -275,10 +275,30 @@ static void darshan_log_print_apmpi_rec(void *rec, char *file_name, ...@@ -275,10 +275,30 @@ static void darshan_log_print_apmpi_rec(void *rec, char *file_name,
static void darshan_log_print_apmpi_description(int ver) static void darshan_log_print_apmpi_description(int ver)
{ {
printf("\n# description of APMPI counters: %d\n", ver); printf("\n# description of APMPI counters: %d\n", ver);
//printf("# node: node connected to this router\n"); printf("# global summary stats showing the variance across all the MPI processes.\n");
//printf("# AR_RTR_x_y_INQ_PRF_INCOMING_FLIT_VC[0-7]: flits on VCz of x y tile\n"); printf("# MPI_TOTAL_COMM_TIME_VARIANCE: variance in total communication time across all the processes.\n");
//printf("# AR_RTR_x_y_INQ_PRF_ROWBUS_STALL_CNT: stalls on x y tile\n"); printf("# MPI_TOTAL_COMM_SYNC_TIME_VARIANCE: variance in total sync time across all the processes.\n");
printf("# per-process summary stats based on the MPI op instrumented counters.\n");
printf("# MPI_PROCESSOR_NAME: name of the processor used by the MPI process.\n");
printf("# MPI_TOTAL_COMM_TIME: total communication (MPI) time of a process across all the MPI ops.\n");
printf("# MPI_TOTAL_COMM_SYNC_TIME: total sync time of a process across all the MPI ops.\n");
printf("# APMPI_*: MPI operation counts.\n");
printf("# Blocking Point-to-point, Nonblocking Point-to-point, Misc MPI operations.\n");
printf("# Blocking Collective, Nonblocking Collective and RMA opeations are instrumented.\n");
printf("# Total MPI operations instrumented in this release: 74.\n");
printf("# The following counters (as applicable) are reported for each instrumented operation.\n");
printf("# CALL_COUNT: total call count for an MPI operation.\n");
printf("# TOTAL_BYTES: total bytes (cumulative across all calls of an op) used with an MPI op.\n");
printf("# MSG_SIZE_AGG_0_256: total bytes for all the calls of an MPI op with message size range [0, 256B].\n");
printf("# MSG_SIZE_AGG_256_1K: total bytes for all the calls of an MPI op with message size range (256B, 1KB].\n");
printf("# MSG_SIZE_AGG_1K_8K: total bytes for all the calls of an MPI op with message size range (1KB, 8KB].\n");
printf("# MSG_SIZE_AGG_8K_256K: total bytes for all the calls of an MPI op with message size range (8KB, 256KB].\n");
printf("# MSG_SIZE_AGG_256K_1M: total bytes for all the calls of an MPI op with message size range (256KB, 1MB].\n");
printf("# MSG_SIZE_AGG_1M_PLUS: total bytes for all the calls of an MPI op with message size greater than 1MB.\n");
printf("# TOTAL_TIME: total time (cumulative across all calls of an op) of an MPI op.\n");
printf("# MIN_TIME: maximum time across all calls of an MPI op.\n");
printf("# MAX_TIME: minimum time across all calls of an MPI op.\n");
printf("# TOTAL_SYNC_TIME: total sync time (cumulative across all calls of an op) of an MPI op.\n");
return; return;
} }
...@@ -305,12 +325,12 @@ static void darshan_log_print_apmpi_rec_diff(void *file_rec1, char *file_name1, ...@@ -305,12 +325,12 @@ static void darshan_log_print_apmpi_rec_diff(void *file_rec1, char *file_name1,
printf("- "); printf("- ");
DARSHAN_F_COUNTER_PRINT(darshan_module_names[DARSHAN_APMPI_MOD], DARSHAN_F_COUNTER_PRINT(darshan_module_names[DARSHAN_APMPI_MOD],
hdr_rec1->base_rec.rank, hdr_rec1->base_rec.id, hdr_rec1->base_rec.rank, hdr_rec1->base_rec.id,
"RANKS_TOTAL_MPITIME_VARIANCE", hdr_rec1->apmpi_f_variance_total_mpitime, "MPI_TOTAL_COMM_TIME_VARIANCE", hdr_rec1->apmpi_f_variance_total_mpitime,
"", "", ""); "", "", "");
if(sync_flag) if(sync_flag)
DARSHAN_F_COUNTER_PRINT(darshan_module_names[DARSHAN_APMPI_MOD], DARSHAN_F_COUNTER_PRINT(darshan_module_names[DARSHAN_APMPI_MOD],
hdr_rec1->base_rec.rank, hdr_rec1->base_rec.id, hdr_rec1->base_rec.rank, hdr_rec1->base_rec.id,
"RANKS_TOTAL_MPISYNCTIME_VARIANCE", hdr_rec1->apmpi_f_variance_total_mpisynctime, "MPI_TOTAL_COMM_SYNC_TIME_VARIANCE", hdr_rec1->apmpi_f_variance_total_mpisynctime,
"", "", ""); "", "", "");
} }
else if (!hdr_rec1) else if (!hdr_rec1)
...@@ -318,12 +338,12 @@ static void darshan_log_print_apmpi_rec_diff(void *file_rec1, char *file_name1, ...@@ -318,12 +338,12 @@ static void darshan_log_print_apmpi_rec_diff(void *file_rec1, char *file_name1,
printf("+ "); printf("+ ");
DARSHAN_F_COUNTER_PRINT(darshan_module_names[DARSHAN_APMPI_MOD], DARSHAN_F_COUNTER_PRINT(darshan_module_names[DARSHAN_APMPI_MOD],
hdr_rec2->base_rec.rank, hdr_rec2->base_rec.id, hdr_rec2->base_rec.rank, hdr_rec2->base_rec.id,
"RANKS_TOTAL_MPITIME_VARIANCE", hdr_rec2->apmpi_f_variance_total_mpitime, "MPI_TOTAL_COMM_TIME_VARIANCE", hdr_rec2->apmpi_f_variance_total_mpitime,
"", "", ""); "", "", "");
if(sync_flag) if(sync_flag)
DARSHAN_F_COUNTER_PRINT(darshan_module_names[DARSHAN_APMPI_MOD], DARSHAN_F_COUNTER_PRINT(darshan_module_names[DARSHAN_APMPI_MOD],
hdr_rec2->base_rec.rank, hdr_rec2->base_rec.id, hdr_rec2->base_rec.rank, hdr_rec2->base_rec.id,
"RANKS_TOTAL_MPISYNCTIME_VARIANCE", hdr_rec2->apmpi_f_variance_total_mpisynctime, "MPI_TOTAL_COMM_SYNC_TIME_VARIANCE", hdr_rec2->apmpi_f_variance_total_mpisynctime,
"", "", ""); "", "", "");
} }
else else
...@@ -333,12 +353,12 @@ static void darshan_log_print_apmpi_rec_diff(void *file_rec1, char *file_name1, ...@@ -333,12 +353,12 @@ static void darshan_log_print_apmpi_rec_diff(void *file_rec1, char *file_name1,
printf("- "); printf("- ");
DARSHAN_F_COUNTER_PRINT(darshan_module_names[DARSHAN_APMPI_MOD], DARSHAN_F_COUNTER_PRINT(darshan_module_names[DARSHAN_APMPI_MOD],
hdr_rec1->base_rec.rank, hdr_rec1->base_rec.id, hdr_rec1->base_rec.rank, hdr_rec1->base_rec.id,
"RANKS_TOTAL_MPITIME_VARIANCE", hdr_rec1->apmpi_f_variance_total_mpitime, "MPI_TOTAL_COMM_TIME_VARIANCE", hdr_rec1->apmpi_f_variance_total_mpitime,
"", "", ""); "", "", "");
printf("+ "); printf("+ ");
DARSHAN_F_COUNTER_PRINT(darshan_module_names[DARSHAN_APMPI_MOD], DARSHAN_F_COUNTER_PRINT(darshan_module_names[DARSHAN_APMPI_MOD],
hdr_rec2->base_rec.rank, hdr_rec2->base_rec.id, hdr_rec2->base_rec.rank, hdr_rec2->base_rec.id,
"RANKS_TOTAL_MPITIME_VARIANCE", hdr_rec2->apmpi_f_variance_total_mpitime, "MPI_TOTAL_COMM_TIME_VARIANCE", hdr_rec2->apmpi_f_variance_total_mpitime,
"", "", ""); "", "", "");
} }
if(sync_flag) if(sync_flag)
...@@ -348,12 +368,12 @@ static void darshan_log_print_apmpi_rec_diff(void *file_rec1, char *file_name1, ...@@ -348,12 +368,12 @@ static void darshan_log_print_apmpi_rec_diff(void *file_rec1, char *file_name1,
printf("- "); printf("- ");
DARSHAN_F_COUNTER_PRINT(darshan_module_names[DARSHAN_APMPI_MOD], DARSHAN_F_COUNTER_PRINT(darshan_module_names[DARSHAN_APMPI_MOD],
hdr_rec1->base_rec.rank, hdr_rec1->base_rec.id, hdr_rec1->base_rec.rank, hdr_rec1->base_rec.id,
"RANKS_TOTAL_MPISYNCTIME_VARIANCE", hdr_rec1->apmpi_f_variance_total_mpisynctime, "MPI_TOTAL_COMM_SYNC_TIME_VARIANCE", hdr_rec1->apmpi_f_variance_total_mpisynctime,
"", "", ""); "", "", "");
printf("+ "); printf("+ ");
DARSHAN_F_COUNTER_PRINT(darshan_module_names[DARSHAN_APMPI_MOD], DARSHAN_F_COUNTER_PRINT(darshan_module_names[DARSHAN_APMPI_MOD],
hdr_rec2->base_rec.rank, hdr_rec2->base_rec.id, hdr_rec2->base_rec.rank, hdr_rec2->base_rec.id,
"RANKS_TOTAL_MPISYNCTIME_VARIANCE", hdr_rec2->apmpi_f_variance_total_mpisynctime, "MPI_TOTAL_COMM_SYNC_TIME_VARIANCE", hdr_rec2->apmpi_f_variance_total_mpisynctime,
"", "", ""); "", "", "");
} }
} }
...@@ -366,7 +386,7 @@ static void darshan_log_print_apmpi_rec_diff(void *file_rec1, char *file_name1, ...@@ -366,7 +386,7 @@ static void darshan_log_print_apmpi_rec_diff(void *file_rec1, char *file_name1,
printf("- "); printf("- ");
DARSHAN_S_COUNTER_PRINT(darshan_module_names[DARSHAN_APMPI_MOD], DARSHAN_S_COUNTER_PRINT(darshan_module_names[DARSHAN_APMPI_MOD],
prf_rec1->base_rec.rank, prf_rec1->base_rec.id, prf_rec1->base_rec.rank, prf_rec1->base_rec.id,
"nodeid", prf_rec1->node_name, "MPI_PROCESSOR_NAME", prf_rec1->node_name,
"", "", ""); "", "", "");
} }
else if (!prf_rec1) else if (!prf_rec1)
...@@ -374,7 +394,7 @@ static void darshan_log_print_apmpi_rec_diff(void *file_rec1, char *file_name1, ...@@ -374,7 +394,7 @@ static void darshan_log_print_apmpi_rec_diff(void *file_rec1, char *file_name1,
printf("+ "); printf("+ ");
DARSHAN_S_COUNTER_PRINT(darshan_module_names[DARSHAN_APMPI_MOD], DARSHAN_S_COUNTER_PRINT(darshan_module_names[DARSHAN_APMPI_MOD],
prf_rec2->base_rec.rank, prf_rec2->base_rec.id, prf_rec2->base_rec.rank, prf_rec2->base_rec.id,
"nodeid", prf_rec2->node_name, "MPI_PROCESSOR_NAME", prf_rec2->node_name,
"", "", ""); "", "", "");
} }
else if (prf_rec1->node_name != prf_rec2->node_name) else if (prf_rec1->node_name != prf_rec2->node_name)
...@@ -382,12 +402,12 @@ static void darshan_log_print_apmpi_rec_diff(void *file_rec1, char *file_name1, ...@@ -382,12 +402,12 @@ static void darshan_log_print_apmpi_rec_diff(void *file_rec1, char *file_name1,
printf("- "); printf("- ");
DARSHAN_S_COUNTER_PRINT(darshan_module_names[DARSHAN_APMPI_MOD], DARSHAN_S_COUNTER_PRINT(darshan_module_names[DARSHAN_APMPI_MOD],
prf_rec1->base_rec.rank, prf_rec1->base_rec.id, prf_rec1->base_rec.rank, prf_rec1->base_rec.id,
"nodeid", prf_rec1->node_name, "MPI_PROCESSOR_NAME", prf_rec1->node_name,
"", "", ""); "", "", "");
printf("+ "); printf("+ ");
DARSHAN_S_COUNTER_PRINT(darshan_module_names[DARSHAN_APMPI_MOD], DARSHAN_S_COUNTER_PRINT(darshan_module_names[DARSHAN_APMPI_MOD],
prf_rec2->base_rec.rank, prf_rec2->base_rec.id, prf_rec2->base_rec.rank, prf_rec2->base_rec.id,
"nodeid", prf_rec2->node_name, "MPI_PROCESSOR_NAME", prf_rec2->node_name,
"", "", ""); "", "", "");
} }
int i; int i;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment