GitLab maintenance scheduled form Friday, 2021-06-18 5:00pm to Satursday, 2021-06-19 10:00pm CT - Services will be unavailable during this time.

Commit cb17024a authored by Shane Snyder's avatar Shane Snyder

add core shutdown instrumentation

parent e984a8b5
...@@ -227,6 +227,13 @@ static void darshan_core_shutdown() ...@@ -227,6 +227,13 @@ static void darshan_core_shutdown()
int global_mod_use_count[DARSHAN_MAX_MODS] = {0}; int global_mod_use_count[DARSHAN_MAX_MODS] = {0};
darshan_record_id shared_recs[DARSHAN_CORE_MAX_RECORDS] = {0}; darshan_record_id shared_recs[DARSHAN_CORE_MAX_RECORDS] = {0};
double start_log_time; double start_log_time;
double open1, open2;
double job1, job2;
double rec1, rec2;
double mod1[DARSHAN_MAX_MODS] = {0};
double mod2[DARSHAN_MAX_MODS] = {0};
double header1, header2;
double tm_end;
long offset; long offset;
struct darshan_header log_header; struct darshan_header log_header;
MPI_File log_fh; MPI_File log_fh;
...@@ -236,6 +243,8 @@ static void darshan_core_shutdown() ...@@ -236,6 +243,8 @@ static void darshan_core_shutdown()
if(getenv("DARSHAN_INTERNAL_TIMING")) if(getenv("DARSHAN_INTERNAL_TIMING"))
internal_timing_flag = 1; internal_timing_flag = 1;
start_log_time = DARSHAN_MPI_CALL(PMPI_Wtime)();
/* disable darhan-core while we shutdown */ /* disable darhan-core while we shutdown */
DARSHAN_CORE_LOCK(); DARSHAN_CORE_LOCK();
if(!darshan_core) if(!darshan_core)
...@@ -259,8 +268,6 @@ static void darshan_core_shutdown() ...@@ -259,8 +268,6 @@ static void darshan_core_shutdown()
} }
DARSHAN_CORE_UNLOCK(); DARSHAN_CORE_UNLOCK();
start_log_time = DARSHAN_MPI_CALL(PMPI_Wtime)();
logfile_name = malloc(PATH_MAX); logfile_name = malloc(PATH_MAX);
if(!logfile_name) if(!logfile_name)
{ {
...@@ -337,8 +344,12 @@ static void darshan_core_shutdown() ...@@ -337,8 +344,12 @@ static void darshan_core_shutdown()
/* get a list of records which are shared across all processes */ /* get a list of records which are shared across all processes */
darshan_get_shared_records(final_core, shared_recs); darshan_get_shared_records(final_core, shared_recs);
if(internal_timing_flag)
open1 = DARSHAN_MPI_CALL(PMPI_Wtime)();
/* collectively open the darshan log file */ /* collectively open the darshan log file */
ret = darshan_log_coll_open(logfile_name, &log_fh); ret = darshan_log_coll_open(logfile_name, &log_fh);
if(internal_timing_flag)
open2 = DARSHAN_MPI_CALL(PMPI_Wtime)();
/* error out if unable to open log file */ /* error out if unable to open log file */
DARSHAN_MPI_CALL(PMPI_Allreduce)(&ret, &all_ret, 1, MPI_INT, DARSHAN_MPI_CALL(PMPI_Allreduce)(&ret, &all_ret, 1, MPI_INT,
...@@ -356,6 +367,8 @@ static void darshan_core_shutdown() ...@@ -356,6 +367,8 @@ static void darshan_core_shutdown()
return; return;
} }
if(internal_timing_flag)
job1 = DARSHAN_MPI_CALL(PMPI_Wtime)();
/* rank 0 is responsible for writing the compressed darshan job information */ /* rank 0 is responsible for writing the compressed darshan job information */
if(my_rank == 0) if(my_rank == 0)
{ {
...@@ -397,9 +410,15 @@ static void darshan_core_shutdown() ...@@ -397,9 +410,15 @@ static void darshan_core_shutdown()
darshan_core_cleanup(final_core); darshan_core_cleanup(final_core);
return; return;
} }
if(internal_timing_flag)
job2 = DARSHAN_MPI_CALL(PMPI_Wtime)();
if(internal_timing_flag)
rec1 = DARSHAN_MPI_CALL(PMPI_Wtime)();
/* write the record name->id hash to the log file */ /* write the record name->id hash to the log file */
ret = darshan_log_write_record_hash(log_fh, final_core, &log_header.rec_map); ret = darshan_log_write_record_hash(log_fh, final_core, &log_header.rec_map);
if(internal_timing_flag)
rec2 = DARSHAN_MPI_CALL(PMPI_Wtime)();
/* error out if unable to write record hash */ /* error out if unable to write record hash */
DARSHAN_MPI_CALL(PMPI_Allreduce)(&ret, &all_ret, 1, MPI_INT, DARSHAN_MPI_CALL(PMPI_Allreduce)(&ret, &all_ret, 1, MPI_INT,
...@@ -417,6 +436,7 @@ static void darshan_core_shutdown() ...@@ -417,6 +436,7 @@ static void darshan_core_shutdown()
return; return;
} }
/* TODO: would be nice to factor this out somehow ... a lot to look at */
/* loop over globally used darshan modules and: /* loop over globally used darshan modules and:
* - perform shared file reductions, if possible * - perform shared file reductions, if possible
* - get final output buffer * - get final output buffer
...@@ -443,7 +463,11 @@ static void darshan_core_shutdown() ...@@ -443,7 +463,11 @@ static void darshan_core_shutdown()
continue; continue;
} }
else if(global_mod_use_count[j] == nprocs)
if(internal_timing_flag)
mod1[i] = DARSHAN_MPI_CALL(PMPI_Wtime)();
/* if all processes used this module, prepare to do a shared file reduction */
if(global_mod_use_count[j] == nprocs)
{ {
int shared_rec_count = 0; int shared_rec_count = 0;
int rec_sz = 0; int rec_sz = 0;
...@@ -451,8 +475,6 @@ static void darshan_core_shutdown() ...@@ -451,8 +475,6 @@ static void darshan_core_shutdown()
MPI_Datatype red_type; MPI_Datatype red_type;
MPI_Op red_op; MPI_Op red_op;
/* if all processes used this module, prepare to do a shared file reduction */
/* set the shared file list for this module */ /* set the shared file list for this module */
memset(mod_shared_recs, 0, DARSHAN_CORE_MAX_RECORDS * sizeof(darshan_record_id)); memset(mod_shared_recs, 0, DARSHAN_CORE_MAX_RECORDS * sizeof(darshan_record_id));
for(j = 0; j < DARSHAN_CORE_MAX_RECORDS && shared_recs[j] != 0; j++) for(j = 0; j < DARSHAN_CORE_MAX_RECORDS && shared_recs[j] != 0; j++)
...@@ -548,8 +570,12 @@ static void darshan_core_shutdown() ...@@ -548,8 +570,12 @@ static void darshan_core_shutdown()
{ {
this_mod->mod_funcs.shutdown(); this_mod->mod_funcs.shutdown();
} }
if(internal_timing_flag)
mod2[i] = DARSHAN_MPI_CALL(PMPI_Wtime)();
} }
if(internal_timing_flag)
header1 = DARSHAN_MPI_CALL(PMPI_Wtime)();
/* rank 0 is responsible for writing the log header */ /* rank 0 is responsible for writing the log header */
if(my_rank == 0) if(my_rank == 0)
{ {
...@@ -576,6 +602,8 @@ static void darshan_core_shutdown() ...@@ -576,6 +602,8 @@ static void darshan_core_shutdown()
darshan_core_cleanup(final_core); darshan_core_cleanup(final_core);
return; return;
} }
if(internal_timing_flag)
header2 = DARSHAN_MPI_CALL(PMPI_Wtime)();
DARSHAN_MPI_CALL(PMPI_File_close)(&log_fh); DARSHAN_MPI_CALL(PMPI_File_close)(&log_fh);
...@@ -614,7 +642,53 @@ static void darshan_core_shutdown() ...@@ -614,7 +642,53 @@ static void darshan_core_shutdown()
if(internal_timing_flag) if(internal_timing_flag)
{ {
/* TODO: what do we want to time in new darshan version? */ double open_tm, open_slowest;
double header_tm, header_slowest;
double job_tm, job_slowest;
double rec_tm, rec_slowest;
double mod_tm[DARSHAN_MAX_MODS], mod_slowest[DARSHAN_MAX_MODS];
double all_tm, all_slowest;
tm_end = DARSHAN_MPI_CALL(PMPI_Wtime)();
open_tm = open2 - open1;
header_tm = header2 - header1;
job_tm = job2 - job1;
rec_tm = rec2 - rec1;
all_tm = tm_end - start_log_time;
for(i = 0;i < DARSHAN_MAX_MODS; i++)
{
mod_tm[i] = mod2[i] - mod1[i];
}
DARSHAN_MPI_CALL(PMPI_Reduce)(&open_tm, &open_slowest, 1,
MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
DARSHAN_MPI_CALL(PMPI_Reduce)(&header_tm, &header_slowest, 1,
MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
DARSHAN_MPI_CALL(PMPI_Reduce)(&job_tm, &job_slowest, 1,
MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
DARSHAN_MPI_CALL(PMPI_Reduce)(&rec_tm, &rec_slowest, 1,
MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
DARSHAN_MPI_CALL(PMPI_Reduce)(&all_tm, &all_slowest, 1,
MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
DARSHAN_MPI_CALL(PMPI_Reduce)(mod_tm, mod_slowest, DARSHAN_MAX_MODS,
MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
if(my_rank == 0)
{
printf("#darshan:<op>\t<nprocs>\t<time>\n");
printf("darshan:log_open\t%d\t%f\n", nprocs, open_slowest);
printf("darshan:job_write\t%d\t%f\n", nprocs, job_slowest);
printf("darshan:hash_write\t%d\t%f\n", nprocs, rec_slowest);
printf("darshan:header_write\t%d\t%f\n", nprocs, rec_slowest);
for(i = 0; i < DARSHAN_MAX_MODS; i++)
{
if(global_mod_use_count[i])
printf("darshan:%s_shutdown\t%d\t%f\n", darshan_module_names[i],
nprocs, rec_slowest);
}
printf("darshan:core_shutdown\t%d\t%f\n", nprocs, all_slowest);
}
} }
return; return;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment