Commit 01812211 authored by Shane Snyder's avatar Shane Snyder
Browse files

revamped error handling runtime side

parent 5348600b
...@@ -30,25 +30,30 @@ ...@@ -30,25 +30,30 @@
/* max length of exe string within job record (not counting '\0') */ /* max length of exe string within job record (not counting '\0') */
#define CP_EXE_LEN (CP_JOB_RECORD_SIZE - sizeof(struct darshan_job) - 1) #define CP_EXE_LEN (CP_JOB_RECORD_SIZE - sizeof(struct darshan_job) - 1)
/* max length of module name string (not counting '\0') */
/* TODO */
#define DARSHAN_MOD_NAME_LEN 31
typedef uint64_t darshan_record_id; typedef uint64_t darshan_record_id;
/* unique identifiers to distinguish between available darshan modules */ /* unique identifiers to distinguish between available darshan modules */
/* NOTES: - valid ids range from [0...DARSHAN_MAX_MODS-1] /* NOTES: - valid ids range from [0...DARSHAN_MAX_MODS-1]
* - order of ids control module shutdown order (and consequently, order in log file) * - order of ids control module shutdown order (and consequently, order in log file)
*/ */
/* TODO: enforce maximum? */
#define DARSHAN_MAX_MODS 16 #define DARSHAN_MAX_MODS 16
typedef enum typedef enum
{ {
DARSHAN_POSIX_MOD, DARSHAN_POSIX_MOD = 0,
DARSHAN_MPIIO_MOD, DARSHAN_MPIIO_MOD,
DARSHAN_HDF5_MOD, DARSHAN_HDF5_MOD,
DARSHAN_PNETCDF_MOD, DARSHAN_PNETCDF_MOD,
} darshan_module_id; } darshan_module_id;
static char *darshan_module_names[] =
{
"POSIX",
"MPI-IO",
"HDF5",
"PNETCDF"
};
enum darshan_comp_type enum darshan_comp_type
{ {
DARSHAN_GZ_COMP, DARSHAN_GZ_COMP,
......
...@@ -18,7 +18,6 @@ ...@@ -18,7 +18,6 @@
struct darshan_core_module struct darshan_core_module
{ {
darshan_module_id id; darshan_module_id id;
char name[DARSHAN_MOD_NAME_LEN+1];
struct darshan_module_funcs mod_funcs; struct darshan_module_funcs mod_funcs;
}; };
......
...@@ -44,7 +44,6 @@ struct darshan_module_funcs ...@@ -44,7 +44,6 @@ struct darshan_module_funcs
void darshan_core_register_module( void darshan_core_register_module(
darshan_module_id id, darshan_module_id id,
char *name,
struct darshan_module_funcs *funcs, struct darshan_module_funcs *funcs,
int *runtime_mem_limit); int *runtime_mem_limit);
......
...@@ -45,9 +45,11 @@ static void darshan_get_logfile_name( ...@@ -45,9 +45,11 @@ static void darshan_get_logfile_name(
char* logfile_name, int jobid, struct tm* start_tm); char* logfile_name, int jobid, struct tm* start_tm);
static void darshan_log_record_hints_and_ver( static void darshan_log_record_hints_and_ver(
struct darshan_core_runtime* job); struct darshan_core_runtime* job);
static int darshan_get_shared_record_ids( static void darshan_get_shared_record_ids(
struct darshan_core_runtime *job, darshan_record_id *shared_recs); struct darshan_core_runtime *job, darshan_record_id *shared_recs);
static int darshan_log_write_record_map( static int darshan_log_coll_open(
char *logfile_name, MPI_File *log_fh);
static int darshan_log_write_record_hash(
MPI_File log_fh, struct darshan_core_record_ref *rec_hash, MPI_File log_fh, struct darshan_core_record_ref *rec_hash,
darshan_record_id *shared_recs, struct darshan_log_map *map); darshan_record_id *shared_recs, struct darshan_log_map *map);
static int darshan_log_coll_write( static int darshan_log_coll_write(
...@@ -197,13 +199,11 @@ static void darshan_core_shutdown() ...@@ -197,13 +199,11 @@ static void darshan_core_shutdown()
int local_mod_use[DARSHAN_MAX_MODS] = {0}; int local_mod_use[DARSHAN_MAX_MODS] = {0};
int global_mod_use_count[DARSHAN_MAX_MODS] = {0}; int global_mod_use_count[DARSHAN_MAX_MODS] = {0};
darshan_record_id shared_recs[DARSHAN_CORE_MAX_RECORDS] = {0}; darshan_record_id shared_recs[DARSHAN_CORE_MAX_RECORDS] = {0};
char *hints;
double start_log_time; double start_log_time;
long offset; long offset;
struct darshan_header log_header; struct darshan_header log_header;
MPI_File log_fh; MPI_File log_fh;
MPI_Offset tmp_off; MPI_Offset tmp_off;
MPI_Info info;
MPI_Status status; MPI_Status status;
if(getenv("DARSHAN_INTERNAL_TIMING")) if(getenv("DARSHAN_INTERNAL_TIMING"))
...@@ -297,10 +297,6 @@ static void darshan_core_shutdown() ...@@ -297,10 +297,6 @@ static void darshan_core_shutdown()
final_job->log_job.end_time = last_end_time; final_job->log_job.end_time = last_end_time;
} }
/* XXX */
/* TODO: ensuing error checking...does MPI ensure collective I/O functions return the same error
* globally, or do I always need to allreduce????? */
/* set which local modules were actually used */ /* set which local modules were actually used */
for(i = 0; i < DARSHAN_MAX_MODS; i++) for(i = 0; i < DARSHAN_MAX_MODS; i++)
{ {
...@@ -312,72 +308,10 @@ static void darshan_core_shutdown() ...@@ -312,72 +308,10 @@ static void darshan_core_shutdown()
DARSHAN_MPI_CALL(PMPI_Allreduce)(local_mod_use, global_mod_use_count, DARSHAN_MAX_MODS, MPI_INT, MPI_SUM, MPI_COMM_WORLD); DARSHAN_MPI_CALL(PMPI_Allreduce)(local_mod_use, global_mod_use_count, DARSHAN_MAX_MODS, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
/* get a list of records which are shared across all processes */ /* get a list of records which are shared across all processes */
ret = darshan_get_shared_record_ids(final_job, shared_recs); darshan_get_shared_record_ids(final_job, shared_recs);
/* error out if unable to determine shared file records */
DARSHAN_MPI_CALL(PMPI_Allreduce)(&ret, &all_ret, 1, MPI_INT,
MPI_LOR, MPI_COMM_WORLD);
if(all_ret != 0)
{
if(my_rank == 0)
{
fprintf(stderr, "darshan library warning: unable to determine shared file records\n");
}
free(logfile_name);
darshan_core_cleanup(final_job);
return;
}
/* check environment variable to see if the default MPI file hints have
* been overridden
*/
MPI_Info_create(&info);
hints = getenv(CP_LOG_HINTS_OVERRIDE);
if(!hints)
{
hints = __CP_LOG_HINTS;
}
if(hints && strlen(hints) > 0)
{
char *tok_str;
char *orig_tok_str;
char *key;
char *value;
char *saveptr = NULL;
tok_str = strdup(hints);
if(tok_str)
{
orig_tok_str = tok_str;
do
{
/* split string on semicolon */
key = strtok_r(tok_str, ";", &saveptr);
if(key)
{
tok_str = NULL;
/* look for = sign splitting key/value pairs */
value = index(key, '=');
if(value)
{
/* break key and value into separate null terminated strings */
value[0] = '\0';
value++;
if(strlen(key) > 0)
MPI_Info_set(info, key, value);
}
}
}while(key != NULL);
free(orig_tok_str);
}
}
/* open the darshan log file for writing */ /* collectively open the darshan log file */
ret = DARSHAN_MPI_CALL(PMPI_File_open)(MPI_COMM_WORLD, logfile_name, ret = darshan_log_coll_open(logfile_name, &log_fh);
MPI_MODE_CREATE | MPI_MODE_WRONLY | MPI_MODE_EXCL, info, &log_fh);
MPI_Info_free(&info);
/* error out if unable to open log file */ /* error out if unable to open log file */
DARSHAN_MPI_CALL(PMPI_Allreduce)(&ret, &all_ret, 1, MPI_INT, DARSHAN_MPI_CALL(PMPI_Allreduce)(&ret, &all_ret, 1, MPI_INT,
...@@ -386,12 +320,8 @@ static void darshan_core_shutdown() ...@@ -386,12 +320,8 @@ static void darshan_core_shutdown()
{ {
if(my_rank == 0) if(my_rank == 0)
{ {
int msg_len; fprintf(stderr, "darshan library warning: unable to open log file %s\n",
char msg[MPI_MAX_ERROR_STRING] = {0}; logfile_name);
MPI_Error_string(ret, msg, &msg_len);
fprintf(stderr, "darshan library warning: unable to open log file %s: %s\n",
logfile_name, msg);
unlink(logfile_name); unlink(logfile_name);
} }
free(logfile_name); free(logfile_name);
...@@ -403,26 +333,33 @@ static void darshan_core_shutdown() ...@@ -403,26 +333,33 @@ static void darshan_core_shutdown()
if(my_rank == 0) if(my_rank == 0)
{ {
/* write the job information, making sure to prealloc space for the log header */ /* write the job information, making sure to prealloc space for the log header */
ret = DARSHAN_MPI_CALL(PMPI_File_write_at)(log_fh, sizeof(struct darshan_header), all_ret = DARSHAN_MPI_CALL(PMPI_File_write_at)(log_fh, sizeof(struct darshan_header),
&final_job->log_job, sizeof(struct darshan_job), MPI_BYTE, &status); &final_job->log_job, sizeof(struct darshan_job), MPI_BYTE, &status);
if(ret != MPI_SUCCESS) if(all_ret != MPI_SUCCESS)
{ {
int msg_len; fprintf(stderr, "darshan library warning: unable to write job data to log file %s\n",
char msg[MPI_MAX_ERROR_STRING] = {0}; logfile_name);
unlink(logfile_name);
MPI_Error_string(ret, msg, &msg_len);
fprintf(stderr, "darshan library warning: unable to write job data to log file %s: %s\n",
logfile_name, msg);
} }
/* TODO */ /* TODO: after compression is added, this should be fixed */
log_header.rec_map.off = sizeof(struct darshan_header) + sizeof(struct darshan_job); log_header.rec_map.off = sizeof(struct darshan_header) + sizeof(struct darshan_job);
} }
/* write the record name->id map to the log file */ /* error out if unable to write job information */
ret = darshan_log_write_record_map(log_fh, final_job->rec_hash, DARSHAN_MPI_CALL(PMPI_Bcast)(&all_ret, 1, MPI_INT, 0, MPI_COMM_WORLD);
if(all_ret != 0)
{
free(logfile_name);
darshan_core_cleanup(final_job);
return;
}
/* write the record name->id hash to the log file */
ret = darshan_log_write_record_hash(log_fh, final_job->rec_hash,
shared_recs, &log_header.rec_map); shared_recs, &log_header.rec_map);
/* error out if unable to write record hash */
DARSHAN_MPI_CALL(PMPI_Allreduce)(&ret, &all_ret, 1, MPI_INT, DARSHAN_MPI_CALL(PMPI_Allreduce)(&ret, &all_ret, 1, MPI_INT,
MPI_LOR, MPI_COMM_WORLD); MPI_LOR, MPI_COMM_WORLD);
if(all_ret != 0) if(all_ret != 0)
...@@ -431,6 +368,7 @@ static void darshan_core_shutdown() ...@@ -431,6 +368,7 @@ static void darshan_core_shutdown()
{ {
fprintf(stderr, "darshan library warning: unable to write record map to log file %s\n", fprintf(stderr, "darshan library warning: unable to write record map to log file %s\n",
logfile_name); logfile_name);
unlink(logfile_name);
} }
free(logfile_name); free(logfile_name);
darshan_core_cleanup(final_job); darshan_core_cleanup(final_job);
...@@ -441,7 +379,8 @@ static void darshan_core_shutdown() ...@@ -441,7 +379,8 @@ static void darshan_core_shutdown()
* - get final output buffer * - get final output buffer
* - compress (zlib) provided output buffer * - compress (zlib) provided output buffer
* - append compressed buffer to log file * - append compressed buffer to log file
* - shutdown the module TODO * - add module index info (file offset/length) to log header
* - shutdown the module
*/ */
for(i = 0; i < DARSHAN_MAX_MODS; i++) for(i = 0; i < DARSHAN_MAX_MODS; i++)
{ {
...@@ -483,9 +422,22 @@ static void darshan_core_shutdown() ...@@ -483,9 +422,22 @@ static void darshan_core_shutdown()
/* write module data buffer to the darshan log file */ /* write module data buffer to the darshan log file */
ret = darshan_log_coll_write(log_fh, mod_buf, mod_buf_size, &log_header.mod_map[i]); ret = darshan_log_coll_write(log_fh, mod_buf, mod_buf_size, &log_header.mod_map[i]);
if(ret < 0)
/* error out if unable to write this module's data */
DARSHAN_MPI_CALL(PMPI_Allreduce)(&ret, &all_ret, 1, MPI_INT,
MPI_LOR, MPI_COMM_WORLD);
if(all_ret != 0)
{ {
/* TODO: */ if(my_rank == 0)
{
fprintf(stderr,
"darshan library warning: unable to write %s module data to log file %s\n",
darshan_module_names[i], logfile_name);
unlink(logfile_name);
}
free(logfile_name);
darshan_core_cleanup(final_job);
return;
} }
tmp_off += log_header.mod_map[i].len; tmp_off += log_header.mod_map[i].len;
...@@ -508,20 +460,32 @@ static void darshan_core_shutdown() ...@@ -508,20 +460,32 @@ static void darshan_core_shutdown()
log_header.magic_nr = CP_MAGIC_NR; log_header.magic_nr = CP_MAGIC_NR;
log_header.comp_type = DARSHAN_GZ_COMP; log_header.comp_type = DARSHAN_GZ_COMP;
ret = DARSHAN_MPI_CALL(PMPI_File_write_at)(log_fh, 0, &log_header, all_ret = DARSHAN_MPI_CALL(PMPI_File_write_at)(log_fh, 0, &log_header,
sizeof(struct darshan_header), MPI_BYTE, &status); sizeof(struct darshan_header), MPI_BYTE, &status);
if(ret != MPI_SUCCESS) if(all_ret != MPI_SUCCESS)
{ {
/* TODO */ fprintf(stderr, "darshan library warning: unable to write header to log file %s\n",
logfile_name);
unlink(logfile_name);
} }
} }
/* error out if unable to write log header */
DARSHAN_MPI_CALL(PMPI_Bcast)(&all_ret, 1, MPI_INT, 0, MPI_COMM_WORLD);
if(all_ret != 0)
{
free(logfile_name);
darshan_core_cleanup(final_job);
return;
}
DARSHAN_MPI_CALL(PMPI_File_close)(&log_fh); DARSHAN_MPI_CALL(PMPI_File_close)(&log_fh);
/* if we got this far, there are no errors, so rename from *.darshan_partial /* if we got this far, there are no errors, so rename from *.darshan_partial
* to *-<logwritetime>.darshan.gz, which indicates that this log file is * to *-<logwritetime>.darshan.gz, which indicates that this log file is
* complete and ready for analysis * complete and ready for analysis
*/ */
/* TODO: support user given logfile path/name */
if(my_rank == 0) if(my_rank == 0)
{ {
char* tmp_index; char* tmp_index;
...@@ -758,12 +722,11 @@ static void darshan_log_record_hints_and_ver(struct darshan_core_runtime* job) ...@@ -758,12 +722,11 @@ static void darshan_log_record_hints_and_ver(struct darshan_core_runtime* job)
return; return;
} }
static int darshan_get_shared_record_ids(struct darshan_core_runtime *job, static void darshan_get_shared_record_ids(struct darshan_core_runtime *job,
darshan_record_id *shared_recs) darshan_record_id *shared_recs)
{ {
int i; int i;
int ndx; int ndx;
int ret;
struct darshan_core_record_ref *ref, *tmp; struct darshan_core_record_ref *ref, *tmp;
darshan_record_id id_array[DARSHAN_CORE_MAX_RECORDS] = {0}; darshan_record_id id_array[DARSHAN_CORE_MAX_RECORDS] = {0};
darshan_record_id mask_array[DARSHAN_CORE_MAX_RECORDS] = {0}; darshan_record_id mask_array[DARSHAN_CORE_MAX_RECORDS] = {0};
...@@ -780,13 +743,9 @@ static int darshan_get_shared_record_ids(struct darshan_core_runtime *job, ...@@ -780,13 +743,9 @@ static int darshan_get_shared_record_ids(struct darshan_core_runtime *job,
} }
/* broadcast root's list of records to all other processes */ /* broadcast root's list of records to all other processes */
ret = DARSHAN_MPI_CALL(PMPI_Bcast)(id_array, DARSHAN_MPI_CALL(PMPI_Bcast)(id_array,
(DARSHAN_CORE_MAX_RECORDS * sizeof(darshan_record_id)), (DARSHAN_CORE_MAX_RECORDS * sizeof(darshan_record_id)),
MPI_BYTE, 0, MPI_COMM_WORLD); MPI_BYTE, 0, MPI_COMM_WORLD);
if(ret != 0)
{
return(-1);
}
/* everyone looks to see if they opened the same records as root */ /* everyone looks to see if they opened the same records as root */
for(i=0; (i<DARSHAN_CORE_MAX_RECORDS && id_array[i] != 0); i++) for(i=0; (i<DARSHAN_CORE_MAX_RECORDS && id_array[i] != 0); i++)
...@@ -803,12 +762,8 @@ static int darshan_get_shared_record_ids(struct darshan_core_runtime *job, ...@@ -803,12 +762,8 @@ static int darshan_get_shared_record_ids(struct darshan_core_runtime *job,
} }
/* now allreduce so everyone agrees which files are shared */ /* now allreduce so everyone agrees which files are shared */
ret = DARSHAN_MPI_CALL(PMPI_Allreduce)(mask_array, all_mask_array, DARSHAN_MPI_CALL(PMPI_Allreduce)(mask_array, all_mask_array,
DARSHAN_CORE_MAX_RECORDS, MPI_INT, MPI_LAND, MPI_COMM_WORLD); DARSHAN_CORE_MAX_RECORDS, MPI_INT, MPI_LAND, MPI_COMM_WORLD);
if(ret != 0)
{
return(-1);
}
ndx = 0; ndx = 0;
for(i=0; (i<DARSHAN_CORE_MAX_RECORDS && id_array[i] != 0); i++) for(i=0; (i<DARSHAN_CORE_MAX_RECORDS && id_array[i] != 0); i++)
...@@ -819,13 +774,74 @@ static int darshan_get_shared_record_ids(struct darshan_core_runtime *job, ...@@ -819,13 +774,74 @@ static int darshan_get_shared_record_ids(struct darshan_core_runtime *job,
} }
} }
return;
}
static int darshan_log_coll_open(char *logfile_name, MPI_File *log_fh)
{
char *hints;
char *tok_str;
char *orig_tok_str;
char *key;
char *value;
char *saveptr = NULL;
int ret;
MPI_Info info;
/* check environment variable to see if the default MPI file hints have
* been overridden
*/
MPI_Info_create(&info);
hints = getenv(CP_LOG_HINTS_OVERRIDE);
if(!hints)
{
hints = __CP_LOG_HINTS;
}
if(hints && strlen(hints) > 0)
{
tok_str = strdup(hints);
if(tok_str)
{
orig_tok_str = tok_str;
do
{
/* split string on semicolon */
key = strtok_r(tok_str, ";", &saveptr);
if(key)
{
tok_str = NULL;
/* look for = sign splitting key/value pairs */
value = index(key, '=');
if(value)
{
/* break key and value into separate null terminated strings */
value[0] = '\0';
value++;
if(strlen(key) > 0)
MPI_Info_set(info, key, value);
}
}
}while(key != NULL);
free(orig_tok_str);
}
}
/* open the darshan log file for writing */
ret = DARSHAN_MPI_CALL(PMPI_File_open)(MPI_COMM_WORLD, logfile_name,
MPI_MODE_CREATE | MPI_MODE_WRONLY | MPI_MODE_EXCL, info, log_fh);
if(ret < 0)
return(-1);
MPI_Info_free(&info);
return(0); return(0);
} }
/* NOTE: the map written to file may contain duplicate id->name entries if a /* NOTE: the map written to file may contain duplicate id->name entries if a
* record is opened by multiple ranks, but not all ranks * record is opened by multiple ranks, but not all ranks
*/ */
static int darshan_log_write_record_map(MPI_File log_fh, struct darshan_core_record_ref *rec_hash, static int darshan_log_write_record_hash(MPI_File log_fh, struct darshan_core_record_ref *rec_hash,
darshan_record_id *shared_recs, struct darshan_log_map *map) darshan_record_id *shared_recs, struct darshan_log_map *map)
{ {
int i; int i;
...@@ -983,7 +999,6 @@ static int darshan_log_coll_write(MPI_File log_fh, void *buf, int count, ...@@ -983,7 +999,6 @@ static int darshan_log_coll_write(MPI_File log_fh, void *buf, int count,
void darshan_core_register_module( void darshan_core_register_module(
darshan_module_id id, darshan_module_id id,
char *name,
struct darshan_module_funcs *funcs, struct darshan_module_funcs *funcs,
int *runtime_mem_limit) int *runtime_mem_limit)
{ {
...@@ -1017,7 +1032,6 @@ void darshan_core_register_module( ...@@ -1017,7 +1032,6 @@ void darshan_core_register_module(
memset(mod, 0, sizeof(*mod)); memset(mod, 0, sizeof(*mod));
mod->id = id; mod->id = id;
strncpy(mod->name, name, DARSHAN_MOD_NAME_LEN);
mod->mod_funcs = *funcs; mod->mod_funcs = *funcs;
/* register module with darshan */ /* register module with darshan */
......
...@@ -45,8 +45,6 @@ typedef int64_t off64_t; ...@@ -45,8 +45,6 @@ typedef int64_t off64_t;
#define MAP_OR_FAIL(func) #define MAP_OR_FAIL(func)
#define POSIX_MOD_NAME "POSIX"
struct posix_runtime_file struct posix_runtime_file
{ {
struct darshan_posix_file* file_record; struct darshan_posix_file* file_record;
...@@ -245,7 +243,6 @@ static void posix_runtime_initialize() ...@@ -245,7 +243,6 @@ static void posix_runtime_initialize()
/* register the posix module with darshan core */ /* register the posix module with darshan core */
darshan_core_register_module( darshan_core_register_module(
DARSHAN_POSIX_MOD, DARSHAN_POSIX_MOD,
POSIX_MOD_NAME,
&posix_mod_fns, &posix_mod_fns,
&mem_limit); &mem_limit);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment