diff --git a/darshan-log-format.h b/darshan-log-format.h index f1d2c49fcad84aa33d9a1ff0f1edad7f7dbf69e8..6f7d63d6eafe9f759bb4affaf92263be4f23c7ad 100644 --- a/darshan-log-format.h +++ b/darshan-log-format.h @@ -30,25 +30,30 @@ /* max length of exe string within job record (not counting '\0') */ #define CP_EXE_LEN (CP_JOB_RECORD_SIZE - sizeof(struct darshan_job) - 1) -/* max length of module name string (not counting '\0') */ -/* TODO */ -#define DARSHAN_MOD_NAME_LEN 31 - typedef uint64_t darshan_record_id; /* unique identifiers to distinguish between available darshan modules */ /* NOTES: - valid ids range from [0...DARSHAN_MAX_MODS-1] * - order of ids control module shutdown order (and consequently, order in log file) */ +/* TODO: enforce maximum? */ #define DARSHAN_MAX_MODS 16 typedef enum { - DARSHAN_POSIX_MOD, + DARSHAN_POSIX_MOD = 0, DARSHAN_MPIIO_MOD, DARSHAN_HDF5_MOD, DARSHAN_PNETCDF_MOD, } darshan_module_id; +static char *darshan_module_names[] = +{ + "POSIX", + "MPI-IO", + "HDF5", + "PNETCDF" +}; + enum darshan_comp_type { DARSHAN_GZ_COMP, diff --git a/darshan-runtime/darshan-core.h b/darshan-runtime/darshan-core.h index 835bb4232d37bad2f1560e528e361f80b352f3d5..91f68b515a9bf6e6551d5c5416829f6a4d70a83c 100644 --- a/darshan-runtime/darshan-core.h +++ b/darshan-runtime/darshan-core.h @@ -18,7 +18,6 @@ struct darshan_core_module { darshan_module_id id; - char name[DARSHAN_MOD_NAME_LEN+1]; struct darshan_module_funcs mod_funcs; }; diff --git a/darshan-runtime/darshan.h b/darshan-runtime/darshan.h index 87c68371748d93ab082968a677e43fd1f74658e5..7b57874c2748e7a70b82803d5d373e79d814adf5 100644 --- a/darshan-runtime/darshan.h +++ b/darshan-runtime/darshan.h @@ -44,7 +44,6 @@ struct darshan_module_funcs void darshan_core_register_module( darshan_module_id id, - char *name, struct darshan_module_funcs *funcs, int *runtime_mem_limit); diff --git a/darshan-runtime/lib/darshan-core.c b/darshan-runtime/lib/darshan-core.c index f33fded4ce7973e8af19db2eab3b09fe505a83df..ef03e4bfb8468d436db5301821fe263639679966 100644 --- a/darshan-runtime/lib/darshan-core.c +++ b/darshan-runtime/lib/darshan-core.c @@ -45,9 +45,11 @@ static void darshan_get_logfile_name( char* logfile_name, int jobid, struct tm* start_tm); static void darshan_log_record_hints_and_ver( struct darshan_core_runtime* job); -static int darshan_get_shared_record_ids( +static void darshan_get_shared_record_ids( struct darshan_core_runtime *job, darshan_record_id *shared_recs); -static int darshan_log_write_record_map( +static int darshan_log_coll_open( + char *logfile_name, MPI_File *log_fh); +static int darshan_log_write_record_hash( MPI_File log_fh, struct darshan_core_record_ref *rec_hash, darshan_record_id *shared_recs, struct darshan_log_map *map); static int darshan_log_coll_write( @@ -197,13 +199,11 @@ static void darshan_core_shutdown() int local_mod_use[DARSHAN_MAX_MODS] = {0}; int global_mod_use_count[DARSHAN_MAX_MODS] = {0}; darshan_record_id shared_recs[DARSHAN_CORE_MAX_RECORDS] = {0}; - char *hints; double start_log_time; long offset; struct darshan_header log_header; MPI_File log_fh; MPI_Offset tmp_off; - MPI_Info info; MPI_Status status; if(getenv("DARSHAN_INTERNAL_TIMING")) @@ -297,10 +297,6 @@ static void darshan_core_shutdown() final_job->log_job.end_time = last_end_time; } - /* XXX */ - /* TODO: ensuing error checking...does MPI ensure collective I/O functions return the same error - * globally, or do I always need to allreduce????? */ - /* set which local modules were actually used */ for(i = 0; i < DARSHAN_MAX_MODS; i++) { @@ -312,72 +308,10 @@ static void darshan_core_shutdown() DARSHAN_MPI_CALL(PMPI_Allreduce)(local_mod_use, global_mod_use_count, DARSHAN_MAX_MODS, MPI_INT, MPI_SUM, MPI_COMM_WORLD); /* get a list of records which are shared across all processes */ - ret = darshan_get_shared_record_ids(final_job, shared_recs); - - /* error out if unable to determine shared file records */ - DARSHAN_MPI_CALL(PMPI_Allreduce)(&ret, &all_ret, 1, MPI_INT, - MPI_LOR, MPI_COMM_WORLD); - if(all_ret != 0) - { - if(my_rank == 0) - { - fprintf(stderr, "darshan library warning: unable to determine shared file records\n"); - } - free(logfile_name); - darshan_core_cleanup(final_job); - return; - } - - /* check environment variable to see if the default MPI file hints have - * been overridden - */ - MPI_Info_create(&info); - - hints = getenv(CP_LOG_HINTS_OVERRIDE); - if(!hints) - { - hints = __CP_LOG_HINTS; - } - - if(hints && strlen(hints) > 0) - { - char *tok_str; - char *orig_tok_str; - char *key; - char *value; - char *saveptr = NULL; - - tok_str = strdup(hints); - if(tok_str) - { - orig_tok_str = tok_str; - do - { - /* split string on semicolon */ - key = strtok_r(tok_str, ";", &saveptr); - if(key) - { - tok_str = NULL; - /* look for = sign splitting key/value pairs */ - value = index(key, '='); - if(value) - { - /* break key and value into separate null terminated strings */ - value[0] = '\0'; - value++; - if(strlen(key) > 0) - MPI_Info_set(info, key, value); - } - } - }while(key != NULL); - free(orig_tok_str); - } - } + darshan_get_shared_record_ids(final_job, shared_recs); - /* open the darshan log file for writing */ - ret = DARSHAN_MPI_CALL(PMPI_File_open)(MPI_COMM_WORLD, logfile_name, - MPI_MODE_CREATE | MPI_MODE_WRONLY | MPI_MODE_EXCL, info, &log_fh); - MPI_Info_free(&info); + /* collectively open the darshan log file */ + ret = darshan_log_coll_open(logfile_name, &log_fh); /* error out if unable to open log file */ DARSHAN_MPI_CALL(PMPI_Allreduce)(&ret, &all_ret, 1, MPI_INT, @@ -386,12 +320,8 @@ static void darshan_core_shutdown() { if(my_rank == 0) { - int msg_len; - char msg[MPI_MAX_ERROR_STRING] = {0}; - - MPI_Error_string(ret, msg, &msg_len); - fprintf(stderr, "darshan library warning: unable to open log file %s: %s\n", - logfile_name, msg); + fprintf(stderr, "darshan library warning: unable to open log file %s\n", + logfile_name); unlink(logfile_name); } free(logfile_name); @@ -403,26 +333,33 @@ static void darshan_core_shutdown() if(my_rank == 0) { /* write the job information, making sure to prealloc space for the log header */ - ret = DARSHAN_MPI_CALL(PMPI_File_write_at)(log_fh, sizeof(struct darshan_header), + all_ret = DARSHAN_MPI_CALL(PMPI_File_write_at)(log_fh, sizeof(struct darshan_header), &final_job->log_job, sizeof(struct darshan_job), MPI_BYTE, &status); - if(ret != MPI_SUCCESS) + if(all_ret != MPI_SUCCESS) { - int msg_len; - char msg[MPI_MAX_ERROR_STRING] = {0}; - - MPI_Error_string(ret, msg, &msg_len); - fprintf(stderr, "darshan library warning: unable to write job data to log file %s: %s\n", - logfile_name, msg); + fprintf(stderr, "darshan library warning: unable to write job data to log file %s\n", + logfile_name); + unlink(logfile_name); } - /* TODO */ + /* TODO: after compression is added, this should be fixed */ log_header.rec_map.off = sizeof(struct darshan_header) + sizeof(struct darshan_job); } - /* write the record name->id map to the log file */ - ret = darshan_log_write_record_map(log_fh, final_job->rec_hash, + /* error out if unable to write job information */ + DARSHAN_MPI_CALL(PMPI_Bcast)(&all_ret, 1, MPI_INT, 0, MPI_COMM_WORLD); + if(all_ret != 0) + { + free(logfile_name); + darshan_core_cleanup(final_job); + return; + } + + /* write the record name->id hash to the log file */ + ret = darshan_log_write_record_hash(log_fh, final_job->rec_hash, shared_recs, &log_header.rec_map); + /* error out if unable to write record hash */ DARSHAN_MPI_CALL(PMPI_Allreduce)(&ret, &all_ret, 1, MPI_INT, MPI_LOR, MPI_COMM_WORLD); if(all_ret != 0) @@ -431,6 +368,7 @@ static void darshan_core_shutdown() { fprintf(stderr, "darshan library warning: unable to write record map to log file %s\n", logfile_name); + unlink(logfile_name); } free(logfile_name); darshan_core_cleanup(final_job); @@ -441,7 +379,8 @@ static void darshan_core_shutdown() * - get final output buffer * - compress (zlib) provided output buffer * - append compressed buffer to log file - * - shutdown the module TODO + * - add module index info (file offset/length) to log header + * - shutdown the module */ for(i = 0; i < DARSHAN_MAX_MODS; i++) { @@ -483,9 +422,22 @@ static void darshan_core_shutdown() /* write module data buffer to the darshan log file */ ret = darshan_log_coll_write(log_fh, mod_buf, mod_buf_size, &log_header.mod_map[i]); - if(ret < 0) + + /* error out if unable to write this module's data */ + DARSHAN_MPI_CALL(PMPI_Allreduce)(&ret, &all_ret, 1, MPI_INT, + MPI_LOR, MPI_COMM_WORLD); + if(all_ret != 0) { - /* TODO: */ + if(my_rank == 0) + { + fprintf(stderr, + "darshan library warning: unable to write %s module data to log file %s\n", + darshan_module_names[i], logfile_name); + unlink(logfile_name); + } + free(logfile_name); + darshan_core_cleanup(final_job); + return; } tmp_off += log_header.mod_map[i].len; @@ -508,20 +460,32 @@ static void darshan_core_shutdown() log_header.magic_nr = CP_MAGIC_NR; log_header.comp_type = DARSHAN_GZ_COMP; - ret = DARSHAN_MPI_CALL(PMPI_File_write_at)(log_fh, 0, &log_header, + all_ret = DARSHAN_MPI_CALL(PMPI_File_write_at)(log_fh, 0, &log_header, sizeof(struct darshan_header), MPI_BYTE, &status); - if(ret != MPI_SUCCESS) + if(all_ret != MPI_SUCCESS) { - /* TODO */ + fprintf(stderr, "darshan library warning: unable to write header to log file %s\n", + logfile_name); + unlink(logfile_name); } } + /* error out if unable to write log header */ + DARSHAN_MPI_CALL(PMPI_Bcast)(&all_ret, 1, MPI_INT, 0, MPI_COMM_WORLD); + if(all_ret != 0) + { + free(logfile_name); + darshan_core_cleanup(final_job); + return; + } + DARSHAN_MPI_CALL(PMPI_File_close)(&log_fh); /* if we got this far, there are no errors, so rename from *.darshan_partial * to *-.darshan.gz, which indicates that this log file is * complete and ready for analysis */ + /* TODO: support user given logfile path/name */ if(my_rank == 0) { char* tmp_index; @@ -758,12 +722,11 @@ static void darshan_log_record_hints_and_ver(struct darshan_core_runtime* job) return; } -static int darshan_get_shared_record_ids(struct darshan_core_runtime *job, +static void darshan_get_shared_record_ids(struct darshan_core_runtime *job, darshan_record_id *shared_recs) { int i; int ndx; - int ret; struct darshan_core_record_ref *ref, *tmp; darshan_record_id id_array[DARSHAN_CORE_MAX_RECORDS] = {0}; darshan_record_id mask_array[DARSHAN_CORE_MAX_RECORDS] = {0}; @@ -780,13 +743,9 @@ static int darshan_get_shared_record_ids(struct darshan_core_runtime *job, } /* broadcast root's list of records to all other processes */ - ret = DARSHAN_MPI_CALL(PMPI_Bcast)(id_array, + DARSHAN_MPI_CALL(PMPI_Bcast)(id_array, (DARSHAN_CORE_MAX_RECORDS * sizeof(darshan_record_id)), MPI_BYTE, 0, MPI_COMM_WORLD); - if(ret != 0) - { - return(-1); - } /* everyone looks to see if they opened the same records as root */ for(i=0; (i 0) + { + tok_str = strdup(hints); + if(tok_str) + { + orig_tok_str = tok_str; + do + { + /* split string on semicolon */ + key = strtok_r(tok_str, ";", &saveptr); + if(key) + { + tok_str = NULL; + /* look for = sign splitting key/value pairs */ + value = index(key, '='); + if(value) + { + /* break key and value into separate null terminated strings */ + value[0] = '\0'; + value++; + if(strlen(key) > 0) + MPI_Info_set(info, key, value); + } + } + }while(key != NULL); + free(orig_tok_str); + } + } + + /* open the darshan log file for writing */ + ret = DARSHAN_MPI_CALL(PMPI_File_open)(MPI_COMM_WORLD, logfile_name, + MPI_MODE_CREATE | MPI_MODE_WRONLY | MPI_MODE_EXCL, info, log_fh); + if(ret < 0) + return(-1); + + MPI_Info_free(&info); return(0); } /* NOTE: the map written to file may contain duplicate id->name entries if a * record is opened by multiple ranks, but not all ranks */ -static int darshan_log_write_record_map(MPI_File log_fh, struct darshan_core_record_ref *rec_hash, +static int darshan_log_write_record_hash(MPI_File log_fh, struct darshan_core_record_ref *rec_hash, darshan_record_id *shared_recs, struct darshan_log_map *map) { int i; @@ -983,7 +999,6 @@ static int darshan_log_coll_write(MPI_File log_fh, void *buf, int count, void darshan_core_register_module( darshan_module_id id, - char *name, struct darshan_module_funcs *funcs, int *runtime_mem_limit) { @@ -1017,7 +1032,6 @@ void darshan_core_register_module( memset(mod, 0, sizeof(*mod)); mod->id = id; - strncpy(mod->name, name, DARSHAN_MOD_NAME_LEN); mod->mod_funcs = *funcs; /* register module with darshan */ diff --git a/darshan-runtime/lib/darshan-posix.c b/darshan-runtime/lib/darshan-posix.c index 269b8ab7d7dfa8c71cf5b3c901943375983b3622..1578b929d0a8efb3a51f0aacd382b6b2ec257fa3 100644 --- a/darshan-runtime/lib/darshan-posix.c +++ b/darshan-runtime/lib/darshan-posix.c @@ -45,8 +45,6 @@ typedef int64_t off64_t; #define MAP_OR_FAIL(func) -#define POSIX_MOD_NAME "POSIX" - struct posix_runtime_file { struct darshan_posix_file* file_record; @@ -245,7 +243,6 @@ static void posix_runtime_initialize() /* register the posix module with darshan core */ darshan_core_register_module( DARSHAN_POSIX_MOD, - POSIX_MOD_NAME, &posix_mod_fns, &mem_limit);