diff --git a/darshan-log-format.h b/darshan-log-format.h index c556b4e7eeee9b05b5eacdda29273add97ff24e8..3432be8af9a91d3b172565cfeda63e86b787fe08 100644 --- a/darshan-log-format.h +++ b/darshan-log-format.h @@ -111,6 +111,7 @@ struct darshan_base_record #include "darshan-hdf5-log-format.h" #include "darshan-pnetcdf-log-format.h" #include "darshan-bgq-log-format.h" +#include "darshan-lustre-log-format.h" /* X-macro for keeping module ordering consistent */ /* NOTE: first val used to define module enum values, @@ -128,7 +129,8 @@ struct darshan_base_record X(DARSHAN_MPIIO_MOD, "MPI-IO", DARSHAN_MPIIO_VER, &mpiio_logutils) \ X(DARSHAN_HDF5_MOD, "HDF5", DARSHAN_HDF5_VER, &hdf5_logutils) \ X(DARSHAN_PNETCDF_MOD, "PNETCDF", DARSHAN_PNETCDF_VER, &pnetcdf_logutils) \ - X(DARSHAN_BGQ_MOD, "BG/Q", DARSHAN_BGQ_VER, &bgq_logutils) + X(DARSHAN_BGQ_MOD, "BG/Q", DARSHAN_BGQ_VER, &bgq_logutils) \ + X(DARSHAN_LUSTRE_MOD, "LUSTRE", DARSHAN_LUSTRE_VER, &lustre_logutils) /* unique identifiers to distinguish between available darshan modules */ /* NOTES: - valid ids range from [0...DARSHAN_MAX_MODS-1] diff --git a/darshan-lustre-log-format.h b/darshan-lustre-log-format.h new file mode 100644 index 0000000000000000000000000000000000000000..3c3f2cf789b362633ccc6e1dfe1eaa2ec83df1f1 --- /dev/null +++ b/darshan-lustre-log-format.h @@ -0,0 +1,59 @@ +/* + * Copyright (C) 2015 University of Chicago. + * See COPYRIGHT notice in top-level directory. + * + */ + +#ifndef __DARSHAN_LUSTRE_LOG_FORMAT_H +#define __DARSHAN_LUSTRE_LOG_FORMAT_H + +/* NOTE -- redefining the size of OST_ID will require changing the DARSHAN_BSWAP + * macro used in darshan-util/darshan-lustre-logutils.c as well + */ +typedef int64_t OST_ID; + +/* current Lustre log format version */ +#define DARSHAN_LUSTRE_VER 1 + +#define LUSTRE_COUNTERS \ + /* number of OSTs for file system */\ + X(LUSTRE_OSTS) \ + /* number of MDTs for file system */\ + X(LUSTRE_MDTS) \ + /* index of first OST for file */\ + X(LUSTRE_STRIPE_OFFSET) \ + /* bytes per stripe for file */\ + X(LUSTRE_STRIPE_SIZE) \ + /* number of stripes (OSTs) for file */\ + X(LUSTRE_STRIPE_WIDTH) \ + /* end of counters */\ + X(LUSTRE_NUM_INDICES) + +#define X(a) a, +/* integer statistics for Lustre file records */ +enum darshan_lustre_indices +{ + LUSTRE_COUNTERS +}; +#undef X + +/* record structure for the Lustre module. a record is created and stored for + * every file opened that belongs to a Lustre file system. This record includes: + * - a corresponding record identifier (created by hashing the file path) + * - the rank of the process which opened the file (-1 for shared files) + * - integer file I/O statistics (stripe size, width, # of OSTs, etc.) + */ +struct darshan_lustre_record +{ + darshan_record_id rec_id; + int64_t rank; + int64_t counters[LUSTRE_NUM_INDICES]; + OST_ID ost_ids[1]; +}; + +/* + * helper function to calculate the size of a record + */ +#define LUSTRE_RECORD_SIZE( osts ) ( sizeof(struct darshan_lustre_record) + sizeof(OST_ID) * (osts - 1) ) + +#endif /* __DARSHAN_LUSTRE_LOG_FORMAT_H */ diff --git a/darshan-runtime/Makefile.in b/darshan-runtime/Makefile.in index 17b34f2377183b6a288ef57c127823276f6b4e88..ff6fcff90693f5e06f347329449f5fa66f3115fa 100644 --- a/darshan-runtime/Makefile.in +++ b/darshan-runtime/Makefile.in @@ -37,6 +37,10 @@ LIBS = -lz @LIBBZ2@ DARSHAN_STATIC_MOD_OBJS = lib/darshan-posix.o lib/darshan-mpiio.o lib/darshan-hdf5.o lib/darshan-pnetcdf.o DARSHAN_DYNAMIC_MOD_OBJS = lib/darshan-posix.po lib/darshan-mpiio.po lib/darshan-hdf5.po lib/darshan-pnetcdf.po +# TODO: make the lustre module enabled using config options +DARSHAN_STATIC_MOD_OBJS += lib/darshan-lustre.o +DARSHAN_DYNAMIC_MOD_OBJS += lib/darshan-lustre.po + ifdef DARSHAN_USE_BGQ DARSHAN_STATIC_MOD_OBJS += lib/darshan-bgq.o DARSHAN_DYNAMIC_MOD_OBJS += lib/darshan-bgq.po @@ -50,63 +54,69 @@ lib:: lib/darshan-core-init-finalize.o: lib/darshan-core-init-finalize.c darshan.h darshan-core.h $(DARSHAN_LOG_FORMAT) | lib $(CC) $(CFLAGS) -c $< -o $@ -lib/darshan-core-init-finalize.po: lib/darshan-core-init-finalize.c darshan.h darshan-core.h $(DARSHAN_LOG_FORMAT) | lib +lib/darshan-core-init-finalize.po: lib/darshan-core-init-finalize.c darshan.h darshan-dynamic.h darshan-core.h $(DARSHAN_LOG_FORMAT) | lib $(CC) $(CFLAGS_SHARED) -c $< -o $@ lib/darshan-core.o: lib/darshan-core.c darshan.h darshan-core.h $(DARSHAN_LOG_FORMAT) | lib $(CC) $(CFLAGS) -c $< -o $@ -lib/darshan-core.po: lib/darshan-core.c darshan.h darshan-core.h $(DARSHAN_LOG_FORMAT) | lib +lib/darshan-core.po: lib/darshan-core.c darshan.h darshan-dynamic.h darshan-core.h $(DARSHAN_LOG_FORMAT) | lib $(CC) $(CFLAGS_SHARED) -c $< -o $@ -lib/darshan-common.o: lib/darshan-common.c darshan.h $(DARSHAN_LOG_FORMAT) | lib +lib/darshan-common.o: lib/darshan-common.c darshan.h darshan-common.h $(DARSHAN_LOG_FORMAT) | lib $(CC) $(CFLAGS) -c $< -o $@ -lib/darshan-common.po: lib/darshan-common.c darshan.h $(DARSHAN_LOG_FORMAT) | lib +lib/darshan-common.po: lib/darshan-common.c darshan.h darshan-common.h $(DARSHAN_LOG_FORMAT) | lib $(CC) $(CFLAGS_SHARED) -c $< -o $@ -lib/darshan-null.o: lib/darshan-null.c darshan.h $(DARSHAN_LOG_FORMAT) $(srcdir)/../darshan-null-log-format.h | lib +lib/darshan-null.o: lib/darshan-null.c darshan.h darshan-common.h $(DARSHAN_LOG_FORMAT) $(srcdir)/../darshan-null-log-format.h | lib $(CC) $(CFLAGS) -c $< -o $@ -lib/darshan-null.po: lib/darshan-null.c darshan.h $(DARSHAN_LOG_FORMAT) $(srcdir)/../darshan-null-log-format.h | lib +lib/darshan-null.po: lib/darshan-null.c darshan.h darshan-dynamic.h darshan-common.h $(DARSHAN_LOG_FORMAT) $(srcdir)/../darshan-null-log-format.h | lib $(CC) $(CFLAGS_SHARED) -c $< -o $@ -lib/darshan-posix.o: lib/darshan-posix.c darshan.h $(DARSHAN_LOG_FORMAT) $(srcdir)/../darshan-posix-log-format.h | lib +lib/darshan-posix.o: lib/darshan-posix.c darshan.h darshan-common.h $(DARSHAN_LOG_FORMAT) $(srcdir)/../darshan-posix-log-format.h | lib $(CC) $(CFLAGS) -c $< -o $@ -lib/darshan-posix.po: lib/darshan-posix.c darshan.h $(DARSHAN_LOG_FORMAT) $(srcdir)/../darshan-posix-log-format.h | lib +lib/darshan-posix.po: lib/darshan-posix.c darshan.h darshan-dynamic.h darshan-common.h $(DARSHAN_LOG_FORMAT) $(srcdir)/../darshan-posix-log-format.h | lib $(CC) $(CFLAGS_SHARED) -c $< -o $@ -lib/darshan-mpiio.o: lib/darshan-mpiio.c darshan.h $(DARSHAN_LOG_FORMAT) $(srcdir)/../darshan-mpiio-log-format.h | lib +lib/darshan-mpiio.o: lib/darshan-mpiio.c darshan.h darshan-common.h $(DARSHAN_LOG_FORMAT) $(srcdir)/../darshan-mpiio-log-format.h | lib $(CC) $(CFLAGS) -c $< -o $@ -lib/darshan-mpiio.po: lib/darshan-mpiio.c darshan.h $(DARSHAN_LOG_FORMAT) $(srcdir)/../darshan-mpiio-log-format.h | lib +lib/darshan-mpiio.po: lib/darshan-mpiio.c darshan.h darshan-dynamic.h darshan-common.h $(DARSHAN_LOG_FORMAT) $(srcdir)/../darshan-mpiio-log-format.h | lib $(CC) $(CFLAGS_SHARED) -c $< -o $@ -lib/darshan-bgq.o: lib/darshan-bgq.c darshan.h $(DARSHAN_LOG_FORMAT) $(srcdir)/../darshan-mpiio-log-format.h | lib +lib/darshan-bgq.o: lib/darshan-bgq.c darshan.h darshan-common.h $(DARSHAN_LOG_FORMAT) $(srcdir)/../darshan-mpiio-log-format.h | lib $(CC) $(CFLAGS) -c $< -o $@ -lib/darshan-bgq.po: lib/darshan-bgq.c darshan.h $(DARSHAN_LOG_FORMAT) $(srcdir)/../darshan-mpiio-log-format.h | lib +lib/darshan-bgq.po: lib/darshan-bgq.c darshan.h darshan-dynamic.h darshan-common.h $(DARSHAN_LOG_FORMAT) $(srcdir)/../darshan-mpiio-log-format.h | lib $(CC) $(CFLAGS_SHARED) -c $< -o $@ -lib/darshan-hdf5.o: lib/darshan-hdf5.c darshan.h $(DARSHAN_LOG_FORMAT) $(srcdir)/../darshan-hdf5-log-format.h | lib +lib/darshan-hdf5.o: lib/darshan-hdf5.c darshan.h darshan-common.h $(DARSHAN_LOG_FORMAT) $(srcdir)/../darshan-hdf5-log-format.h | lib $(CC) $(CFLAGS) -c $< -o $@ -lib/darshan-hdf5.po: lib/darshan-hdf5.c darshan.h $(DARSHAN_LOG_FORMAT) $(srcdir)/../darshan-hdf5-log-format.h | lib +lib/darshan-hdf5.po: lib/darshan-hdf5.c darshan.h darshan-dynamic.h darshan-common.h $(DARSHAN_LOG_FORMAT) $(srcdir)/../darshan-hdf5-log-format.h | lib $(CC) $(CFLAGS_SHARED) -c $< -o $@ lib/darshan-hdf5-stubs.o: lib/darshan-hdf5-stubs.c darshan.h $(DARSHAN_LOG_FORMAT) | lib $(CC) $(CFLAGS) -c $< -o $@ -lib/darshan-pnetcdf.o: lib/darshan-pnetcdf.c darshan.h $(DARSHAN_LOG_FORMAT) $(srcdir)/../darshan-pnetcdf-log-format.h | lib +lib/darshan-pnetcdf.o: lib/darshan-pnetcdf.c darshan.h darshan-common.h $(DARSHAN_LOG_FORMAT) $(srcdir)/../darshan-pnetcdf-log-format.h | lib $(CC) $(CFLAGS) -c $< -o $@ -lib/darshan-pnetcdf.po: lib/darshan-pnetcdf.c darshan.h $(DARSHAN_LOG_FORMAT) $(srcdir)/../darshan-pnetcdf-log-format.h | lib +lib/darshan-pnetcdf.po: lib/darshan-pnetcdf.c darshan.h darshan-dynamic.h darshan-common.h $(DARSHAN_LOG_FORMAT) $(srcdir)/../darshan-pnetcdf-log-format.h | lib $(CC) $(CFLAGS_SHARED) -c $< -o $@ lib/darshan-pnetcdf-stubs.o: lib/darshan-pnetcdf-stubs.c darshan.h $(DARSHAN_LOG_FORMAT) | lib $(CC) $(CFLAGS) -c $< -o $@ +lib/darshan-lustre.o: lib/darshan-lustre.c darshan.h darshan-common.h $(DARSHAN_LOG_FORMAT) $(srcdir)/../darshan-lustre-log-format.h | lib + $(CC) $(CFLAGS) -c $< -o $@ + +lib/darshan-lustre.po: lib/darshan-lustre.c darshan.h darshan-dynamic.h darshan-common.h $(DARSHAN_LOG_FORMAT) $(srcdir)/../darshan-lustre-log-format.h | lib + $(CC) $(CFLAGS_SHARED) -c $< -o $@ + lib/lookup3.o: lib/lookup3.c $(CC) $(CFLAGS) -c $< -o $@ diff --git a/darshan-runtime/darshan-lustre.h b/darshan-runtime/darshan-lustre.h new file mode 100644 index 0000000000000000000000000000000000000000..d84cbf349cb49152b6fd805367cba20656c23708 --- /dev/null +++ b/darshan-runtime/darshan-lustre.h @@ -0,0 +1,19 @@ +struct lustre_record_runtime +{ + struct darshan_lustre_record *record; + size_t record_size; + UT_hash_handle hlink; +}; + +struct lustre_runtime +{ + int record_count; /* number of records defined */ + size_t record_buffer_max; /* size of the allocated buffer pointed to by record_buffer */ + size_t record_buffer_used; /* size of the allocated buffer actually used */ + void *next_free_record; /* pointer to end of record_buffer */ + void *record_buffer; /* buffer in which records are created */ + struct lustre_record_runtime *record_runtime_array; + struct lustre_record_runtime *record_runtime_hash; +}; + + diff --git a/darshan-runtime/darshan.h b/darshan-runtime/darshan.h index b58c17d868af950dd6e3a03c64f10395ae992869..674e5fa7635bb6cd9b77c92e9776f15ac112b7cc 100644 --- a/darshan-runtime/darshan.h +++ b/darshan-runtime/darshan.h @@ -73,6 +73,15 @@ typedef void (*darshan_module_shutdown)( int *mod_buf_sz /* output parameter to save module buffer size */ ); +/* stores FS info from statfs calls for a given mount point */ +struct darshan_fs_info +{ + int fs_type; + int block_size; + int ost_count; + int mdt_count; +}; + /***************************************************** * darshan-core functions exported to darshan modules * *****************************************************/ @@ -122,17 +131,18 @@ darshan_record_id darshan_core_gen_record_id( * Darshan record (e.g., the full file path), which for now is just a * string. 'mod_id' is the identifier of the calling module. 'rec_len' * is the size of the record being registered with Darshan. If given, - * 'file_alignment' is a pointer to an integer which on return will - * contain the corresponding file system alignment of the file system - * path 'name' resides on. Returns a pointer to the address the record - * should be written to on success, NULL on failure. + * 'fs_info' is a pointer to a structure containing information on + * the underlying FS this record is associated with (determined by + * matching the file name prefix with Darshan's list of tracked mount + * points). Returns a pointer to the address the record should be + * written to on success, NULL on failure. */ void *darshan_core_register_record( darshan_record_id rec_id, const char *name, darshan_module_id mod_id, int rec_len, - int *file_alignment); + struct darshan_fs_info *fs_info); /* darshan_core_wtime() * diff --git a/darshan-runtime/lib/darshan-core-init-finalize.c b/darshan-runtime/lib/darshan-core-init-finalize.c index b3dcc47c51db2412d9f3c593f73071bdf968386d..26ef82f01583fd445ecdcf4e92872db51c0b902f 100644 --- a/darshan-runtime/lib/darshan-core-init-finalize.c +++ b/darshan-runtime/lib/darshan-core-init-finalize.c @@ -15,6 +15,7 @@ #include "darshan.h" #include "darshan-core.h" +#include "darshan-dynamic.h" #ifdef DARSHAN_PRELOAD diff --git a/darshan-runtime/lib/darshan-core.c b/darshan-runtime/lib/darshan-core.c index 657a1866645399b117ae7a7147539e8cb36a8138..38cf28c6fd43a1ea6026031d49b00e30e1e2a476 100644 --- a/darshan-runtime/lib/darshan-core.c +++ b/darshan-runtime/lib/darshan-core.c @@ -19,6 +19,8 @@ #include #include #include +#include +#include #include #include #include @@ -32,6 +34,9 @@ #include "darshan-core.h" #include "darshan-dynamic.h" +/* XXX stick this into autoconf .h */ +#include + extern char* __progname; extern char* __progname_full; @@ -83,9 +88,9 @@ void (*mod_static_init_fns[])(void) = #define DARSHAN_MAX_MNT_TYPE 32 struct mnt_data { - int block_size; char path[DARSHAN_MAX_MNT_PATH]; char type[DARSHAN_MAX_MNT_TYPE]; + struct darshan_fs_info fs_info; }; static struct mnt_data mnt_data_array[DARSHAN_MAX_MNTS]; static int mnt_data_count = 0; @@ -99,11 +104,11 @@ static void darshan_log_record_hints_and_ver( struct darshan_core_runtime* core); static void darshan_get_exe_and_mounts( struct darshan_core_runtime *core, int argc, char **argv); +static void darshan_fs_info_from_path( + const char *path, struct darshan_fs_info *fs_info); static int darshan_add_name_record_ref( struct darshan_core_runtime *core, darshan_record_id rec_id, const char *name, darshan_module_id mod_id); -static int darshan_block_size_from_path( - const char *path); static void darshan_get_user_name( char *user); static void darshan_get_logfile_name( @@ -919,12 +924,42 @@ static void add_entry(char* buf, int* space_left, struct mntent* entry) #define LL_SUPER_MAGIC 0x0BD00BD0 #endif ret = statfs(entry->mnt_dir, &statfsbuf); + mnt_data_array[mnt_data_count].fs_info.fs_type = statfsbuf.f_type; if(ret == 0 && statfsbuf.f_type != LL_SUPER_MAGIC) - mnt_data_array[mnt_data_count].block_size = statfsbuf.f_bsize; + mnt_data_array[mnt_data_count].fs_info.block_size = statfsbuf.f_bsize; else if(ret == 0 && statfsbuf.f_type == LL_SUPER_MAGIC) - mnt_data_array[mnt_data_count].block_size = 1024*1024; + mnt_data_array[mnt_data_count].fs_info.block_size = 1024*1024; else - mnt_data_array[mnt_data_count].block_size = 4096; + mnt_data_array[mnt_data_count].fs_info.block_size = 4096; + + /* XXX */ + /* attempt to retrieve OST and MDS counts from Lustre */ + mnt_data_array[mnt_data_count].fs_info.ost_count = -1; + mnt_data_array[mnt_data_count].fs_info.mdt_count = -1; + if ( statfsbuf.f_type == LL_SUPER_MAGIC ) + { + int n_ost, n_mdt; + int ret_ost, ret_mdt; + DIR *mount_dir; + + mount_dir = opendir( entry->mnt_dir ); + if ( mount_dir ) + { + /* n_ost and n_mdt are used for both input and output to ioctl */ + n_ost = 0; + n_mdt = 1; + + ret_ost = ioctl( dirfd(mount_dir), LL_IOC_GETOBDCOUNT, &n_ost ); + ret_mdt = ioctl( dirfd(mount_dir), LL_IOC_GETOBDCOUNT, &n_mdt ); + + if ( !(ret_ost < 0 || ret_mdt < 0) ) + { + mnt_data_array[mnt_data_count].fs_info.ost_count = n_ost; + mnt_data_array[mnt_data_count].fs_info.mdt_count = n_mdt; + } + closedir( mount_dir ); + } + } /* store mount information with the job-level metadata in darshan log */ ret = snprintf(tmp_mnt, 256, "\n%s\t%s", @@ -1059,21 +1094,55 @@ static void darshan_get_exe_and_mounts(struct darshan_core_runtime *core, return; } -static int darshan_block_size_from_path(const char *path) +static void darshan_fs_info_from_path(const char *path, struct darshan_fs_info *fs_info) { int i; - int block_size = -1; + fs_info->fs_type = -1; + fs_info->block_size = -1; for(i=0; iname_mem_used) > DARSHAN_NAME_RECORD_BUF_SIZE) + return(0); + + ref = malloc(sizeof(*ref)); + if(!ref) + return(0); + memset(ref, 0, sizeof(*ref)); + + /* initialize the name record */ + ref->name_record = (struct darshan_name_record *) + ((char *)core->log_name_p + core->name_mem_used); + memset(ref->name_record, 0, record_size); + ref->name_record->id = rec_id; + strcpy(ref->name_record->name, name); + DARSHAN_MOD_FLAG_SET(ref->mod_flags, mod_id); + + /* add the record to the hash table */ + HASH_ADD(hlink, core->name_hash, name_record->id, + sizeof(darshan_record_id), ref); + core->name_mem_used += record_size; +#ifdef __DARSHAN_ENABLE_MMAP_LOGS + core->log_hdr_p->name_map.len += record_size; +#endif + + return(1); } static void darshan_get_user_name(char *cuser) @@ -1239,39 +1308,6 @@ static void darshan_get_logfile_name(char* logfile_name, int jobid, struct tm* s return; } -static int darshan_add_name_record_ref(struct darshan_core_runtime *core, - darshan_record_id rec_id, const char *name, darshan_module_id mod_id) -{ - struct darshan_core_name_record_ref *ref; - int record_size = sizeof(darshan_record_id) + strlen(name) + 1; - - if((record_size + core->name_mem_used) > DARSHAN_NAME_RECORD_BUF_SIZE) - return(0); - - ref = malloc(sizeof(*ref)); - if(!ref) - return(0); - memset(ref, 0, sizeof(*ref)); - - /* initialize the name record */ - ref->name_record = (struct darshan_name_record *) - ((char *)core->log_name_p + core->name_mem_used); - memset(ref->name_record, 0, record_size); - ref->name_record->id = rec_id; - strcpy(ref->name_record->name, name); - DARSHAN_MOD_FLAG_SET(ref->mod_flags, mod_id); - - /* add the record to the hash table */ - HASH_ADD(hlink, core->name_hash, name_record->id, - sizeof(darshan_record_id), ref); - core->name_mem_used += record_size; -#ifdef __DARSHAN_ENABLE_MMAP_LOGS - core->log_hdr_p->name_map.len += record_size; -#endif - - return(1); -} - static void darshan_get_shared_records(struct darshan_core_runtime *core, darshan_record_id **shared_recs, int *shared_rec_cnt) { @@ -1889,7 +1925,7 @@ void *darshan_core_register_record( const char *name, darshan_module_id mod_id, int rec_len, - int *file_alignment) + struct darshan_fs_info *fs_info) { struct darshan_core_name_record_ref *ref; void *rec_buf; @@ -1942,8 +1978,8 @@ void *darshan_core_register_record( #endif DARSHAN_CORE_UNLOCK(); - if(file_alignment) - *file_alignment = darshan_block_size_from_path(name); + if(fs_info) + darshan_fs_info_from_path(name, fs_info); return(rec_buf);; } diff --git a/darshan-runtime/lib/darshan-hdf5.c b/darshan-runtime/lib/darshan-hdf5.c index 6946151d38843fec85e865e5b7f0cb81b778e0ea..ae9bed26f5adbc12498c88a09a69bd18ac277bfc 100644 --- a/darshan-runtime/lib/darshan-hdf5.c +++ b/darshan-runtime/lib/darshan-hdf5.c @@ -4,6 +4,9 @@ * */ +#define _XOPEN_SOURCE 500 +#define _GNU_SOURCE + #include "darshan-runtime-config.h" #include #include @@ -16,7 +19,6 @@ #include #include #include -#define __USE_GNU #include #include "darshan.h" diff --git a/darshan-runtime/lib/darshan-lustre.c b/darshan-runtime/lib/darshan-lustre.c new file mode 100644 index 0000000000000000000000000000000000000000..c02c81742ac0822f5364e0a070da7a1702929032 --- /dev/null +++ b/darshan-runtime/lib/darshan-lustre.c @@ -0,0 +1,515 @@ +/* + * Copyright (C) 2015 University of Chicago. + * See COPYRIGHT notice in top-level directory. + * + */ + +#define _XOPEN_SOURCE 500 +#define _GNU_SOURCE + +#include "darshan-runtime-config.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* XXX stick this into autoconf .h */ +#include + +#include "uthash.h" + +#include "darshan.h" +#include "darshan-dynamic.h" +#include "darshan-lustre.h" + +struct lustre_runtime *lustre_runtime = NULL; +static pthread_mutex_t lustre_runtime_mutex = PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP; +static int instrumentation_disabled = 0; +static int my_rank = -1; + +static void lustre_runtime_initialize(void); + +static void lustre_begin_shutdown(void); +static void lustre_get_output_data(MPI_Comm mod_comm, darshan_record_id *shared_recs, + int shared_rec_count, void **lustre_buf, int *lustre_buf_sz); +static void lustre_shutdown(void); +static int lustre_record_compare(const void* a_p, const void* b_p); +static void lustre_record_reduction_op(void* infile_v, void* inoutfile_v, + int *len, MPI_Datatype *datatype); + +#define LUSTRE_LOCK() pthread_mutex_lock(&lustre_runtime_mutex) +#define LUSTRE_UNLOCK() pthread_mutex_unlock(&lustre_runtime_mutex) + +void darshan_instrument_lustre_file(const char* filepath, int fd) +{ + struct lustre_record_runtime *rec_rt; + struct darshan_lustre_record *rec; + struct darshan_fs_info fs_info; + darshan_record_id rec_id; + int limit_flag; + int i; + struct lov_user_md *lum; + size_t lumsize = sizeof(struct lov_user_md) + + LOV_MAX_STRIPE_COUNT * sizeof(struct lov_user_ost_data); + size_t rec_size; + char *newname = NULL; + + LUSTRE_LOCK(); + if(instrumentation_disabled) + { + LUSTRE_UNLOCK(); + return; + } + + /* make sure the lustre module is already initialized */ + lustre_runtime_initialize(); + if(!lustre_runtime) + { + LUSTRE_UNLOCK(); + return; + } + + /* if we can't issue ioctl, we have no counter data at all */ + if ( (lum = calloc(1, lumsize)) == NULL ) + { + LUSTRE_UNLOCK(); + return; + } + + /* find out the OST count of this file so we can allocate memory */ + lum->lmm_magic = LOV_USER_MAGIC; + lum->lmm_stripe_count = LOV_MAX_STRIPE_COUNT; + + /* -1 means ioctl failed, likely because file isn't on Lustre */ + if ( ioctl( fd, LL_IOC_LOV_GETSTRIPE, (void *)lum ) == -1 ) + { + free(lum); + LUSTRE_UNLOCK(); + return; + } + + rec_size = LUSTRE_RECORD_SIZE( lum->lmm_stripe_count ); + + /* get fully qualified name for record */ + newname = darshan_clean_file_path(filepath); + if(!newname) + newname = (char*)filepath; + + { + /* broken out for clarity */ + void *end_of_new_record = (char*)lustre_runtime->next_free_record + rec_size; + void *end_of_rec_buffer = (char*)lustre_runtime->record_buffer + lustre_runtime->record_buffer_max; + limit_flag = ( end_of_new_record > end_of_rec_buffer ); + } + + /* register a Lustre file record with Darshan */ + fs_info.fs_type = -1; + darshan_core_register_record( + (void *)newname, + strlen(newname), + DARSHAN_LUSTRE_MOD, + 1, + limit_flag, + &rec_id, + &fs_info); + + /* if record id is 0, darshan has no more memory for instrumenting */ + if(rec_id == 0) + { + free(lum); + LUSTRE_UNLOCK(); + return; + } + + /* search the hash table for this file record, and initialize if not found */ + HASH_FIND(hlink, lustre_runtime->record_runtime_hash, &rec_id, sizeof(darshan_record_id), rec_rt ); + if ( !rec_rt ) { + /* allocate a new lustre record and append it to the array */ + rec_rt = &(lustre_runtime->record_runtime_array[lustre_runtime->record_count]); + rec_rt->record = lustre_runtime->next_free_record; + rec_rt->record_size = rec_size; + lustre_runtime->next_free_record = (char*)(lustre_runtime->next_free_record) + rec_size; + lustre_runtime->record_buffer_used += rec_size; + rec = rec_rt->record; + rec->rec_id = rec_id; + rec->rank = my_rank; + + /* implicit assumption here that none of these counters will change + * after the first time a file is opened. This may not always be + * true in the future */ + if ( fs_info.fs_type != -1 ) + { + rec->counters[LUSTRE_OSTS] = fs_info.ost_count; + rec->counters[LUSTRE_MDTS] = fs_info.mdt_count; + } + else + { + rec->counters[LUSTRE_OSTS] = -1; + rec->counters[LUSTRE_MDTS] = -1; + } + + rec->counters[LUSTRE_STRIPE_SIZE] = lum->lmm_stripe_size; + rec->counters[LUSTRE_STRIPE_WIDTH] = lum->lmm_stripe_count; + rec->counters[LUSTRE_STRIPE_OFFSET] = lum->lmm_stripe_offset; + for ( i = 0; i < lum->lmm_stripe_count; i++ ) + rec->ost_ids[i] = lum->lmm_objects[i].l_ost_idx; + free(lum); + + HASH_ADD(hlink, lustre_runtime->record_runtime_hash, record->rec_id, sizeof(darshan_record_id), rec_rt); + + lustre_runtime->record_count++; + } + + LUSTRE_UNLOCK(); + return; +} + +static void lustre_runtime_initialize() +{ + int mem_limit; + int max_records; + struct darshan_module_funcs lustre_mod_fns = + { + .begin_shutdown = &lustre_begin_shutdown, + .get_output_data = &lustre_get_output_data, + .shutdown = &lustre_shutdown + }; + + /* don't do anything if already initialized or instrumenation is disabled */ + if(lustre_runtime || instrumentation_disabled) + return; + + /* register the lustre module with darshan-core */ + darshan_core_register_module( + DARSHAN_LUSTRE_MOD, + &lustre_mod_fns, + &my_rank, + &mem_limit, + NULL); + + /* return if no memory assigned by darshan core */ + if(mem_limit == 0) + return; + + lustre_runtime = malloc(sizeof(*lustre_runtime)); + if(!lustre_runtime) + return; + memset(lustre_runtime, 0, sizeof(*lustre_runtime)); + + /* allocate the full size of the memory limit we are given */ + lustre_runtime->record_buffer= malloc(mem_limit); + if(!lustre_runtime->record_buffer) + { + lustre_runtime->record_buffer_max = 0; + return; + } + lustre_runtime->record_buffer_max = mem_limit; + lustre_runtime->next_free_record = lustre_runtime->record_buffer; + memset(lustre_runtime->record_buffer, 0, lustre_runtime->record_buffer_max); + + /* Allocate array of Lustre runtime data. We calculate the maximum possible + * number of records that will fit into mem_limit by assuming that each + * record has the minimum possible OST count, then allocate that many + * runtime records. record_buffer will always run out of memory before + * we overflow record_runtime_array. + */ + max_records = mem_limit / sizeof(struct darshan_lustre_record); + lustre_runtime->record_runtime_array = + malloc( max_records * sizeof(struct lustre_record_runtime)); + if(!lustre_runtime->record_runtime_array) + { + lustre_runtime->record_buffer_max = 0; + free( lustre_runtime->record_buffer ); + return; + } + memset(lustre_runtime->record_runtime_array, 0, + max_records * sizeof(struct lustre_record_runtime)); + + return; +} + +/************************************************************************** + * Functions exported by Lustre module for coordinating with darshan-core * + **************************************************************************/ + +static void lustre_begin_shutdown(void) +{ + assert(lustre_runtime); + + LUSTRE_LOCK(); + /* disable further instrumentation while Darshan shuts down */ + instrumentation_disabled = 1; + LUSTRE_UNLOCK(); + + return; +} + +static void lustre_get_output_data( + MPI_Comm mod_comm, + darshan_record_id *shared_recs, + int shared_rec_count, + void **lustre_buf, + int *lustre_buf_sz) +{ + struct lustre_record_runtime *file; + int i; + + assert(lustre_runtime); + + /* if there are globally shared files, do a shared file reduction */ + /* NOTE: the shared file reduction is also skipped if the + * DARSHAN_DISABLE_SHARED_REDUCTION environment variable is set. + */ + if (shared_rec_count && !getenv("DARSHAN_DISABLE_SHARED_REDUCTION")) + { + /* necessary initialization of shared records */ + for(i = 0; i < shared_rec_count; i++) + { + HASH_FIND(hlink, lustre_runtime->record_runtime_hash, &shared_recs[i], + sizeof(darshan_record_id), file); + assert(file); + + file->record->rank = -1; + } + + /* sort the array of files descending by rank so that we get all of the + * shared files (marked by rank -1) in a contiguous portion at end + * of the array + */ + sort_lustre_records(); + + /* simply drop all shared records from non-root ranks by truncating + * the record array and recalculating the size of the used buffer + */ + if (my_rank != 0) + { + lustre_runtime->record_count -= shared_rec_count; + lustre_runtime->record_buffer_used = 0; + for ( i = 0; i < lustre_runtime->record_count; i++ ) + lustre_runtime->record_buffer_used += + LUSTRE_RECORD_SIZE( (lustre_runtime->record_runtime_array[i]).record->counters[LUSTRE_STRIPE_WIDTH] ); + } + } + + *lustre_buf = (void *)(lustre_runtime->record_buffer); + *lustre_buf_sz = lustre_runtime->record_buffer_used; + + return; +} + +static void lustre_shutdown(void) +{ + assert(lustre_runtime); + + HASH_CLEAR(hlink, lustre_runtime->record_runtime_hash); + free(lustre_runtime->record_runtime_array); + free(lustre_runtime->record_buffer); + free(lustre_runtime); + lustre_runtime = NULL; + + return; +} + +/* compare function for sorting file records by descending rank */ +static int lustre_record_compare(const void* a_p, const void* b_p) +{ + const struct lustre_record_runtime* a = a_p; + const struct lustre_record_runtime* b = b_p; + + if (a->record->rank < b->record->rank) + return 1; + if (a->record->rank > b->record->rank) + return -1; + + /* if ( a->record->rank == b->record->rank ) we MUST do a secondary + * sort so that the order of qsort is fully deterministic and consistent + * across all MPI ranks. Without a secondary sort, the sort order can + * be affected by rank-specific variations (e.g., the order in which + * files are first opened). + */ + + return 0; +} + +/* + * Sort the record_runtimes and records by MPI rank to facilitate shared redux. + * This requires craftiness and additional heap utilization because the records + * (but not record_runtimes) have variable size. Currently has to temporarily + * duplicate the entire record_buffer; there is room for more memory-efficient + * optimization if this becomes a scalability issue. + */ +int sort_lustre_records() +{ + int i; + struct darshan_lustre_record *rec; + struct lustre_record_runtime *rec_rt, *tmp_rec_rt; + char *new_buf, *p; + + /* Create a new buffer to store an entire replica of record_buffer. Since + * we know the exact size of record_buffer's useful data at this point, we + * can allocate the exact amount we need instead of record_buffer_max */ + new_buf = malloc(lustre_runtime->record_buffer_used); + p = new_buf; + if ( !new_buf ) + return 1; + + /* qsort breaks the hash table, so delete it now to free its memory buffers + * and prevent later confusion */ + HASH_ITER( hlink, lustre_runtime->record_runtime_hash, rec_rt, tmp_rec_rt ) + HASH_DELETE( hlink, lustre_runtime->record_runtime_hash, rec_rt ); + + /* sort the runtime records, which is has fixed-length elements */ + qsort( + lustre_runtime->record_runtime_array, + lustre_runtime->record_count, + sizeof(struct lustre_record_runtime), + lustre_record_compare + ); + + /* rebuild the hash and array with the qsorted runtime records */ + for ( i = 0; i < lustre_runtime->record_count; i++ ) + { + rec_rt = &(lustre_runtime->record_runtime_array[i]); + HASH_ADD(hlink, lustre_runtime->record_runtime_hash, record->rec_id, sizeof(darshan_record_id), rec_rt ); + } + + /* create reordered record buffer, then copy it back in place */ + for ( i = 0; i < lustre_runtime->record_count; i++ ) + { + rec_rt = &(lustre_runtime->record_runtime_array[i]); + memcpy( p, rec_rt->record, rec_rt->record_size ); + /* fix record pointers within each runtime record too - pre-emptively + * point them at where they will live in record_buffer after we memcpy + * below */ + rec_rt->record = (struct darshan_lustre_record *)((char*)(lustre_runtime->record_buffer) + (p - new_buf)); + + p += rec_rt->record_size; + } + memcpy( + lustre_runtime->record_buffer, + new_buf, + lustre_runtime->record_buffer_used ); + + free(new_buf); + return 0; +} + +/* this is just boilerplate reduction code that isn't currently used */ +static void lustre_record_reduction_op(void* infile_v, void* inoutfile_v, + int *len, MPI_Datatype *datatype) +{ + struct darshan_lustre_record tmp_record; + struct darshan_lustre_record *infile = infile_v; + struct darshan_lustre_record *inoutfile = inoutfile_v; + int i, j; + + assert(lustre_runtime); + + for( i=0; i<*len; i++ ) + { + memset(&tmp_record, 0, sizeof(struct darshan_lustre_record)); + tmp_record.rec_id = infile->rec_id; + tmp_record.rank = -1; + + /* preserve only rank 0's value */ + for( j = LUSTRE_OSTS; j < LUSTRE_NUM_INDICES; j++) + { + if ( my_rank == 0 ) + { + tmp_record.counters[j] = infile->counters[j]; + } + else + { + tmp_record.counters[j] = inoutfile->counters[j]; + } + } + + /* update pointers */ + *inoutfile = tmp_record; + inoutfile++; + infile++; + } + + return; +} + +/* + * Dump the memory structure of our records and runtime records + */ +void print_lustre_runtime( void ) +{ + int i, j; + struct darshan_lustre_record *rec; + + /* print what we just loaded */ + for ( i = 0; i < lustre_runtime->record_count; i++ ) + { + rec = (lustre_runtime->record_runtime_array[i]).record; + printf( "File %2d\n", i ); + for ( j = 0; j < LUSTRE_NUM_INDICES; j++ ) + { + printf( " Counter %-2d: %10ld, addr %ld\n", + j, + rec->counters[j], + (char*)(&(rec->counters[j])) - (char*)(lustre_runtime->record_buffer) ); + } + for ( j = 0; j < rec->counters[LUSTRE_STRIPE_WIDTH]; j++ ) + { + if ( j > 0 && j % 2 == 0 ) printf("\n"); + printf( " Stripe %-2d: %10ld, addr %-9d", + j, + rec->ost_ids[j], + (char*)(&(rec->ost_ids[j])) - (char*)(lustre_runtime->record_buffer) ); + } + printf( "\n" ); + } + return; +} + +/* + * Dump the order in which records appear in memory + */ +void print_array( void ) +{ + int i; + struct lustre_record_runtime *rec_rt; + printf("*** DUMPING RECORD LIST BY ARRAY SEQUENCE\n"); + for ( i = 0; i < lustre_runtime->record_count; i++ ) + { + rec_rt = &(lustre_runtime->record_runtime_array[i]); + printf( "*** record %d rank %d osts %d\n", + rec_rt->record->rec_id, + rec_rt->record->rank, + rec_rt->record->counters[LUSTRE_STRIPE_WIDTH]); + } +} +void print_hash( void ) +{ + struct lustre_record_runtime *rec_rt, *tmp_rec_rt; + printf("*** DUMPING RECORD LIST BY HASH SEQUENCE\n"); + HASH_ITER( hlink, lustre_runtime->record_runtime_hash, rec_rt, tmp_rec_rt ) + { + printf( "*** record %d rank %d osts %d\n", + rec_rt->record->rec_id, + rec_rt->record->rank, + rec_rt->record->counters[LUSTRE_STRIPE_WIDTH]); + } + return; +} + + + + +/* + * Local variables: + * c-indent-level: 4 + * c-basic-offset: 4 + * End: + * + * vim: ts=8 sts=4 sw=4 expandtab + */ diff --git a/darshan-runtime/lib/darshan-lustre_old.c b/darshan-runtime/lib/darshan-lustre_old.c new file mode 100644 index 0000000000000000000000000000000000000000..c846a70f5e896f9aeb2ff392b8d3a8461d8758e8 --- /dev/null +++ b/darshan-runtime/lib/darshan-lustre_old.c @@ -0,0 +1,543 @@ +/* + * Copyright (C) 2015 University of Chicago. + * See COPYRIGHT notice in top-level directory. + * + */ + +#define _XOPEN_SOURCE 500 +#define _GNU_SOURCE + +#include "darshan-runtime-config.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* XXX stick this into autoconf .h */ +#include + +#include "uthash.h" + +#include "darshan.h" +#include "darshan-dynamic.h" +#include "darshan-lustre.h" + +struct lustre_runtime *lustre_runtime = NULL; +static pthread_mutex_t lustre_runtime_mutex = PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP; +static int instrumentation_disabled = 0; +static int my_rank = -1; + +static void lustre_runtime_initialize(void); + +static void lustre_begin_shutdown(void); +static void lustre_get_output_data(MPI_Comm mod_comm, darshan_record_id *shared_recs, + int shared_rec_count, void **lustre_buf, int *lustre_buf_sz); +static void lustre_shutdown(void); +static int lustre_record_compare(const void* a_p, const void* b_p); +static void lustre_record_reduction_op(void* infile_v, void* inoutfile_v, + int *len, MPI_Datatype *datatype); + +#define LUSTRE_LOCK() pthread_mutex_lock(&lustre_runtime_mutex) +#define LUSTRE_UNLOCK() pthread_mutex_unlock(&lustre_runtime_mutex) +#define LUSTRE_RECORD_SIZE( osts ) ( sizeof(struct darshan_lustre_record) + sizeof(int64_t) * (osts - 1) ) + +void darshan_instrument_lustre_file(const char* filepath, int fd) +{ + struct lustre_record_runtime *rec_rt; + struct darshan_lustre_record *rec; + struct darshan_fs_info fs_info; + darshan_record_id rec_id; + int limit_flag; + int i; + struct lov_user_md *lum; + size_t lumsize = sizeof(struct lov_user_md) + + LOV_MAX_STRIPE_COUNT * sizeof(struct lov_user_ost_data); + size_t rec_size; + + LUSTRE_LOCK(); + /* make sure the lustre module is already initialized */ + lustre_runtime_initialize(); + + /* if we can't issue ioctl, we have no counter data at all */ + if ( (lum = calloc(1, lumsize)) == NULL ) + return; + + /* find out the OST count of this file so we can allocate memory */ + lum->lmm_magic = LOV_USER_MAGIC; + lum->lmm_stripe_count = LOV_MAX_STRIPE_COUNT; + + /* -1 means ioctl failed, likely because file isn't on Lustre */ + if ( ioctl( fd, LL_IOC_LOV_GETSTRIPE, (void *)lum ) == -1 ) + { + free(lum); + return; + } + + rec_size = LUSTRE_RECORD_SIZE( lum->lmm_stripe_count ); + + { + /* broken out for clarity */ + void *end_of_new_record = (char*)lustre_runtime->next_free_record + rec_size; + void *end_of_rec_buffer = (char*)lustre_runtime->record_buffer + lustre_runtime->record_buffer_max; + limit_flag = ( end_of_new_record > end_of_rec_buffer ); + } + + /* register a Lustre file record with Darshan */ + fs_info.fs_type = -1; + darshan_core_register_record( + (void *)filepath, + strlen(filepath), + DARSHAN_LUSTRE_MOD, + 1, + limit_flag, + &rec_id, + &fs_info); + + /* if record id is 0, darshan has no more memory for instrumenting */ + if(rec_id == 0) + { + free(lum); + LUSTRE_UNLOCK(); + return; + } + + /* search the hash table for this file record, and initialize if not found */ + HASH_FIND(hlink, lustre_runtime->record_runtime_hash, &rec_id, sizeof(darshan_record_id), rec_rt ); + if ( !rec_rt ) { + /* allocate a new lustre record and append it to the array */ + rec_rt = &(lustre_runtime->record_runtime_array[lustre_runtime->record_count]); + rec_rt->record = lustre_runtime->next_free_record; + rec_rt->record_size = rec_size; + lustre_runtime->next_free_record = (char*)(lustre_runtime->next_free_record) + rec_size; + lustre_runtime->record_buffer_used += rec_size; + rec = rec_rt->record; + rec->rec_id = rec_id; + rec->rank = my_rank; + + /* implicit assumption here that none of these counters will change + * after the first time a file is opened. This may not always be + * true in the future */ + if ( fs_info.fs_type != -1 ) + { + rec->counters[LUSTRE_OSTS] = fs_info.ost_count; + rec->counters[LUSTRE_MDTS] = fs_info.mdt_count; + } + else + { + rec->counters[LUSTRE_OSTS] = -1; + rec->counters[LUSTRE_MDTS] = -1; + } + + rec->counters[LUSTRE_STRIPE_SIZE] = lum->lmm_stripe_size; + rec->counters[LUSTRE_STRIPE_WIDTH] = lum->lmm_stripe_count; + rec->counters[LUSTRE_STRIPE_OFFSET] = lum->lmm_stripe_offset; + for ( i = 0; i < lum->lmm_stripe_count; i++ ) + rec->ost_ids[i] = lum->lmm_objects[i].l_ost_idx; + free(lum); + + HASH_ADD(hlink, lustre_runtime->record_runtime_hash, record->rec_id, sizeof(darshan_record_id), rec_rt); + + lustre_runtime->record_count++; + } + + LUSTRE_UNLOCK(); + return; +} + +static void lustre_runtime_initialize() +{ + int mem_limit; + int max_records; + struct darshan_module_funcs lustre_mod_fns = + { + .begin_shutdown = &lustre_begin_shutdown, + .get_output_data = &lustre_get_output_data, + .shutdown = &lustre_shutdown + }; + + /* don't do anything if already initialized or instrumenation is disabled */ + if(lustre_runtime || instrumentation_disabled) + return; + + /* register the lustre module with darshan-core */ + darshan_core_register_module( + DARSHAN_LUSTRE_MOD, + &lustre_mod_fns, + &my_rank, + &mem_limit, + NULL); + + /* return if no memory assigned by darshan core */ + if(mem_limit == 0) + return; + + lustre_runtime = malloc(sizeof(*lustre_runtime)); + if(!lustre_runtime) + return; + memset(lustre_runtime, 0, sizeof(*lustre_runtime)); + + /* allocate the full size of the memory limit we are given */ + lustre_runtime->record_buffer= malloc(mem_limit); + if(!lustre_runtime->record_buffer) + { + lustre_runtime->record_buffer_max = 0; + return; + } + lustre_runtime->record_buffer_max = mem_limit; + lustre_runtime->next_free_record = lustre_runtime->record_buffer; + memset(lustre_runtime->record_buffer, 0, lustre_runtime->record_buffer_max); + + /* Allocate array of Lustre runtime data. We calculate the maximum possible + * number of records that will fit into mem_limit by assuming that each + * record has the minimum possible OST count, then allocate that many + * runtime records. record_buffer will always run out of memory before + * we overflow record_runtime_array. + */ + max_records = mem_limit / sizeof(struct darshan_lustre_record); + lustre_runtime->record_runtime_array = + malloc( max_records * sizeof(struct lustre_record_runtime)); + if(!lustre_runtime->record_runtime_array) + { + lustre_runtime->record_buffer_max = 0; + free( lustre_runtime->record_buffer ); + return; + } + memset(lustre_runtime->record_runtime_array, 0, + max_records * sizeof(struct lustre_record_runtime)); + + return; +} + +/************************************************************************** + * Functions exported by Lustre module for coordinating with darshan-core * + **************************************************************************/ + +static void lustre_begin_shutdown(void) +{ + assert(lustre_runtime); + + LUSTRE_LOCK(); + /* disable further instrumentation while Darshan shuts down */ + instrumentation_disabled = 1; + LUSTRE_UNLOCK(); + + return; +} + +static void lustre_get_output_data( + MPI_Comm mod_comm, + darshan_record_id *shared_recs, + int shared_rec_count, + void **lustre_buf, + int *lustre_buf_sz) +{ + struct lustre_record_runtime *file; + int i, ishared; + int *rec_lengths; + size_t shared_rec_size; + struct darshan_lustre_record *red_send_buf = NULL; + struct darshan_lustre_record *red_recv_buf = NULL; + MPI_Datatype red_type; + MPI_Aint *rec_offsets; + MPI_Op red_op; + + assert(lustre_runtime); + + /* if there are globally shared files, do a shared file reduction */ + /* NOTE: the shared file reduction is also skipped if the + * DARSHAN_DISABLE_SHARED_REDUCTION environment variable is set. + */ + if (shared_rec_count && !getenv("DARSHAN_DISABLE_SHARED_REDUCTION")) + { + /* necessary initialization of shared records */ + for(i = 0; i < shared_rec_count; i++) + { + HASH_FIND(hlink, lustre_runtime->record_runtime_hash, &shared_recs[i], + sizeof(darshan_record_id), file); + assert(file); + + file->record->rank = -1; + } + + /* sort the array of files descending by rank so that we get all of the + * shared files (marked by rank -1) in a contiguous portion at end + * of the array + */ + sort_lustre_records(); + + /* make red_send_buf point to the first shared-file record */ + ishared = lustre_runtime->record_count - shared_rec_count; + red_send_buf = + (lustre_runtime->record_runtime_array[ishared]).record; + + /* allocate memory for the reduction output on rank 0 */ + if (my_rank == 0) + { + shared_rec_size = lustre_runtime->record_buffer_used - ((char*)red_send_buf - (char*)lustre_runtime->record_buffer); + red_recv_buf = malloc(shared_rec_size); + if (!red_recv_buf) + return; + } + + /* need to build rec_lengths (array of ints) and rec_offsets (array of ints) */ + rec_lengths = malloc(sizeof(*rec_lengths) * shared_rec_count); + rec_offsets = malloc(sizeof(*rec_offsets) * shared_rec_count); + for ( i = ishared; i < shared_rec_count; i ++ ) + { + rec_lengths[i] = (lustre_runtime->record_runtime_array[i]).record_size; + rec_offsets[i] = (char*)((lustre_runtime->record_runtime_array[i]).record) - + (char*)((lustre_runtime->record_runtime_array[ishared]).record); + } + + /* ... */ + DARSHAN_MPI_CALL(PMPI_Type_hindexed)( + shared_rec_count, + rec_lengths, + rec_offsets, + MPI_BYTE, + &red_type + ); + DARSHAN_MPI_CALL(PMPI_Type_commit)(&red_type); + DARSHAN_MPI_CALL(PMPI_Op_create)(lustre_record_reduction_op, 1, &red_op); + DARSHAN_MPI_CALL(PMPI_Reduce)(red_send_buf, red_recv_buf, + shared_rec_count, red_type, red_op, 0, mod_comm); + + /* clean up reduction state */ + if (my_rank == 0) + { + memcpy(&(lustre_runtime->record_buffer[ishared]), red_recv_buf, + shared_rec_size); + free(red_recv_buf); + } + else + { + lustre_runtime->record_count -= shared_rec_count; + } + free(rec_lengths); + free(rec_offsets); + DARSHAN_MPI_CALL(PMPI_Type_free)(&red_type); + DARSHAN_MPI_CALL(PMPI_Op_free)(&red_op); + } + + *lustre_buf = (void *)(lustre_runtime->record_buffer); + *lustre_buf_sz = lustre_runtime->record_buffer_used; + + return; +} + +static void lustre_shutdown(void) +{ + assert(lustre_runtime); + + HASH_CLEAR(hlink, lustre_runtime->record_runtime_hash); + free(lustre_runtime->record_runtime_array); + free(lustre_runtime->record_buffer); + free(lustre_runtime); + lustre_runtime = NULL; + + return; +} + +/* compare function for sorting file records by descending rank */ +static int lustre_record_compare(const void* a_p, const void* b_p) +{ + const struct lustre_record_runtime* a = a_p; + const struct lustre_record_runtime* b = b_p; + + if (a->record->rank < b->record->rank) + return 1; + if (a->record->rank > b->record->rank) + return -1; + + /* if ( a->record->rank == b->record->rank ) we MUST do a secondary + * sort so that the order of qsort is fully deterministic and consistent + * across all MPI ranks. Without a secondary sort, the sort order can + * be affected by rank-specific variations (e.g., the order in which + * files are first opened). + */ + + return 0; +} + +/* + * Sort the record_runtimes and records by MPI rank to facilitate shared redux. + * This requires craftiness and additional heap utilization because the records + * (but not record_runtimes) have variable size. Currently has to temporarily + * duplicate the entire record_buffer; there is room for more memory-efficient + * optimization if this becomes a scalability issue. + */ +int sort_lustre_records() +{ + int i; + struct darshan_lustre_record *rec; + struct lustre_record_runtime *rec_rt, *tmp_rec_rt; + char *new_buf, *p; + + /* Create a new buffer to store an entire replica of record_buffer. Since + * we know the exact size of record_buffer's useful data at this point, we + * can allocate the exact amount we need instead of record_buffer_max */ + new_buf = malloc(lustre_runtime->record_buffer_used); + p = new_buf; + if ( !new_buf ) + return 1; + + /* qsort breaks the hash table, so delete it now to free its memory buffers + * and prevent later confusion */ + HASH_ITER( hlink, lustre_runtime->record_runtime_hash, rec_rt, tmp_rec_rt ) + HASH_DELETE( hlink, lustre_runtime->record_runtime_hash, rec_rt ); + + /* sort the runtime records, which is has fixed-length elements */ + qsort( + lustre_runtime->record_runtime_array, + lustre_runtime->record_count, + sizeof(struct lustre_record_runtime), + lustre_record_compare + ); + + /* rebuild the hash and array with the qsorted runtime records */ + for ( i = 0; i < lustre_runtime->record_count; i++ ) + { + rec_rt = &(lustre_runtime->record_runtime_array[i]); + HASH_ADD(hlink, lustre_runtime->record_runtime_hash, record->rec_id, sizeof(darshan_record_id), rec_rt ); + } + + /* create reordered record buffer, then copy it back in place */ + for ( i = 0; i < lustre_runtime->record_count; i++ ) + { + rec_rt = &(lustre_runtime->record_runtime_array[i]); + memcpy( p, rec_rt->record, rec_rt->record_size ); + /* fix record pointers within each runtime record too - pre-emptively + * point them at where they will live in record_buffer after we memcpy + * below */ + rec_rt->record = (struct darshan_lustre_record *)((char*)(lustre_runtime->record_buffer) + (p - new_buf)); + + p += rec_rt->record_size; + } + memcpy( + lustre_runtime->record_buffer, + new_buf, + lustre_runtime->record_buffer_used ); + + free(new_buf); + return 0; +} + +/* this is just boilerplate reduction code that isn't currently used */ +static void lustre_record_reduction_op(void* infile_v, void* inoutfile_v, + int *len, MPI_Datatype *datatype) +{ + struct darshan_lustre_record tmp_record; + struct darshan_lustre_record *infile = infile_v; + struct darshan_lustre_record *inoutfile = inoutfile_v; + int i, j; + + assert(lustre_runtime); + + for( i=0; i<*len; i++ ) + { + memset(&tmp_record, 0, sizeof(struct darshan_lustre_record)); + tmp_record.rec_id = infile->rec_id; + tmp_record.rank = -1; + + /* preserve only rank 0's value */ + for( j = LUSTRE_OSTS; j < LUSTRE_NUM_INDICES; j++) + { + if ( my_rank == 0 ) + { + tmp_record.counters[j] = infile->counters[j]; + } + else + { + tmp_record.counters[j] = inoutfile->counters[j]; + } + } + + /* update pointers */ + *inoutfile = tmp_record; + inoutfile++; + infile++; + } + + return; +} + +/* + * Dump the memory structure of our records and runtime records + */ +void print_lustre_runtime( void ) +{ + int i, j; + struct darshan_lustre_record *rec; + + /* print what we just loaded */ + for ( i = 0; i < lustre_runtime->record_count; i++ ) + { + rec = (lustre_runtime->record_runtime_array[i]).record; + printf( "File %2d\n", i ); + for ( j = 0; j < LUSTRE_NUM_INDICES; j++ ) + { + printf( " Counter %-2d: %10ld, addr %ld\n", + j, + rec->counters[j], + (char*)(&(rec->counters[j])) - (char*)(lustre_runtime->record_buffer) ); + } + for ( j = 0; j < rec->counters[LUSTRE_STRIPE_WIDTH]; j++ ) + { + if ( j > 0 && j % 2 == 0 ) printf("\n"); + printf( " Stripe %-2d: %10ld, addr %-9d", + j, + rec->ost_ids[j], + (char*)(&(rec->ost_ids[j])) - (char*)(lustre_runtime->record_buffer) ); + } + printf( "\n" ); + } + return; +} + +/* + * Dump the order in which records appear in memory + */ +void print_array( void ) +{ + int i; + struct lustre_record_runtime *rec_rt; + printf("*** DUMPING RECORD LIST BY ARRAY SEQUENCE\n"); + for ( i = 0; i < lustre_runtime->record_count; i++ ) + { + rec_rt = &(lustre_runtime->record_runtime_array[i]); + printf( "*** record %d rank %d osts %d\n", + rec_rt->record->rec_id, + rec_rt->record->rank, + rec_rt->record->counters[LUSTRE_STRIPE_WIDTH]); + } +} +void print_hash( void ) +{ + struct lustre_record_runtime *rec_rt, *tmp_rec_rt; + printf("*** DUMPING RECORD LIST BY HASH SEQUENCE\n"); + HASH_ITER( hlink, lustre_runtime->record_runtime_hash, rec_rt, tmp_rec_rt ) + { + printf( "*** record %d rank %d osts %d\n", + rec_rt->record->rec_id, + rec_rt->record->rank, + rec_rt->record->counters[LUSTRE_STRIPE_WIDTH]); + } + return; +} + + + + +/* + * Local variables: + * c-indent-level: 4 + * c-basic-offset: 4 + * End: + * + * vim: ts=8 sts=4 sw=4 expandtab + */ diff --git a/darshan-runtime/lib/darshan-pnetcdf.c b/darshan-runtime/lib/darshan-pnetcdf.c index 1b71a89633a2d06030da109980f4d86d3b7f9d4e..baf62fa816d4871a94639e7fc586fdab851239f2 100644 --- a/darshan-runtime/lib/darshan-pnetcdf.c +++ b/darshan-runtime/lib/darshan-pnetcdf.c @@ -4,6 +4,9 @@ * */ +#define _XOPEN_SOURCE 500 +#define _GNU_SOURCE + #include "darshan-runtime-config.h" #include #include @@ -16,7 +19,6 @@ #include #include #include -#define __USE_GNU #include #include "darshan.h" diff --git a/darshan-runtime/lib/darshan-posix.c b/darshan-runtime/lib/darshan-posix.c index f9bc22e342702e8441e9a47a87eb0def46fa033e..91ac2faf585ff51365d1d35977810d471d864e6b 100644 --- a/darshan-runtime/lib/darshan-posix.c +++ b/darshan-runtime/lib/darshan-posix.c @@ -37,6 +37,10 @@ typedef int64_t off64_t; #define aiocb64 aiocb #endif +#ifndef LL_SUPER_MAGIC +#define LL_SUPER_MAGIC 0x0BD00BD0 +#endif + DARSHAN_FORWARD_DECL(open, int, (const char *path, int flags, ...)); DARSHAN_FORWARD_DECL(open64, int, (const char *path, int flags, ...)); DARSHAN_FORWARD_DECL(creat, int, (const char* path, mode_t mode)); @@ -117,6 +121,7 @@ struct posix_file_record_ref int access_count; void *stride_root; int stride_count; + int fs_type; /* same as darshan_fs_info->fs_type */ struct posix_aio_tracker* aio_list; }; @@ -161,6 +166,11 @@ static void posix_shutdown( MPI_Comm mod_comm, darshan_record_id *shared_recs, int shared_rec_count, void **posix_buf, int *posix_buf_sz); +/* XXX modules don't expose an API for other modules, so use extern to get + * Lustre instrumentation function + */ +extern void darshan_instrument_lustre_file(const char *filepath, int fd); + static struct posix_runtime *posix_runtime = NULL; static pthread_mutex_t posix_runtime_mutex = PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP; static int instrumentation_disabled = 0; @@ -216,6 +226,8 @@ static int darshan_mem_alignment = 1; DARSHAN_TIMER_INC_NO_OVERLAP(rec_ref->file_rec->fcounters[POSIX_F_META_TIME], \ __tm1, __tm2, rec_ref->last_meta_end); \ darshan_add_record_ref(&(posix_runtime->fd_hash), &__ret, sizeof(int), rec_ref); \ + if(rec_ref->fs_type == LL_SUPER_MAGIC) \ + darshan_instrument_lustre_file(__path, __ret); \ if(newpath != __path) free(newpath); \ } while(0) @@ -1444,7 +1456,7 @@ static struct posix_file_record_ref *posix_track_new_file_record( { struct darshan_posix_file *file_rec = NULL; struct posix_file_record_ref *rec_ref = NULL; - int file_alignment; + struct darshan_fs_info fs_info; int ret; rec_ref = malloc(sizeof(*rec_ref)); @@ -1469,7 +1481,7 @@ static struct posix_file_record_ref *posix_track_new_file_record( path, DARSHAN_POSIX_MOD, sizeof(struct darshan_posix_file), - &file_alignment); + &fs_info); if(!file_rec) { @@ -1483,8 +1495,9 @@ static struct posix_file_record_ref *posix_track_new_file_record( file_rec->base_rec.id = rec_id; file_rec->base_rec.rank = my_rank; file_rec->counters[POSIX_MEM_ALIGNMENT] = darshan_mem_alignment; - file_rec->counters[POSIX_FILE_ALIGNMENT] = file_alignment; + file_rec->counters[POSIX_FILE_ALIGNMENT] = fs_info.block_size; rec_ref->file_rec = file_rec; + rec_ref->fs_type = fs_info.fs_type; posix_runtime->file_rec_count++; return(rec_ref); diff --git a/darshan-test/2.x/llapi-perf.c b/darshan-test/2.x/llapi-perf.c new file mode 100644 index 0000000000000000000000000000000000000000..be90d0e3441b4c5c2d889c7a0f7e07fc2eead16b --- /dev/null +++ b/darshan-test/2.x/llapi-perf.c @@ -0,0 +1,314 @@ +/* + * (C) 2012 by Argonne National Laboratory. + * See COPYRIGHT in top-level directory. + */ + +/* llapi-perf.c + * Time how long it takes to extract various file data from Lustre via + * ioctl and llapi calls from every process. -i uses ioctl, -a uses the + * Lustre API. This also retains the features of stat-perf.c, which + * times how long it takes to issue a stat64() call to the designated file + * from every process. -f causes it to use fstat64() rather than stat64(). + * -l causes it to use lseek(SEEK_END) instead of stat64(). + * -c causes it to create the file from scratch rather than operating on an + * existing file. -r issues a realpath() call on the file. + */ + +#define _LARGEFILE64_SOURCE + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifndef NO_LUSTRE +#include +#endif + +static char* opt_file = NULL; +static int opt_create = 0; +static int opt_fstat = 0; +static int opt_lseek = 0; +static int opt_realpath = 0; +static int opt_ioctl = 0; +static int opt_llapi = 0; +static int opt_fpp = 0; +static int rank = -1; + +static int parse_args(int argc, char **argv); +static void usage(void); + +int main(int argc, char **argv) +{ + int fd; + int ret; + double stime, etime, elapsed, slowest; + struct stat64 statbuf; + int nprocs; + off64_t offset, orig_offset; + char* new_path; + + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &nprocs); + + /* parse the command line arguments */ + parse_args(argc, argv); + + MPI_Barrier(MPI_COMM_WORLD); + + /* open specified file */ + if(!opt_create) + { + fd = open(opt_file, O_RDWR); + if(fd < 0) + { + perror("open"); + exit(1); + } + } + else + { + /* rank 0 create, everyone else open */ + if(rank == 0 || opt_fpp) + { + fd = open(opt_file, O_RDWR|O_CREAT|O_EXCL, S_IRUSR|S_IWUSR); + if(fd < 0) + { + perror("open"); + exit(1); + } + MPI_Barrier(MPI_COMM_WORLD); + } + else + { + MPI_Barrier(MPI_COMM_WORLD); + fd = open(opt_file, O_RDWR); + if(fd < 0) + { + perror("open"); + exit(1); + } + } + } + + MPI_Barrier(MPI_COMM_WORLD); + stime = MPI_Wtime(); + + ret = 0; + if(opt_fstat) + ret = fstat64(fd, &statbuf); + else if(opt_lseek) + { + /* find current position */ + orig_offset = lseek64(fd, 0, SEEK_CUR); + if(orig_offset < 0) + ret = -1; + else + { + /* find end of file; this is the size */ + offset = lseek64(fd, 0, SEEK_END); + if(offset < 0) + ret = -1; + else + { + /* go back to original position */ + offset = lseek64(fd, orig_offset, SEEK_SET); + if(offset < 0) + ret = -1; + } + } + } + else if(opt_realpath) + { + new_path = realpath(opt_file, NULL); + if(!new_path) + ret = -1; + else + free(new_path); + } + else if ( opt_llapi || opt_ioctl ) + { +#ifdef NO_LUSTRE + fprintf(stderr, "Not compiled with Lustre support\n"); + ret = -1; +#else + struct lov_user_md *lum; + size_t lumsize = sizeof(struct lov_user_md) + + LOV_MAX_STRIPE_COUNT * sizeof(struct lov_user_ost_data); + + lum = calloc(1, lumsize); + if (lum == NULL) { + ret = ENOMEM; + fprintf(stderr, "No memory\n"); + } + else { + if ( opt_llapi ) + { + ret = llapi_file_get_stripe(opt_file, lum); + } + else if ( opt_ioctl ) + { + lum->lmm_magic = LOV_USER_MAGIC; + lum->lmm_stripe_count = LOV_MAX_STRIPE_COUNT; + ret = ioctl( fd, LL_IOC_LOV_GETSTRIPE, (void *)lum ); + } +#ifdef DEBUG + /* different API/ioctl calls populate only parts of lum */ + printf( "stripe_width=%d stripe_size=%d starting_ost=%d\n", + lum->lmm_stripe_count, + lum->lmm_stripe_size, + lum->lmm_stripe_count ); +#endif + } +#endif + } + else + ret = stat64(opt_file, &statbuf); + + if(ret != 0 && !opt_ioctl && !opt_llapi) + { + perror("stat64 or fstat64"); + exit(1); + } +#ifndef NO_LUSTRE + else if ( ret < 0 && opt_ioctl ) + { + perror("ioctl"); + exit(1); + } +#endif + + etime = MPI_Wtime(); + + elapsed = etime-stime; + ret = MPI_Reduce(&elapsed, &slowest, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); + if(ret != 0) + { + fprintf(stderr, "Error: MPI_Reduce() failure.\n"); + exit(1); + } + + + slowest *= 1000.0; + + if(rank == 0) + { + printf("opt_file: %s, opt_create: %d, opt_fstat: %d, opt_lseek: %d, opt_realpath: %d, opt_llapi: %d, opt_ioctl: %d, opt_fpp: %d, nprocs: %d, time: %f ms\n", + opt_file, + opt_create, + opt_fstat, + opt_lseek, + opt_realpath, + opt_llapi, + opt_ioctl, + opt_fpp, + nprocs, + slowest); + } + + MPI_Finalize(); + return(0); +} + +static int parse_args(int argc, char **argv) +{ + int c; + + while ((c = getopt(argc, argv, "fclripa")) != EOF) { + switch (c) { + case 'c': /* create file */ + opt_create = 1; + break; + case 'f': /* fstat instead of stat */ + opt_fstat = 1; + break; + case 'l': /* lseek instead of stat */ + opt_lseek = 1; + break; + case 'r': /* realpath instead of stat */ + opt_realpath = 1; + break; + case 'i': /* use ioctl test */ + opt_ioctl = 1; + break; + case 'a': /* use llapi test*/ + opt_llapi = 1; + break; + case 'p': /* file per process instead of shared file */ + opt_fpp = 1; + break; + case 'h': + if (rank == 0) + usage(); + exit(0); + case '?': /* unknown */ + if (rank == 0) + usage(); + exit(1); + default: + break; + } + } + + if(opt_lseek + opt_fstat + opt_realpath + opt_ioctl + opt_llapi > 1) + { + fprintf(stderr, "Error: Only specify one of -l, -f, -i, -a, or -r.\n"); + usage(); + exit(1); + } + + if(argc-optind != 1) + { + if(rank == 0) + usage(); + exit(1); + } + + if ( opt_fpp ) + { + opt_file = malloc( sizeof(char) * (strlen( argv[optind] ) + 10) ); + sprintf( opt_file, "%s.%d", argv[optind], rank ); + } + else + { + opt_file = strdup(argv[optind]); + } + assert(opt_file); + + return(0); +} + +static void usage(void) +{ + printf("Usage: stat-perf [...] \n"); + printf("\n is one or more of\n"); + printf(" -c create new file to stat\n"); + printf(" -p do file-per-process instead of shared file\n"); + printf(" -f use fstat instead of stat\n"); + printf(" -l use lseek instead of stat\n"); + printf(" -r use realpath instead of stat\n"); + printf(" -a use Lustre API test\n"); + printf(" -i use ioctl Lustre test\n"); + printf(" -h print this help\n"); +} + +/* + * Local variables: + * c-indent-level: 3 + * c-basic-offset: 3 + * tab-width: 3 + * + * vim: ts=3 + * End: + */ + + diff --git a/darshan-test/lustre/.gitignore b/darshan-test/lustre/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..d67066f79a5418dbeee098904fd7090e293c98c1 --- /dev/null +++ b/darshan-test/lustre/.gitignore @@ -0,0 +1,2 @@ +*.o +darshan-tester diff --git a/darshan-test/lustre/Makefile b/darshan-test/lustre/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..55c1a47cc4bc3d5c8fa1815ec20374156d111604 --- /dev/null +++ b/darshan-test/lustre/Makefile @@ -0,0 +1,19 @@ +.PHONY: clean +BINS = darshan-tester darshan-tester-mpi +OBJS = darshan-lustre.o darshan-core-stub.o +CFLAGS = -O0 -g -I../.. -I../../darshan-runtime + +### Include -I. when building non-MPI tests to include the mpi.h stub header +CFLAGS += -I. + +darshan-tester: $(OBJS) + $(CC) $(LDFLAGS) $^ $(LOADLIBES) $(LDLIBS) -o $@ + +darshan-tester-mpi: $(OBJS) + $(CC) $(LDFLAGS) $^ $(LOADLIBES) $(LDLIBS) -o $@ + +darshan-lustre.o: ../../darshan-runtime/lib/darshan-lustre.c + $(CC) $(CPPFLAGS) $(CFLAGS) -c $? -o $@ + +clean: + -@rm -v $(OBJS) $(BINS) diff --git a/darshan-test/lustre/darshan-core-stub.c b/darshan-test/lustre/darshan-core-stub.c new file mode 100644 index 0000000000000000000000000000000000000000..95dbd32bd6cf489c55a79bd7e6160cda7ad2ffc1 --- /dev/null +++ b/darshan-test/lustre/darshan-core-stub.c @@ -0,0 +1,104 @@ +#define _XOPEN_SOURCE 500 +#define _GNU_SOURCE + +#include "darshan-runtime-config.h" +#include "darshan.h" +#include "darshan-core.h" + +#include +#include +#include +#include + +#include "darshan-lustre.h" + +/* + * Global variables + */ +static darshan_record_id next_rec_id = 1; +static int my_rank = 0; +static struct darshan_module_funcs mod_funcs; + +/* + * Import routines from Lustre module + */ +extern struct lustre_runtime *lustre_runtime; + +void darshan_core_register_record( + void *name, + int len, + darshan_module_id mod_id, + int printable_flag, + int mod_limit_flag, + darshan_record_id *rec_id, + struct darshan_fs_info *fs_info) +{ + *rec_id = next_rec_id++; + + if (fs_info) + { + memset( fs_info, 0, sizeof(struct darshan_fs_info) ); + fs_info->fs_type = -1; + } + + return; +} + +void darshan_core_register_module( + darshan_module_id mod_id, + struct darshan_module_funcs *funcs, + int *rank, + int *mod_mem_limit, + int *sys_mem_alignment) +{ +/* if (sys_mem_alignment) *sys_mem_alignment = darshan_mem_alignment; */ + if (rank) *rank = my_rank; + *mod_mem_limit = DARSHAN_MOD_MEM_MAX; + mod_funcs = *funcs; + + return; +} + +void darshan_core_shutdown() +{ + darshan_record_id *mod_shared_recs = NULL; + int mod_shared_rec_cnt = 0; + void* mod_buf = NULL; + int mod_buf_sz = 0; + + mod_funcs.begin_shutdown(); + mod_funcs.get_output_data( MPI_COMM_WORLD, mod_shared_recs, mod_shared_rec_cnt, &mod_buf, &mod_buf_sz ); + + print_lustre_runtime(); + + mod_funcs.shutdown(); + + return; +} + +int main( int argc, char **argv ) +{ + int fd, i; + char *fname; + + srand(234); + + /* build Darshan records */ + for ( i = 1; i < argc; i++ ) + { + fname = argv[i]; + printf( "File %3d - processing %s\n", i, fname ); + fd = open( fname, O_RDONLY ); + darshan_instrument_lustre_file( fname, fd ); + close(fd); + } + + for ( i = 0; i < lustre_runtime->record_count; i++ ) + (lustre_runtime->record_runtime_array[i]).record->rank = rand() % 10; + + print_lustre_runtime(); + + darshan_core_shutdown(); + + return 0; +} diff --git a/darshan-test/lustre/mpi.h b/darshan-test/lustre/mpi.h new file mode 100644 index 0000000000000000000000000000000000000000..dc9a263daf909ec8343ee721ea0027e2d3a5010e --- /dev/null +++ b/darshan-test/lustre/mpi.h @@ -0,0 +1,10 @@ +/* + * VERY primitive stubs that allow darshan.h to be included in non-MPI + * applications like darshan-tester + */ +typedef int MPI_Comm; +typedef int MPI_Datatype; +typedef int MPI_Op; +typedef long MPI_Aint; +unsigned char MPI_BYTE; +#define MPI_COMM_WORLD 0 diff --git a/darshan-test/lustre/test-darshan.sh b/darshan-test/lustre/test-darshan.sh new file mode 100755 index 0000000000000000000000000000000000000000..f1bb21fb6d875b9ebf26ba6f6f5cfa7615456f81 --- /dev/null +++ b/darshan-test/lustre/test-darshan.sh @@ -0,0 +1,32 @@ +#!/bin/bash +# +# Run the test program through Valgrind to expose memory leaks and buffer +# overflows on a variety of different file locations and geometries +# + +### Make some files to test. Assume $SCRATCH points at Lustre +for stripe in 1 2 4 8 16 32 +do + if [ ! -f $SCRATCH/stripe${stripe} ]; then + lfs setstripe -c $stripe $SCRATCH/stripe${stripe} + fi +done + +set -x + +valgrind --tool=memcheck \ + --leak-check=yes \ + --show-reachable=yes \ + --num-callers=20 \ + --track-fds=yes \ + --read-var-info=yes \ + ./darshan-tester \ + $SCRATCH/stripe4 \ + $SCRATCH/stripe32 \ + $SCRATCH/stripe1 \ + $SCRATCH/stripe16 \ + $SCRATCH/stripe8 \ + $HOME/.bashrc \ + $SCRATCH/stripe2 + +set +x diff --git a/darshan-util/Makefile.in b/darshan-util/Makefile.in index 3b6e88b2833f93f4080a07f5dfe06d77de0279dd..77e40d88c146f8bba0001cd19133cc515052e0f7 100644 --- a/darshan-util/Makefile.in +++ b/darshan-util/Makefile.in @@ -13,10 +13,10 @@ libdir = $(DESTDIR)@libdir@ pkgconfigdir = $(DESTDIR)$(libdir)/pkgconfig DARSHAN_LOG_FORMAT = $(srcdir)/../darshan-log-format.h -DARSHAN_MOD_LOG_FORMATS = $(srcdir)/../darshan-posix-log-format.h $(srcdir)/../darshan-mpiio-log-format.h $(srcdir)/../darshan-hdf5-log-format.h $(srcdir)/../darshan-pnetcdf-log-format.h -DARSHAN_MOD_LOGUTIL_HEADERS = darshan-posix-logutils.h darshan-mpiio-logutils.h darshan-hdf5-logutils.h darshan-pnetcdf-logutils.h -DARSHAN_STATIC_MOD_OBJS = darshan-posix-logutils.o darshan-mpiio-logutils.o darshan-hdf5-logutils.o darshan-pnetcdf-logutils.o darshan-bgq-logutils.o -DARSHAN_DYNAMIC_MOD_OBJS = darshan-posix-logutils.po darshan-mpiio-logutils.po darshan-hdf5-logutils.po darshan-pnetcdf-logutils.po darshan-bgq-logutils.po +DARSHAN_MOD_LOG_FORMATS = $(srcdir)/../darshan-posix-log-format.h $(srcdir)/../darshan-mpiio-log-format.h $(srcdir)/../darshan-hdf5-log-format.h $(srcdir)/../darshan-pnetcdf-log-format.h $(srcdir)/../darshan-lustre-log-format.h +DARSHAN_MOD_LOGUTIL_HEADERS = darshan-posix-logutils.h darshan-mpiio-logutils.h darshan-hdf5-logutils.h darshan-pnetcdf-logutils.h darshan-lustre-logutils.h +DARSHAN_STATIC_MOD_OBJS = darshan-posix-logutils.o darshan-mpiio-logutils.o darshan-hdf5-logutils.o darshan-pnetcdf-logutils.o darshan-bgq-logutils.o darshan-lustre-logutils.o +DARSHAN_DYNAMIC_MOD_OBJS = darshan-posix-logutils.po darshan-mpiio-logutils.po darshan-hdf5-logutils.po darshan-pnetcdf-logutils.po darshan-bgq-logutils.po darshan-lustre-logutils.po DARSHAN_ENABLE_SHARED=@DARSHAN_ENABLE_SHARED@ @@ -81,6 +81,11 @@ darshan-bgq-logutils.o: darshan-bgq-logutils.c darshan-logutils.h darshan-bgq-lo darshan-bgq-logutils.po: darshan-bgq-logutils.c darshan-logutils.h darshan-bgq-logutils.h $(DARSHAN_LOG_FORMAT) $(srcdir)/../darshan-bgq-log-format.h | uthash-1.9.2 $(CC) $(CFLAGS_SHARED) -c $< -o $@ +darshan-lustre-logutils.o: darshan-lustre-logutils.c darshan-logutils.h darshan-lustre-logutils.h $(DARSHAN_LOG_FORMAT) $(srcdir)/../darshan-lustre-log-format.h | uthash-1.9.2 + $(CC) $(CFLAGS) -c $< -o $@ +darshan-lustre-logutils.po: darshan-lustre-logutils.c darshan-logutils.h darshan-lustre-logutils.h $(DARSHAN_LOG_FORMAT) $(srcdir)/../darshan-lustre-log-format.h | uthash-1.9.2 + $(CC) $(CFLAGS_SHARED) -c $< -o $@ + libdarshan-util.a: darshan-logutils.o $(DARSHAN_STATIC_MOD_OBJS) ar rcs libdarshan-util.a $^ diff --git a/darshan-util/darshan-convert.c b/darshan-util/darshan-convert.c index f74c1e85b4b1b3a107fa92347c63e6f75fa84a0f..de4b0388b4fdd1f854cc7d9e1eec91483bdadaee 100644 --- a/darshan-util/darshan-convert.c +++ b/darshan-util/darshan-convert.c @@ -19,8 +19,6 @@ #include "darshan-logutils.h" -#define DEF_MOD_BUF_SIZE 1024 /* 1 KiB is enough for all current mod records ... */ - extern uint32_t darshan_hashlittle(const void *key, size_t length, uint32_t initval); int usage (char *exename) @@ -343,6 +341,14 @@ int main(int argc, char **argv) return(-1); } + mod_buf = malloc(DEF_MOD_BUF_SIZE); + if (!mod_buf) + { + darshan_log_close(infile); + darshan_log_close(outfile); + return(-1); + } + /* loop over each module and convert it's data to the new format */ for(i=0; ilog_get_record(infile, mod_buf)) == 1); } + free(mod_buf); darshan_log_close(infile); darshan_log_close(outfile); diff --git a/darshan-util/darshan-logutils.h b/darshan-util/darshan-logutils.h index b76fc31913fbaca7dc6e29aa8662c817b916f4aa..84a45fdc4418fb1ca71971453169dbcb69d6a58d 100644 --- a/darshan-util/darshan-logutils.h +++ b/darshan-util/darshan-logutils.h @@ -17,6 +17,11 @@ #include "darshan-log-format.h" +/* Maximum size of a record - Lustre OST lists can get huge, but 81920 is enough + * for 10K OSTs + */ +#define DEF_MOD_BUF_SIZE 81920 /* 640 KiB */ + struct darshan_fd_int_state; /* darshan file descriptor definition */ @@ -120,6 +125,7 @@ extern struct darshan_mod_logutil_funcs *mod_logutils[]; #include "darshan-hdf5-logutils.h" #include "darshan-pnetcdf-logutils.h" #include "darshan-bgq-logutils.h" +#include "darshan-lustre-logutils.h" darshan_fd darshan_log_open(const char *name); darshan_fd darshan_log_create(const char *name, enum darshan_comp_type comp_type, diff --git a/darshan-util/darshan-lustre-logutils.c b/darshan-util/darshan-lustre-logutils.c new file mode 100644 index 0000000000000000000000000000000000000000..ecc5a323bb1492e4d97ed3d20d6f3a3b98baeedc --- /dev/null +++ b/darshan-util/darshan-lustre-logutils.c @@ -0,0 +1,232 @@ +/* + * Copyright (C) 2015 University of Chicago. + * See COPYRIGHT notice in top-level directory. + * + */ + +#define _GNU_SOURCE +#include "darshan-util-config.h" +#include +#include +#include +#include +#include +#include +#include +#include + +#include "darshan-logutils.h" + +/* counter name strings for the LUSTRE module */ +#define X(a) #a, +char *lustre_counter_names[] = { + LUSTRE_COUNTERS +}; +#undef X + +static int darshan_log_get_lustre_record(darshan_fd fd, void* lustre_buf, + darshan_record_id* rec_id); +static int darshan_log_put_lustre_record(darshan_fd fd, void* lustre_buf, int ver); +static void darshan_log_print_lustre_record(void *file_rec, + char *file_name, char *mnt_pt, char *fs_type, int ver); +static void darshan_log_print_lustre_description(void); +static void darshan_log_print_lustre_record_diff(void *rec1, char *file_name1, + void *rec2, char *file_name2); + +struct darshan_mod_logutil_funcs lustre_logutils = +{ + .log_get_record = &darshan_log_get_lustre_record, + .log_put_record = &darshan_log_put_lustre_record, + .log_print_record = &darshan_log_print_lustre_record, + .log_print_description = &darshan_log_print_lustre_description, + .log_print_diff = &darshan_log_print_lustre_record_diff +}; + +static int darshan_log_get_lustre_record(darshan_fd fd, void* lustre_buf, + darshan_record_id* rec_id) +{ + struct darshan_lustre_record *rec; + int i; + int ret; + + /* retrieve the fixed-size portion of the record */ + ret = darshan_log_getmod(fd, DARSHAN_LUSTRE_MOD, lustre_buf, + sizeof(struct darshan_lustre_record)); + if(ret < 0) + return(-1); + else if(ret < sizeof(struct darshan_lustre_record)) + return(0); + + rec = (struct darshan_lustre_record *)lustre_buf; + + /* swap bytes if necessary */ + if(fd->swap_flag) + { + DARSHAN_BSWAP64(&rec->rec_id); + DARSHAN_BSWAP64(&rec->rank); + for(i=0; icounters[i]); + } + + /* now read the rest of the record */ + if ( rec->counters[LUSTRE_STRIPE_WIDTH] > 1 ) { + ret = darshan_log_getmod( + fd, + DARSHAN_LUSTRE_MOD, + (void*)(&(rec->ost_ids[1])), + (rec->counters[LUSTRE_STRIPE_WIDTH] - 1)*sizeof(OST_ID) + ); + if(ret < 0) + return(-1); + else if(ret < (rec->counters[LUSTRE_STRIPE_WIDTH] - 1)*sizeof(OST_ID)) + return(0); + /* swap bytes if necessary */ + if ( fd->swap_flag ) + for (i = 1; i < rec->counters[LUSTRE_STRIPE_WIDTH]; i++ ) + DARSHAN_BSWAP64(&(rec->ost_ids[i])); + } + + *rec_id = rec->rec_id; + return(1); +} + +static int darshan_log_put_lustre_record(darshan_fd fd, void* lustre_buf, int ver) +{ + struct darshan_lustre_record *rec = (struct darshan_lustre_record *)lustre_buf; + int ret; + + ret = darshan_log_putmod(fd, DARSHAN_LUSTRE_MOD, rec, + LUSTRE_RECORD_SIZE(rec->counters[LUSTRE_STRIPE_WIDTH]), ver); + if(ret < 0) + return(-1); + + return(0); +} + +static void darshan_log_print_lustre_record(void *rec, char *file_name, + char *mnt_pt, char *fs_type, int ver) +{ + int i; + struct darshan_lustre_record *lustre_rec = + (struct darshan_lustre_record *)rec; + + for(i=0; irank, lustre_rec->rec_id, lustre_counter_names[i], + lustre_rec->counters[i], file_name, mnt_pt, fs_type); + } + + for (i = 0; i < lustre_rec->counters[LUSTRE_STRIPE_WIDTH]; i++ ) + { + char strbuf[25]; + snprintf( strbuf, 25, "LUSTRE_OST_ID_%d", i ); + DARSHAN_COUNTER_PRINT(darshan_module_names[DARSHAN_LUSTRE_MOD], + lustre_rec->rank, + lustre_rec->rec_id, + strbuf, + lustre_rec->ost_ids[i], + file_name, + mnt_pt, + fs_type); + } + + return; +} + +static void darshan_log_print_lustre_description() +{ + /* TODO: add actual counter descriptions here */ + printf("\n# description of LUSTRE counters:\n"); + printf("# LUSTRE_OSTS: number of OSTs across the entire file system.\n"); + printf("# LUSTRE_MDTS: number of MDTs across the entire file system.\n"); + printf("# LUSTRE_STRIPE_SIZE: stripe size for file in bytes.\n"); + printf("# LUSTRE_STRIPE_WIDTH: number of OSTs over which file is striped.\n"); + printf("# LUSTRE_STRIPE_OFFSET: OST ID offset specified when the file was created.\n"); + printf("# LUSTRE_OST_ID_*: indices of OSTs over which the file is striped.\n"); + + DARSHAN_PRINT_HEADER(); + + return; +} + +static void darshan_log_print_lustre_record_diff(void *rec1, char *file_name1, + void *rec2, char *file_name2) +{ + struct darshan_lustre_record *lustre_rec1 = (struct darshan_lustre_record *)rec1; + struct darshan_lustre_record *lustre_rec2 = (struct darshan_lustre_record *)rec2; + int i; + + /* NOTE: we assume that both input records are the same module format version */ + + for(i=0; irank, lustre_rec1->rec_id, lustre_counter_names[i], + lustre_rec1->counters[i], file_name1, "", ""); + + } + else if(!lustre_rec1) + { + printf("+ "); + DARSHAN_COUNTER_PRINT(darshan_module_names[DARSHAN_LUSTRE_MOD], + lustre_rec2->rank, lustre_rec2->rec_id, lustre_counter_names[i], + lustre_rec2->counters[i], file_name2, "", ""); + } + else if(lustre_rec1->counters[i] != lustre_rec2->counters[i]) + { + printf("- "); + DARSHAN_COUNTER_PRINT(darshan_module_names[DARSHAN_LUSTRE_MOD], + lustre_rec1->rank, lustre_rec1->rec_id, lustre_counter_names[i], + lustre_rec1->counters[i], file_name1, "", ""); + printf("+ "); + DARSHAN_COUNTER_PRINT(darshan_module_names[DARSHAN_LUSTRE_MOD], + lustre_rec2->rank, lustre_rec2->rec_id, lustre_counter_names[i], + lustre_rec2->counters[i], file_name2, "", ""); + } + } + + /* would it be more or less useful to sort the OST IDs before comparing? */ + if ( lustre_rec1->counters[LUSTRE_STRIPE_WIDTH] == lustre_rec2->counters[LUSTRE_STRIPE_WIDTH] ) { + for (i = 0; i < lustre_rec1->counters[LUSTRE_STRIPE_WIDTH]; i++ ) + { + if (lustre_rec1->ost_ids[i] != lustre_rec2->ost_ids[i]) + { + char strbuf[25]; + snprintf( strbuf, 25, "LUSTRE_OST_ID_%d", i ); + printf("- "); + DARSHAN_COUNTER_PRINT(darshan_module_names[DARSHAN_LUSTRE_MOD], + lustre_rec1->rank, + lustre_rec1->rec_id, + strbuf, + lustre_rec1->ost_ids[i], + file_name1, + "", + ""); + printf("+ "); + DARSHAN_COUNTER_PRINT(darshan_module_names[DARSHAN_LUSTRE_MOD], + lustre_rec2->rank, + lustre_rec2->rec_id, + strbuf, + lustre_rec2->ost_ids[i], + file_name2, + "", + ""); + } + } + } + + return; +} + +/* + * Local variables: + * c-indent-level: 4 + * c-basic-offset: 4 + * End: + * + * vim: ts=8 sts=4 sw=4 expandtab + */ diff --git a/darshan-util/darshan-lustre-logutils.h b/darshan-util/darshan-lustre-logutils.h new file mode 100644 index 0000000000000000000000000000000000000000..3be21a9ab0a49b575e55c91cee853dcc1d95b492 --- /dev/null +++ b/darshan-util/darshan-lustre-logutils.h @@ -0,0 +1,14 @@ +/* + * Copyright (C) 2015 University of Chicago. + * See COPYRIGHT notice in top-level directory. + * + */ + +#ifndef __DARSHAN_LUSTRE_LOG_UTILS_H +#define __DARSHAN_LUSTRE_LOG_UTILS_H + +extern char *lustre_counter_names[]; + +extern struct darshan_mod_logutil_funcs lustre_logutils; + +#endif diff --git a/darshan-util/darshan-parser.c b/darshan-util/darshan-parser.c index cb12ee9b5b06327e226f1960235d45b7a891073a..c0dad99380c5a1e0a7a6771e3d39571847b958ba 100644 --- a/darshan-util/darshan-parser.c +++ b/darshan-util/darshan-parser.c @@ -20,8 +20,6 @@ #include "darshan-logutils.h" -#define DEF_MOD_BUF_SIZE 1024 /* 1 KiB is enough for all current mod records ... */ - /* * Options */ @@ -213,7 +211,7 @@ int main(int argc, char **argv) char *save; char buffer[DARSHAN_JOB_METADATA_LEN]; int empty_mods = 0; - char mod_buf[DEF_MOD_BUF_SIZE]; + char *mod_buf; hash_entry_t *file_hash = NULL; hash_entry_t *curr = NULL; @@ -364,6 +362,12 @@ int main(int argc, char **argv) memset(pdata.rank_cumul_md_time, 0, sizeof(double)*job.nprocs); } + mod_buf = malloc(DEF_MOD_BUF_SIZE); + if (!mod_buf) { + darshan_log_close(fd); + return(-1); + } + for(i=0; i