Commit 08d83a03 authored by Glenn K. Lockwood's avatar Glenn K. Lockwood
Browse files

added filename hashing so multiple opens/closes don't create multiple...

added filename hashing so multiple opens/closes don't create multiple lustre-mod records; added proper reducer so that shared files are flattened into a single set of counters whose values are dictated by rank 0
parent 4b63144a
...@@ -26,6 +26,12 @@ ...@@ -26,6 +26,12 @@
#include "darshan.h" #include "darshan.h"
#include "darshan-dynamic.h" #include "darshan-dynamic.h"
struct lustre_record_runtime
{
struct darshan_lustre_record *record;
UT_hash_handle hlink;
};
/* we just use a simple array for storing records. the POSIX module /* we just use a simple array for storing records. the POSIX module
* only calls into the Lustre module for new records, so we will never * only calls into the Lustre module for new records, so we will never
* have to search for an existing Lustre record (assuming the Lustre * have to search for an existing Lustre record (assuming the Lustre
...@@ -34,8 +40,10 @@ ...@@ -34,8 +40,10 @@
struct lustre_runtime struct lustre_runtime
{ {
struct darshan_lustre_record *record_array; struct darshan_lustre_record *record_array;
struct lustre_record_runtime *record_runtime_array;
int record_array_size; int record_array_size;
int record_array_ndx; int record_array_ndx;
struct lustre_record_runtime *record_hash;
}; };
static struct lustre_runtime *lustre_runtime = NULL; static struct lustre_runtime *lustre_runtime = NULL;
...@@ -49,72 +57,92 @@ static void lustre_begin_shutdown(void); ...@@ -49,72 +57,92 @@ static void lustre_begin_shutdown(void);
static void lustre_get_output_data(MPI_Comm mod_comm, darshan_record_id *shared_recs, static void lustre_get_output_data(MPI_Comm mod_comm, darshan_record_id *shared_recs,
int shared_rec_count, void **lustre_buf, int *lustre_buf_sz); int shared_rec_count, void **lustre_buf, int *lustre_buf_sz);
static void lustre_shutdown(void); static void lustre_shutdown(void);
static int lustre_record_compare(const void* a_p, const void* b_p);
static void lustre_record_reduction_op(void* infile_v, void* inoutfile_v,
int *len, MPI_Datatype *datatype);
#define LUSTRE_LOCK() pthread_mutex_lock(&lustre_runtime_mutex) #define LUSTRE_LOCK() pthread_mutex_lock(&lustre_runtime_mutex)
#define LUSTRE_UNLOCK() pthread_mutex_unlock(&lustre_runtime_mutex) #define LUSTRE_UNLOCK() pthread_mutex_unlock(&lustre_runtime_mutex)
/* TODO: is there any way we can further compact Lustre data to save space?
* e.g., are all files in the same directory guaranteed same striping parameters?
* if so, can we store stripe parameters on per-directory basis and the OST
* list on a per-file basis? maybe the storage savings are small enough this isn't
* worth it, but nice to keep in mind
*/
void darshan_instrument_lustre_file(const char* filepath, int fd) void darshan_instrument_lustre_file(const char* filepath, int fd)
{ {
struct darshan_lustre_record *rec; struct lustre_record_runtime *rec_rt;
struct darshan_fs_info fs_info; struct darshan_fs_info fs_info;
darshan_record_id rec_id; darshan_record_id rec_id;
struct lov_user_md *lum; int limit_flag;
size_t lumsize = sizeof(struct lov_user_md) +
LOV_MAX_STRIPE_COUNT * sizeof(struct lov_user_ost_data);
LUSTRE_LOCK(); LUSTRE_LOCK();
/* make sure the lustre module is already initialized */ /* make sure the lustre module is already initialized */
lustre_runtime_initialize(); lustre_runtime_initialize();
/* if the array is full, we just back out */ /* if the array is full, we just back out */
if(lustre_runtime->record_array_ndx >= lustre_runtime->record_array_size) limit_flag = (lustre_runtime->record_array_ndx >= lustre_runtime->record_array_size);
return;
/* register a Lustre file record with Darshan */ /* register a Lustre file record with Darshan */
fs_info.fs_type = -1;
darshan_core_register_record( darshan_core_register_record(
(void *)filepath, (void *)filepath,
strlen(filepath), strlen(filepath),
DARSHAN_LUSTRE_MOD, DARSHAN_LUSTRE_MOD,
1, 1,
0, limit_flag,
&rec_id, &rec_id,
&fs_info); &fs_info);
/* if record id is 0, darshan has no more memory for instrumenting */ /* if record id is 0, darshan has no more memory for instrumenting */
if(rec_id == 0) if(rec_id == 0)
{
LUSTRE_UNLOCK();
return; return;
}
/* search the hash table for this file record, and initialize if not found */
HASH_FIND(hlink, lustre_runtime->record_hash, &rec_id, sizeof(darshan_record_id), rec_rt );
if ( !rec_rt ) {
struct darshan_lustre_record *rec;
struct lov_user_md *lum;
size_t lumsize = sizeof(struct lov_user_md) +
LOV_MAX_STRIPE_COUNT * sizeof(struct lov_user_ost_data);
/* allocate a new lustre record and append it to the array */
rec_rt = &(lustre_runtime->record_runtime_array[lustre_runtime->record_array_ndx]);
rec_rt->record = &(lustre_runtime->record_array[lustre_runtime->record_array_ndx]);
rec = rec_rt->record;
rec->rec_id = rec_id;
rec->rank = my_rank;
/* implicit assumption here that none of these counters will change
* after the first time a file is opened. This may not always be
* true in the future */
if ( fs_info.fs_type != -1 )
{
rec->counters[LUSTRE_OSTS] = fs_info.ost_count;
rec->counters[LUSTRE_MDTS] = fs_info.mdt_count;
}
else
{
rec->counters[LUSTRE_OSTS] = -1;
rec->counters[LUSTRE_MDTS] = -1;
}
/* allocate a new lustre record and append it to the array */ if ( (lum = calloc(1, lumsize)) != NULL ) {
rec = &(lustre_runtime->record_array[lustre_runtime->record_array_ndx++]); lum->lmm_magic = LOV_USER_MAGIC;
rec->rec_id = rec_id; /* don't care about the return code for ioctl */
rec->rank = my_rank; ioctl( fd, LL_IOC_LOV_GETSTRIPE, (void *)lum );
rec->counters[LUSTRE_STRIPE_SIZE] = lum->lmm_stripe_size;
/* TODO: gather lustre data, store in record hash */ rec->counters[LUSTRE_STRIPE_WIDTH] = lum->lmm_stripe_count;
/* counters in lustre_ref->record->counters */ rec->counters[LUSTRE_STRIPE_OFFSET] = 0; /* this currently doesn't work; lum->lmm_objects[0].l_ost_idx isn't being populated */
rec->counters[LUSTRE_OSTS] = fs_info.ost_count; /* TODO: add explicit list of OSTs */
rec->counters[LUSTRE_MDTS] = fs_info.mdt_count; free(lum);
}
/* we must map darshan_lustre_record (or darshan_posix_file, or filename) to an fd */ else
rec->counters[LUSTRE_STRIPE_SIZE] = -1; {
rec->counters[LUSTRE_STRIPE_WIDTH] = -1; rec->counters[LUSTRE_STRIPE_SIZE] = -1;
rec->counters[LUSTRE_STRIPE_OFFSET] = -1; rec->counters[LUSTRE_STRIPE_WIDTH] = -1;
rec->counters[LUSTRE_STRIPE_OFFSET] = -1;
if ( (lum = calloc(1, lumsize)) != NULL ) { }
lum->lmm_magic = LOV_USER_MAGIC; HASH_ADD(hlink, lustre_runtime->record_hash, record->rec_id, sizeof(darshan_record_id), rec_rt);
/* don't care about the return code for ioctl */ lustre_runtime->record_array_ndx++;
ioctl( fd, LL_IOC_LOV_GETSTRIPE, (void *)lum );
rec->counters[LUSTRE_STRIPE_SIZE] = lum->lmm_stripe_size;
rec->counters[LUSTRE_STRIPE_WIDTH] = lum->lmm_stripe_count;
rec->counters[LUSTRE_STRIPE_OFFSET] = lum->lmm_stripe_offset;
/* todo: add explicit list of OSTs */
free(lum);
} }
LUSTRE_UNLOCK(); LUSTRE_UNLOCK();
...@@ -167,6 +195,16 @@ static void lustre_runtime_initialize() ...@@ -167,6 +195,16 @@ static void lustre_runtime_initialize()
memset(lustre_runtime->record_array, 0, lustre_runtime->record_array_size * memset(lustre_runtime->record_array, 0, lustre_runtime->record_array_size *
sizeof(struct darshan_lustre_record)); sizeof(struct darshan_lustre_record));
lustre_runtime->record_runtime_array = malloc(lustre_runtime->record_array_size *
sizeof(struct lustre_record_runtime));
if(!lustre_runtime->record_runtime_array)
{
lustre_runtime->record_array_size = 0;
return;
}
memset(lustre_runtime->record_runtime_array, 0, lustre_runtime->record_array_size *
sizeof(struct lustre_record_runtime));
return; return;
} }
...@@ -193,10 +231,10 @@ static void lustre_get_output_data( ...@@ -193,10 +231,10 @@ static void lustre_get_output_data(
void **lustre_buf, void **lustre_buf,
int *lustre_buf_sz) int *lustre_buf_sz)
{ {
struct hdf5_file_runtime *file; struct lustre_record_runtime *file;
int i; int i;
struct darshan_hdf5_file *red_send_buf = NULL; struct darshan_lustre_record *red_send_buf = NULL;
struct darshan_hdf5_file *red_recv_buf = NULL; struct darshan_lustre_record *red_recv_buf = NULL;
MPI_Datatype red_type; MPI_Datatype red_type;
MPI_Op red_op; MPI_Op red_op;
...@@ -211,68 +249,52 @@ static void lustre_get_output_data( ...@@ -211,68 +249,52 @@ static void lustre_get_output_data(
/* necessary initialization of shared records */ /* necessary initialization of shared records */
for(i = 0; i < shared_rec_count; i++) for(i = 0; i < shared_rec_count; i++)
{ {
HASH_FIND(hlink, lustre_runtime->file_hash, &shared_recs[i], HASH_FIND(hlink, lustre_runtime->record_hash, &shared_recs[i],
sizeof(darshan_record_id), file); sizeof(darshan_record_id), file);
assert(file); assert(file);
file->file_record->rank = -1; file->record->rank = -1;
} }
/*******************************************************************************
* resume editing here!
*
* TODO: determine lustre record shared across all processes,
* and have only rank 0 write these records out. No shared
* reductions should be necessary as the Lustre data for a
* given file should be the same on each process
******************************************************************************/
/* sort the array of files descending by rank so that we get all of the /* sort the array of files descending by rank so that we get all of the
* shared files (marked by rank -1) in a contiguous portion at end * shared files (marked by rank -1) in a contiguous portion at end
* of the array * of the array
*/ */
qsort(hdf5_runtime->file_record_array, hdf5_runtime->file_array_ndx, qsort(lustre_runtime->record_array, lustre_runtime->record_array_ndx,
sizeof(struct darshan_hdf5_file), hdf5_record_compare); sizeof(struct darshan_lustre_record), lustre_record_compare);
/* make *send_buf point to the shared files at the end of sorted array */ /* make *send_buf point to the shared files at the end of sorted array */
red_send_buf = red_send_buf =
&(hdf5_runtime->file_record_array[hdf5_runtime->file_array_ndx-shared_rec_count]); &(lustre_runtime->record_array[lustre_runtime->record_array_ndx-shared_rec_count]);
/* allocate memory for the reduction output on rank 0 */ /* allocate memory for the reduction output on rank 0 */
if(my_rank == 0) if(my_rank == 0)
{ {
red_recv_buf = malloc(shared_rec_count * sizeof(struct darshan_hdf5_file)); red_recv_buf = malloc(shared_rec_count * sizeof(struct darshan_lustre_record));
if(!red_recv_buf) if(!red_recv_buf)
{ {
return; return;
} }
} }
/* construct a datatype for a HDF5 file record. This is serving no purpose DARSHAN_MPI_CALL(PMPI_Type_contiguous)(sizeof(struct darshan_lustre_record),
* except to make sure we can do a reduction on proper boundaries
*/
DARSHAN_MPI_CALL(PMPI_Type_contiguous)(sizeof(struct darshan_hdf5_file),
MPI_BYTE, &red_type); MPI_BYTE, &red_type);
DARSHAN_MPI_CALL(PMPI_Type_commit)(&red_type); DARSHAN_MPI_CALL(PMPI_Type_commit)(&red_type);
DARSHAN_MPI_CALL(PMPI_Op_create)(lustre_record_reduction_op, 1, &red_op);
/* register a HDF5 file record reduction operator */
DARSHAN_MPI_CALL(PMPI_Op_create)(hdf5_record_reduction_op, 1, &red_op);
/* reduce shared HDF5 file records */
DARSHAN_MPI_CALL(PMPI_Reduce)(red_send_buf, red_recv_buf, DARSHAN_MPI_CALL(PMPI_Reduce)(red_send_buf, red_recv_buf,
shared_rec_count, red_type, red_op, 0, mod_comm); shared_rec_count, red_type, red_op, 0, mod_comm);
/* clean up reduction state */ /* clean up reduction state */
if(my_rank == 0) if(my_rank == 0)
{ {
int tmp_ndx = hdf5_runtime->file_array_ndx - shared_rec_count; int tmp_ndx = lustre_runtime->record_array_ndx - shared_rec_count;
memcpy(&(hdf5_runtime->file_record_array[tmp_ndx]), red_recv_buf, memcpy(&(lustre_runtime->record_array[tmp_ndx]), red_recv_buf,
shared_rec_count * sizeof(struct darshan_hdf5_file)); shared_rec_count * sizeof(struct darshan_lustre_record));
free(red_recv_buf); free(red_recv_buf);
} }
else else
{ {
hdf5_runtime->file_array_ndx -= shared_rec_count; lustre_runtime->record_array_ndx -= shared_rec_count;
} }
DARSHAN_MPI_CALL(PMPI_Type_free)(&red_type); DARSHAN_MPI_CALL(PMPI_Type_free)(&red_type);
...@@ -289,14 +311,68 @@ static void lustre_shutdown(void) ...@@ -289,14 +311,68 @@ static void lustre_shutdown(void)
{ {
assert(lustre_runtime); assert(lustre_runtime);
/* TODO: free data structures */ HASH_CLEAR(hlink, lustre_runtime->record_hash);
free(lustre_runtime->record_array); free(lustre_runtime->record_array);
free(lustre_runtime->record_runtime_array);
free(lustre_runtime); free(lustre_runtime);
lustre_runtime = NULL; lustre_runtime = NULL;
return; return;
} }
/* compare function for sorting file records by descending rank */
static int lustre_record_compare(const void* a_p, const void* b_p)
{
const struct darshan_lustre_record* a = a_p;
const struct darshan_lustre_record* b = b_p;
if(a->rank < b->rank)
return 1;
if(a->rank > b->rank)
return -1;
return 0;
}
/* this is just boilerplate reduction code that isn't currently used */
static void lustre_record_reduction_op(void* infile_v, void* inoutfile_v,
int *len, MPI_Datatype *datatype)
{
struct darshan_lustre_record tmp_record;
struct darshan_lustre_record *infile = infile_v;
struct darshan_lustre_record *inoutfile = inoutfile_v;
int i, j;
assert(lustre_runtime);
for( i=0; i<*len; i++ )
{
memset(&tmp_record, 0, sizeof(struct darshan_lustre_record));
tmp_record.rec_id = infile->rec_id;
tmp_record.rank = -1;
/* preserve only rank 0's value */
for( j = LUSTRE_OSTS; j < LUSTRE_NUM_INDICES; j++)
{
if ( my_rank == 0 )
{
tmp_record.counters[j] = infile->counters[j];
}
else
{
tmp_record.counters[j] = inoutfile->counters[j];
}
}
/* update pointers */
*inoutfile = tmp_record;
inoutfile++;
infile++;
}
return;
}
/* /*
* Local variables: * Local variables:
* c-indent-level: 4 * c-indent-level: 4
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment