Commit 08d83a03 authored by Glenn K. Lockwood's avatar Glenn K. Lockwood
Browse files

added filename hashing so multiple opens/closes don't create multiple...

added filename hashing so multiple opens/closes don't create multiple lustre-mod records; added proper reducer so that shared files are flattened into a single set of counters whose values are dictated by rank 0
parent 4b63144a
......@@ -26,6 +26,12 @@
#include "darshan.h"
#include "darshan-dynamic.h"
struct lustre_record_runtime
{
struct darshan_lustre_record *record;
UT_hash_handle hlink;
};
/* we just use a simple array for storing records. the POSIX module
* only calls into the Lustre module for new records, so we will never
* have to search for an existing Lustre record (assuming the Lustre
......@@ -34,8 +40,10 @@
struct lustre_runtime
{
struct darshan_lustre_record *record_array;
struct lustre_record_runtime *record_runtime_array;
int record_array_size;
int record_array_ndx;
struct lustre_record_runtime *record_hash;
};
static struct lustre_runtime *lustre_runtime = NULL;
......@@ -49,62 +57,73 @@ static void lustre_begin_shutdown(void);
static void lustre_get_output_data(MPI_Comm mod_comm, darshan_record_id *shared_recs,
int shared_rec_count, void **lustre_buf, int *lustre_buf_sz);
static void lustre_shutdown(void);
static int lustre_record_compare(const void* a_p, const void* b_p);
static void lustre_record_reduction_op(void* infile_v, void* inoutfile_v,
int *len, MPI_Datatype *datatype);
#define LUSTRE_LOCK() pthread_mutex_lock(&lustre_runtime_mutex)
#define LUSTRE_UNLOCK() pthread_mutex_unlock(&lustre_runtime_mutex)
/* TODO: is there any way we can further compact Lustre data to save space?
* e.g., are all files in the same directory guaranteed same striping parameters?
* if so, can we store stripe parameters on per-directory basis and the OST
* list on a per-file basis? maybe the storage savings are small enough this isn't
* worth it, but nice to keep in mind
*/
void darshan_instrument_lustre_file(const char* filepath, int fd)
{
struct darshan_lustre_record *rec;
struct lustre_record_runtime *rec_rt;
struct darshan_fs_info fs_info;
darshan_record_id rec_id;
struct lov_user_md *lum;
size_t lumsize = sizeof(struct lov_user_md) +
LOV_MAX_STRIPE_COUNT * sizeof(struct lov_user_ost_data);
int limit_flag;
LUSTRE_LOCK();
/* make sure the lustre module is already initialized */
lustre_runtime_initialize();
/* if the array is full, we just back out */
if(lustre_runtime->record_array_ndx >= lustre_runtime->record_array_size)
return;
limit_flag = (lustre_runtime->record_array_ndx >= lustre_runtime->record_array_size);
/* register a Lustre file record with Darshan */
fs_info.fs_type = -1;
darshan_core_register_record(
(void *)filepath,
strlen(filepath),
DARSHAN_LUSTRE_MOD,
1,
0,
limit_flag,
&rec_id,
&fs_info);
/* if record id is 0, darshan has no more memory for instrumenting */
if(rec_id == 0)
{
LUSTRE_UNLOCK();
return;
}
/* search the hash table for this file record, and initialize if not found */
HASH_FIND(hlink, lustre_runtime->record_hash, &rec_id, sizeof(darshan_record_id), rec_rt );
if ( !rec_rt ) {
struct darshan_lustre_record *rec;
struct lov_user_md *lum;
size_t lumsize = sizeof(struct lov_user_md) +
LOV_MAX_STRIPE_COUNT * sizeof(struct lov_user_ost_data);
/* allocate a new lustre record and append it to the array */
rec = &(lustre_runtime->record_array[lustre_runtime->record_array_ndx++]);
rec_rt = &(lustre_runtime->record_runtime_array[lustre_runtime->record_array_ndx]);
rec_rt->record = &(lustre_runtime->record_array[lustre_runtime->record_array_ndx]);
rec = rec_rt->record;
rec->rec_id = rec_id;
rec->rank = my_rank;
/* TODO: gather lustre data, store in record hash */
/* counters in lustre_ref->record->counters */
/* implicit assumption here that none of these counters will change
* after the first time a file is opened. This may not always be
* true in the future */
if ( fs_info.fs_type != -1 )
{
rec->counters[LUSTRE_OSTS] = fs_info.ost_count;
rec->counters[LUSTRE_MDTS] = fs_info.mdt_count;
/* we must map darshan_lustre_record (or darshan_posix_file, or filename) to an fd */
rec->counters[LUSTRE_STRIPE_SIZE] = -1;
rec->counters[LUSTRE_STRIPE_WIDTH] = -1;
rec->counters[LUSTRE_STRIPE_OFFSET] = -1;
}
else
{
rec->counters[LUSTRE_OSTS] = -1;
rec->counters[LUSTRE_MDTS] = -1;
}
if ( (lum = calloc(1, lumsize)) != NULL ) {
lum->lmm_magic = LOV_USER_MAGIC;
......@@ -112,10 +131,19 @@ void darshan_instrument_lustre_file(const char* filepath, int fd)
ioctl( fd, LL_IOC_LOV_GETSTRIPE, (void *)lum );
rec->counters[LUSTRE_STRIPE_SIZE] = lum->lmm_stripe_size;
rec->counters[LUSTRE_STRIPE_WIDTH] = lum->lmm_stripe_count;
rec->counters[LUSTRE_STRIPE_OFFSET] = lum->lmm_stripe_offset;
/* todo: add explicit list of OSTs */
rec->counters[LUSTRE_STRIPE_OFFSET] = 0; /* this currently doesn't work; lum->lmm_objects[0].l_ost_idx isn't being populated */
/* TODO: add explicit list of OSTs */
free(lum);
}
else
{
rec->counters[LUSTRE_STRIPE_SIZE] = -1;
rec->counters[LUSTRE_STRIPE_WIDTH] = -1;
rec->counters[LUSTRE_STRIPE_OFFSET] = -1;
}
HASH_ADD(hlink, lustre_runtime->record_hash, record->rec_id, sizeof(darshan_record_id), rec_rt);
lustre_runtime->record_array_ndx++;
}
LUSTRE_UNLOCK();
return;
......@@ -167,6 +195,16 @@ static void lustre_runtime_initialize()
memset(lustre_runtime->record_array, 0, lustre_runtime->record_array_size *
sizeof(struct darshan_lustre_record));
lustre_runtime->record_runtime_array = malloc(lustre_runtime->record_array_size *
sizeof(struct lustre_record_runtime));
if(!lustre_runtime->record_runtime_array)
{
lustre_runtime->record_array_size = 0;
return;
}
memset(lustre_runtime->record_runtime_array, 0, lustre_runtime->record_array_size *
sizeof(struct lustre_record_runtime));
return;
}
......@@ -193,10 +231,10 @@ static void lustre_get_output_data(
void **lustre_buf,
int *lustre_buf_sz)
{
struct hdf5_file_runtime *file;
struct lustre_record_runtime *file;
int i;
struct darshan_hdf5_file *red_send_buf = NULL;
struct darshan_hdf5_file *red_recv_buf = NULL;
struct darshan_lustre_record *red_send_buf = NULL;
struct darshan_lustre_record *red_recv_buf = NULL;
MPI_Datatype red_type;
MPI_Op red_op;
......@@ -211,68 +249,52 @@ static void lustre_get_output_data(
/* necessary initialization of shared records */
for(i = 0; i < shared_rec_count; i++)
{
HASH_FIND(hlink, lustre_runtime->file_hash, &shared_recs[i],
HASH_FIND(hlink, lustre_runtime->record_hash, &shared_recs[i],
sizeof(darshan_record_id), file);
assert(file);
file->file_record->rank = -1;
file->record->rank = -1;
}
/*******************************************************************************
* resume editing here!
*
* TODO: determine lustre record shared across all processes,
* and have only rank 0 write these records out. No shared
* reductions should be necessary as the Lustre data for a
* given file should be the same on each process
******************************************************************************/
/* sort the array of files descending by rank so that we get all of the
* shared files (marked by rank -1) in a contiguous portion at end
* of the array
*/
qsort(hdf5_runtime->file_record_array, hdf5_runtime->file_array_ndx,
sizeof(struct darshan_hdf5_file), hdf5_record_compare);
qsort(lustre_runtime->record_array, lustre_runtime->record_array_ndx,
sizeof(struct darshan_lustre_record), lustre_record_compare);
/* make *send_buf point to the shared files at the end of sorted array */
red_send_buf =
&(hdf5_runtime->file_record_array[hdf5_runtime->file_array_ndx-shared_rec_count]);
&(lustre_runtime->record_array[lustre_runtime->record_array_ndx-shared_rec_count]);
/* allocate memory for the reduction output on rank 0 */
if(my_rank == 0)
{
red_recv_buf = malloc(shared_rec_count * sizeof(struct darshan_hdf5_file));
red_recv_buf = malloc(shared_rec_count * sizeof(struct darshan_lustre_record));
if(!red_recv_buf)
{
return;
}
}
/* construct a datatype for a HDF5 file record. This is serving no purpose
* except to make sure we can do a reduction on proper boundaries
*/
DARSHAN_MPI_CALL(PMPI_Type_contiguous)(sizeof(struct darshan_hdf5_file),
DARSHAN_MPI_CALL(PMPI_Type_contiguous)(sizeof(struct darshan_lustre_record),
MPI_BYTE, &red_type);
DARSHAN_MPI_CALL(PMPI_Type_commit)(&red_type);
/* register a HDF5 file record reduction operator */
DARSHAN_MPI_CALL(PMPI_Op_create)(hdf5_record_reduction_op, 1, &red_op);
/* reduce shared HDF5 file records */
DARSHAN_MPI_CALL(PMPI_Op_create)(lustre_record_reduction_op, 1, &red_op);
DARSHAN_MPI_CALL(PMPI_Reduce)(red_send_buf, red_recv_buf,
shared_rec_count, red_type, red_op, 0, mod_comm);
/* clean up reduction state */
if(my_rank == 0)
{
int tmp_ndx = hdf5_runtime->file_array_ndx - shared_rec_count;
memcpy(&(hdf5_runtime->file_record_array[tmp_ndx]), red_recv_buf,
shared_rec_count * sizeof(struct darshan_hdf5_file));
int tmp_ndx = lustre_runtime->record_array_ndx - shared_rec_count;
memcpy(&(lustre_runtime->record_array[tmp_ndx]), red_recv_buf,
shared_rec_count * sizeof(struct darshan_lustre_record));
free(red_recv_buf);
}
else
{
hdf5_runtime->file_array_ndx -= shared_rec_count;
lustre_runtime->record_array_ndx -= shared_rec_count;
}
DARSHAN_MPI_CALL(PMPI_Type_free)(&red_type);
......@@ -289,14 +311,68 @@ static void lustre_shutdown(void)
{
assert(lustre_runtime);
/* TODO: free data structures */
HASH_CLEAR(hlink, lustre_runtime->record_hash);
free(lustre_runtime->record_array);
free(lustre_runtime->record_runtime_array);
free(lustre_runtime);
lustre_runtime = NULL;
return;
}
/* compare function for sorting file records by descending rank */
static int lustre_record_compare(const void* a_p, const void* b_p)
{
const struct darshan_lustre_record* a = a_p;
const struct darshan_lustre_record* b = b_p;
if(a->rank < b->rank)
return 1;
if(a->rank > b->rank)
return -1;
return 0;
}
/* this is just boilerplate reduction code that isn't currently used */
static void lustre_record_reduction_op(void* infile_v, void* inoutfile_v,
int *len, MPI_Datatype *datatype)
{
struct darshan_lustre_record tmp_record;
struct darshan_lustre_record *infile = infile_v;
struct darshan_lustre_record *inoutfile = inoutfile_v;
int i, j;
assert(lustre_runtime);
for( i=0; i<*len; i++ )
{
memset(&tmp_record, 0, sizeof(struct darshan_lustre_record));
tmp_record.rec_id = infile->rec_id;
tmp_record.rank = -1;
/* preserve only rank 0's value */
for( j = LUSTRE_OSTS; j < LUSTRE_NUM_INDICES; j++)
{
if ( my_rank == 0 )
{
tmp_record.counters[j] = infile->counters[j];
}
else
{
tmp_record.counters[j] = inoutfile->counters[j];
}
}
/* update pointers */
*inoutfile = tmp_record;
inoutfile++;
infile++;
}
return;
}
/*
* Local variables:
* c-indent-level: 4
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment