Commit 7496c279 authored by Shane Snyder's avatar Shane Snyder
Browse files

More updates to facilitate runtime core shutdown

parent 37d68c1e
...@@ -22,22 +22,21 @@ ...@@ -22,22 +22,21 @@
#define DARSHAN_MOD_NAME_LEN 31 #define DARSHAN_MOD_NAME_LEN 31
/* flags to indicate properties of file records */ /* flags to indicate properties of file records */
#define CP_FLAG_CONDENSED 1<<0
#define CP_FLAG_NOTIMING 1<<1 #define CP_FLAG_NOTIMING 1<<1
struct darshan_core_module struct darshan_core_module
{ {
darshan_module_id id;
char name[DARSHAN_MOD_NAME_LEN+1]; char name[DARSHAN_MOD_NAME_LEN+1];
struct darshan_module_funcs mod_funcs; struct darshan_module_funcs mod_funcs;
struct darshan_core_module *next;
}; };
/* in memory structure to keep up with job level data */ /* in memory structure to keep up with job level data */
struct darshan_core_job_runtime struct darshan_core_job_runtime
{ {
struct darshan_job log_job; struct darshan_job log_job;
struct darshan_core_module* mod_array[DARSHAN_MAX_MODS];
char exe[CP_EXE_LEN+1]; char exe[CP_EXE_LEN+1];
struct darshan_core_module *mod_list_head;
char comp_buf[CP_COMP_BUF_SIZE]; char comp_buf[CP_COMP_BUF_SIZE];
int flags; int flags;
double wtime_offset; double wtime_offset;
......
...@@ -25,9 +25,25 @@ ...@@ -25,9 +25,25 @@
/* Environment variable to override __CP_MEM_ALIGNMENT */ /* Environment variable to override __CP_MEM_ALIGNMENT */
#define CP_MEM_ALIGNMENT_OVERRIDE "DARSHAN_MEMALIGN" #define CP_MEM_ALIGNMENT_OVERRIDE "DARSHAN_MEMALIGN"
/* TODO these go where ? */ /* TODO where do each of the following macros make most sense ? */
#define DARSHAN_MPI_CALL(func) func #define DARSHAN_MPI_CALL(func) func
/* max length of module name string (not counting \0) */
#define DARSHAN_MOD_NAME_LEN 31
/* unique identifiers to distinguish between available darshan modules */
/* NOTES: - valid ids range from [0...DARSHAN_MAX_MODS-1]
* - order of ids control module shutdown order (first module shuts down first)
*/
#define DARSHAN_MAX_MODS 16
typedef enum
{
DARSHAN_POSIX_MOD,
DARSHAN_MPIIO_MOD,
DARSHAN_HDF5_MOD,
DARSHAN_PNETCDF_MOD,
} darshan_module_id;
typedef uint64_t darshan_file_id; typedef uint64_t darshan_file_id;
struct darshan_module_funcs struct darshan_module_funcs
...@@ -41,6 +57,7 @@ struct darshan_module_funcs ...@@ -41,6 +57,7 @@ struct darshan_module_funcs
*********************************************/ *********************************************/
void darshan_core_register_module( void darshan_core_register_module(
darshan_module_id id,
char *name, char *name,
struct darshan_module_funcs *funcs, struct darshan_module_funcs *funcs,
int *runtime_mem_limit); int *runtime_mem_limit);
......
...@@ -19,11 +19,9 @@ ...@@ -19,11 +19,9 @@
#include <sys/types.h> #include <sys/types.h>
#include <sys/stat.h> #include <sys/stat.h>
#include <sys/vfs.h> #include <sys/vfs.h>
#include <mpi.h> #include <mpi.h>
#include "darshan-core.h" #include "darshan-core.h"
#include "utlist.h"
/* TODO is __progname_full needed here */ /* TODO is __progname_full needed here */
extern char* __progname; extern char* __progname;
...@@ -36,19 +34,11 @@ static int my_rank = -1; ...@@ -36,19 +34,11 @@ static int my_rank = -1;
static void darshan_core_initialize(int *argc, char ***argv); static void darshan_core_initialize(int *argc, char ***argv);
static void darshan_core_shutdown(void); static void darshan_core_shutdown(void);
static void darshan_core_cleanup(struct darshan_core_job_runtime* job); static void darshan_core_cleanup(struct darshan_core_job_runtime* job);
static void darshan_get_logfile_name(char* logfile_name, int jobid, struct tm* start_tm);
#define DARSHAN_LOCK() pthread_mutex_lock(&darshan_mutex) #define DARSHAN_LOCK() pthread_mutex_lock(&darshan_mutex)
#define DARSHAN_UNLOCK() pthread_mutex_unlock(&darshan_mutex) #define DARSHAN_UNLOCK() pthread_mutex_unlock(&darshan_mutex)
#define DARSHAN_MOD_REGISTER(__mod, __job) \
LL_PREPEND(__job->mod_list_head, __mod)
#define DARSHAN_MOD_SEARCH(__mod, __tmp, __job) \
LL_SEARCH(__job->mod_list_head, __mod, __tmp, mod_cmp)
#define DARSHAN_MOD_ITER(__mod, __tmp, __job) \
LL_FOREACH_SAFE(__job->mod_list_head, __mod, __tmp)
#define DARSHAN_MOD_DELETE(__mod, __job) \
LL_DELETE(__job->mod_list_head, __mod)
/* intercept MPI initialize and finalize to manage darshan core runtime */ /* intercept MPI initialize and finalize to manage darshan core runtime */
int MPI_Init(int *argc, char ***argv) int MPI_Init(int *argc, char ***argv)
{ {
...@@ -186,23 +176,17 @@ static void darshan_core_shutdown() ...@@ -186,23 +176,17 @@ static void darshan_core_shutdown()
struct darshan_core_job_runtime* final_job; struct darshan_core_job_runtime* final_job;
struct darshan_core_module *mod, *tmp; struct darshan_core_module *mod, *tmp;
int internal_timing_flag = 0; int internal_timing_flag = 0;
int jobid;
char* jobid_str;
char* envjobid; char* envjobid;
char* logpath; char* jobid_str;
int jobid;
struct tm* start_tm;
time_t start_time_tmp;
int ret; int ret;
int local_ret = 0;
int all_ret = 0;
uint64_t hlevel;
char hname[HOST_NAME_MAX];
uint64_t logmod;
char* logpath_override = NULL;
#ifdef __CP_LOG_ENV
char env_check[256];
char* env_tok;
#endif
int64_t first_start_time; int64_t first_start_time;
int64_t last_end_time; int64_t last_end_time;
int local_mod_use[DARSHAN_MAX_MODS] = {0};
int global_mod_use_count[DARSHAN_MAX_MODS] = {0};
int i;
if(getenv("DARSHAN_INTERNAL_TIMING")) if(getenv("DARSHAN_INTERNAL_TIMING"))
internal_timing_flag = 1; internal_timing_flag = 1;
...@@ -227,30 +211,16 @@ static void darshan_core_shutdown() ...@@ -227,30 +211,16 @@ static void darshan_core_shutdown()
return; return;
} }
/* construct log file name */ /* set jobid and logfile name on rank 0 */
if(my_rank == 0) if(my_rank == 0)
{ {
char cuser[L_cuserid] = {0};
struct tm* my_tm;
time_t start_time_tmp;
/* Use CP_JOBID_OVERRIDE for the env var or CP_JOBID */ /* Use CP_JOBID_OVERRIDE for the env var or CP_JOBID */
envjobid = getenv(CP_JOBID_OVERRIDE); envjobid = getenv(CP_JOBID_OVERRIDE);
if (!envjobid) if(!envjobid)
{ {
envjobid = CP_JOBID; envjobid = CP_JOBID;
} }
/* Use CP_LOG_PATH_OVERRIDE for the value or __CP_LOG_PATH */
logpath = getenv(CP_LOG_PATH_OVERRIDE);
if (!logpath)
{
#ifdef __CP_LOG_PATH
logpath = __CP_LOG_PATH;
#endif
}
/* find a job id */
jobid_str = getenv(envjobid); jobid_str = getenv(envjobid);
if(jobid_str) if(jobid_str)
{ {
...@@ -263,121 +233,15 @@ static void darshan_core_shutdown() ...@@ -263,121 +233,15 @@ static void darshan_core_shutdown()
jobid = getpid(); jobid = getpid();
} }
/* break out time into something human readable */ /* add to darshan core job */
start_time_tmp = final_job->log_job.start_time; final_job->log_job.jobid = (int64_t)jobid;
my_tm = localtime(&start_time_tmp);
/* get the username for this job. In order we will try each of the
* following until one of them succeeds:
*
* - cuserid()
* - getenv("LOGNAME")
* - snprintf(..., geteuid());
*
* Note that we do not use getpwuid() because it generally will not
* work in statically compiled binaries.
*/
#ifndef DARSHAN_DISABLE_CUSERID
cuserid(cuser);
#endif
/* if cuserid() didn't work, then check the environment */
if (strcmp(cuser, "") == 0)
{
char* logname_string;
logname_string = getenv("LOGNAME");
if(logname_string)
{
strncpy(cuser, logname_string, (L_cuserid-1));
}
}
/* if cuserid() and environment both fail, then fall back to uid */
if (strcmp(cuser, "") == 0)
{
uid_t uid = geteuid();
snprintf(cuser, sizeof(cuser), "%u", uid);
}
/* generate a random number to help differentiate the log */
hlevel=DARSHAN_MPI_CALL(PMPI_Wtime)() * 1000000;
(void) gethostname(hname, sizeof(hname));
logmod = darshan_hash((void*)hname,strlen(hname),hlevel);
/* see if darshan was configured using the --with-logpath-by-env
* argument, which allows the user to specify an absolute path to
* place logs via an env variable.
*/
#ifdef __CP_LOG_ENV
/* just silently skip if the environment variable list is too big */
if(strlen(__CP_LOG_ENV) < 256)
{
/* copy env variable list to a temporary buffer */
strcpy(env_check, __CP_LOG_ENV);
/* tokenize the comma-separated list */
env_tok = strtok(env_check, ",");
if(env_tok)
{
do
{
/* check each env variable in order */
logpath_override = getenv(env_tok);
if(logpath_override)
{
/* stop as soon as we find a match */
break;
}
}while((env_tok = strtok(NULL, ",")));
}
}
#endif
if(logpath_override) /* use human readable start time format in log filename */
{ start_time_tmp = final_job->log_job.start_time;
ret = snprintf(logfile_name, PATH_MAX, start_tm = localtime(&start_time_tmp);
"%s/%s_%s_id%d_%d-%d-%d-%" PRIu64 ".darshan_partial",
logpath_override,
cuser, __progname, jobid,
(my_tm->tm_mon+1),
my_tm->tm_mday,
(my_tm->tm_hour*60*60 + my_tm->tm_min*60 + my_tm->tm_sec),
logmod);
if(ret == (PATH_MAX-1))
{
/* file name was too big; squish it down */
snprintf(logfile_name, PATH_MAX,
"%s/id%d.darshan_partial",
logpath_override, jobid);
}
}
else if(logpath)
{
ret = snprintf(logfile_name, PATH_MAX,
"%s/%d/%d/%d/%s_%s_id%d_%d-%d-%d-%" PRIu64 ".darshan_partial",
logpath, (my_tm->tm_year+1900),
(my_tm->tm_mon+1), my_tm->tm_mday,
cuser, __progname, jobid,
(my_tm->tm_mon+1),
my_tm->tm_mday,
(my_tm->tm_hour*60*60 + my_tm->tm_min*60 + my_tm->tm_sec),
logmod);
if(ret == (PATH_MAX-1))
{
/* file name was too big; squish it down */
snprintf(logfile_name, PATH_MAX,
"%s/id%d.darshan_partial",
logpath, jobid);
}
}
else
{
logfile_name[0] = '\0';
}
/* add jobid */ /* construct log file name */
final_job->log_job.jobid = (int64_t)jobid; darshan_get_logfile_name(logfile_name, jobid, start_tm);
} }
/* broadcast log file name */ /* broadcast log file name */
...@@ -404,8 +268,17 @@ static void darshan_core_shutdown() ...@@ -404,8 +268,17 @@ static void darshan_core_shutdown()
final_job->log_job.end_time = last_end_time; final_job->log_job.end_time = last_end_time;
} }
/* TODO: coordinate shutdown accross all registered modules */ /* set which local modules were actually used */
DARSHAN_MOD_ITER(mod, tmp, final_job) for(i = 0; i < DARSHAN_MAX_MODS; i++)
{
if(final_job->mod_array[i])
local_mod_use[i] = 1;
}
/* reduce the number of times a module was opened globally and bcast to everyone */
DARSHAN_MPI_CALL(PMPI_Allreduce)(local_mod_use, global_mod_use_count, DARSHAN_MAX_MODS, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
for(i = 0; i < DARSHAN_MAX_MODS; i++)
{ {
} }
...@@ -413,7 +286,7 @@ static void darshan_core_shutdown() ...@@ -413,7 +286,7 @@ static void darshan_core_shutdown()
free(logfile_name); free(logfile_name);
darshan_core_cleanup(final_job); darshan_core_cleanup(final_job);
if (internal_timing_flag) if(internal_timing_flag)
{ {
/* TODO: what do we want to time in new darshan version? */ /* TODO: what do we want to time in new darshan version? */
} }
...@@ -423,12 +296,11 @@ static void darshan_core_shutdown() ...@@ -423,12 +296,11 @@ static void darshan_core_shutdown()
static void darshan_core_cleanup(struct darshan_core_job_runtime* job) static void darshan_core_cleanup(struct darshan_core_job_runtime* job)
{ {
struct darshan_core_module *mod, *tmp; int i;
DARSHAN_MOD_ITER(mod, tmp, job) for(i = 0; i < DARSHAN_MAX_MODS; i++)
{ {
DARSHAN_MOD_DELETE(mod, job);
free(mod);
} }
free(job); free(job);
...@@ -436,44 +308,170 @@ static void darshan_core_cleanup(struct darshan_core_job_runtime* job) ...@@ -436,44 +308,170 @@ static void darshan_core_cleanup(struct darshan_core_job_runtime* job)
return; return;
} }
static int mod_cmp(struct darshan_core_module* a, struct darshan_core_module* b) static void darshan_get_logfile_name(char* logfile_name, int jobid, struct tm* start_tm)
{ {
return strcmp(a->name, b->name); char* logpath;
char* logname_string;
char* logpath_override = NULL;
#ifdef __CP_LOG_ENV
char env_check[256];
char* env_tok;
#endif
uint64_t hlevel;
char hname[HOST_NAME_MAX];
uint64_t logmod;
char cuser[L_cuserid] = {0};
int ret;
/* Use CP_LOG_PATH_OVERRIDE for the value or __CP_LOG_PATH */
logpath = getenv(CP_LOG_PATH_OVERRIDE);
if(!logpath)
{
#ifdef __CP_LOG_PATH
logpath = __CP_LOG_PATH;
#endif
}
/* get the username for this job. In order we will try each of the
* following until one of them succeeds:
*
* - cuserid()
* - getenv("LOGNAME")
* - snprintf(..., geteuid());
*
* Note that we do not use getpwuid() because it generally will not
* work in statically compiled binaries.
*/
#ifndef DARSHAN_DISABLE_CUSERID
cuserid(cuser);
#endif
/* if cuserid() didn't work, then check the environment */
if(strcmp(cuser, "") == 0)
{
logname_string = getenv("LOGNAME");
if(logname_string)
{
strncpy(cuser, logname_string, (L_cuserid-1));
}
}
/* if cuserid() and environment both fail, then fall back to uid */
if(strcmp(cuser, "") == 0)
{
uid_t uid = geteuid();
snprintf(cuser, sizeof(cuser), "%u", uid);
}
/* generate a random number to help differentiate the log */
hlevel=DARSHAN_MPI_CALL(PMPI_Wtime)() * 1000000;
(void)gethostname(hname, sizeof(hname));
logmod = darshan_hash((void*)hname,strlen(hname),hlevel);
/* see if darshan was configured using the --with-logpath-by-env
* argument, which allows the user to specify an absolute path to
* place logs via an env variable.
*/
#ifdef __CP_LOG_ENV
/* just silently skip if the environment variable list is too big */
if(strlen(__CP_LOG_ENV) < 256)
{
/* copy env variable list to a temporary buffer */
strcpy(env_check, __CP_LOG_ENV);
/* tokenize the comma-separated list */
env_tok = strtok(env_check, ",");
if(env_tok)
{
do
{
/* check each env variable in order */
logpath_override = getenv(env_tok);
if(logpath_override)
{
/* stop as soon as we find a match */
break;
}
}while((env_tok = strtok(NULL, ",")));
}
}
#endif
if(logpath_override)
{
ret = snprintf(logfile_name, PATH_MAX,
"%s/%s_%s_id%d_%d-%d-%d-%" PRIu64 ".darshan_partial",
logpath_override,
cuser, __progname, jobid,
(start_tm->tm_mon+1),
start_tm->tm_mday,
(start_tm->tm_hour*60*60 + start_tm->tm_min*60 + start_tm->tm_sec),
logmod);
if(ret == (PATH_MAX-1))
{
/* file name was too big; squish it down */
snprintf(logfile_name, PATH_MAX,
"%s/id%d.darshan_partial",
logpath_override, jobid);
}
}
else if(logpath)
{
ret = snprintf(logfile_name, PATH_MAX,
"%s/%d/%d/%d/%s_%s_id%d_%d-%d-%d-%" PRIu64 ".darshan_partial",
logpath, (start_tm->tm_year+1900),
(start_tm->tm_mon+1), start_tm->tm_mday,
cuser, __progname, jobid,
(start_tm->tm_mon+1),
start_tm->tm_mday,
(start_tm->tm_hour*60*60 + start_tm->tm_min*60 + start_tm->tm_sec),
logmod);
if(ret == (PATH_MAX-1))
{
/* file name was too big; squish it down */
snprintf(logfile_name, PATH_MAX,
"%s/id%d.darshan_partial",
logpath, jobid);
}
}
else
{
logfile_name[0] = '\0';
}
return;
} }
/* ********************************************************* */ /* ********************************************************* */
void darshan_core_register_module( void darshan_core_register_module(
darshan_module_id id,
char *name, char *name,
struct darshan_module_funcs *funcs, struct darshan_module_funcs *funcs,
int *runtime_mem_limit) int *runtime_mem_limit)
{ {
struct darshan_core_module tmp;
struct darshan_core_module* mod; struct darshan_core_module* mod;
DARSHAN_LOCK(); DARSHAN_LOCK();
*runtime_mem_limit = 0; *runtime_mem_limit = 0;
if(!darshan_core_job) if(!darshan_core_job || (id >= DARSHAN_MAX_MODS))
{ {
DARSHAN_UNLOCK(); DARSHAN_UNLOCK();
return; return;
} }
/* see if this module is already registered */ /* see if this module is already registered */
strncpy(tmp.name, name, DARSHAN_MOD_NAME_LEN); if(darshan_core_job->mod_array[id])
DARSHAN_MOD_SEARCH(mod, &tmp, darshan_core_job);
if(mod)
{ {
/* if module is already registered, update module_funcs and return */ /* if module is already registered just return */
/* NOTE: we do not recalculate memory limit here, just set to 0 */ /* NOTE: we do not recalculate memory limit here, just set to 0 */
mod->mod_funcs = *funcs;
DARSHAN_UNLOCK(); DARSHAN_UNLOCK();
return; return;
} }
/* this module has not been registered yet, allocate and register it */ /* this module has not been registered yet, allocate and initialize it */
mod = malloc(sizeof(*mod)); mod = malloc(sizeof(*mod));
if(!mod) if(!mod)
{ {
...@@ -482,9 +480,12 @@ void darshan_core_register_module( ...@@ -482,9 +480,12 @@ void darshan_core_register_module(
} }
memset(mod, 0, sizeof(*mod)); memset(mod, 0, sizeof(*mod));
mod->id = id;
strncpy(mod->name, name, DARSHAN_MOD_NAME_LEN); strncpy(mod->name, name, DARSHAN_MOD_NAME_LEN);
mod->mod_funcs = *funcs; mod->mod_funcs = *funcs;