Commit 44a058c2 authored by Shane Snyder's avatar Shane Snyder

Initial darshan-core shutdown logic

parent c88a970e
......@@ -44,11 +44,4 @@ struct darshan_job
char metadata[DARSHAN_JOB_METADATA_LEN];
};
/* This macro can be used to identify files that have been opened using
* pnetcdf, hdf5, or mpi-io, but were never opened at the posix level. As a
* result the record will not necessarily have all of the expected fields
* populated.
*/
#define CP_FILE_PARTIAL(__file)((((__file)->counters[CP_POSIX_OPENS] || (__file)->counters[CP_POSIX_FOPENS] || (__file)->counters[CP_POSIX_STATS]) ? 0 : 1))
#endif /* __DARSHAN_LOG_FORMAT_H */
......@@ -10,8 +10,21 @@
#include <sys/types.h>
#include <stdint.h>
#include <mpi.h>
#include "darshan-log-format.h"
/* Environment variable to override CP_JOBID */
#define CP_JOBID_OVERRIDE "DARSHAN_JOBID"
/* Environment variable to override __CP_LOG_PATH */
#define CP_LOG_PATH_OVERRIDE "DARSHAN_LOGPATH"
/* Environment variable to override __CP_LOG_PATH */
#define CP_LOG_HINTS_OVERRIDE "DARSHAN_LOGHINTS"
/* Environment variable to override __CP_MEM_ALIGNMENT */
#define CP_MEM_ALIGNMENT_OVERRIDE "DARSHAN_MEMALIGN"
/* TODO these go where ? */
#define DARSHAN_MPI_CALL(func) func
......@@ -36,8 +49,4 @@ void darshan_core_lookup_id(
double darshan_core_wtime(void);
char* darshan_clean_file_path(const char* path);
double darshan_wtime(void);
#endif /* __DARSHAN_H */
......@@ -3,18 +3,22 @@
* See COPYRIGHT in top-level directory.
*/
#define _XOPEN_SOURCE 500
#include "darshan-runtime-config.h"
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#ifdef HAVE_MNTENT_H
#include <mntent.h>
#endif
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <limits.h>
#include <pthread.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/vfs.h>
#include <pthread.h>
#include <mpi.h>
......@@ -25,6 +29,7 @@ extern char* __progname_full;
static void darshan_core_initialize(int *argc, char ***argv);
static void darshan_core_shutdown(void);
static void darshan_core_cleanup(struct darshan_core_job_runtime* job);
/* internal variables */
static struct darshan_core_job_runtime *darshan_core_job = NULL;
......@@ -33,14 +38,14 @@ static pthread_mutex_t darshan_mutex = PTHREAD_MUTEX_INITIALIZER;
#define DARSHAN_LOCK() pthread_mutex_lock(&darshan_mutex)
#define DARSHAN_UNLOCK() pthread_mutex_unlock(&darshan_mutex)
#define DARSHAN_MOD_REGISTER(__mod) \
LL_PREPEND(darshan_core_job->mod_list_head, __mod)
#define DARSHAN_MOD_SEARCH(__mod, __tmp) \
LL_SEARCH(darshan_core_job->mod_list_head, __mod, __tmp, mod_cmp)
#define DARSHAN_MOD_ITER(__mod, __tmp) \
LL_FOREACH_SAFE(darshan_core_job->mod_list_head, __mod, __tmp)
#define DARSHAN_MOD_DELETE(__mod) \
LL_DELETE(darshan_core_job->mod_list_head, __mod)
#define DARSHAN_MOD_REGISTER(__mod, __job) \
LL_PREPEND(__job->mod_list_head, __mod)
#define DARSHAN_MOD_SEARCH(__mod, __tmp, __job) \
LL_SEARCH(__job->mod_list_head, __mod, __tmp, mod_cmp)
#define DARSHAN_MOD_ITER(__mod, __tmp, __job) \
LL_FOREACH_SAFE(__job->mod_list_head, __mod, __tmp)
#define DARSHAN_MOD_DELETE(__mod, __job) \
LL_DELETE(__job->mod_list_head, __mod)
/* intercept MPI initialize and finalize to initialize darshan */
int MPI_Init(int *argc, char ***argv)
......@@ -176,22 +181,242 @@ static void darshan_core_initialize(int *argc, char ***argv)
static void darshan_core_shutdown()
{
int rank;
char *logfile_name;
struct darshan_core_job_runtime* final_job;
struct darshan_core_module *mod, *tmp;
int internal_timing_flag = 0;
int jobid;
char* jobid_str;
char* envjobid;
char* logpath;
int ret;
uint64_t hlevel;
char hname[HOST_NAME_MAX];
uint64_t logmod;
char* logpath_override = NULL;
#ifdef __CP_LOG_ENV
char env_check[256];
char* env_tok;
#endif
if(getenv("DARSHAN_INTERNAL_TIMING"))
internal_timing_flag = 1;
DARSHAN_LOCK();
if(!darshan_core_job)
{
DARSHAN_UNLOCK();
return;
}
/* disable further tracing while hanging onto the data so that we can
* write it out
*/
final_job = darshan_core_job;
darshan_core_job = NULL;
DARSHAN_UNLOCK();
logfile_name = malloc(PATH_MAX);
if(!logfile_name)
{
darshan_core_cleanup(final_job);
return;
}
DARSHAN_MPI_CALL(PMPI_Comm_rank)(MPI_COMM_WORLD, &rank);
/* construct log file name */
if(rank == 0)
{
char cuser[L_cuserid] = {0};
struct tm* my_tm;
time_t start_time_tmp;
/* Use CP_JOBID_OVERRIDE for the env var or CP_JOBID */
envjobid = getenv(CP_JOBID_OVERRIDE);
if (!envjobid)
{
envjobid = CP_JOBID;
}
/* Use CP_LOG_PATH_OVERRIDE for the value or __CP_LOG_PATH */
logpath = getenv(CP_LOG_PATH_OVERRIDE);
if (!logpath)
{
#ifdef __CP_LOG_PATH
logpath = __CP_LOG_PATH;
#endif
}
/* find a job id */
jobid_str = getenv(envjobid);
if(jobid_str)
{
/* in cobalt we can find it in env var */
ret = sscanf(jobid_str, "%d", &jobid);
}
if(!jobid_str || ret != 1)
{
/* use pid as fall back */
jobid = getpid();
}
/* break out time into something human readable */
start_time_tmp = final_job->log_job.start_time;
my_tm = localtime(&start_time_tmp);
/* get the username for this job. In order we will try each of the
* following until one of them succeeds:
*
* - cuserid()
* - getenv("LOGNAME")
* - snprintf(..., geteuid());
*
* Note that we do not use getpwuid() because it generally will not
* work in statically compiled binaries.
*/
#ifndef DARSHAN_DISABLE_CUSERID
cuserid(cuser);
#endif
/* if cuserid() didn't work, then check the environment */
if (strcmp(cuser, "") == 0)
{
char* logname_string;
logname_string = getenv("LOGNAME");
if(logname_string)
{
strncpy(cuser, logname_string, (L_cuserid-1));
}
}
/* if cuserid() and environment both fail, then fall back to uid */
if (strcmp(cuser, "") == 0)
{
uid_t uid = geteuid();
snprintf(cuser, sizeof(cuser), "%u", uid);
}
/* generate a random number to help differentiate the log */
hlevel=DARSHAN_MPI_CALL(PMPI_Wtime)() * 1000000;
(void) gethostname(hname, sizeof(hname));
logmod = darshan_hash((void*)hname,strlen(hname),hlevel);
/* see if darshan was configured using the --with-logpath-by-env
* argument, which allows the user to specify an absolute path to
* place logs via an env variable.
*/
#ifdef __CP_LOG_ENV
/* just silently skip if the environment variable list is too big */
if(strlen(__CP_LOG_ENV) < 256)
{
/* copy env variable list to a temporary buffer */
strcpy(env_check, __CP_LOG_ENV);
/* tokenize the comma-separated list */
env_tok = strtok(env_check, ",");
if(env_tok)
{
do
{
/* check each env variable in order */
logpath_override = getenv(env_tok);
if(logpath_override)
{
/* stop as soon as we find a match */
break;
}
}while((env_tok = strtok(NULL, ",")));
}
}
#endif
if(logpath_override)
{
ret = snprintf(logfile_name, PATH_MAX,
"%s/%s_%s_id%d_%d-%d-%d-%" PRIu64 ".darshan_partial",
logpath_override,
cuser, __progname, jobid,
(my_tm->tm_mon+1),
my_tm->tm_mday,
(my_tm->tm_hour*60*60 + my_tm->tm_min*60 + my_tm->tm_sec),
logmod);
if(ret == (PATH_MAX-1))
{
/* file name was too big; squish it down */
snprintf(logfile_name, PATH_MAX,
"%s/id%d.darshan_partial",
logpath_override, jobid);
}
}
else if(logpath)
{
ret = snprintf(logfile_name, PATH_MAX,
"%s/%d/%d/%d/%s_%s_id%d_%d-%d-%d-%" PRIu64 ".darshan_partial",
logpath, (my_tm->tm_year+1900),
(my_tm->tm_mon+1), my_tm->tm_mday,
cuser, __progname, jobid,
(my_tm->tm_mon+1),
my_tm->tm_mday,
(my_tm->tm_hour*60*60 + my_tm->tm_min*60 + my_tm->tm_sec),
logmod);
if(ret == (PATH_MAX-1))
{
/* file name was too big; squish it down */
snprintf(logfile_name, PATH_MAX,
"%s/id%d.darshan_partial",
logpath, jobid);
}
}
else
{
logfile_name[0] = '\0';
}
/* add jobid */
final_job->log_job.jobid = (int64_t)jobid;
}
/* broadcast log file name */
DARSHAN_MPI_CALL(PMPI_Bcast)(logfile_name, PATH_MAX, MPI_CHAR, 0,
MPI_COMM_WORLD);
if(strlen(logfile_name) == 0)
{
/* failed to generate log file name */
darshan_core_cleanup(final_job);
return;
}
final_job->log_job.end_time = time(NULL);
/* TODO: coordinate shutdown accross all registered modules */
DARSHAN_MOD_ITER(mod, tmp)
free(logfile_name);
darshan_core_cleanup(final_job);
if (internal_timing_flag)
{
printf("Shutting down %s module\n", mod->name);
/* TODO: what do we want to time in new darshan version? */
}
return;
}
DARSHAN_MOD_DELETE(mod);
static void darshan_core_cleanup(struct darshan_core_job_runtime* job)
{
struct darshan_core_module *mod, *tmp;
DARSHAN_MOD_ITER(mod, tmp, job)
{
DARSHAN_MOD_DELETE(mod, job);
free(mod);
};
free(darshan_core_job);
free(job);
return;
}
......@@ -211,15 +436,18 @@ void darshan_core_register_module(
struct darshan_core_module tmp;
struct darshan_core_module* mod;
DARSHAN_LOCK();
*runtime_mem_limit = 0;
if(!darshan_core_job)
{
DARSHAN_UNLOCK();
return;
DARSHAN_LOCK();
}
/* see if this module is already registered */
strncpy(tmp.name, name, DARSHAN_MOD_NAME_LEN);
DARSHAN_MOD_SEARCH(mod, &tmp);
DARSHAN_MOD_SEARCH(mod, &tmp, darshan_core_job);
if(mod)
{
/* if module is already registered, update module_funcs and return */
......@@ -241,7 +469,7 @@ void darshan_core_register_module(
strncpy(mod->name, name, DARSHAN_MOD_NAME_LEN);
mod->mod_funcs = *funcs;
DARSHAN_MOD_REGISTER(mod);
DARSHAN_MOD_REGISTER(mod, darshan_core_job);
/* TODO: something smarter than just 2 MiB per module */
*runtime_mem_limit = 2 * 1024 * 1024;
......
......@@ -188,7 +188,7 @@ struct posix_runtime
static struct posix_runtime *posix_runtime = NULL;
static pthread_mutex_t posix_runtime_mutex = PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP;
static int my_rank = -1;
static int my_rank = -1; /* TODO */
static int darshan_mem_alignment = 1;
/* these are paths that we will not trace */
......@@ -215,6 +215,7 @@ static void posix_runtime_finalize(void);
static struct posix_runtime_file* posix_file_by_name(const char *name);
static struct posix_runtime_file* posix_file_by_name_setfd(const char* name, int fd);
static void posix_file_close_fd(int fd);
static char* darshan_clean_file_path(const char* path);
static void posix_prepare_for_shutdown(void);
static void posix_get_output_data(void **buffer, int size);
......@@ -395,7 +396,7 @@ static void posix_runtime_initialize()
#if (__CP_MEM_ALIGNMENT < 1)
#error Darshan must be configured with a positive value for --with-mem-align
#endif
alignstr = getenv("DARSHAN_MEMALIGN");
alignstr = getenv(CP_MEM_ALIGNMENT_OVERRIDE);
if(alignstr)
{
ret = sscanf(alignstr, "%d", &tmpval);
......@@ -528,6 +529,67 @@ static void posix_file_close_fd(int fd)
return;
}
/* Allocate a new string that contains a cleaned-up version of the path
* passed in as an argument. Converts relative paths to absolute paths and
* filters out some potential noise in the path string.
*/
static char* darshan_clean_file_path(const char* path)
{
char* newpath = NULL;
char* cwd = NULL;
char* filter = NULL;
if(!path || strlen(path) < 1)
return(NULL);
if(path[0] == '/')
{
/* it is already an absolute path */
newpath = malloc(strlen(path)+1);
if(newpath)
{
strcpy(newpath, path);
}
}
else
{
/* handle relative path */
cwd = malloc(PATH_MAX);
if(cwd)
{
if(getcwd(cwd, PATH_MAX))
{
newpath = malloc(strlen(path) + strlen(cwd) + 2);
if(newpath)
{
sprintf(newpath, "%s/%s", cwd, path);
}
}
free(cwd);
}
}
if(!newpath)
return(NULL);
/* filter out any double slashes */
while((filter = strstr(newpath, "//")))
{
/* shift down one character */
memmove(filter, &filter[1], (strlen(&filter[1]) + 1));
}
/* filter out any /./ instances */
while((filter = strstr(newpath, "/./")))
{
/* shift down two characters */
memmove(filter, &filter[2], (strlen(&filter[2]) + 1));
}
/* return result */
return(newpath);
}
/* ***************************************************** */
static void posix_prepare_for_shutdown()
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment