Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Cristian Simarro
darshan
Commits
01812211
Commit
01812211
authored
Feb 11, 2015
by
Shane Snyder
Browse files
revamped error handling runtime side
parent
5348600b
Changes
5
Hide whitespace changes
Inline
Side-by-side
darshan-log-format.h
View file @
01812211
...
...
@@ -30,25 +30,30 @@
/* max length of exe string within job record (not counting '\0') */
#define CP_EXE_LEN (CP_JOB_RECORD_SIZE - sizeof(struct darshan_job) - 1)
/* max length of module name string (not counting '\0') */
/* TODO */
#define DARSHAN_MOD_NAME_LEN 31
typedef
uint64_t
darshan_record_id
;
/* unique identifiers to distinguish between available darshan modules */
/* NOTES: - valid ids range from [0...DARSHAN_MAX_MODS-1]
* - order of ids control module shutdown order (and consequently, order in log file)
*/
/* TODO: enforce maximum? */
#define DARSHAN_MAX_MODS 16
typedef
enum
{
DARSHAN_POSIX_MOD
,
DARSHAN_POSIX_MOD
=
0
,
DARSHAN_MPIIO_MOD
,
DARSHAN_HDF5_MOD
,
DARSHAN_PNETCDF_MOD
,
}
darshan_module_id
;
static
char
*
darshan_module_names
[]
=
{
"POSIX"
,
"MPI-IO"
,
"HDF5"
,
"PNETCDF"
};
enum
darshan_comp_type
{
DARSHAN_GZ_COMP
,
...
...
darshan-runtime/darshan-core.h
View file @
01812211
...
...
@@ -18,7 +18,6 @@
struct
darshan_core_module
{
darshan_module_id
id
;
char
name
[
DARSHAN_MOD_NAME_LEN
+
1
];
struct
darshan_module_funcs
mod_funcs
;
};
...
...
darshan-runtime/darshan.h
View file @
01812211
...
...
@@ -44,7 +44,6 @@ struct darshan_module_funcs
void
darshan_core_register_module
(
darshan_module_id
id
,
char
*
name
,
struct
darshan_module_funcs
*
funcs
,
int
*
runtime_mem_limit
);
...
...
darshan-runtime/lib/darshan-core.c
View file @
01812211
...
...
@@ -45,9 +45,11 @@ static void darshan_get_logfile_name(
char
*
logfile_name
,
int
jobid
,
struct
tm
*
start_tm
);
static
void
darshan_log_record_hints_and_ver
(
struct
darshan_core_runtime
*
job
);
static
int
darshan_get_shared_record_ids
(
static
void
darshan_get_shared_record_ids
(
struct
darshan_core_runtime
*
job
,
darshan_record_id
*
shared_recs
);
static
int
darshan_log_write_record_map
(
static
int
darshan_log_coll_open
(
char
*
logfile_name
,
MPI_File
*
log_fh
);
static
int
darshan_log_write_record_hash
(
MPI_File
log_fh
,
struct
darshan_core_record_ref
*
rec_hash
,
darshan_record_id
*
shared_recs
,
struct
darshan_log_map
*
map
);
static
int
darshan_log_coll_write
(
...
...
@@ -197,13 +199,11 @@ static void darshan_core_shutdown()
int
local_mod_use
[
DARSHAN_MAX_MODS
]
=
{
0
};
int
global_mod_use_count
[
DARSHAN_MAX_MODS
]
=
{
0
};
darshan_record_id
shared_recs
[
DARSHAN_CORE_MAX_RECORDS
]
=
{
0
};
char
*
hints
;
double
start_log_time
;
long
offset
;
struct
darshan_header
log_header
;
MPI_File
log_fh
;
MPI_Offset
tmp_off
;
MPI_Info
info
;
MPI_Status
status
;
if
(
getenv
(
"DARSHAN_INTERNAL_TIMING"
))
...
...
@@ -297,10 +297,6 @@ static void darshan_core_shutdown()
final_job
->
log_job
.
end_time
=
last_end_time
;
}
/* XXX */
/* TODO: ensuing error checking...does MPI ensure collective I/O functions return the same error
* globally, or do I always need to allreduce????? */
/* set which local modules were actually used */
for
(
i
=
0
;
i
<
DARSHAN_MAX_MODS
;
i
++
)
{
...
...
@@ -312,72 +308,10 @@ static void darshan_core_shutdown()
DARSHAN_MPI_CALL
(
PMPI_Allreduce
)(
local_mod_use
,
global_mod_use_count
,
DARSHAN_MAX_MODS
,
MPI_INT
,
MPI_SUM
,
MPI_COMM_WORLD
);
/* get a list of records which are shared across all processes */
ret
=
darshan_get_shared_record_ids
(
final_job
,
shared_recs
);
/* error out if unable to determine shared file records */
DARSHAN_MPI_CALL
(
PMPI_Allreduce
)(
&
ret
,
&
all_ret
,
1
,
MPI_INT
,
MPI_LOR
,
MPI_COMM_WORLD
);
if
(
all_ret
!=
0
)
{
if
(
my_rank
==
0
)
{
fprintf
(
stderr
,
"darshan library warning: unable to determine shared file records
\n
"
);
}
free
(
logfile_name
);
darshan_core_cleanup
(
final_job
);
return
;
}
/* check environment variable to see if the default MPI file hints have
* been overridden
*/
MPI_Info_create
(
&
info
);
hints
=
getenv
(
CP_LOG_HINTS_OVERRIDE
);
if
(
!
hints
)
{
hints
=
__CP_LOG_HINTS
;
}
if
(
hints
&&
strlen
(
hints
)
>
0
)
{
char
*
tok_str
;
char
*
orig_tok_str
;
char
*
key
;
char
*
value
;
char
*
saveptr
=
NULL
;
tok_str
=
strdup
(
hints
);
if
(
tok_str
)
{
orig_tok_str
=
tok_str
;
do
{
/* split string on semicolon */
key
=
strtok_r
(
tok_str
,
";"
,
&
saveptr
);
if
(
key
)
{
tok_str
=
NULL
;
/* look for = sign splitting key/value pairs */
value
=
index
(
key
,
'='
);
if
(
value
)
{
/* break key and value into separate null terminated strings */
value
[
0
]
=
'\0'
;
value
++
;
if
(
strlen
(
key
)
>
0
)
MPI_Info_set
(
info
,
key
,
value
);
}
}
}
while
(
key
!=
NULL
);
free
(
orig_tok_str
);
}
}
darshan_get_shared_record_ids
(
final_job
,
shared_recs
);
/* open the darshan log file for writing */
ret
=
DARSHAN_MPI_CALL
(
PMPI_File_open
)(
MPI_COMM_WORLD
,
logfile_name
,
MPI_MODE_CREATE
|
MPI_MODE_WRONLY
|
MPI_MODE_EXCL
,
info
,
&
log_fh
);
MPI_Info_free
(
&
info
);
/* collectively open the darshan log file */
ret
=
darshan_log_coll_open
(
logfile_name
,
&
log_fh
);
/* error out if unable to open log file */
DARSHAN_MPI_CALL
(
PMPI_Allreduce
)(
&
ret
,
&
all_ret
,
1
,
MPI_INT
,
...
...
@@ -386,12 +320,8 @@ static void darshan_core_shutdown()
{
if
(
my_rank
==
0
)
{
int
msg_len
;
char
msg
[
MPI_MAX_ERROR_STRING
]
=
{
0
};
MPI_Error_string
(
ret
,
msg
,
&
msg_len
);
fprintf
(
stderr
,
"darshan library warning: unable to open log file %s: %s
\n
"
,
logfile_name
,
msg
);
fprintf
(
stderr
,
"darshan library warning: unable to open log file %s
\n
"
,
logfile_name
);
unlink
(
logfile_name
);
}
free
(
logfile_name
);
...
...
@@ -403,26 +333,33 @@ static void darshan_core_shutdown()
if
(
my_rank
==
0
)
{
/* write the job information, making sure to prealloc space for the log header */
ret
=
DARSHAN_MPI_CALL
(
PMPI_File_write_at
)(
log_fh
,
sizeof
(
struct
darshan_header
),
all_
ret
=
DARSHAN_MPI_CALL
(
PMPI_File_write_at
)(
log_fh
,
sizeof
(
struct
darshan_header
),
&
final_job
->
log_job
,
sizeof
(
struct
darshan_job
),
MPI_BYTE
,
&
status
);
if
(
ret
!=
MPI_SUCCESS
)
if
(
all_
ret
!=
MPI_SUCCESS
)
{
int
msg_len
;
char
msg
[
MPI_MAX_ERROR_STRING
]
=
{
0
};
MPI_Error_string
(
ret
,
msg
,
&
msg_len
);
fprintf
(
stderr
,
"darshan library warning: unable to write job data to log file %s: %s
\n
"
,
logfile_name
,
msg
);
fprintf
(
stderr
,
"darshan library warning: unable to write job data to log file %s
\n
"
,
logfile_name
);
unlink
(
logfile_name
);
}
/* TODO */
/* TODO
: after compression is added, this should be fixed
*/
log_header
.
rec_map
.
off
=
sizeof
(
struct
darshan_header
)
+
sizeof
(
struct
darshan_job
);
}
/* write the record name->id map to the log file */
ret
=
darshan_log_write_record_map
(
log_fh
,
final_job
->
rec_hash
,
/* error out if unable to write job information */
DARSHAN_MPI_CALL
(
PMPI_Bcast
)(
&
all_ret
,
1
,
MPI_INT
,
0
,
MPI_COMM_WORLD
);
if
(
all_ret
!=
0
)
{
free
(
logfile_name
);
darshan_core_cleanup
(
final_job
);
return
;
}
/* write the record name->id hash to the log file */
ret
=
darshan_log_write_record_hash
(
log_fh
,
final_job
->
rec_hash
,
shared_recs
,
&
log_header
.
rec_map
);
/* error out if unable to write record hash */
DARSHAN_MPI_CALL
(
PMPI_Allreduce
)(
&
ret
,
&
all_ret
,
1
,
MPI_INT
,
MPI_LOR
,
MPI_COMM_WORLD
);
if
(
all_ret
!=
0
)
...
...
@@ -431,6 +368,7 @@ static void darshan_core_shutdown()
{
fprintf
(
stderr
,
"darshan library warning: unable to write record map to log file %s
\n
"
,
logfile_name
);
unlink
(
logfile_name
);
}
free
(
logfile_name
);
darshan_core_cleanup
(
final_job
);
...
...
@@ -441,7 +379,8 @@ static void darshan_core_shutdown()
* - get final output buffer
* - compress (zlib) provided output buffer
* - append compressed buffer to log file
* - shutdown the module TODO
* - add module index info (file offset/length) to log header
* - shutdown the module
*/
for
(
i
=
0
;
i
<
DARSHAN_MAX_MODS
;
i
++
)
{
...
...
@@ -483,9 +422,22 @@ static void darshan_core_shutdown()
/* write module data buffer to the darshan log file */
ret
=
darshan_log_coll_write
(
log_fh
,
mod_buf
,
mod_buf_size
,
&
log_header
.
mod_map
[
i
]);
if
(
ret
<
0
)
/* error out if unable to write this module's data */
DARSHAN_MPI_CALL
(
PMPI_Allreduce
)(
&
ret
,
&
all_ret
,
1
,
MPI_INT
,
MPI_LOR
,
MPI_COMM_WORLD
);
if
(
all_ret
!=
0
)
{
/* TODO: */
if
(
my_rank
==
0
)
{
fprintf
(
stderr
,
"darshan library warning: unable to write %s module data to log file %s
\n
"
,
darshan_module_names
[
i
],
logfile_name
);
unlink
(
logfile_name
);
}
free
(
logfile_name
);
darshan_core_cleanup
(
final_job
);
return
;
}
tmp_off
+=
log_header
.
mod_map
[
i
].
len
;
...
...
@@ -508,20 +460,32 @@ static void darshan_core_shutdown()
log_header
.
magic_nr
=
CP_MAGIC_NR
;
log_header
.
comp_type
=
DARSHAN_GZ_COMP
;
ret
=
DARSHAN_MPI_CALL
(
PMPI_File_write_at
)(
log_fh
,
0
,
&
log_header
,
all_
ret
=
DARSHAN_MPI_CALL
(
PMPI_File_write_at
)(
log_fh
,
0
,
&
log_header
,
sizeof
(
struct
darshan_header
),
MPI_BYTE
,
&
status
);
if
(
ret
!=
MPI_SUCCESS
)
if
(
all_
ret
!=
MPI_SUCCESS
)
{
/* TODO */
fprintf
(
stderr
,
"darshan library warning: unable to write header to log file %s
\n
"
,
logfile_name
);
unlink
(
logfile_name
);
}
}
/* error out if unable to write log header */
DARSHAN_MPI_CALL
(
PMPI_Bcast
)(
&
all_ret
,
1
,
MPI_INT
,
0
,
MPI_COMM_WORLD
);
if
(
all_ret
!=
0
)
{
free
(
logfile_name
);
darshan_core_cleanup
(
final_job
);
return
;
}
DARSHAN_MPI_CALL
(
PMPI_File_close
)(
&
log_fh
);
/* if we got this far, there are no errors, so rename from *.darshan_partial
* to *-<logwritetime>.darshan.gz, which indicates that this log file is
* complete and ready for analysis
*/
/* TODO: support user given logfile path/name */
if
(
my_rank
==
0
)
{
char
*
tmp_index
;
...
...
@@ -758,12 +722,11 @@ static void darshan_log_record_hints_and_ver(struct darshan_core_runtime* job)
return
;
}
static
int
darshan_get_shared_record_ids
(
struct
darshan_core_runtime
*
job
,
static
void
darshan_get_shared_record_ids
(
struct
darshan_core_runtime
*
job
,
darshan_record_id
*
shared_recs
)
{
int
i
;
int
ndx
;
int
ret
;
struct
darshan_core_record_ref
*
ref
,
*
tmp
;
darshan_record_id
id_array
[
DARSHAN_CORE_MAX_RECORDS
]
=
{
0
};
darshan_record_id
mask_array
[
DARSHAN_CORE_MAX_RECORDS
]
=
{
0
};
...
...
@@ -780,13 +743,9 @@ static int darshan_get_shared_record_ids(struct darshan_core_runtime *job,
}
/* broadcast root's list of records to all other processes */
ret
=
DARSHAN_MPI_CALL
(
PMPI_Bcast
)(
id_array
,
DARSHAN_MPI_CALL
(
PMPI_Bcast
)(
id_array
,
(
DARSHAN_CORE_MAX_RECORDS
*
sizeof
(
darshan_record_id
)),
MPI_BYTE
,
0
,
MPI_COMM_WORLD
);
if
(
ret
!=
0
)
{
return
(
-
1
);
}
/* everyone looks to see if they opened the same records as root */
for
(
i
=
0
;
(
i
<
DARSHAN_CORE_MAX_RECORDS
&&
id_array
[
i
]
!=
0
);
i
++
)
...
...
@@ -803,12 +762,8 @@ static int darshan_get_shared_record_ids(struct darshan_core_runtime *job,
}
/* now allreduce so everyone agrees which files are shared */
ret
=
DARSHAN_MPI_CALL
(
PMPI_Allreduce
)(
mask_array
,
all_mask_array
,
DARSHAN_MPI_CALL
(
PMPI_Allreduce
)(
mask_array
,
all_mask_array
,
DARSHAN_CORE_MAX_RECORDS
,
MPI_INT
,
MPI_LAND
,
MPI_COMM_WORLD
);
if
(
ret
!=
0
)
{
return
(
-
1
);
}
ndx
=
0
;
for
(
i
=
0
;
(
i
<
DARSHAN_CORE_MAX_RECORDS
&&
id_array
[
i
]
!=
0
);
i
++
)
...
...
@@ -819,13 +774,74 @@ static int darshan_get_shared_record_ids(struct darshan_core_runtime *job,
}
}
return
;
}
static
int
darshan_log_coll_open
(
char
*
logfile_name
,
MPI_File
*
log_fh
)
{
char
*
hints
;
char
*
tok_str
;
char
*
orig_tok_str
;
char
*
key
;
char
*
value
;
char
*
saveptr
=
NULL
;
int
ret
;
MPI_Info
info
;
/* check environment variable to see if the default MPI file hints have
* been overridden
*/
MPI_Info_create
(
&
info
);
hints
=
getenv
(
CP_LOG_HINTS_OVERRIDE
);
if
(
!
hints
)
{
hints
=
__CP_LOG_HINTS
;
}
if
(
hints
&&
strlen
(
hints
)
>
0
)
{
tok_str
=
strdup
(
hints
);
if
(
tok_str
)
{
orig_tok_str
=
tok_str
;
do
{
/* split string on semicolon */
key
=
strtok_r
(
tok_str
,
";"
,
&
saveptr
);
if
(
key
)
{
tok_str
=
NULL
;
/* look for = sign splitting key/value pairs */
value
=
index
(
key
,
'='
);
if
(
value
)
{
/* break key and value into separate null terminated strings */
value
[
0
]
=
'\0'
;
value
++
;
if
(
strlen
(
key
)
>
0
)
MPI_Info_set
(
info
,
key
,
value
);
}
}
}
while
(
key
!=
NULL
);
free
(
orig_tok_str
);
}
}
/* open the darshan log file for writing */
ret
=
DARSHAN_MPI_CALL
(
PMPI_File_open
)(
MPI_COMM_WORLD
,
logfile_name
,
MPI_MODE_CREATE
|
MPI_MODE_WRONLY
|
MPI_MODE_EXCL
,
info
,
log_fh
);
if
(
ret
<
0
)
return
(
-
1
);
MPI_Info_free
(
&
info
);
return
(
0
);
}
/* NOTE: the map written to file may contain duplicate id->name entries if a
* record is opened by multiple ranks, but not all ranks
*/
static
int
darshan_log_write_record_
map
(
MPI_File
log_fh
,
struct
darshan_core_record_ref
*
rec_hash
,
static
int
darshan_log_write_record_
hash
(
MPI_File
log_fh
,
struct
darshan_core_record_ref
*
rec_hash
,
darshan_record_id
*
shared_recs
,
struct
darshan_log_map
*
map
)
{
int
i
;
...
...
@@ -983,7 +999,6 @@ static int darshan_log_coll_write(MPI_File log_fh, void *buf, int count,
void
darshan_core_register_module
(
darshan_module_id
id
,
char
*
name
,
struct
darshan_module_funcs
*
funcs
,
int
*
runtime_mem_limit
)
{
...
...
@@ -1017,7 +1032,6 @@ void darshan_core_register_module(
memset
(
mod
,
0
,
sizeof
(
*
mod
));
mod
->
id
=
id
;
strncpy
(
mod
->
name
,
name
,
DARSHAN_MOD_NAME_LEN
);
mod
->
mod_funcs
=
*
funcs
;
/* register module with darshan */
...
...
darshan-runtime/lib/darshan-posix.c
View file @
01812211
...
...
@@ -45,8 +45,6 @@ typedef int64_t off64_t;
#define MAP_OR_FAIL(func)
#define POSIX_MOD_NAME "POSIX"
struct
posix_runtime_file
{
struct
darshan_posix_file
*
file_record
;
...
...
@@ -245,7 +243,6 @@ static void posix_runtime_initialize()
/* register the posix module with darshan core */
darshan_core_register_module
(
DARSHAN_POSIX_MOD
,
POSIX_MOD_NAME
,
&
posix_mod_fns
,
&
mem_limit
);
...
...
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment