Commit 3266825f authored by Shane Snyder's avatar Shane Snyder
Browse files

Merge branch 'master' into lustre-mod

Conflicts:
	ChangeLog
parents 1c2df3e1 5e1a0963
......@@ -4,13 +4,22 @@ Darshan Release Change Log
Darshan-3.0.0
=============
* install darshan-null-log-format.h header when installing
darshan-util component, otherwise compiler errors are
generated when building external tools that use
libdarshan-util
* update docs to give debugging tips for cases where
Darshan logs are not generated
* fix shared library regression test script to check for
potential errors with Darshan symbols rather than
failing silently in these cases
* bug fix for determining minimum non-zero counters in
shared file reductions in all modules
* loosen Darshan's PMPI symbol check to prevent inadvertent
disabling of Darshan for some MPICH builds
* update runtime docs to give information on upgrading Darshan
* bug fix for resolving MPI_Gather when LDPRELOADing Darshan's
shared libraries (reported by Richard Hedges)
* bug fix for resolving MPI_Gather and MPI_Barrier when LDPRELOADing
Darshan's shared libraries (reported by Richard Hedges and Rob Latham)
* add more helpful error handling when opening 2.x version log files
* port darshan-diff utility over to new log file format
* fix numerous configure bugs on Cray systems
......
......@@ -12,7 +12,7 @@ DARSHAN_LD_FLAGS="@LDFLAGS@"
# in turn used one of those HLLs).
PRE_LD_FLAGS="-L$DARSHAN_LIB_PATH $DARSHAN_LD_FLAGS -ldarshan -lz -Wl,@$DARSHAN_SHARE_PATH/ld-opts/darshan-base-ld-opts"
POST_LD_FLAGS="-L$DARSHAN_LIB_PATH -ldarshan -lz -lrt -lpthread"
POST_LD_FLAGS="-L$DARSHAN_LIB_PATH -Wl,--start-group -ldarshan -ldarshan-stubs -Wl,--end-group -lz -lrt -lpthread"
usage="\
Usage: darshan-config [--pre-ld-flags] [--post-ld-flags]"
......
......@@ -282,6 +282,10 @@ print OUTPUT<<"EOF";
if [ \$? -eq 0 ] ; then
CXXMPICH=-lmpicxx
fi
bgqlib_out=`grep -m 1 -Po "libmpichcxx-\\S+" \$tmpfile`
if [ \$? -eq 0 ] ; then
CXXMPICH=`echo \$bgqlib_out | sed 's/libmpichcxx-\\(.*\\)\\.a.*/-lmpichcxx-\\1/'`
fi
rm \$tmpfile >& /dev/null
......
......@@ -287,7 +287,13 @@ print OUTPUT<<"EOF";
if [ \$rc_cnk_check -eq 0 ] ; then
FMPICH=-lfmpich.cnk
else
FMPICH=-lfmpich
bgqlib_out=`grep -Po "libmpichf\\S\+\-\\S\+" \$tmpfile 2>/dev/null`
rc_bgqlib_check=\$?
if [ \$rc_bgqlib_check -eq 0 ] ; then
FMPICH=`echo \$bgqlib_out | sed 's/libmpich\\(.*\\)-\\(.*\\).a/-lmpich\\1-\\2/'`
else
FMPICH=-lfmpich
fi
fi
fi
......
......@@ -169,8 +169,8 @@ Example for MPICH 3.1 or earlier:
----
export MPICC_PROFILE=$DARSHAN_PREFIX/share/mpi-profile/darshan-cc
export MPICXX_PROFILE=$DARSHAN_PREFIX/share/mpi-profile/darshan-cxx
export MPICF77_PROFILE=$DARSHAN_PREFIX/share/mpi-profile/darshan-f
export MPICF90_PROFILE=$DARSHAN_PREFIX/share/mpi-profile/darshan-f
export MPIF77_PROFILE=$DARSHAN_PREFIX/share/mpi-profile/darshan-f
export MPIF90_PROFILE=$DARSHAN_PREFIX/share/mpi-profile/darshan-f
----
Examples for command line use:
......@@ -213,17 +213,36 @@ environment variable to insert instrumentation at run time. The executables
should be compiled using the normal, unmodified MPI compiler.
To use this mechanism, set the `LD_PRELOAD` environment variable to the full
path to the Darshan shared library, as in this example:
path to the Darshan shared library. The preferred method of inserting Darshan
instrumentation in this case is to set the LD_PRELOAD variable specifically
for the application of interest. Typically this is possible using
command line arguments offered by the `mpirun` or `mpiexec` scripts or by
the job scheduler:
----
export LD_PRELOAD=/home/carns/darshan-install/lib/libdarshan.so
mpiexec -n 4 -env LD_PRELOAD /home/carns/darshan-install/lib/libdarshan.so mpi-io-test
----
----
srun -n 4 --export=LD_PRELOAD=/home/carns/darshan-install/lib/libdarshan.so mpi-io-test
----
For sequential programs, the following will set LD_PRELOAD for process duration only:
----
env LD_PRELOAD=/home/carns/darshan-install/lib/libdarshan.so mpi-io-test
----
Other environments may have other specific options for controlling this behavior.
Please check your local site documentation for details.
You can then run your application as usual. Some environments may require a
special `mpirun` or `mpiexec` command line argument to propagate the
environment variable to all processes. Other environments may require a
scheduler submission option to control this behavior. Please check your
local site documentation for details.
It is also possible to just export LD_PRELOAD as follows, but it is recommended
against doing that to prevent Darshan and MPI symbols from being pulled into
unrelated binaries:
----
export LD_PRELOAD=/home/carns/darshan-install/lib/libdarshan.so
----
=== Instrumenting dynamically-linked Fortran applications
......@@ -294,8 +313,8 @@ Blue Gene profiling configuration example:
----
export MPICC_PROFILE=$DARSHAN_PREFIX/share/mpi-profile/darshan-bg-cc
export MPICXX_PROFILE=$DARSHAN_PREFIX/share/mpi-profile/darshan-bg-cxx
export MPICF77_PROFILE=$DARSHAN_PREFIX/share/mpi-profile/darshan-bg-f
export MPICF90_PROFILE=$DARSHAN_PREFIX/share/mpi-profile/darshan-bg-f
export MPIF77_PROFILE=$DARSHAN_PREFIX/share/mpi-profile/darshan-bg-f
export MPIF90_PROFILE=$DARSHAN_PREFIX/share/mpi-profile/darshan-bg-f
----
=== Cray platforms (XE, XC, or similar)
......@@ -511,3 +530,47 @@ behavior at runtime:
* DARSHAN_LOGPATH: specifies the path to write Darshan log files to. Note that this directory needs to be formatted using the darshan-mk-log-dirs script.
* DARSHAN_LOGFILE: specifies the path (directory + Darshan log file name) to write the output Darshan log to. This overrides the default Darshan behavior of automatically generating a log file name and adding it to a log file directory formatted using darshan-mk-log-dirs script.
* DARSHAN_MODMEM: specifies the maximum amount of memory (in MiB) a Darshan instrumentation module can consume at runtime.
== Debugging
=== No log file
In cases where Darshan is not generating a log file for an application, some common things to check are:
* Check stderr to ensure Darshan isn't indicating any internal errors (e.g., invalid log file path)
For statically linked executables:
* Ensure that Darshan symbols are present in the underlying executable by running `nm` on it:
----
> nm test | grep darshan
0000000000772260 b darshan_core
0000000000404440 t darshan_core_cleanup
00000000004049b0 T darshan_core_initialize
000000000076b660 d darshan_core_mutex
00000000004070a0 T darshan_core_register_module
----
* Make sure the application executable is statically linked:
** In general, we encourage the use of purely statically linked executables when using the static
instrumentation method given in link:darshan-runtime.html#_instrumenting_statically_linked_applications[Section 5]
** If purely static executables are not an option, we encourage users to use the LD_PRELOAD method of
instrumentation given in link:darshan-runtime.html#_instrumenting_dynamically_linked_applications[Section 6]
** Statically linked executables are the default on Cray platforms and the IBM BG platforms;
statically linked executables can be explicitly requested using the `-static` compile option to most compilers
** You can verify that an executable is purely statically linked by using the `file` command:
----
> file mpi-io-test
mpi-io-test: ELF 64-bit LSB executable, x86-64, version 1 (GNU/Linux), statically linked, for GNU/Linux 2.6.24, BuildID[sha1]=9893e599e7a560159ccf547b4c4ba5671f65ba32, not stripped
----
* Ensure that the linker is correctly linking in Darshan's runtime libraries:
** A common mistake is to explicitly link in the underlying MPI libraries (e.g., `-lmpich` or `-lmpichf90`)
in the link command, which can interfere with Darshan's instrumentation
*** These libraries are usually linked in automatically by the compiler
*** MPICH's `mpicc` comipler's `-show` flag can be used to examine the invoked link command, for instance
** The linker's `-y` option can be used to verify that Darshan is properly intercepting MPI_Init
function (e.g. by setting `CFLAGS='-Wl,-yMPI_Init'`), which it uses to initialize its runtime structures
----
/usr/common/software/darshan/3.0.0-pre3/lib/libdarshan.a(darshan-core-init-finalize.o): definition of MPI_Init
----
......@@ -137,6 +137,7 @@ DARSHAN_FORWARD_DECL(PMPI_Gather, int, (const void *sendbuf, int sendcount, MPI_
#else
DARSHAN_FORWARD_DECL(PMPI_Gather, int, (void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, int recvcount, MPI_Datatype recvtype, int root, MPI_Comm comm));
#endif
DARSHAN_FORWARD_DECL(PMPI_Barrier, int, (MPI_Comm comm));
void resolve_mpi_symbols (void)
{
......@@ -197,6 +198,7 @@ void resolve_mpi_symbols (void)
MAP_OR_FAIL(PMPI_Send);
MAP_OR_FAIL(PMPI_Recv);
MAP_OR_FAIL(PMPI_Gather);
MAP_OR_FAIL(PMPI_Barrier);
return;
}
......
......@@ -52,6 +52,7 @@ char* darshan_path_exclusions[] = {
"/sbin/",
"/sys/",
"/proc/",
"/var/",
NULL
};
......@@ -261,12 +262,12 @@ void darshan_core_shutdown()
darshan_record_id *mod_shared_recs;
int shared_rec_cnt = 0;
double start_log_time;
double open1, open2;
double job1, job2;
double rec1, rec2;
double open1 = 0, open2 = 0;
double job1 = 0, job2 = 0;
double rec1 = 0, rec2 = 0;
double mod1[DARSHAN_MAX_MODS] = {0};
double mod2[DARSHAN_MAX_MODS] = {0};
double header1, header2;
double header1 = 0, header2 = 0;
double tm_end;
uint64_t gz_fp = 0;
MPI_File log_fh;
......
......@@ -117,7 +117,8 @@ hid_t DARSHAN_DECL(H5Fcreate)(const char *filename, unsigned flags,
file = hdf5_file_by_name_sethid(filename, ret);
if(file)
{
if(file->file_record->fcounters[HDF5_F_OPEN_TIMESTAMP] == 0)
if(file->file_record->fcounters[HDF5_F_OPEN_TIMESTAMP] == 0 ||
file->file_record->fcounters[HDF5_F_OPEN_TIMESTAMP] > tm1)
file->file_record->fcounters[HDF5_F_OPEN_TIMESTAMP] = tm1;
file->file_record->counters[HDF5_OPENS] += 1;
}
......@@ -156,7 +157,8 @@ hid_t DARSHAN_DECL(H5Fopen)(const char *filename, unsigned flags,
file = hdf5_file_by_name_sethid(filename, ret);
if(file)
{
if(file->file_record->fcounters[HDF5_F_OPEN_TIMESTAMP] == 0)
if(file->file_record->fcounters[HDF5_F_OPEN_TIMESTAMP] == 0 ||
file->file_record->fcounters[HDF5_F_OPEN_TIMESTAMP] > tm1)
file->file_record->fcounters[HDF5_F_OPEN_TIMESTAMP] = tm1;
file->file_record->counters[HDF5_OPENS] += 1;
}
......
......@@ -153,7 +153,8 @@ static void mpiio_shutdown(void);
file->file_record->counters[MPIIO_COLL_OPENS] += 1; \
if(__info != MPI_INFO_NULL) \
file->file_record->counters[MPIIO_HINTS] += 1; \
if(file->file_record->fcounters[MPIIO_F_OPEN_TIMESTAMP] == 0) \
if(file->file_record->fcounters[MPIIO_F_OPEN_TIMESTAMP] == 0 || \
file->file_record->fcounters[MPIIO_F_OPEN_TIMESTAMP] > __tm1) \
file->file_record->fcounters[MPIIO_F_OPEN_TIMESTAMP] = __tm1; \
DARSHAN_TIMER_INC_NO_OVERLAP(file->file_record->fcounters[MPIIO_F_META_TIME], __tm1, __tm2, file->last_meta_end); \
} while(0)
......@@ -174,7 +175,8 @@ static void mpiio_shutdown(void);
if(file->last_io_type == DARSHAN_IO_WRITE) \
file->file_record->counters[MPIIO_RW_SWITCHES] += 1; \
file->last_io_type = DARSHAN_IO_READ; \
if(file->file_record->fcounters[MPIIO_F_READ_START_TIMESTAMP] == 0) \
if(file->file_record->fcounters[MPIIO_F_READ_START_TIMESTAMP] == 0 || \
file->file_record->fcounters[MPIIO_F_READ_START_TIMESTAMP] > __tm1) \
file->file_record->fcounters[MPIIO_F_READ_START_TIMESTAMP] = __tm1; \
file->file_record->fcounters[MPIIO_F_READ_END_TIMESTAMP] = __tm2; \
if(file->file_record->fcounters[MPIIO_F_MAX_READ_TIME] < __elapsed) { \
......@@ -199,7 +201,8 @@ static void mpiio_shutdown(void);
if(file->last_io_type == DARSHAN_IO_READ) \
file->file_record->counters[MPIIO_RW_SWITCHES] += 1; \
file->last_io_type = DARSHAN_IO_WRITE; \
if(file->file_record->fcounters[MPIIO_F_WRITE_START_TIMESTAMP] == 0) \
if(file->file_record->fcounters[MPIIO_F_READ_START_TIMESTAMP] == 0 || \
file->file_record->fcounters[MPIIO_F_READ_START_TIMESTAMP] > __tm1) \
file->file_record->fcounters[MPIIO_F_WRITE_START_TIMESTAMP] = __tm1; \
file->file_record->fcounters[MPIIO_F_WRITE_END_TIMESTAMP] = __tm2; \
if(file->file_record->fcounters[MPIIO_F_MAX_WRITE_TIME] < __elapsed) { \
......
......@@ -114,7 +114,8 @@ int DARSHAN_DECL(ncmpi_create)(MPI_Comm comm, const char *path,
file = pnetcdf_file_by_name_setncid(path, (*ncidp));
if(file)
{
if(file->file_record->fcounters[PNETCDF_F_OPEN_TIMESTAMP] == 0)
if(file->file_record->fcounters[PNETCDF_F_OPEN_TIMESTAMP] == 0 ||
file->file_record->fcounters[PNETCDF_F_OPEN_TIMESTAMP] > tm1)
file->file_record->fcounters[PNETCDF_F_OPEN_TIMESTAMP] = tm1;
DARSHAN_MPI_CALL(PMPI_Comm_size)(comm, &comm_size);
if(comm_size == 1)
......@@ -162,7 +163,8 @@ int DARSHAN_DECL(ncmpi_open)(MPI_Comm comm, const char *path,
file = pnetcdf_file_by_name_setncid(path, (*ncidp));
if(file)
{
if(file->file_record->fcounters[PNETCDF_F_OPEN_TIMESTAMP] == 0)
if(file->file_record->fcounters[PNETCDF_F_OPEN_TIMESTAMP] == 0 ||
file->file_record->fcounters[PNETCDF_F_OPEN_TIMESTAMP] > tm1)
file->file_record->fcounters[PNETCDF_F_OPEN_TIMESTAMP] = tm1;
DARSHAN_MPI_CALL(PMPI_Comm_size)(comm, &comm_size);
if(comm_size == 1)
......
......@@ -231,7 +231,8 @@ static void posix_shutdown(void);
file->file_record->counters[POSIX_FOPENS] += 1; \
else \
file->file_record->counters[POSIX_OPENS] += 1; \
if(file->file_record->fcounters[POSIX_F_OPEN_TIMESTAMP] == 0) \
if(file->file_record->fcounters[POSIX_F_OPEN_TIMESTAMP] == 0 || \
file->file_record->fcounters[POSIX_F_OPEN_TIMESTAMP] > __tm1) \
file->file_record->fcounters[POSIX_F_OPEN_TIMESTAMP] = __tm1; \
DARSHAN_TIMER_INC_NO_OVERLAP(file->file_record->fcounters[POSIX_F_META_TIME], __tm1, __tm2, file->last_meta_end); \
} while(0)
......@@ -278,7 +279,8 @@ static void posix_shutdown(void);
if(file->last_io_type == DARSHAN_IO_WRITE) \
file->file_record->counters[POSIX_RW_SWITCHES] += 1; \
file->last_io_type = DARSHAN_IO_READ; \
if(file->file_record->fcounters[POSIX_F_READ_START_TIMESTAMP] == 0) \
if(file->file_record->fcounters[POSIX_F_READ_START_TIMESTAMP] == 0 || \
file->file_record->fcounters[POSIX_F_READ_START_TIMESTAMP] > __tm1) \
file->file_record->fcounters[POSIX_F_READ_START_TIMESTAMP] = __tm1; \
file->file_record->fcounters[POSIX_F_READ_END_TIMESTAMP] = __tm2; \
if(file->file_record->fcounters[POSIX_F_MAX_READ_TIME] < __elapsed) { \
......@@ -329,7 +331,8 @@ static void posix_shutdown(void);
if(file->last_io_type == DARSHAN_IO_READ) \
file->file_record->counters[POSIX_RW_SWITCHES] += 1; \
file->last_io_type = DARSHAN_IO_WRITE; \
if(file->file_record->fcounters[POSIX_F_WRITE_START_TIMESTAMP] == 0) \
if(file->file_record->fcounters[POSIX_F_WRITE_START_TIMESTAMP] == 0 || \
file->file_record->fcounters[POSIX_F_WRITE_START_TIMESTAMP] > __tm1) \
file->file_record->fcounters[POSIX_F_WRITE_START_TIMESTAMP] = __tm1; \
file->file_record->fcounters[POSIX_F_WRITE_END_TIMESTAMP] = __tm2; \
if(file->file_record->fcounters[POSIX_F_MAX_WRITE_TIME] < __elapsed) { \
......
#!/bin/bash
# submit job and get job id
jobid=`qsub --env DARSHAN_LOGFILE=$DARSHAN_LOGFILE --mode c16 --proccount $DARSHAN_DEFAULT_NPROCS -A SSSPPg -t 10 -n 1 --output $DARSHAN_TMP/$$-tmp.out --error $DARSHAN_TMP/$$-tmp.err --debuglog $DARSHAN_TMP/$$-tmp.debuglog "$@"`
jobid=`qsub --env DARSHAN_LOGFILE=$DARSHAN_LOGFILE --mode c16 --proccount $DARSHAN_DEFAULT_NPROCS -A radix-io -t 10 -n 1 --output $DARSHAN_TMP/$$-tmp.out --error $DARSHAN_TMP/$$-tmp.err --debuglog $DARSHAN_TMP/$$-tmp.debuglog "$@"`
if [ $? -ne 0 ]; then
echo "Error: failed to qsub $@"
exit 1
......
......@@ -45,6 +45,17 @@ fi
# set up environment for tests according to platform
source $DARSHAN_TESTDIR/$DARSHAN_PLATFORM/env.sh
# in case of using LD_PRELOAD, it is possible errors with Darshan symbols
# (e..g, forgetting to resolve a symbol Darshan wraps using dlsym) can
# cause arbitrary binaries to crash. We check the output of the true
# command to successfully identify failures like this and exit
true_out=`/bin/true 2>&1`
if [ $? -ne 0 -o -n "$true_out" ]; then
echo -n $true_out > tmp
echo "environment setup failed"
exit 1
fi
failure_count=0
for i in `ls $DARSHAN_TESTDIR/test-cases/*.sh`; do
......
......@@ -3,7 +3,7 @@
PROG=cxxpi
# set log file path; remove previous log if present
export DARSHAN_LOGFILE=$DARSHAN_TMP/${PROG}.darshan.gz
export DARSHAN_LOGFILE=$DARSHAN_TMP/${PROG}.darshan
rm -f ${DARSHAN_LOGFILE}
# compile
......
......@@ -3,7 +3,7 @@
PROG=fperf-f77
# set log file path; remove previous log if present
export DARSHAN_LOGFILE=$DARSHAN_TMP/${PROG}.darshan.gz
export DARSHAN_LOGFILE=$DARSHAN_TMP/${PROG}.darshan
rm -f ${DARSHAN_LOGFILE}
# compile
......
......@@ -3,7 +3,7 @@
PROG=fperf-f90
# set log file path; remove previous log if present
export DARSHAN_LOGFILE=$DARSHAN_TMP/${PROG}.darshan.gz
export DARSHAN_LOGFILE=$DARSHAN_TMP/${PROG}.darshan
rm -f ${DARSHAN_LOGFILE}
# compile
......
......@@ -3,7 +3,7 @@
PROG=mpi-io-test
# set log file path; remove previous log if present
export DARSHAN_LOGFILE=$DARSHAN_TMP/${PROG}.darshan.gz
export DARSHAN_LOGFILE=$DARSHAN_TMP/${PROG}.darshan
rm -f ${DARSHAN_LOGFILE}
# compile
......
......@@ -137,6 +137,7 @@ endif
install -m 644 $(srcdir)/darshan-pnetcdf-logutils.h $(includedir)
install -m 644 $(srcdir)/darshan-bgq-logutils.h $(includedir)
install -m 644 $(srcdir)/darshan-null-logutils.h $(includedir)
install -m 644 $(srcdir)/../darshan-null-log-format.h $(includedir)
install -m 644 $(srcdir)/../darshan-posix-log-format.h $(includedir)
install -m 644 $(srcdir)/../darshan-mpiio-log-format.h $(includedir)
install -m 644 $(srcdir)/../darshan-hdf5-log-format.h $(includedir)
......@@ -158,7 +159,7 @@ endif
clean::
rm -f *.o *.po *.a darshan-analyzer darshan-convert darshan-parser jenkins-hash-gen
rm -f *.o *.po *.a darshan-analyzer darshan-convert darshan-diff darshan-parser jenkins-hash-gen
distclean:: clean
rm -f darshan-runtime-config.h aclocal.m4 autom4te.cache/* config.status config.log Makefile util/bin/darshan-job-summary.pl
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment