Commit 76a9b5b6 authored by fisaila's avatar fisaila
Browse files

Added epochs to Darshan. Epoches and full statistics are mutually exclusive.

Functionality:
An epoch is enclosed by:
darshan_start_epoch();
darshan_stop_epoch();

There is one log file per epoch and the epoch number appears in the file name. 
If epoches are not used the functionality should be the same as before. 

Implementation: 
I tried to minimally modify the code (only darshan-mpi-io.c was modified). 
1) darshan_shutdown became: 
void darshan_shutdown_epoch(int timing_flag)

5 lines were commented and moved to the new darshan_shutdown

2) void darshan_shutdown(int timing_flag)

If no epoch is used just calls darshan_shutdown_epoch
else 
for each epoch restore the counters from the end of epoch and call darshan_shutdown_epoch

3) void darshan_start_epoch(void)

just resets the counters from darshan_global_job

4) void darshan_end_epoch(void)

saves the counters from darshan_global_job 


 
Test:
One test has been added in darshan-test/epoch-testing/. 
I have not performed yet extensive testing. 




git-svn-id: https://svn.mcs.anl.gov/repos/darshan/branches/darshan-florin-extensions@818 3b7491f3-a168-0410-bf4b-c445ed680a29
parent 4ea07ace
......@@ -228,6 +228,8 @@ struct variance_dt
double S;
};
static int epoch_counter = 0;
void darshan_mpi_initialize(int *argc, char ***argv)
{
int nprocs;
......@@ -269,7 +271,7 @@ void darshan_mpi_initialize(int *argc, char ***argv)
return;
}
void darshan_shutdown(int timing_flag)
void darshan_shutdown_epoch(int timing_flag)
{
int rank;
char* logfile_name;
......@@ -298,6 +300,7 @@ void darshan_shutdown(int timing_flag)
char* env_tok;
#endif
uint64_t hlevel;
static int epoch_idx = 0;
CP_LOCK();
if(!darshan_global_job)
......@@ -309,7 +312,8 @@ void darshan_shutdown(int timing_flag)
* write it out
*/
final_job = darshan_global_job;
darshan_global_job = NULL;
// Moved to the new darshan_shutdown
// darshan_global_job = NULL;
CP_UNLOCK();
start_log_time = DARSHAN_MPI_CALL(PMPI_Wtime)();
......@@ -470,14 +474,25 @@ void darshan_shutdown(int timing_flag)
if(logpath_override)
{
ret = snprintf(logfile_name, PATH_MAX,
"%s/%s_%s_id%d_%d-%d-%d-%" PRIu64 ".darshan_partial",
logpath_override,
cuser, __progname, jobid,
(my_tm->tm_mon+1),
my_tm->tm_mday,
(my_tm->tm_hour*60*60 + my_tm->tm_min*60 + my_tm->tm_sec),
logmod);
if (epoch_counter > 0)
ret = snprintf(logfile_name, PATH_MAX,
"%s/%s_%s_id%d_epoch%d_%d-%d-%d-%" PRIu64 ".darshan_partial",
logpath_override,
cuser, __progname, jobid, epoch_idx++,
(my_tm->tm_mon+1),
my_tm->tm_mday,
(my_tm->tm_hour*60*60 + my_tm->tm_min*60 + my_tm->tm_sec),
logmod);
else
ret = snprintf(logfile_name, PATH_MAX,
"%s/%s_%s_id%d_%d-%d-%d-%" PRIu64 ".darshan_partial",
logpath_override,
cuser, __progname, jobid,
(my_tm->tm_mon+1),
my_tm->tm_mday,
(my_tm->tm_hour*60*60 + my_tm->tm_min*60 + my_tm->tm_sec),
logmod);
if(ret == (PATH_MAX-1))
{
/* file name was too big; squish it down */
......@@ -488,15 +503,27 @@ void darshan_shutdown(int timing_flag)
}
else if(logpath)
{
ret = snprintf(logfile_name, PATH_MAX,
"%s/%d/%d/%d/%s_%s_id%d_%d-%d-%d-%" PRIu64 ".darshan_partial",
logpath, (my_tm->tm_year+1900),
(my_tm->tm_mon+1), my_tm->tm_mday,
cuser, __progname, jobid,
(my_tm->tm_mon+1),
my_tm->tm_mday,
(my_tm->tm_hour*60*60 + my_tm->tm_min*60 + my_tm->tm_sec),
logmod);
if (epoch_counter > 0)
ret = snprintf(logfile_name, PATH_MAX,
"%s/%d/%d/%d/%s_%s_id%d_epoch%d_%d-%d-%d-%" PRIu64 ".darshan_partial",
logpath, (my_tm->tm_year+1900),
(my_tm->tm_mon+1), my_tm->tm_mday,
cuser, __progname, jobid, epoch_idx++,
(my_tm->tm_mon+1),
my_tm->tm_mday,
(my_tm->tm_hour*60*60 + my_tm->tm_min*60 + my_tm->tm_sec),
logmod);
else
ret = snprintf(logfile_name, PATH_MAX,
"%s/%d/%d/%d/%s_%s_id%d_%d-%d-%d-%" PRIu64 ".darshan_partial",
logpath, (my_tm->tm_year+1900),
(my_tm->tm_mon+1), my_tm->tm_mday,
cuser, __progname, jobid,
(my_tm->tm_mon+1),
my_tm->tm_mday,
(my_tm->tm_hour*60*60 + my_tm->tm_min*60 + my_tm->tm_sec),
logmod);
if(ret == (PATH_MAX-1))
{
/* file name was too big; squish it down */
......@@ -616,11 +643,13 @@ void darshan_shutdown(int timing_flag)
}
}
if(final_job->trailing_data)
free(final_job->trailing_data);
mnt_data_count = 0;
// Moved to the new darshan_shutdown
// if(final_job->trailing_data)
// free(final_job->trailing_data);
// mnt_data_count = 0;
free(logfile_name);
darshan_finalize(final_job);
// Moved to the new darshan_shutdown
// darshan_finalize(final_job);
if(timing_flag)
{
......@@ -2485,35 +2514,135 @@ void darshan_mnt_id_from_path(const char* path, int64_t* device_id, int64_t* blo
return;
}
static int epoch_counter = 0;
// Keep counters for each epoch
#define DARSHAN_MAX_EPOCHS 128
struct darshan_file epoch_file_array[DARSHAN_MAX_EPOCHS][CP_MAX_FILES];
struct darshan_file_runtime epoch_file_runtime_array[DARSHAN_MAX_EPOCHS][CP_MAX_FILES];
int epoch_file_count[DARSHAN_MAX_EPOCHS];
static void darshan_reset_counters(){
int i;
for (i=0; i<darshan_global_job->file_count; i++){
int64_t mode = darshan_global_job->file_array[i].counters[CP_MODE];
int64_t device = darshan_global_job->file_array[i].counters[CP_DEVICE];
double open_timestamp = darshan_global_job->file_array[i].fcounters[CP_F_OPEN_TIMESTAMP];
memset(darshan_global_job->file_array[i].counters, 0,
sizeof(int64_t)*CP_NUM_INDICES);
memset(darshan_global_job->file_array[i].fcounters, 0,
sizeof(double)*CP_F_NUM_INDICES);
darshan_global_job->file_array[i].counters[CP_MODE] = mode;
darshan_global_job->file_array[i].counters[CP_DEVICE] = device;
darshan_global_job->file_array[i].fcounters[CP_F_OPEN_TIMESTAMP] = open_timestamp;
darshan_global_job->file_runtime_array[i].access_root = NULL;
darshan_global_job->file_runtime_array[i].access_count = 0;
darshan_global_job->file_runtime_array[i].stride_root = NULL;
darshan_global_job->file_runtime_array[i].stride_count = 0;
darshan_global_job->file_runtime_array[i].last_byte_read = 0;
darshan_global_job->file_runtime_array[i].last_byte_written = 0;
// darshan_global_job->file_runtime_array[i].offset = 0;
darshan_global_job->file_runtime_array[i].last_io_type = 0;
darshan_global_job->file_runtime_array[i].last_posix_write_end = 0;
darshan_global_job->file_runtime_array[i].last_mpi_write_end = 0;
darshan_global_job->file_runtime_array[i].last_posix_read_end = 0;
darshan_global_job->file_runtime_array[i].last_mpi_read_end = 0;
darshan_global_job->file_runtime_array[i].last_posix_meta_end = 0;
darshan_global_job->file_runtime_array[i].last_mpi_meta_end = 0;
darshan_global_job->file_runtime_array[i].aio_list_head = NULL;
darshan_global_job->file_runtime_array[i].aio_list_tail = NULL;
}
}
void darshan_start_epoch(void)
{
int nprocs, rank;
if(darshan_global_job)
{
/* darshan instrumentation already on; turn it off */
darshan_finalize(darshan_global_job);
darshan_global_job = NULL;
CP_LOCK();
if (epoch_counter == 0){
memset(epoch_file_array, 0,
sizeof(struct darshan_file)*CP_MAX_FILES*DARSHAN_MAX_EPOCHS);
memset(epoch_file_runtime_array, 0,
sizeof(struct darshan_file_runtime)*CP_MAX_FILES*DARSHAN_MAX_EPOCHS);
}
darshan_reset_counters();
CP_UNLOCK();
return;
}
epoch_counter++;
DARSHAN_MPI_CALL(PMPI_Comm_size)(MPI_COMM_WORLD, &nprocs);
DARSHAN_MPI_CALL(PMPI_Comm_rank)(MPI_COMM_WORLD, &rank);
darshan_initialize(0, NULL, nprocs, rank);
void darshan_end_epoch(void)
{
int i;
CP_LOCK();
memcpy(epoch_file_array[epoch_counter],
darshan_global_job->file_array,
darshan_global_job->file_count* sizeof(struct darshan_file));
memcpy(epoch_file_runtime_array[epoch_counter],
darshan_global_job->file_runtime_array,
darshan_global_job->file_count* sizeof(struct darshan_file_runtime));
epoch_file_count[epoch_counter] = darshan_global_job->file_count;
epoch_counter++;
CP_UNLOCK();
return;
}
void darshan_end_epoch(void)
void darshan_shutdown(int timing_flag)
{
darshan_shutdown(0);
if (!epoch_counter)
darshan_shutdown_epoch(0);
else {
int i,j;
for(i=0; i<epoch_counter; i++){
CP_LOCK();
darshan_global_job->file_count = epoch_file_count[i];
memcpy(darshan_global_job->file_array,
epoch_file_array[i],
darshan_global_job->file_count* sizeof(struct darshan_file));
// Can not copy to avoid destroying the hash table pointers
for (j=0; j<darshan_global_job->file_count; j++) {
darshan_global_job->file_runtime_array[j].log_file = epoch_file_runtime_array[i][j].log_file;
darshan_global_job->file_runtime_array[j].access_root = epoch_file_runtime_array[i][j].access_root;
darshan_global_job->file_runtime_array[j].access_count = epoch_file_runtime_array[i][j].access_count;
darshan_global_job->file_runtime_array[j].stride_root = epoch_file_runtime_array[i][j].stride_root ;
darshan_global_job->file_runtime_array[j].stride_count = epoch_file_runtime_array[i][j].stride_count;
darshan_global_job->file_runtime_array[j].last_byte_read = epoch_file_runtime_array[i][j].last_byte_read;
darshan_global_job->file_runtime_array[j].last_byte_written = epoch_file_runtime_array[i][j].last_byte_written;
darshan_global_job->file_runtime_array[j].offset = epoch_file_runtime_array[i][j].offset;
darshan_global_job->file_runtime_array[j].last_io_type = epoch_file_runtime_array[i][j].last_io_type;
darshan_global_job->file_runtime_array[j].last_posix_write_end = epoch_file_runtime_array[i][j].last_posix_write_end;
darshan_global_job->file_runtime_array[j].last_mpi_write_end = epoch_file_runtime_array[i][j].last_mpi_write_end;
darshan_global_job->file_runtime_array[j].last_posix_read_end = epoch_file_runtime_array[i][j].last_posix_read_end;
darshan_global_job->file_runtime_array[j].last_mpi_read_end = epoch_file_runtime_array[i][j].last_mpi_read_end;
darshan_global_job->file_runtime_array[j].last_posix_meta_end = epoch_file_runtime_array[i][j].last_posix_meta_end;
darshan_global_job->file_runtime_array[j].last_mpi_meta_end = epoch_file_runtime_array[i][j].last_mpi_meta_end;
darshan_global_job->file_runtime_array[j].aio_list_head = epoch_file_runtime_array[i][j].aio_list_head;
darshan_global_job->file_runtime_array[j].aio_list_tail = epoch_file_runtime_array[i][j].aio_list_tail;
}
CP_UNLOCK();
darshan_shutdown_epoch(0);
}
}
// Moved here from previous darshan_shutdown
CP_LOCK();
if (darshan_global_job->trailing_data)
free(darshan_global_job->trailing_data);
mnt_data_count = 0;
darshan_finalize(darshan_global_job);
darshan_global_job = NULL;
return;
CP_UNLOCK();
}
/*
* Local variables:
* c-indent-level: 4
......
# Generated by configure from .in at Sun Jan 23 19:29:20 CET 2005
#CC = mpicc.darshan
CC = mpicc
LD = $(CC)
CFLAGS = -O0 -g -Wall -Wl,-rpath,../../darshan-runtime/lib
CFLAGS += -I../../darshan-runtime/lib -I. -I../../darshan-runtime -I../../darshan-florin-extensions
#LDFLAGS = -Wl,-wrap,MPI_Init
LDFLAGS = -L../../darshan-runtime/lib -ldarshan
#-L/home/fisaila/software/darshan-florin-extensions/darshan-runtime/lib -ldarshan-posix #-rdynamic -L/home/fisaila/software/darshan-florin-extensions/darshan-runtime/lib -ldarshan
OBJS =
LIBS =
SRCS = $(patsubst %.o,%.c,$(OBJS))
PRGS = writef
all: $(PRGS)
%.o: %.c $(HEADERS)
$(CC) $(CFLAGS) -c $*.c $(INCLUDE) -o $@
$(PRGS): $(OBJS) /homes/fisaila/software/darshan/lib/libdarshan.so
$(PRGS):
$(PRGS): % : %.o
$(CC) $(CFLAGS) -o $@ $< $(OBJS) $(LDFLAGS) $(LIBS)
clean:
-rm -f *.o *~ $(PRGS)
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
/*
* (C) 2001 by Argonne National Laboratory.
* See COPYRIGHT in top-level directory.
*/
#include "mpi.h"
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <darshan-ext.h>
static void handle_error(int errcode, char *str)
{
char msg[MPI_MAX_ERROR_STRING];
int resultlen;
MPI_Error_string(errcode, msg, &resultlen);
fprintf(stderr, "%s: %s\n", str, msg);
MPI_Abort(MPI_COMM_WORLD, 1);
}
/* The file name is taken as a command-line argument. */
int main(int argc, char **argv)
{
int i, errcode;
int nprocs, len, *buf, bufcount, rank;
MPI_File fh,fh2;
MPI_Status status;
double stim, write_tim, new_write_tim, write_bw;
char *filename;
MPI_Init(&argc,&argv);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
/* process 0 takes the file name as a command-line argument and
broadcasts it to other processes */
if (!rank) {
i = 1;
while ((i < argc) && strcmp("-fname", *argv)) {
i++;
argv++;
}
if (i >= argc) {
fprintf(stderr, "\n*# Usage: coll_perf -fname filename\n\n");
MPI_Abort(MPI_COMM_WORLD, 1);
}
argv++;
len = strlen(*argv);
filename = (char *) malloc(len+1);
strcpy(filename, *argv);
MPI_Bcast(&len, 1, MPI_INT, 0, MPI_COMM_WORLD);
MPI_Bcast(filename, len+1, MPI_CHAR, 0, MPI_COMM_WORLD);
}
else {
MPI_Bcast(&len, 1, MPI_INT, 0, MPI_COMM_WORLD);
filename = (char *) malloc(len+1);
MPI_Bcast(filename, len+1, MPI_CHAR, 0, MPI_COMM_WORLD);
}
bufcount = 128*128*128;
buf = (int *) malloc(bufcount * sizeof(int));
darshan_start_epoch();
errcode = MPI_File_open(MPI_COMM_SELF, filename,
MPI_MODE_CREATE | MPI_MODE_RDWR, MPI_INFO_NULL, &fh);
darshan_end_epoch();
if (errcode != MPI_SUCCESS) handle_error(errcode, "MPI_File_open(1)");
MPI_Barrier(MPI_COMM_WORLD);
stim = MPI_Wtime();
darshan_start_epoch();
MPI_File_write_all(fh, buf, bufcount, MPI_INT, &status);
errcode = MPI_File_open(MPI_COMM_SELF, "abc",
MPI_MODE_CREATE | MPI_MODE_RDWR, MPI_INFO_NULL, &fh2);
MPI_File_write_all(fh2, buf, bufcount, MPI_INT, &status);
darshan_end_epoch();
write_tim = MPI_Wtime() - stim;
darshan_start_epoch();
MPI_File_write_all(fh, buf, bufcount, MPI_INT, &status);
darshan_end_epoch();
MPI_File_close(&fh);
MPI_File_close(&fh2);
MPI_Allreduce(&write_tim, &new_write_tim, 1, MPI_DOUBLE, MPI_MAX,
MPI_COMM_WORLD);
if (rank == 0) {
write_bw = (bufcount*sizeof(int))/(new_write_tim*1024.0*1024.0);
fprintf(stderr, "Each of %d processes writes buf size=%ld\n",nprocs, bufcount*sizeof(int));
fprintf(stderr, "Collective write time = %f sec, Collective write bandwidth = %f Mbytes/sec\n", new_write_tim, write_bw);
}
free(filename);
free(buf);
MPI_Finalize();
return 0;
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment