darshan-mpi-io.c 109 KB
Newer Older
1 2 3 4 5
/*
 *  (C) 2009 by Argonne National Laboratory.
 *      See COPYRIGHT in top-level directory.
 */

6 7 8
#define _XOPEN_SOURCE 500
#define _GNU_SOURCE /* for tdestroy() */

9
#include "darshan-runtime-config.h"
10

11
#include <stdio.h>
12
#ifdef HAVE_MNTENT_H
13
#include <mntent.h>
14
#endif
15 16 17 18 19 20 21
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <limits.h>
#include <unistd.h>
#include <pthread.h>
#include <sys/types.h>
22
#include <sys/stat.h>
23
#include <sys/vfs.h>
24 25 26 27 28 29
#include <zlib.h>
#include <assert.h>
#include <search.h>

#include "mpi.h"
#include "darshan.h"
30
#include "darshan-dynamic.h"
31
#include "darshan-ext.h"
32

33 34 35 36 37
extern char* __progname;

/* maximum number of memory segments each process will write to the log */
#define CP_MAX_MEM_SEGMENTS 8

38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96
/* Some old versions of MPI don't provide all of these COMBINER definitions.  
 * If any are missing then we define them to an arbitrary value just to 
 * prevent compile errors in DATATYPE_INC().
 */
#ifndef MPI_COMBINER_NAMED
    #define MPI_COMBINER_NAMED CP_COMBINER_NAMED
#endif
#ifndef MPI_COMBINER_DUP
    #define MPI_COMBINER_DUP CP_COMBINER_DUP
#endif
#ifndef MPI_COMBINER_CONTIGUOUS
    #define MPI_COMBINER_CONTIGUOUS CP_COMBINER_CONTIGUOUS
#endif
#ifndef MPI_COMBINER_VECTOR
    #define MPI_COMBINER_VECTOR CP_COMBINER_VECTOR
#endif
#ifndef MPI_COMBINER_HVECTOR_INTEGER
    #define MPI_COMBINER_HVECTOR_INTEGER CP_COMBINER_HVECTOR_INTEGER
#endif
#ifndef MPI_COMBINER_HVECTOR
    #define MPI_COMBINER_HVECTOR CP_COMBINER_HVECTOR
#endif
#ifndef MPI_COMBINER_INDEXED
    #define MPI_COMBINER_INDEXED CP_COMBINER_INDEXED
#endif
#ifndef MPI_COMBINER_HINDEXED_INTEGER
    #define MPI_COMBINER_HINDEXED_INTEGER CP_COMBINER_HINDEXED_INTEGER
#endif
#ifndef MPI_COMBINER_HINDEXED
    #define MPI_COMBINER_HINDEXED CP_COMBINER_HINDEXED
#endif
#ifndef MPI_COMBINER_INDEXED_BLOCK
    #define MPI_COMBINER_INDEXED_BLOCK CP_COMBINER_INDEXED_BLOCK
#endif
#ifndef MPI_COMBINER_STRUCT_INTEGER
    #define MPI_COMBINER_STRUCT_INTEGER CP_COMBINER_STRUCT_INTEGER
#endif
#ifndef MPI_COMBINER_STRUCT
    #define MPI_COMBINER_STRUCT CP_COMBINER_STRUCT
#endif
#ifndef MPI_COMBINER_SUBARRAY
    #define MPI_COMBINER_SUBARRAY CP_COMBINER_SUBARRAY
#endif
#ifndef MPI_COMBINER_DARRAY
    #define MPI_COMBINER_DARRAY CP_COMBINER_DARRAY
#endif
#ifndef MPI_COMBINER_F90_REAL
    #define MPI_COMBINER_F90_REAL CP_COMBINER_F90_REAL
#endif
#ifndef MPI_COMBINER_F90_COMPLEX
    #define MPI_COMBINER_F90_COMPLEX CP_COMBINER_F90_COMPLEX
#endif
#ifndef MPI_COMBINER_F90_INTEGER
    #define MPI_COMBINER_F90_INTEGER CP_COMBINER_F90_INTEGER
#endif
#ifndef MPI_COMBINER_RESIZED
    #define MPI_COMBINER_RESIZED CP_COMBINER_RESIZED
#endif

97 98
#define CP_DATATYPE_INC(__file, __datatype) do {\
    int num_integers, num_addresses, num_datatypes, combiner, ret; \
99 100
    ret = DARSHAN_MPI_CALL(PMPI_Type_get_envelope)(__datatype, &num_integers, \
        &num_addresses, &num_datatypes, &combiner); \
101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142
    if(ret == MPI_SUCCESS) { \
        switch(combiner) { \
            case MPI_COMBINER_NAMED:\
                CP_INC(__file,CP_COMBINER_NAMED,1); break; \
            case MPI_COMBINER_DUP:\
                CP_INC(__file,CP_COMBINER_DUP,1); break; \
            case MPI_COMBINER_CONTIGUOUS:\
                CP_INC(__file,CP_COMBINER_CONTIGUOUS,1); break; \
            case MPI_COMBINER_VECTOR:\
                CP_INC(__file,CP_COMBINER_VECTOR,1); break; \
            case MPI_COMBINER_HVECTOR_INTEGER:\
                CP_INC(__file,CP_COMBINER_HVECTOR_INTEGER,1); break; \
            case MPI_COMBINER_HVECTOR:\
                CP_INC(__file,CP_COMBINER_HVECTOR,1); break; \
            case MPI_COMBINER_INDEXED:\
                CP_INC(__file,CP_COMBINER_INDEXED,1); break; \
            case MPI_COMBINER_HINDEXED_INTEGER:\
                CP_INC(__file,CP_COMBINER_HINDEXED_INTEGER,1); break; \
            case MPI_COMBINER_HINDEXED:\
                CP_INC(__file,CP_COMBINER_HINDEXED,1); break; \
            case MPI_COMBINER_INDEXED_BLOCK:\
                CP_INC(__file,CP_COMBINER_INDEXED_BLOCK,1); break; \
            case MPI_COMBINER_STRUCT_INTEGER:\
                CP_INC(__file,CP_COMBINER_STRUCT_INTEGER,1); break; \
            case MPI_COMBINER_STRUCT:\
                CP_INC(__file,CP_COMBINER_STRUCT,1); break; \
            case MPI_COMBINER_SUBARRAY:\
                CP_INC(__file,CP_COMBINER_SUBARRAY,1); break; \
            case MPI_COMBINER_DARRAY:\
                CP_INC(__file,CP_COMBINER_DARRAY,1); break; \
            case MPI_COMBINER_F90_REAL:\
                CP_INC(__file,CP_COMBINER_F90_REAL,1); break; \
            case MPI_COMBINER_F90_COMPLEX:\
                CP_INC(__file,CP_COMBINER_F90_COMPLEX,1); break; \
            case MPI_COMBINER_F90_INTEGER:\
                CP_INC(__file,CP_COMBINER_F90_INTEGER,1); break; \
            case MPI_COMBINER_RESIZED:\
                CP_INC(__file,CP_COMBINER_RESIZED,1); break; \
        } \
    } \
} while(0)

143 144 145 146
int epoch_counter = 0;


char darshan_log[DARSHAN_TRACER_LOG_SIZE];
147
long long int darshan_log_ptr = 0;
148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166

void darshan_trace_log_record(int rank, int epoch, int op, double tm1, double tm2, int send_count, int recv_count, long long int offset) {
       
    if (getenv("DARSHAN_TRACING")) {
        if (darshan_log_ptr +  sizeof(struct darshan_trace_record) > DARSHAN_TRACER_LOG_SIZE) {
            printf("Out of memory for log recording\n");
            return;
        }
        else {
            struct darshan_trace_record* d =  (struct darshan_trace_record*) (darshan_log + darshan_log_ptr);
	      	
	    PMPI_Comm_rank(MPI_COMM_WORLD, &(d->rank));
	    //d->rank = rank; 
            d->epoch = epoch;
            d->op = op;
            d->tm1 = tm1;
            d->tm2 = tm2;
            d->send_count = send_count;
            d->recv_count = recv_count;
fisaila's avatar
fisaila committed
167
	    d->offset = offset; 
168 169 170 171 172 173 174
            darshan_log_ptr += sizeof(struct darshan_trace_record);
        }
    }
}

void darshan_trace_log_write() {
    char *filename;
175 176 177
    char *dir;
    if ((dir=getenv("DARSHAN_TRACING"))!=NULL) {
    //if (getenv("DARSHAN_TRACING")){
178 179 180 181
        MPI_Offset offset;
        int rank;
        MPI_File fh;
        MPI_Status status;
182 183 184 185 186 187 188 189 190 191 192 193 194 195

        filename = (char*) malloc(PATH_MAX);

        DARSHAN_MPI_CALL(PMPI_Comm_rank)(MPI_COMM_WORLD, &rank);
	if (rank == 0) {
            struct tm* my_tm;
            time_t tm;
 
	    tm = time(NULL);
	    my_tm = localtime(&tm);
        	
	    snprintf(filename, PATH_MAX,
                    "%s/%s_%d-%d-%d.darshan_trace",
                    dir,
196 197 198 199 200
                    __progname,
                    (my_tm->tm_mon+1),
                    my_tm->tm_mday,
                    (my_tm->tm_hour*60*60 + my_tm->tm_min*60 + my_tm->tm_sec));

201 202 203 204 205
	    fprintf(stdout, "DARSHAN_TRACEFILE:%s\n", filename);
        }

        DARSHAN_MPI_CALL(PMPI_Bcast)(filename, PATH_MAX, MPI_CHAR, 0,
            MPI_COMM_WORLD);
206

207
        PMPI_Scan(&darshan_log_ptr, &offset, 1, MPI_LONG_LONG_INT, MPI_SUM, MPI_COMM_WORLD);
208
//        printf("%d: darshan_log_ptr=%lld offset=%lld\n", rank, darshan_log_ptr, offset-darshan_log_ptr);   
209 210 211 212 213 214 215 216 217 218
        DARSHAN_MPI_CALL(PMPI_File_open)(MPI_COMM_WORLD, filename, MPI_MODE_CREATE | MPI_MODE_WRONLY | MPI_MODE_EXCL, 
                      MPI_INFO_NULL, &fh);
        DARSHAN_MPI_CALL(PMPI_File_write_at_all)(fh, offset - darshan_log_ptr, darshan_log, darshan_log_ptr, MPI_BYTE, &status);
        DARSHAN_MPI_CALL(PMPI_File_close)(&fh);
	free(filename);
    }
}



219 220 221 222 223 224 225
int count_contiguous_blocks_memory(MPI_Datatype datatype, int count);
int count_contiguous_blocks_file(MPI_File fh, MPI_Offset foff1, MPI_Offset foff2);
MPI_Offset func_1_inf(MPI_File fh, MPI_Offset x, int memtype_size);
MPI_Offset func_1(MPI_File fh, MPI_Offset x);

/*
#define CP_RECORD_MPI_WRITE(__ret, __fh, __count, __datatype, __counter, __tm1, __tm2, __voff) do { \
226 227 228
    struct darshan_file_runtime* file; \
    int size = 0; \
    MPI_Aint extent = 0; \
229
    MPI_Offset foff1, foff2; \
230 231 232
    if(__ret != MPI_SUCCESS) break; \
    file = darshan_file_by_fh(__fh); \
    if(!file) break; \
233
    DARSHAN_MPI_CALL(PMPI_Type_size)(__datatype, &size);  \
234
    size = size * __count; \
235
    DARSHAN_MPI_CALL(PMPI_Type_extent)(__datatype, &extent); \
236 237 238 239
    CP_BUCKET_INC(file, CP_SIZE_WRITE_AGG_0_100, size); \
    CP_BUCKET_INC(file, CP_EXTENT_WRITE_0_100, extent); \
    CP_INC(file, __counter, 1); \
    CP_DATATYPE_INC(file, __datatype); \
240
    CP_F_INC_NO_OVERLAP(file, __tm1, __tm2, file->last_mpi_write_end, CP_F_MPI_WRITE_TIME); \
241 242 243
    if(CP_F_VALUE(file, CP_F_WRITE_START_TIMESTAMP) == 0) \
        CP_F_SET(file, CP_F_WRITE_START_TIMESTAMP, __tm1); \
    CP_F_SET(file, CP_F_WRITE_END_TIMESTAMP, __tm2); \
244 245 246 247 248 249 250 251 252
    CP_SET(file, CP_AVG_MEM_DTYPE_SIZE, size); \
    CP_SET(file, CP_AVG_MEM_DTYPE_EXTENT, extent * __count); \
    CP_SET(file, CP_AVG_MEM_DTYPE_BLOCKS, count_contiguous_blocks_memory(__datatype,  __count)); \
    MPI_File_get_byte_offset(__fh, __voff, &foff1); \
    MPI_File_get_byte_offset(__fh, __voff + size, &foff2); \
    CP_SET(file, CP_MIN_FILE_OFFSET, foff1);  \
    CP_SET(file, CP_MAX_FILE_OFFSET, foff2);  \
    CP_SET(file, CP_AVG_FILE_DTYPE_EXTENT, foff2 -foff1);  \
    CP_SET(file, CP_AVG_FILE_DTYPE_BLOCKS, count_contiguous_blocks_file(fh, foff1, foff2 )); \
253
} while(0)
254 255 256 257
*/

static struct darshan_file_runtime* darshan_file_by_fh(MPI_File fh);

258

259
void printHints(MPI_File fh)
fisaila's avatar
fisaila committed
260 261 262 263 264 265 266
{
    int rank;
    DARSHAN_MPI_CALL(PMPI_Comm_rank)(MPI_COMM_WORLD, &rank);
    if (rank == 0) {	
    	char key[MPI_MAX_INFO_VAL],
         value[MPI_MAX_INFO_VAL];
    	int  flag, i, nkeys;
267 268 269
	MPI_Info info;
	MPI_File_get_info(fh, &info);
    	MPI_Info_get_nkeys(info, &nkeys);
270
	fprintf(stdout,"MPI-IO hints epoch%d\n", epoch_counter);
fisaila's avatar
fisaila committed
271
    	for (i = 0; i < nkeys; i++) {
272 273
        	MPI_Info_get_nthkey(info, i, key);
        	MPI_Info_get(info, key, MPI_MAX_INFO_VAL-1,
fisaila's avatar
fisaila committed
274 275 276 277 278 279 280
                               value, &flag);
        	fprintf(stdout,"\t%s = %s\n", key, value);
    	}
   }
} /* printHints() */


281 282 283 284 285 286 287
void CP_RECORD_MPI_WRITE(int __ret, MPI_File __fh, int __count, MPI_Datatype __datatype, 
			 int64_t __counter, double __tm1, double __tm2, MPI_Offset __voff) { 
    struct darshan_file_runtime* file; 
    int size = 0; 
    MPI_Aint extent = 0; 
    MPI_Offset foff1, foff2;
    int mem_blocks, file_blocks; //
288
    //printHints(__fh); //	
289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307
    if(__ret != MPI_SUCCESS) return; 
    file = darshan_file_by_fh(__fh); 
    if(!file) return; 
    DARSHAN_MPI_CALL(PMPI_Type_size)(__datatype, &size);  
    size = size * __count; 
    DARSHAN_MPI_CALL(PMPI_Type_extent)(__datatype, &extent); 
    CP_BUCKET_INC(file, CP_SIZE_WRITE_AGG_0_100, size); 
    CP_BUCKET_INC(file, CP_EXTENT_WRITE_0_100, extent); 
    CP_INC(file, __counter, 1); 
    CP_DATATYPE_INC(file, __datatype); 
    CP_F_INC_NO_OVERLAP(file, __tm1, __tm2, file->last_mpi_write_end, CP_F_MPI_WRITE_TIME); 
    if(CP_F_VALUE(file, CP_F_WRITE_START_TIMESTAMP) == 0) 
        CP_F_SET(file, CP_F_WRITE_START_TIMESTAMP, __tm1); 
    CP_F_SET(file, CP_F_WRITE_END_TIMESTAMP, __tm2); 
    CP_SET(file, CP_AVG_MEM_DTYPE_SIZE, size); 
    CP_SET(file, CP_AVG_MEM_DTYPE_EXTENT, extent * __count);
    mem_blocks = count_contiguous_blocks_memory(__datatype,  __count); //
    CP_SET(file, CP_AVG_MEM_DTYPE_BLOCKS, mem_blocks); // 
    CP_SET(file, CP_MAX_MEM_DTYPE_SIZE, size); //
308
    CP_SET(file, CP_MAX_MEM_DTYPE_EXTENT, extent * __count); //
309 310
    CP_SET(file, CP_MAX_MEM_DTYPE_BLOCKS, mem_blocks); //
    CP_SET(file, CP_MIN_MEM_DTYPE_SIZE, size); //
311
    CP_SET(file, CP_MIN_MEM_DTYPE_EXTENT, extent * __count); //
312 313 314 315 316 317 318
    CP_SET(file, CP_MIN_MEM_DTYPE_BLOCKS, mem_blocks); //
    //MPI_File_get_byte_offset(__fh, __voff, &foff1);
    //MPI_File_get_byte_offset(__fh, __voff + __count, &foff2);
    foff1 = func_1(__fh, __voff); //
    foff2 = func_1_inf(__fh, __voff, size); //
    //    printf("foff1 = %lld foff1_func_1 = %lld foff2 = %lld foff2_func_1_inf = %lld __voff =%lld __count=%d\n", 
    //	   foff1, func_1(__fh, __voff), foff2, func_1_inf(__fh, __voff, size), __voff,  __count);
319
    file_blocks = (size>0)?count_contiguous_blocks_file(__fh, foff1, foff2 ):0;
320 321 322 323 324 325 326 327
    CP_SET(file, CP_MIN_FILE_OFFSET, foff1);  
    CP_SET(file, CP_MAX_FILE_OFFSET, foff2);  
    CP_SET(file, CP_AVG_FILE_DTYPE_EXTENT, foff2 - foff1 + 1);  
    CP_SET(file, CP_AVG_FILE_DTYPE_BLOCKS, file_blocks); 
    CP_SET(file, CP_MAX_FILE_DTYPE_EXTENT, foff2 - foff1 + 1); //
    CP_SET(file, CP_MAX_FILE_DTYPE_BLOCKS, file_blocks);  //
    CP_SET(file, CP_MIN_FILE_DTYPE_EXTENT, foff2 - foff1 + 1); //
    CP_SET(file, CP_MIN_FILE_DTYPE_BLOCKS, file_blocks);  //    
328
    darshan_trace_log_record(-1, epoch_counter,__counter,__tm1,__tm2,__count*size, 0,__voff);	
329 330
} 

331 332 333 334 335 336 337 338

#define CP_RECORD_MPI_READ(__ret, __fh, __count, __datatype, __counter, __tm1, __tm2) do { \
    struct darshan_file_runtime* file; \
    int size = 0; \
    MPI_Aint extent = 0; \
    if(__ret != MPI_SUCCESS) break; \
    file = darshan_file_by_fh(__fh); \
    if(!file) break; \
339
    DARSHAN_MPI_CALL(PMPI_Type_size)(__datatype, &size);  \
340
    size = size * __count; \
341
    DARSHAN_MPI_CALL(PMPI_Type_extent)(__datatype, &extent); \
342 343 344 345
    CP_BUCKET_INC(file, CP_SIZE_READ_AGG_0_100, size); \
    CP_BUCKET_INC(file, CP_EXTENT_READ_0_100, extent); \
    CP_INC(file, __counter, 1); \
    CP_DATATYPE_INC(file, __datatype); \
346
    CP_F_INC_NO_OVERLAP(file, __tm1, __tm2, file->last_mpi_read_end, CP_F_MPI_READ_TIME); \
347 348 349 350 351
    if(CP_F_VALUE(file, CP_F_READ_START_TIMESTAMP) == 0) \
        CP_F_SET(file, CP_F_READ_START_TIMESTAMP, __tm1); \
    CP_F_SET(file, CP_F_READ_END_TIMESTAMP, __tm2); \
} while(0)

352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370

void  set_collective_io_hints(MPI_File fh){
	char *value;

	MPI_Info info;
	MPI_Info_create(&info);
	if (value = getenv("ROMIO_CB_NODES"))
		MPI_Info_set(info, "cb_nodes", value);
	// printf("value=%s\n",value);
        if (value = getenv("ROMIO_CB_BUFFER_SIZE"))
                MPI_Info_set(info, "cb_buffer_size", value);
	// printf("value=%s\n",value);
        if (value = getenv("ROMIO_BG_NODES_PSET"))
                MPI_Info_set(info, "bg_nodes_pset", value);
	// printf("value=%s\n",value);
	MPI_File_set_info(fh, info);
	MPI_Info_free(&info);
}

371 372 373
static void cp_log_construct_indices(struct darshan_job_runtime* final_job,
    int rank, int* inout_count, int* lengths, void** pointers, char*
    trailing_data);
374
static int cp_log_write(struct darshan_job_runtime* final_job, int rank, 
375
    char* logfile_name, int count, int* lengths, void** pointers, double start_log_time);
376
static void cp_log_record_hints_and_ver(struct darshan_job_runtime* final_job, int rank);
377
static int cp_log_reduction(struct darshan_job_runtime* final_job, int rank, 
378
    char* logfile_name, MPI_Offset* next_offset);
379 380 381 382 383 384
static void darshan_file_reduce(void* infile_v, 
    void* inoutfile_v, int *len, 
    MPI_Datatype *datatype);
static int cp_log_compress(struct darshan_job_runtime* final_job,
    int rank, int* inout_count, int* lengths, void** pointers);
static int file_compare(const void* a, const void* b);
385 386 387 388 389 390
static int darshan_file_variance(
    struct darshan_file *infile_array,
    struct darshan_file *outfile_array,
    int count, int rank);
static void pairwise_variance_reduce (
    void *invec, void *inoutvec, int *len, MPI_Datatype *dt);
391
#if 0
392
static void debug_mounts(const char* mtab_file, const char* out_file);
393
#endif
394

395
//static struct darshan_file_runtime* darshan_file_by_fh(MPI_File fh);
396 397
static void darshan_file_close_fh(MPI_File fh);
static struct darshan_file_runtime* darshan_file_by_name_setfh(const char* name, MPI_File fh);
398

399
#define CP_MAX_MNTS 32
400 401 402
#define CP_MAX_MNT_PATH 256
#define CP_MAX_MNT_TYPE 32
struct mnt_data
403
{
404 405 406 407 408 409 410
    int64_t hash;
    int64_t block_size;
    char path[CP_MAX_MNT_PATH];
    char type[CP_MAX_MNT_TYPE];
};
static struct mnt_data mnt_data_array[CP_MAX_MNTS];
static int mnt_data_count = 0;
411

412 413 414 415 416 417 418
struct variance_dt
{
    double n;
    double T;
    double S;
};

419 420 421 422 423
// The next two variables used for the context of the file access operation
// e.g. for tracing all the MPI communication from an MPI_File_write
static __thread char *crt_filename = NULL;
static __thread MPI_File *crt_fh = NULL;

424
void darshan_mpi_initialize(int *argc, char ***argv)
425 426 427
{
    int nprocs;
    int rank;
428 429
    int timing_flag = 0;
    double init_start, init_time, init_max;
430

431 432
    DARSHAN_MPI_CALL(PMPI_Comm_size)(MPI_COMM_WORLD, &nprocs);
    DARSHAN_MPI_CALL(PMPI_Comm_rank)(MPI_COMM_WORLD, &rank);
433 434 435 436 437 438
    
    if(getenv("DARSHAN_INTERNAL_TIMING"))
        timing_flag = 1;

    if(timing_flag)
        init_start = DARSHAN_MPI_CALL(PMPI_Wtime)();
439 440 441 442 443 444 445 446 447 448

    if(argc && argv)
    {
        darshan_initialize(*argc, *argv, nprocs, rank);
    }
    else
    {
        /* we don't see argc and argv here in fortran */
        darshan_initialize(0, NULL, nprocs, rank);
    }
449 450 451 452 453 454 455 456 457 458 459 460
    
    if(timing_flag)
    {
        init_time = DARSHAN_MPI_CALL(PMPI_Wtime)() - init_start;
        DARSHAN_MPI_CALL(PMPI_Reduce)(&init_time, &init_max, 1,
            MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
        if(rank == 0)
        {
            printf("#darshan:<op>\t<nprocs>\t<time>\n");
            printf("darshan:init\t%d\t%f\n", nprocs, init_max);
        }
    }
461

462
    return;
463 464
}

465
void darshan_shutdown_epoch(struct darshan_job_runtime* final_job, int timing_flag)
466 467 468
{
    int rank;
    char* logfile_name;
469
    //struct darshan_job_runtime* final_job;
470 471 472 473 474
    double start_log_time = 0;
    int all_ret = 0;
    int local_ret = 0;
    MPI_Offset next_offset = 0;
    char* jobid_str;
475 476
    char* envjobid;
    char* logpath;
477 478 479 480 481 482
    int jobid;
    int index_count = 0;
    int lengths[CP_MAX_MEM_SEGMENTS];
    void* pointers[CP_MAX_MEM_SEGMENTS];
    int ret;
    double red1=0, red2=0, gz1=0, gz2=0, write1=0, write2=0, tm_end=0;
483
    double bcst=0;
484
    int nprocs;
485
    time_t start_time_tmp = 0;
486 487
    uint64_t logmod;
    char hname[HOST_NAME_MAX];
488 489 490 491 492
    char* logpath_override = NULL;
#ifdef __CP_LOG_ENV
    char env_check[256];
    char* env_tok;
#endif
493
    uint64_t hlevel;
494
    static int epoch_idx = 0;
495

496 497 498 499 500 501
    //CP_LOCK();
    //if(!darshan_global_job)
    //{
    //    CP_UNLOCK();
    //    return;
    //}
502 503 504
    /* disable further tracing while hanging onto the data so that we can
     * write it out
     */
505
    //final_job = darshan_global_job;
506 507
    //  Moved to the new darshan_shutdown   
    //    darshan_global_job = NULL;
508
    //CP_UNLOCK();
509

510
    start_log_time = DARSHAN_MPI_CALL(PMPI_Wtime)();
511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556

    /* figure out which access sizes to log */
    darshan_walk_file_accesses(final_job);

    /* if the records have been condensed, then zero out fields that are no
     * longer valid for safety 
     */
    if(final_job->flags & CP_FLAG_CONDENSED && final_job->file_count)
    {
        CP_SET(&final_job->file_runtime_array[0], CP_MODE, 0);
        CP_SET(&final_job->file_runtime_array[0], CP_CONSEC_READS, 0);
        CP_SET(&final_job->file_runtime_array[0], CP_CONSEC_WRITES, 0);
        CP_SET(&final_job->file_runtime_array[0], CP_SEQ_READS, 0);
        CP_SET(&final_job->file_runtime_array[0], CP_SEQ_WRITES, 0);
        CP_SET(&final_job->file_runtime_array[0], CP_STRIDE1_STRIDE, 0);
        CP_SET(&final_job->file_runtime_array[0], CP_STRIDE2_STRIDE, 0);
        CP_SET(&final_job->file_runtime_array[0], CP_STRIDE3_STRIDE, 0);
        CP_SET(&final_job->file_runtime_array[0], CP_STRIDE4_STRIDE, 0);
        CP_SET(&final_job->file_runtime_array[0], CP_STRIDE1_COUNT, 0);
        CP_SET(&final_job->file_runtime_array[0], CP_STRIDE2_COUNT, 0);
        CP_SET(&final_job->file_runtime_array[0], CP_STRIDE3_COUNT, 0);
        CP_SET(&final_job->file_runtime_array[0], CP_STRIDE4_COUNT, 0);
        CP_SET(&final_job->file_runtime_array[0], CP_ACCESS1_ACCESS, 0);
        CP_SET(&final_job->file_runtime_array[0], CP_ACCESS2_ACCESS, 0);
        CP_SET(&final_job->file_runtime_array[0], CP_ACCESS3_ACCESS, 0);
        CP_SET(&final_job->file_runtime_array[0], CP_ACCESS4_ACCESS, 0);
        CP_SET(&final_job->file_runtime_array[0], CP_ACCESS1_COUNT, 0);
        CP_SET(&final_job->file_runtime_array[0], CP_ACCESS2_COUNT, 0);
        CP_SET(&final_job->file_runtime_array[0], CP_ACCESS3_COUNT, 0);
        CP_SET(&final_job->file_runtime_array[0], CP_ACCESS4_COUNT, 0);
        
        CP_F_SET(&final_job->file_runtime_array[0], CP_F_OPEN_TIMESTAMP, 0);
        CP_F_SET(&final_job->file_runtime_array[0], CP_F_CLOSE_TIMESTAMP, 0);
        CP_F_SET(&final_job->file_runtime_array[0], CP_F_READ_START_TIMESTAMP, 0);
        CP_F_SET(&final_job->file_runtime_array[0], CP_F_READ_END_TIMESTAMP, 0);
        CP_F_SET(&final_job->file_runtime_array[0], CP_F_WRITE_START_TIMESTAMP, 0);
        CP_F_SET(&final_job->file_runtime_array[0], CP_F_WRITE_END_TIMESTAMP, 0);
    }

    logfile_name = malloc(PATH_MAX);
    if(!logfile_name)
    {
        darshan_finalize(final_job);
        return;
    }

557
    DARSHAN_MPI_CALL(PMPI_Comm_rank)(MPI_COMM_WORLD, &rank);
558 559 560 561 562 563 564

    /* construct log file name */
    if(rank == 0)
    {
        char cuser[L_cuserid] = {0};
        struct tm* my_tm;

565 566 567 568 569 570 571 572 573 574 575
        /* Use CP_JOBID_OVERRIDE for the env var or CP_JOBID */
        envjobid = getenv(CP_JOBID_OVERRIDE);
        if (!envjobid)
        {
            envjobid = CP_JOBID;
        }

        /* Use CP_LOG_PATH_OVERRIDE for the value or __CP_LOG_PATH */
        logpath = getenv(CP_LOG_PATH_OVERRIDE);
        if (!logpath)
        {
576
#ifdef __CP_LOG_PATH
577
            logpath = __CP_LOG_PATH;
578
#endif
579 580
        }

581
        /* find a job id */
582
        jobid_str = getenv(envjobid);
583 584 585 586 587 588 589 590 591 592 593 594
        if(jobid_str)
        {
            /* in cobalt we can find it in env var */
            ret = sscanf(jobid_str, "%d", &jobid);
        }
        if(!jobid_str || ret != 1)
        {
            /* use pid as fall back */
            jobid = getpid();
        }

        /* break out time into something human readable */
595 596
        start_time_tmp += final_job->log_job.start_time;
        my_tm = localtime(&start_time_tmp);
597

598 599 600 601 602 603 604 605 606 607 608
        /* get the username for this job.  In order we will try each of the
         * following until one of them succeeds:
         *
         * - cuserid()
         * - getenv("LOGNAME")
         * - snprintf(..., geteuid());
         *
         * Note that we do not use getpwuid() because it generally will not
         * work in statically compiled binaries.
         */

609
#ifndef DARSHAN_DISABLE_CUSERID
610
        cuserid(cuser);
611
#endif
612 613 614 615 616 617 618 619 620 621 622 623 624 625

        /* if cuserid() didn't work, then check the environment */
        if (strcmp(cuser, "") == 0)
        {
            char* logname_string;
            logname_string = getenv("LOGNAME");
            if(logname_string)
            {
                strncpy(cuser, logname_string, (L_cuserid-1));
            }

        }

        /* if cuserid() and environment both fail, then fall back to uid */
626 627 628 629 630
        if (strcmp(cuser, "") == 0)
        {
            uid_t uid = geteuid();
            snprintf(cuser, sizeof(cuser), "%u", uid);
        }
631

632
        /* generate a random number to help differentiate the log */
633
        hlevel=DARSHAN_MPI_CALL(PMPI_Wtime)() * 1000000;
634
        (void) gethostname(hname, sizeof(hname));
635
        logmod = darshan_hash((void*)hname,strlen(hname),hlevel);
636

637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664
        /* see if darshan was configured using the --with-logpath-by-env
         * argument, which allows the user to specify an absolute path to
         * place logs via an env variable.
         */
#ifdef __CP_LOG_ENV
        /* just silently skip if the environment variable list is too big */
        if(strlen(__CP_LOG_ENV) < 256)
        {
            /* copy env variable list to a temporary buffer */
            strcpy(env_check, __CP_LOG_ENV);
            /* tokenize the comma-separated list */
            env_tok = strtok(env_check, ",");
            if(env_tok)
            {
                do
                {
                    /* check each env variable in order */
                    logpath_override = getenv(env_tok); 
                    if(logpath_override)
                    {
                        /* stop as soon as we find a match */
                        break;
                    }
                }while((env_tok = strtok(NULL, ",")));
            }
        }
#endif

665
       
666
        if(logpath_override)
667
        {
668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686
	    if (epoch_counter > 0)
		ret = snprintf(logfile_name, PATH_MAX, 
		    "%s/%s_%s_id%d_epoch%d_%d-%d-%d-%" PRIu64 ".darshan_partial",
		    logpath_override, 
		    cuser, __progname, jobid, epoch_idx++,
                    (my_tm->tm_mon+1), 
                    my_tm->tm_mday, 
                    (my_tm->tm_hour*60*60 + my_tm->tm_min*60 + my_tm->tm_sec),
                    logmod);
 
	    else
		ret = snprintf(logfile_name, PATH_MAX, 
                    "%s/%s_%s_id%d_%d-%d-%d-%" PRIu64 ".darshan_partial",
                    logpath_override, 
                    cuser, __progname, jobid,
                    (my_tm->tm_mon+1), 
                    my_tm->tm_mday, 
                    (my_tm->tm_hour*60*60 + my_tm->tm_min*60 + my_tm->tm_sec),
                    logmod);
687 688 689 690 691 692 693 694
            if(ret == (PATH_MAX-1))
            {
                /* file name was too big; squish it down */
                snprintf(logfile_name, PATH_MAX,
                    "%s/id%d.darshan_partial",
                    logpath_override, jobid);
            }
        }
695
        else if(logpath)
696
        {
697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717
	    if (epoch_counter > 0)	    
		ret = snprintf(logfile_name, PATH_MAX, 
		    "%s/%d/%d/%d/%s_%s_id%d_epoch%d_%d-%d-%d-%" PRIu64 ".darshan_partial",
                    logpath, (my_tm->tm_year+1900), 
                    (my_tm->tm_mon+1), my_tm->tm_mday, 
		    cuser, __progname, jobid, epoch_idx++,
                    (my_tm->tm_mon+1), 
                    my_tm->tm_mday, 
                    (my_tm->tm_hour*60*60 + my_tm->tm_min*60 + my_tm->tm_sec),
                    logmod);
	    else
		ret = snprintf(logfile_name, PATH_MAX, 
		    "%s/%d/%d/%d/%s_%s_id%d_%d-%d-%d-%" PRIu64 ".darshan_partial",
                    logpath, (my_tm->tm_year+1900), 
                    (my_tm->tm_mon+1), my_tm->tm_mday, 
                    cuser, __progname, jobid,
                    (my_tm->tm_mon+1), 
                    my_tm->tm_mday, 
                    (my_tm->tm_hour*60*60 + my_tm->tm_min*60 + my_tm->tm_sec),
                    logmod);

718 719 720 721 722 723 724
            if(ret == (PATH_MAX-1))
            {
                /* file name was too big; squish it down */
                snprintf(logfile_name, PATH_MAX,
                    "%s/id%d.darshan_partial",
                    logpath, jobid);
            }
725
        }
726 727 728 729
        else
        {
            logfile_name[0] = '\0';
        }
730 731 732

        /* add jobid */
        final_job->log_job.jobid = (int64_t)jobid;
733 734 735
    }

    /* broadcast log file name */
736
    bcst=DARSHAN_MPI_CALL(PMPI_Wtime)();
737 738
    DARSHAN_MPI_CALL(PMPI_Bcast)(logfile_name, PATH_MAX, MPI_CHAR, 0,
        MPI_COMM_WORLD);
739

740 741 742 743 744 745 746
    if(strlen(logfile_name) == 0)
    {
        /* failed to generate log file name */
        darshan_finalize(final_job);
	return;
    }

747 748 749 750
    final_job->log_job.end_time = time(NULL);

    /* reduce records for shared files */
    if(timing_flag)
751
        red1 = DARSHAN_MPI_CALL(PMPI_Wtime)();
752 753
    local_ret = cp_log_reduction(final_job, rank, logfile_name, 
        &next_offset);
754
    if(timing_flag)
755 756
        red2 = DARSHAN_MPI_CALL(PMPI_Wtime)();
    DARSHAN_MPI_CALL(PMPI_Allreduce)(&local_ret, &all_ret, 1, MPI_INT, MPI_LOR, 
757 758
        MPI_COMM_WORLD);

759 760 761
    /* if we are using any hints to write the log file, then record those
     * hints in the log file header
     */
762
    cp_log_record_hints_and_ver(final_job, rank);
763

764 765 766 767
    if(all_ret == 0)
    {
        /* collect data to write from local process */
        cp_log_construct_indices(final_job, rank, &index_count, lengths, 
768
            pointers, final_job->trailing_data);
769 770 771 772 773 774
    }

    if(all_ret == 0)
    {
        /* compress data */
        if(timing_flag)
775
            gz1 = DARSHAN_MPI_CALL(PMPI_Wtime)();
776 777 778
        local_ret = cp_log_compress(final_job, rank, &index_count, 
            lengths, pointers);
        if(timing_flag)
779 780 781
            gz2 = DARSHAN_MPI_CALL(PMPI_Wtime)();
        DARSHAN_MPI_CALL(PMPI_Allreduce)(&local_ret, &all_ret, 1,
            MPI_INT, MPI_LOR, MPI_COMM_WORLD);
782 783 784 785 786 787
    }

    if(all_ret == 0)
    {
        /* actually write out log file */
        if(timing_flag)
788
            write1 = DARSHAN_MPI_CALL(PMPI_Wtime)();
789
        local_ret = cp_log_write(final_job, rank, logfile_name, 
790 791
            index_count, lengths, pointers, start_log_time);
        if(timing_flag)
792 793 794
            write2 = DARSHAN_MPI_CALL(PMPI_Wtime)();
        DARSHAN_MPI_CALL(PMPI_Allreduce)(&local_ret, &all_ret, 1,
            MPI_INT, MPI_LOR, MPI_COMM_WORLD);
795 796
    }

797
    if(rank == 0)
798
    {
799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825
        if(all_ret != 0)
        {
            fprintf(stderr, "darshan library warning: unable to write log file %s\n", logfile_name);
            /* if any process failed to write log, then delete the whole 
             * file so we don't leave corrupted results
             */
            unlink(logfile_name);
        }
        else
        {
            /* rename from *.darshan_partial to *-<logwritetime>.darshan.gz,
             * which indicates that this log file is complete and ready for
             * analysis
             */ 
            char* mod_index;
            double end_log_time;
            char* new_logfile_name;

            new_logfile_name = malloc(PATH_MAX);
            if(new_logfile_name)
            {
                new_logfile_name[0] = '\0';
                end_log_time = DARSHAN_MPI_CALL(PMPI_Wtime)();
                strcat(new_logfile_name, logfile_name);
                mod_index = strstr(new_logfile_name, ".darshan_partial");
                sprintf(mod_index, "_%d.darshan.gz", (int)(end_log_time-start_log_time+1));
                rename(logfile_name, new_logfile_name);
826
		fprintf(stdout, "DARSHAN_LOGFILE:%s\n", new_logfile_name);
827
                /* set permissions on log file */
828 829 830
#ifdef __CP_GROUP_READABLE_LOGS
                chmod(new_logfile_name, (S_IRUSR|S_IRGRP)); 
#else
831
                chmod(new_logfile_name, (S_IRUSR)); 
832
#endif
833 834 835
                free(new_logfile_name);
            }
        }
836 837
    }

838 839 840 841
    //  Moved to the new darshan_shutdown   
    // if(final_job->trailing_data)
    //    free(final_job->trailing_data);
    // mnt_data_count = 0;
842
    free(logfile_name);
843 844
    //  Moved to the new darshan_shutdown 
    //    darshan_finalize(final_job);
845 846 847 848 849 850 851
    
    if(timing_flag)
    {
        double red_tm, red_slowest;
        double gz_tm, gz_slowest;
        double write_tm, write_slowest;
        double all_tm, all_slowest;
852
        double bcst_tm, bcst_slowest;
853
        
854
        tm_end = DARSHAN_MPI_CALL(PMPI_Wtime)();
855

856
        bcst_tm= red1-bcst;
857 858 859 860 861
        red_tm = red2-red1;
        gz_tm = gz2-gz1;
        write_tm = write2-write1;
        all_tm = tm_end-start_log_time;

862 863 864 865 866 867 868 869 870 871
        DARSHAN_MPI_CALL(PMPI_Reduce)(&red_tm, &red_slowest, 1,
            MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
        DARSHAN_MPI_CALL(PMPI_Reduce)(&gz_tm, &gz_slowest, 1,
            MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
        DARSHAN_MPI_CALL(PMPI_Reduce)(&write_tm, &write_slowest, 1,
            MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
        DARSHAN_MPI_CALL(PMPI_Reduce)(&all_tm, &all_slowest, 1,
            MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
        DARSHAN_MPI_CALL(PMPI_Reduce)(&bcst_tm, &bcst_slowest, 1,
            MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
872 873 874

        if(rank == 0)
        {
875
            DARSHAN_MPI_CALL(PMPI_Comm_size)(MPI_COMM_WORLD, &nprocs);
876 877 878 879 880
            printf("#darshan:<op>\t<nprocs>\t<time>\n");
            printf("darshan:bcst\t%d\t%f\n", nprocs, bcst_slowest);
            printf("darshan:reduce\t%d\t%f\n", nprocs, red_slowest);
            printf("darshan:gzip\t%d\t%f\n", nprocs, gz_slowest);
            printf("darshan:write\t%d\t%f\n", nprocs, write_slowest);
881
            printf("darshan:bcast+reduce+gzip+write\t%d\t%f\n", nprocs, all_slowest);
882 883 884 885 886 887
        }
    }

    return;
}

888 889 890
#ifdef HAVE_MPIIO_CONST
int MPI_File_open(MPI_Comm comm, const char *filename, int amode, MPI_Info info, MPI_File *fh) 
#else
891
int MPI_File_open(MPI_Comm comm, char *filename, int amode, MPI_Info info, MPI_File *fh) 
892
#endif
893 894 895 896 897 898
{
    int ret;
    struct darshan_file_runtime* file;
    char* tmp;
    int comm_size;
    double tm1, tm2;
899 900 901 902 903 904 905 906 907 908 909
    char *romio_file_prefix, *new_filename;

    
    // Add a ROMIO prefix through an env variable 
    if (romio_file_prefix = getenv("ROMIO_FILE_PREFIX")) {
	new_filename = (char *) malloc (strlen(romio_file_prefix) + strlen(filename) + 1);
	strcpy(new_filename, romio_file_prefix);
	strcat(new_filename, filename);
    }
    else
	new_filename = filename;	
910

911
    //printf("new_filename=%s\n", new_filename);
912
    tm1 = darshan_wtime();
913
    ret = DARSHAN_MPI_CALL(PMPI_File_open)(comm, new_filename, amode, info, fh);
914 915
    tm2 = darshan_wtime();

916 917 918
    if (romio_file_prefix)
	free(new_filename);

919 920 921 922 923 924 925 926 927 928 929 930 931 932
    if(ret == MPI_SUCCESS)
    {
        CP_LOCK();

        /* use ROMIO approach to strip prefix if present */
        /* strip off prefix if there is one, but only skip prefixes
         * if they are greater than length one to allow for windows
         * drive specifications (e.g. c:\...) 
         */
        tmp = strchr(filename, ':');
        if (tmp > filename + 1) {
            filename = tmp + 1;
        }

933 934
        file = darshan_file_by_name_setfh(filename, (*fh));
        if(file)
935 936
        {
            CP_SET(file, CP_MODE, amode);
937
            CP_F_INC_NO_OVERLAP(file, tm1, tm2, file->last_mpi_meta_end, CP_F_MPI_META_TIME);
938
            if(CP_F_VALUE(file, CP_F_OPEN_TIMESTAMP) == 0)
939 940 941
                CP_F_SET(file, CP_F_OPEN_TIMESTAMP,
                DARSHAN_MPI_CALL(PMPI_Wtime)());
            DARSHAN_MPI_CALL(PMPI_Comm_size)(comm, &comm_size);
942 943 944 945 946 947 948 949 950 951 952 953 954
            if(comm_size == 1)
            {
                CP_INC(file, CP_INDEP_OPENS, 1);
            }
            else
            {
                CP_INC(file, CP_COLL_OPENS, 1);
            }
            if(info != MPI_INFO_NULL)
            {
                CP_INC(file, CP_HINTS, 1);
            }
        }
955

956 957 958 959 960 961 962 963 964 965 966 967 968 969
        CP_UNLOCK();
    }

    return(ret);
}

int MPI_File_close(MPI_File *fh) 
{
    struct darshan_file_runtime* file;
    MPI_File tmp_fh = *fh;
    double tm1, tm2;
    int ret;
    
    tm1 = darshan_wtime();
970
    ret = DARSHAN_MPI_CALL(PMPI_File_close)(fh);
971 972 973 974 975 976
    tm2 = darshan_wtime();

    CP_LOCK();
    file = darshan_file_by_fh(tmp_fh);
    if(file)
    {
977
        CP_F_SET(file, CP_F_CLOSE_TIMESTAMP, DARSHAN_MPI_CALL(PMPI_Wtime)());
978
        CP_F_INC_NO_OVERLAP(file, tm1, tm2, file->last_mpi_meta_end, CP_F_MPI_META_TIME);
979
        darshan_file_close_fh(tmp_fh);
980 981 982 983 984 985 986 987 988 989 990 991 992
    }
    CP_UNLOCK();

    return(ret);
}

int MPI_File_sync(MPI_File fh)
{
    int ret;
    struct darshan_file_runtime* file;
    double tm1, tm2;

    tm1 = darshan_wtime();
993
    ret = DARSHAN_MPI_CALL(PMPI_File_sync)(fh);
994 995 996 997 998 999 1000
    tm2 = darshan_wtime();
    if(ret == MPI_SUCCESS)
    {
        CP_LOCK();
        file = darshan_file_by_fh(fh);
        if(file)
        {
1001
            CP_F_INC_NO_OVERLAP(file, tm1, tm2, file->last_mpi_write_end, CP_F_MPI_WRITE_TIME);
1002 1003 1004 1005 1006 1007 1008 1009 1010
            CP_INC(file, CP_SYNCS, 1);
        }
        CP_UNLOCK();
    }

    return(ret);
}


1011 1012 1013 1014
#ifdef HAVE_MPIIO_CONST
int MPI_File_set_view(MPI_File fh, MPI_Offset disp, MPI_Datatype etype, 
    MPI_Datatype filetype, const char *datarep, MPI_Info info)
#else
1015 1016
int MPI_File_set_view(MPI_File fh, MPI_Offset disp, MPI_Datatype etype, 
    MPI_Datatype filetype, char *datarep, MPI_Info info)
1017
#endif
1018 1019 1020 1021 1022 1023
{
    int ret;
    struct darshan_file_runtime* file;
    double tm1, tm2;

    tm1 = darshan_wtime();
1024 1025
    ret = DARSHAN_MPI_CALL(PMPI_File_set_view)(fh, disp, etype,
        filetype, datarep, info);
1026 1027 1028 1029 1030 1031 1032 1033 1034 1035
    tm2 = darshan_wtime();
    if(ret == MPI_SUCCESS)
    {
        CP_LOCK();
        file = darshan_file_by_fh(fh);
        if(file)
        {
            CP_INC(file, CP_VIEWS, 1);
            if(info != MPI_INFO_NULL)
            {
1036
                CP_F_INC_NO_OVERLAP(file, tm1, tm2, file->last_mpi_meta_end, CP_F_MPI_META_TIME);
1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052
                CP_INC(file, CP_HINTS, 1);
            }
            CP_DATATYPE_INC(file, filetype);
        }
        CP_UNLOCK();
    }

    return(ret);
}

int MPI_File_read(MPI_File fh, void *buf, int count, 
    MPI_Datatype datatype, MPI_Status *status)
{
    int ret;
    double tm1, tm2;

1053
    crt_fh = &fh;	
1054
    tm1 = darshan_wtime();
1055
    ret = DARSHAN_MPI_CALL(PMPI_File_read)(fh, buf, count, datatype, status);
1056 1057 1058 1059
    tm2 = darshan_wtime();
    CP_LOCK();
    CP_RECORD_MPI_READ(ret, fh, count, datatype, CP_INDEP_READS, tm1, tm2);
    CP_UNLOCK();
1060
    crt_fh = NULL;	
1061 1062 1063 1064 1065 1066 1067 1068 1069
    return(ret);
}

int MPI_File_read_at(MPI_File fh, MPI_Offset offset, void *buf,
    int count, MPI_Datatype datatype, MPI_Status *status)
{
    int ret;
    double tm1, tm2;

1070
    crt_fh = &fh;
1071
    tm1 = darshan_wtime();
1072 1073
    ret = DARSHAN_MPI_CALL(PMPI_File_read_at)(fh, offset, buf,
        count, datatype, status);
1074 1075 1076 1077
    tm2 = darshan_wtime();
    CP_LOCK();
    CP_RECORD_MPI_READ(ret, fh, count, datatype, CP_INDEP_READS, tm1, tm2);
    CP_UNLOCK();
1078
    crt_fh = NULL;
1079 1080 1081 1082 1083 1084 1085 1086 1087
    return(ret);
}

int MPI_File_read_at_all(MPI_File fh, MPI_Offset offset, void * buf,
    int count, MPI_Datatype datatype, MPI_Status * status)
{
    int ret;
    double tm1, tm2;

1088
    crt_fh = &fh;
1089
    tm1 = darshan_wtime();
1090 1091
    ret = DARSHAN_MPI_CALL(PMPI_File_read_at_all)(fh, offset, buf,
        count, datatype, status);
1092 1093 1094 1095
    tm2 = darshan_wtime();
    CP_LOCK();
    CP_RECORD_MPI_READ(ret, fh, count, datatype, CP_COLL_READS, tm1, tm2);
    CP_UNLOCK();
1096
    crt_fh = NULL;
1097 1098 1099 1100 1101 1102 1103 1104
    return(ret);
}

int MPI_File_read_all(MPI_File fh, void * buf, int count, MPI_Datatype datatype, MPI_Status *status)
{
    int ret;
    double tm1, tm2;

1105
    crt_fh = &fh;
1106
    tm1 = darshan_wtime();
1107 1108
    ret = DARSHAN_MPI_CALL(PMPI_File_read_all)(fh, buf, count,
        datatype, status);
1109 1110 1111 1112
    tm2 = darshan_wtime();
    CP_LOCK();
    CP_RECORD_MPI_READ(ret, fh, count, datatype, CP_COLL_READS, tm1, tm2);
    CP_UNLOCK();
1113
    crt_fh = NULL;
1114 1115 1116 1117 1118 1119 1120 1121
    return(ret);
}

int MPI_File_read_shared(MPI_File fh, void * buf, int count, MPI_Datatype datatype, MPI_Status *status)
{
    int ret;
    double tm1, tm2;

1122
    crt_fh = &fh;
1123
    tm1 = darshan_wtime();
1124 1125
    ret = DARSHAN_MPI_CALL(PMPI_File_read_shared)(fh, buf, count,
        datatype, status);
1126 1127 1128 1129
    tm2 = darshan_wtime();
    CP_LOCK();
    CP_RECORD_MPI_READ(ret, fh, count, datatype, CP_INDEP_READS, tm1, tm2);
    CP_UNLOCK();
1130
    crt_fh = NULL;
1131 1132 1133 1134 1135 1136 1137 1138 1139
    return(ret);
}

int MPI_File_read_ordered(MPI_File fh, void * buf, int count, 
    MPI_Datatype datatype, MPI_Status * status)
{
    int ret;
    double tm1, tm2;

1140
    crt_fh = &fh;
1141
    tm1 = darshan_wtime();
1142 1143
    ret = DARSHAN_MPI_CALL(PMPI_File_read_ordered)(fh, buf, count,
        datatype, status);
1144 1145 1146 1147
    tm2 = darshan_wtime();
    CP_LOCK();
    CP_RECORD_MPI_READ(ret, fh, count, datatype, CP_COLL_READS, tm1, tm2);
    CP_UNLOCK();
1148
    crt_fh = NULL;
1149 1150 1151 1152 1153 1154 1155 1156 1157
    return(ret);
}

int MPI_File_read_at_all_begin(MPI_File fh, MPI_Offset offset, void * buf,
    int count, MPI_Datatype datatype)
{
    int ret;
    double tm1, tm2;

1158
    crt_fh = &fh;
1159
    tm1 = darshan_wtime();
1160 1161
    ret = DARSHAN_MPI_CALL(PMPI_File_read_at_all_begin)(fh, offset, buf,
        count, datatype);
1162 1163 1164 1165
    tm2 = darshan_wtime();
    CP_LOCK();
    CP_RECORD_MPI_READ(ret, fh, count, datatype, CP_SPLIT_READS, tm1, tm2);
    CP_UNLOCK();
1166
    crt_fh = NULL;
1167 1168 1169 1170 1171 1172 1173 1174
    return(ret);
}

int MPI_File_read_all_begin(MPI_File fh, void * buf, int count, MPI_Datatype datatype)
{
    int ret;
    double tm1, tm2;

1175
    crt_fh = &fh;
1176
    tm1 = darshan_wtime();
1177
    ret = DARSHAN_MPI_CALL(PMPI_File_read_all_begin)(fh, buf, count, datatype);
1178 1179 1180 1181
    tm2 = darshan_wtime();
    CP_LOCK();
    CP_RECORD_MPI_READ(ret, fh, count, datatype, CP_SPLIT_READS, tm1, tm2);
    CP_UNLOCK();
1182
    crt_fh = NULL;
1183 1184 1185 1186 1187 1188 1189 1190
    return(ret);
}

int MPI_File_read_ordered_begin(MPI_File fh, void * buf, int count, MPI_Datatype datatype)
{
    int ret;
    double tm1, tm2;

1191
    crt_fh = &fh;
1192
    tm1 = darshan_wtime();
1193 1194
    ret = DARSHAN_MPI_CALL(PMPI_File_read_ordered_begin)(fh, buf, count,
        datatype);
1195 1196 1197 1198
    tm2 = darshan_wtime();
    CP_LOCK();
    CP_RECORD_MPI_READ(ret, fh, count, datatype, CP_SPLIT_READS, tm1, tm2);
    CP_UNLOCK();
1199
    crt_fh = NULL;
1200 1201 1202 1203
    return(ret);
}

int MPI_File_iread_at(MPI_File fh, MPI_Offset offset, void * buf,
1204
    int count, MPI_Datatype datatype, __D_MPI_REQUEST *request)
1205 1206 1207 1208
{
    int ret;
    double tm1, tm2;

1209
    crt_fh = &fh;
1210
    tm1 = darshan_wtime();
1211 1212
    ret = DARSHAN_MPI_CALL(PMPI_File_iread_at)(fh, offset, buf, count,
        datatype, request);
1213 1214 1215 1216
    tm2 = darshan_wtime();
    CP_LOCK();
    CP_RECORD_MPI_READ(ret, fh, count, datatype, CP_NB_READS, tm1, tm2);
    CP_UNLOCK();
1217
    crt_fh = NULL;
1218 1219 1220
    return(ret);
}

1221
int MPI_File_iread(MPI_File fh, void * buf, int count, MPI_Datatype datatype, __D_MPI_REQUEST * request)
1222 1223 1224 1225
{
    int ret;
    double tm1, tm2;

1226
    crt_fh = &fh;
1227
    tm1 = darshan_wtime();
1228
    ret = DARSHAN_MPI_CALL(PMPI_File_iread)(fh, buf, count, datatype, request);
1229 1230 1231 1232
    tm2 = darshan_wtime();
    CP_LOCK();
    CP_RECORD_MPI_READ(ret, fh, count, datatype, CP_NB_READS, tm1, tm2);
    CP_UNLOCK();
1233
    crt_fh = NULL;
1234 1235 1236 1237
    return(ret);
}

int MPI_File_iread_shared(MPI_File fh, void * buf, int count,
1238
    MPI_Datatype datatype, __D_MPI_REQUEST * request)
1239 1240 1241 1242
{
    int ret;
    double tm1, tm2;

1243
    crt_fh = &fh;
1244
    tm1 = darshan_wtime();
1245 1246
    ret = DARSHAN_MPI_CALL(PMPI_File_iread_shared)(fh, buf, count,
        datatype, request);
1247 1248 1249 1250
    tm2 = darshan_wtime();
    CP_LOCK();
    CP_RECORD_MPI_READ(ret, fh, count, datatype, CP_NB_READS, tm1, tm2);
    CP_UNLOCK();
1251
    crt_fh = NULL;
1252 1253 1254
    return(ret);
}

1255 1256


1257 1258 1259 1260
#ifdef HAVE_MPIIO_CONST
int MPI_File_write(MPI_File fh, const void *buf, int count, 
    MPI_Datatype datatype, MPI_Status *status)
#else
1261 1262
int MPI_File_write(MPI_File fh, void *buf, int count, 
    MPI_Datatype datatype, MPI_Status *status)
1263
#endif
1264 1265 1266
{
    int ret;
    double tm1, tm2;
1267
    MPI_Offset off;
1268

1269
    crt_fh = &fh;
1270
    tm1 = darshan_wtime();
1271
    MPI_File_get_position(fh, &off);
1272
    ret = DARSHAN_MPI_CALL(PMPI_File_write)(fh, buf, count, datatype, status);
1273 1274
    tm2 = darshan_wtime();
    CP_LOCK();
1275
    CP_RECORD_MPI_WRITE(ret, fh, count, datatype, CP_INDEP_WRITES, tm1, tm2, off);
1276
    CP_UNLOCK();
1277
    crt_fh = NULL;
1278 1279 1280
    return(ret);
}

1281 1282 1283 1284
#ifdef HAVE_MPIIO_CONST
int MPI_File_write_at(MPI_File fh, MPI_Offset offset, const void *buf,
    int count, MPI_Datatype datatype, MPI_Status *status)
#else
1285 1286
int MPI_File_write_at(MPI_File fh, MPI_Offset offset, void *buf,
    int count, MPI_Datatype datatype, MPI_Status *status)
1287
#endif
1288 1289 1290 1291
{
    int ret;
    double tm1, tm2;

1292
    crt_fh = &fh;
1293
    tm1 = darshan_wtime();
1294 1295
    ret = DARSHAN_MPI_CALL(PMPI_File_write_at)(fh, offset, buf,
        count, datatype, status);
1296 1297
    tm2 = darshan_wtime();
    CP_LOCK();
1298
    CP_RECORD_MPI_WRITE(ret, fh, count, datatype, CP_INDEP_WRITES, tm1, tm2, offset);
1299
    CP_UNLOCK();
1300
    crt_fh = NULL;
1301 1302 1303
    return(ret);
}

1304 1305 1306 1307
#ifdef HAVE_MPIIO_CONST
int MPI_File_write_at_all(MPI_File fh, MPI_Offset offset, const void * buf,
    int count, MPI_Datatype datatype, MPI_Status * status)
#else
1308 1309
int MPI_File_write_at_all(MPI_File fh, MPI_Offset offset, void * buf,
    int count, MPI_Datatype datatype, MPI_Status * status)
1310
#endif
1311 1312 1313 1314
{
    int ret;
    double tm1, tm2;

1315 1316
    darshan_start_epoch();

1317
    set_collective_io_hints(fh);
1318
    crt_fh = &fh;
1319
    tm1 = darshan_wtime();
1320 1321
    ret = DARSHAN_MPI_CALL(PMPI_File_write_at_all)(fh, offset, buf,
        count, datatype, status);
1322 1323
    tm2 = darshan_wtime();
    CP_LOCK();
1324
    CP_RECORD_MPI_WRITE(ret, fh, count, datatype, CP_COLL_WRITES, tm1, tm2, offset);
1325
    CP_UNLOCK();
1326
    crt_fh = NULL;
1327
    printHints(fh);
1328 1329
    darshan_end_epoch();

1330 1331 1332
    return(ret);
}

1333 1334 1335
#ifdef HAVE_MPIIO_CONST
int MPI_File_write_all(MPI_File fh, const void * buf, int count, MPI_Datatype datatype, MPI_Status *status)
#else
1336
int MPI_File_write_all(MPI_File fh, void * buf, int count, MPI_Datatype datatype, MPI_Status *status)
1337
#endif
1338 1339 1340
{
    int ret;
    double tm1, tm2;
1341 1342 1343
    MPI_Offset off;

    darshan_start_epoch();
1344

1345
    set_collective_io_hints(fh);
1346
    crt_fh = &fh;
1347
    tm1 = darshan_wtime();
1348
    MPI_File_get_position(fh, &off);
1349 1350
    ret = DARSHAN_MPI_CALL(PMPI_File_write_all)(fh, buf, count,
        datatype, status);
1351 1352
    tm2 = darshan_wtime();
    CP_LOCK();
1353
    CP_RECORD_MPI_WRITE(ret, fh, count, datatype, CP_COLL_WRITES, tm1, tm2, off);
1354
    CP_UNLOCK();
1355
    crt_fh = NULL;
1356
    printHints(fh);
1357 1358
    darshan_end_epoch();

1359 1360 1361
    return(ret);
}

1362 1363 1364
#ifdef HAVE_MPIIO_CONST
int MPI_File_write_shared(MPI_File fh, const void * buf, int count, MPI_Datatype datatype, MPI_Status *status)
#else
1365
int MPI_File_write_shared(MPI_File fh, void * buf, int count, MPI_Datatype datatype, MPI_Status *status)
1366
#endif
1367 1368 1369
{
    int ret;
    double tm1, tm2;
1370
    MPI_Offset off;
1371

1372
    crt_fh = &fh;
1373
    tm1 = darshan_wtime();
1374
    MPI_File_get_position(fh, &off);
1375 1376
    ret = DARSHAN_MPI_CALL(PMPI_File_write_shared)(fh, buf, count,
        datatype, status);
1377 1378
    tm2 = darshan_wtime();
    CP_LOCK();
1379
    CP_RECORD_MPI_WRITE(ret, fh, count, datatype, CP_INDEP_WRITES, tm1, tm2, off);
1380
    CP_UNLOCK();
1381
    crt_fh = NULL;
1382 1383 1384
    return(ret);
}

1385 1386 1387 1388
#ifdef HAVE_MPIIO_CONST
int MPI_File_write_ordered(MPI_File fh, const void * buf, int count, 
    MPI_Datatype datatype, MPI_Status * status)
#else
1389 1390
int MPI_File_write_ordered(MPI_File fh, void * buf, int count, 
    MPI_Datatype datatype, MPI_Status * status)
1391
#endif
1392 1393 1394
{
    int ret;
    double tm1, tm2;
1395
    MPI_Offset off;
1396

1397
    crt_fh = &fh;
1398
    tm1 = darshan_wtime();
1399
    MPI_File_get_position(fh, &off);
1400
    ret = DARSHAN_MPI_CALL(PMPI_File_write_ordered)(fh, buf, count,
1401
         datatype, status); 
1402 1403
    tm2 = darshan_wtime();
    CP_LOCK();
1404
    CP_RECORD_MPI_WRITE(ret, fh, count, datatype, CP_COLL_WRITES, tm1, tm2, off);
1405
    CP_UNLOCK();
1406
    crt_fh = NULL;
1407 1408 1409
    return(ret);
}

1410 1411 1412 1413
#ifdef HAVE_MPIIO_CONST
int MPI_File_write_at_all_begin(MPI_File fh, MPI_Offset offset, const void * buf,
    int count, MPI_Datatype datatype)
#else
1414 1415
int MPI_File_write_at_all_begin(MPI_File fh, MPI_Offset offset, void * buf,
    int count, MPI_Datatype datatype)
1416
#endif
1417 1418 1419 1420
{
    int ret;
    double tm1, tm2;

1421
    crt_fh = &fh;
1422
    tm1 = darshan_wtime();
1423 1424
    ret = DARSHAN_MPI_CALL(PMPI_File_write_at_all_begin)(fh, offset,
        buf, count, datatype);
1425 1426
    tm2 = darshan_wtime();
    CP_LOCK();
1427
    CP_RECORD_MPI_WRITE(ret, fh, count, datatype, CP_SPLIT_WRITES, tm1, tm2, offset);
1428
    CP_UNLOCK();
1429
    crt_fh = NULL;
1430 1431 1432
    return(ret);
}

1433 1434 1435
#ifdef HAVE_MPIIO_CONST
int MPI_File_write_all_begin(MPI_File fh, const void * buf, int count, MPI_Datatype datatype)
#else
1436
int MPI_File_write_all_begin(MPI_File fh, void * buf, int count, MPI_Datatype datatype)
1437
#endif
1438 1439 1440
{
    int ret;
    double tm1, tm2;
1441
    MPI_Offset off;
1442

1443 1444
    crt_fh = &fh;
    tm1 = darshan_wtime(); 
1445
    MPI_File_get_position(fh, &off);
1446
    ret = DARSHAN_MPI_CALL(PMPI_File_write_all_begin)(fh, buf, count, datatype);
1447 1448
    tm2 = darshan_wtime();
    CP_LOCK();
1449
    CP_RECORD_MPI_WRITE(ret, fh, count, datatype, CP_SPLIT_WRITES, tm1, tm2, off);
1450
    CP_UNLOCK();
1451
    crt_fh = NULL;
1452 1453 1454
    return(ret);
}

1455 1456 1457
#ifdef HAVE_MPIIO_CONST
int MPI_File_write_ordered_begin(MPI_File fh, const void * buf, int count, MPI_Datatype datatype)
#else
1458
int MPI_File_write_ordered_begin(MPI_File fh, void * buf, int count, MPI_Datatype datatype)
1459
#endif
1460 1461 1462
{
    int ret;
    double tm1, tm2;
1463
    MPI_Offset off;
1464

1465
    crt_fh = &fh;
1466
    tm1 = darshan_wtime();
1467
    MPI_File_get_position(fh, &off);
1468 1469
    ret = DARSHAN_MPI_CALL(PMPI_File_write_ordered_begin)(fh, buf, count,
        datatype);
1470 1471
    tm2 = darshan_wtime();
    CP_LOCK();
1472
    CP_RECORD_MPI_WRITE(ret, fh, count, datatype, CP_SPLIT_WRITES, tm1, tm2, off);
1473
    CP_UNLOCK();
1474
    crt_fh = NULL;
1475 1476 1477
    return(ret);
}

1478 1479 1480 1481
#ifdef HAVE_MPIIO_CONST
int MPI_File_iwrite_at(MPI_File fh, MPI_Offset offset, const void * buf,
    int count, MPI_Datatype datatype, __D_MPI_REQUEST *request)
#else