diff --git a/darshan-log-format.h b/darshan-log-format.h index dadc0e0e1370b5724ca72a909efec16f6db06f10..dc310b6f4de875c1fddbe9b742fc111b11b0a14f 100644 --- a/darshan-log-format.h +++ b/darshan-log-format.h @@ -182,6 +182,10 @@ enum darshan_indices CP_ACCESS4_COUNT, CP_DEVICE, /* device id reported by stat */ CP_SIZE_AT_OPEN, + CP_FASTEST_RANK, + CP_FASTEST_RANK_BYTES, + CP_SLOWEST_RANK, + CP_SLOWEST_RANK_BYTES, CP_NUM_INDICES, }; @@ -203,6 +207,11 @@ enum f_darshan_indices CP_F_MPI_WRITE_TIME, /* cumulative mpi-io write time */ CP_F_MAX_READ_TIME, CP_F_MAX_WRITE_TIME, + CP_F_FASTEST_RANK_TIME, + CP_F_SLOWEST_RANK_TIME, + CP_F_VARIANCE_RANK_TIME, + CP_F_VARIANCE_RANK_BYTES, + CP_F_NUM_INDICES, }; diff --git a/darshan-logutils.c b/darshan-logutils.c index 67afa894033c15f57fd1cd4e29fa887a871c207b..26f5513222874d9ea48b9695e31c6ec93618c4d2 100644 --- a/darshan-logutils.c +++ b/darshan-logutils.c @@ -152,6 +152,10 @@ char *darshan_names[] = { "CP_ACCESS4_COUNT", "CP_DEVICE", "CP_SIZE_AT_OPEN", + "CP_FASTEST_RANK", + "CP_FASTEST_RANK_BYTES", + "CP_SLOWEST_RANK", + "CP_SLOWEST_RANK_BYTES", "CP_NUM_INDICES" }; @@ -172,7 +176,12 @@ char *darshan_f_names[] = { "CP_F_MPI_WRITE_TIME", /* cumulative mpi-io write time */ "CP_F_MAX_READ_TIME", "CP_F_MAX_WRITE_TIME", - "CP_F_NUM_INDICES", + "CP_F_FASTEST_RANK_TIME", + "CP_F_SLOWEST_RANK_TIME", + "CP_F_VARIANCE_RANK_TIME", + "CP_F_VARIANCE_RANK_BYTES", + + "CP_F_NUM_INDICES" }; /* function pointers so that we can switch functions depending on what file diff --git a/darshan-parser.c b/darshan-parser.c index 27167ec41e70d0da6563b353046d74ebc1763b88..d0abc7faafa8cd65038c7d73e25c7d7f46967d1a 100644 --- a/darshan-parser.c +++ b/darshan-parser.c @@ -139,6 +139,7 @@ int main(int argc, char **argv) printf("# CP_ACCESS*_COUNT: count of the four most common access sizes.\n"); printf("# CP_DEVICE: device id reported by stat().\n"); printf("# CP_SIZE_AT_OPEN: size of file when first opened.\n"); + printf("# CP_*_RANK_BYTES: fastest, slowest and variance of bytes transfer.\n"); printf("# CP_F_OPEN_TIMESTAMP: timestamp of first open (mpi or posix).\n"); printf("# CP_F_*_START_TIMESTAMP: timestamp of first read/write (mpi or posix).\n"); printf("# CP_F_*_END_TIMESTAMP: timestamp of last read/write (mpi or posix).\n"); @@ -148,6 +149,7 @@ int main(int argc, char **argv) printf("# CP_F_POSIX_META_TIME: cumulative time spent in posix open, close, fsync, stat and seek, .\n"); printf("# CP_F_MPI_META_TIME: cumulative time spent in mpi-io open, close, set_view, and sync.\n"); printf("# CP_MAX_*_TIME: duration of the slowest read and write operations.\n"); + printf("# CP_*_RANK_TIME: fastest, slowest variance of transfer time.\n"); printf("\n"); diff --git a/lib/darshan-mpi-io.c b/lib/darshan-mpi-io.c index b996d924d9b6b48ce75da1687a84d749d45b782e..a597397badb133fa01aa8f013e330e92b7edebd5 100644 --- a/lib/darshan-mpi-io.c +++ b/lib/darshan-mpi-io.c @@ -131,6 +131,13 @@ static int cp_log_compress(struct darshan_job_runtime* final_job, static int file_compare(const void* a, const void* b); static void darshan_mpi_initialize(int *argc, char ***argv); static char* darshan_get_exe_and_mounts(struct darshan_job_runtime* final_job); +static int darshan_file_variance( + struct darshan_file *infile_array, + struct darshan_file *outfile_array, + int count, int rank); +static void pairwise_variance_reduce ( + void *invec, void *inoutvec, int *len, MPI_Datatype *dt); + #define CP_MAX_MNTS 32 uint64_t mnt_hash_array[CP_MAX_MNTS] = {0}; @@ -143,6 +150,13 @@ struct int64_t mnt_id_root; } mnt_mapping[CP_MAX_MNTS]; +struct variance_dt +{ + double n; + double T; + double S; +}; + int MPI_Init(int *argc, char ***argv) { int ret; @@ -1134,6 +1148,31 @@ static int cp_log_reduction(struct darshan_job_runtime* final_job, int rank, { if(final_job->file_array[j].hash == hash_array[i]) { + + /* + * Initialize fastest/slowest info prior + * to the reduction. + */ + final_job->file_array[j].counters[CP_FASTEST_RANK] = + final_job->file_array[j].rank; + final_job->file_array[j].counters[CP_FASTEST_RANK_BYTES] = + final_job->file_array[j].counters[CP_BYTES_READ] + + final_job->file_array[j].counters[CP_BYTES_WRITTEN]; + final_job->file_array[j].fcounters[CP_F_FASTEST_RANK_TIME] = + final_job->file_array[j].fcounters[CP_F_POSIX_META_TIME] + + final_job->file_array[j].fcounters[CP_F_POSIX_READ_TIME] + + final_job->file_array[j].fcounters[CP_F_POSIX_WRITE_TIME]; + + final_job->file_array[j].counters[CP_SLOWEST_RANK] = + final_job->file_array[j].rank; + final_job->file_array[j].counters[CP_SLOWEST_RANK_BYTES] = + final_job->file_array[j].counters[CP_BYTES_READ] + + final_job->file_array[j].counters[CP_BYTES_WRITTEN]; + final_job->file_array[j].fcounters[CP_F_SLOWEST_RANK_TIME] = + final_job->file_array[j].fcounters[CP_F_POSIX_META_TIME] + + final_job->file_array[j].fcounters[CP_F_POSIX_READ_TIME] + + final_job->file_array[j].fcounters[CP_F_POSIX_WRITE_TIME]; + final_job->file_array[j].rank = -1; break; } @@ -1169,6 +1208,14 @@ static int cp_log_reduction(struct darshan_job_runtime* final_job, int rank, return(-1); } + ret = darshan_file_variance( + &final_job->file_array[final_job->file_count-shared_count], + tmp_array, shared_count, rank); + if (ret) + { + return(-1); + } + if(rank == 0) { /* root replaces local files with shared ones */ @@ -1383,6 +1430,48 @@ static void darshan_file_reduce(void* infile_v, inoutfile->counters[CP_MAX_READ_TIME_SIZE]; } + /* min */ + if(infile->counters[CP_F_FASTEST_RANK_TIME] < + inoutfile->counters[CP_F_FASTEST_RANK_TIME]) + { + tmp_file.counters[CP_FASTEST_RANK] = + infile->counters[CP_FASTEST_RANK]; + tmp_file.counters[CP_FASTEST_RANK_BYTES] = + infile->counters[CP_FASTEST_RANK_BYTES]; + tmp_file.fcounters[CP_F_FASTEST_RANK_TIME] = + infile->fcounters[CP_F_FASTEST_RANK_TIME]; + } + else + { + tmp_file.counters[CP_FASTEST_RANK] = + inoutfile->counters[CP_FASTEST_RANK]; + tmp_file.counters[CP_FASTEST_RANK_BYTES] = + inoutfile->counters[CP_FASTEST_RANK_BYTES]; + tmp_file.fcounters[CP_F_FASTEST_RANK_TIME] = + inoutfile->fcounters[CP_F_FASTEST_RANK_TIME]; + } + + /* max */ + if(infile->fcounters[CP_F_SLOWEST_RANK_TIME] > + inoutfile->fcounters[CP_F_SLOWEST_RANK_TIME]) + { + tmp_file.counters[CP_SLOWEST_RANK] = + infile->counters[CP_SLOWEST_RANK]; + tmp_file.counters[CP_SLOWEST_RANK_BYTES] = + infile->counters[CP_SLOWEST_RANK_BYTES]; + tmp_file.fcounters[CP_F_SLOWEST_RANK_TIME] = + infile->fcounters[CP_F_SLOWEST_RANK_TIME]; + } + else + { + tmp_file.counters[CP_SLOWEST_RANK] = + inoutfile->counters[CP_SLOWEST_RANK]; + tmp_file.counters[CP_SLOWEST_RANK_BYTES] = + inoutfile->counters[CP_SLOWEST_RANK_BYTES]; + tmp_file.fcounters[CP_F_SLOWEST_RANK_TIME] = + inoutfile->fcounters[CP_F_SLOWEST_RANK_TIME]; + } + /* pick one device id and file size */ tmp_file.counters[CP_DEVICE] = infile->counters[CP_DEVICE]; tmp_file.counters[CP_SIZE_AT_OPEN] = infile->counters[CP_SIZE_AT_OPEN]; @@ -1793,6 +1882,146 @@ static char* darshan_get_exe_and_mounts(struct darshan_job_runtime* final_job) return(trailing_data); } +/* + * Computes population variance of bytes moved and total time + * for each rank on a shared file. + */ +static int darshan_file_variance( + struct darshan_file *infile_array, + struct darshan_file *outfile_array, + int count, int rank) +{ + MPI_Op pw_var_op; + MPI_Datatype var_dt; + int ret; + int i; + struct variance_dt* var_array = NULL; + struct variance_dt* varres_array = NULL; + + ret = MPI_Op_create(pairwise_variance_reduce, 1, &pw_var_op); + if (ret != MPI_SUCCESS) + { + goto error_handler; + } + + ret = MPI_Type_contiguous(sizeof(struct variance_dt), MPI_BYTE, &var_dt); + if (ret != MPI_SUCCESS) + { + goto error_handler; + } + + ret = MPI_Type_commit(&var_dt); + if (ret != MPI_SUCCESS) + { + goto error_handler; + } + + var_array = malloc(count*sizeof(struct variance_dt)); + if(!var_array) + { + goto error_handler; + } + + if (rank == 0) + { + varres_array = malloc(count*sizeof(struct variance_dt)); + if(!varres_array) + { + goto error_handler; + } + } + + /* + * total time + */ + + for(i=0; in + Y->n; + Z.T = X->T + Y->T; + Z.S = X->S + Y->S + (X->n/(Y->n*Z.n)) * + ((Y->n/X->n)*X->T - Y->T) * ((Y->n/X->n)*X->T - Y->T); + + *Y = Z; + } + + return; +} + /* * Local variables: * c-indent-level: 4