Commit 5309ebb6 authored by rjzamora's avatar rjzamora
Browse files

updtate of topology branch - adding uncache option. About to change readshift...

updtate of topology branch - adding uncache option. About to change readshift option to shift ranks for read and write (seems to highlight topology placement importance
parent a4ccacc5
......@@ -109,10 +109,11 @@ extern void set_MPI_File_write_at(int setval);
#define NUM_ATTRIBUTES 64
#define ATTRIBUTE_SIZE 1024
#define NAME_LENGTH 1024
#define NUM_ITERATIONS 10
#define NUM_ITERATIONS 2
#define MAX_DIM 4
#define MAX_STR 1024
#define MAX_FRAC 0.9
#define MAX_CHAR_PATH 4096
#define MAX_RAM 17179869184
#define MB 1048576
......@@ -137,6 +138,7 @@ void getmoments(double *ain, int n, double *momarr);
static int CountTasksPerNode(int numTasks, int rank, MPI_Comm comm);
static void *malloc_and_touch(size_t size);
static size_t NodeMemoryFracToBytes(double frac_d);
double FS_Uncache( MPI_Comm comm );
/* data structure from flashio */
typedef struct sim_params_t {
......@@ -167,8 +169,10 @@ int main( int argc, char* argv[] )
int numDims = 0;
int useChunked = 0;
int usemem = 0;
int topohint = 0;
int topohint = 0;
int rankshift = 0;
int dogdb = 0;
int uncache = 0;
double memadd = 0.0;
int maxcheck_set = 0;
// defaults are 1
......@@ -216,6 +220,10 @@ int main( int argc, char* argv[] )
for (i=1;i<argc;i++) {
if (strcmp(argv[i],"--metacoll") == 0)
useMetaDataCollectives = 1;
else if (strcmp(argv[i],"--uncache") == 0)
uncache = 1;
else if (strcmp(argv[i],"--gdb") == 0)
dogdb = 1;
else if (strcmp(argv[i],"--nowrite") == 0)
doWriteat = 0;
else if (strcmp(argv[i],"--topohint") == 0)
......@@ -363,12 +371,25 @@ int main( int argc, char* argv[] )
MPI_Barrier(MPI_COMM_WORLD);
if (rank == 0) {
printf("useMetaDataCollectives: %d addDerivedTypeDataset: %d addAttributes: %d useIndependentIO: %d numDims: %d useChunked: %d readShift: %d\n",
useMetaDataCollectives,addDerivedTypeDataset,addAttributes,useIndependentIO,numDims,useChunked,rankshift);
printf("useMetaDataCollectives: %d addDerivedTypeDataset: %d addAttributes: %d useIndependentIO: %d numDims: %d useChunked: %d readShift: %d uncache: %d\n",
useMetaDataCollectives,addDerivedTypeDataset,addAttributes,useIndependentIO,numDims,useChunked,rankshift,uncache);
printf("Metric Bufsize H5DWrite RawWrBDWTH H5Dread RawRdBDWTH Dataset Group Attribute H5Fopen H5Fclose H5Fflush OtherClose\n");
printf("WARNING: Bufsize is the size of the double-type dataset (The derived-type dataset is 8x larger!).\n");
}
/* Sleep here if we are waiting tu use a debugger (like gdb) */
if (dogdb) {
char dbhostname[256];
gethostname(dbhostname, sizeof(dbhostname));
printf("Rank %d: PID %d on %s ready for attach.\n", rank, getpid(), dbhostname);
fflush(stdout);
while (rank==0 && dogdb) {
sleep(2);
}
dogdb = 0;
MPI_Barrier(MPI_COMM_WORLD);
}
int bufloopIter = 0;
int loopIter = 0;
......@@ -481,6 +502,9 @@ int main( int argc, char* argv[] )
mpi_code = MPI_Info_get(mpiHints, "cb_config_list", MPI_MAX_INFO_VAL, info_value, &info_flag);
if (rank == 0) printf("cb_config_list is: %s\n",info_value);
mpi_code = MPI_Info_get(mpiHints, "romio_aggregator_list", MPI_MAX_INFO_VAL, info_value, &info_flag);
if (rank == 0) printf("GPFS+BGQ: romio_aggregator_list is: %s\n",info_value);
// Re-open the property list to pass the hint
H5Pclose(accessPropList);
accessPropList = H5Pcreate(H5P_FILE_ACCESS);
......@@ -862,6 +886,10 @@ int main( int argc, char* argv[] )
otherCloseTime += (MPI_Wtime()-startTime);
/* Write temp file to dissable caching */
if (uncache)
FS_Uncache(comm);
MPI_Barrier(comm);
startTime = MPI_Wtime();
rc = H5Fflush(fd,H5F_SCOPE_LOCAL);
......@@ -910,7 +938,7 @@ int main( int argc, char* argv[] )
groupTime += (MPI_Wtime()-startTime);
if (fd < 0) {
printf("H5Fopen error - fd is %d\n",(hid_t)fd);
printf("H5Fopen error - fd is %lld\n",(hid_t)fd);
exit(1);
}
......@@ -921,14 +949,14 @@ int main( int argc, char* argv[] )
dataSetTime += (MPI_Wtime()-startTime);
if (dataSet < 0) {
printf("H5Dopen error - dataSet is %d\n",(hid_t)dataSet);
printf("H5Dopen error - dataSet is %lld\n",(hid_t)dataSet);
exit(1);
}
fileDataSpace = H5Dget_space(dataSet);
if (fileDataSpace < 0) {
printf("H5Dget_space error - fileDataSpace is %d\n",(hid_t)fileDataSpace);
printf("H5Dget_space error - fileDataSpace is %lld\n",(hid_t)fileDataSpace);
exit(1);
}
dtype_id = H5Dget_type(dataSet);
......@@ -967,9 +995,9 @@ int main( int argc, char* argv[] )
if (!maxcheck_set || (BufSizeTotalDouble <= maxcheck)) {
for (i=0;i<(NumDoubleElements);i++) {
if (rankshift == 0 && (dataBuffer[i] != checkBuffer[i])) {
printf("ERROR on read: index %d doesn't match - expected %20.16f got %20.16f \n",i,dataBuffer[i],checkBuffer[i]);
printf("Rank %d - ERROR on read: index %d doesn't match - expected %20.16f got %20.16f \n",rank,i,dataBuffer[i],checkBuffer[i]);
} else if (rankshift > 0 && (dataBufferShift[i] != checkBuffer[i])) {
printf("ERROR on read: index %d doesn't match - expected %20.16f got %20.16f \n",i,dataBufferShift[i],checkBuffer[i]);
printf("Rank %d - ERROR on read: index %d doesn't match - expected %20.16f got %20.16f \n",rank,i,dataBufferShift[i],checkBuffer[i]);
}
}
}
......@@ -1323,3 +1351,44 @@ static size_t NodeMemoryFracToBytes(double frac_d)
return mem / 100 * percent;
}
double FS_Uncache ( MPI_Comm comm )
{
int rank, i, err;
char tmp_file[ MAX_CHAR_PATH ];
double start_time, end_time;
MPI_File tmp_fh;
MPI_Status status;
int64_t el = 2500000;
float *val, *val_r;
start_time = PMPI_Wtime ();
/*
* 10 MB of float values per array
*/
val = (float *) malloc ( el * sizeof(float) );
val_r = (float *) malloc ( el * sizeof(float) );
PMPI_Comm_rank (comm, &rank);
snprintf (tmp_file, MAX_CHAR_PATH, "uncache-%09d.tmp", rank);
for (i = el; i > 0; i--)
val[i-1] = (float)i;
PMPI_File_open(MPI_COMM_SELF, tmp_file, MPI_MODE_WRONLY | MPI_MODE_CREATE,
MPI_INFO_NULL, &tmp_fh);
PMPI_File_write (tmp_fh, val, el, MPI_FLOAT, &status);
PMPI_File_read (tmp_fh, val_r, el, MPI_FLOAT, &status);
PMPI_File_close (&tmp_fh);
PMPI_File_delete (tmp_file, MPI_INFO_NULL);
for (i = 0; i > el; i++)
val_r[i] += (float)i;
free (val);
free (val_r);
PMPI_Barrier ( comm );
end_time = PMPI_Wtime ();
return end_time - start_time;
}
......@@ -407,7 +407,24 @@ void get_cb_props( int64_t *buffer_size, int64_t *nb_aggr, char* fname, char* cb
*buffer_size = (int64_t) striping_unit;
*nb_aggr = (int64_t) (striping_factor * co_ratio);
#elif defined(ROMIO) || defined(BGQ)
#elif defined(BGQ)
MPI_File_get_info(fh, &mpi_file_info);
/* Override the number of aggeregators (if HDF5_CB_NODES_OVERRIDE env is set) */
if ( cb_override && !(strcmp(cb_override,"0") == 0) ) {
MPI_Info_set(mpi_file_info, "cb_nodes", cb_override);
}
mpi_code = MPI_Info_get(mpi_file_info, "cb_nodes", MPI_MAX_INFO_VAL, info_value, &info_flag);
*nb_aggr = (int64_t) atoi(info_value);
if (rank == 0) printf("cb_nodes is :%lld:\n",*nb_aggr);
mpi_code = MPI_Info_get(mpi_file_info, "cb_buffer_size", MPI_MAX_INFO_VAL, info_value, &info_flag);
*buffer_size = (int64_t) atoi(info_value);
if (rank == 0) printf("cb_buffer_size is :%lld:\n",*buffer_size);
#elif defined(ROMIO)
MPI_File_get_info(fh, &mpi_file_info);
......@@ -568,6 +585,44 @@ int add_chunk ( int64_t datalen, int64_t offset, int64_t buffer_size, int64_t nb
return 0;
}
/*-------------------------------------------------------------------------
* Function: split_proc_name
*
* Purpose: Helper function to return a single (specified) word from the
* processes' name (from MPI_Get_processor_name)
*
* Inputs: str_in: Input string (output of MPI_Get_processor_name)
* str_len: Length of str_in
* str_out: Output string (modified by this function)
* word_ind: Split index that the desired 'name' is located in str_in
*
* Return: 0 == Success. Populates char *str_out with the name
*
*-------------------------------------------------------------------------
*/
int split_proc_name( char *str_in, int str_len, char *str_out, int word_ind ) {
int i,j,ctr,prvspc;
j=0; ctr=0; prvspc=0;
for (i=0; i<=str_len; i++) {
/* If space or string terminator */
if ( str_in[i]==' ' || str_in[i]=='\0' ) {
if (ctr == word_ind) str_out[j]='\0';
if (str_in[i]==' ' && prvspc==0) ctr++; /* for next word */
j=0; /* for next word, init index to 0 */
prvspc=1;
} else {
if (ctr == word_ind) {
str_out[j] = str_in[i];
}
prvspc=0;
j++;
}
}
return 0;
}
/*-------------------------------------------------------------------------
* Function: get_cb_config_list
*
......@@ -592,12 +647,24 @@ int get_cb_config_list ( int64_t* data_lens, int64_t* offsets, int data_len, cha
int rank, nprocs, i, r, resultlen;
int* agg_list;
int64_t *data_to_send_per_aggr;
char name_tmp[MPI_MAX_PROCESSOR_NAME];
char name[MPI_MAX_PROCESSOR_NAME];
char name_buf[MPI_MAX_PROCESSOR_NAME];
char* cb_reverse = getenv("HDF5_CB_REV");
MPI_Comm_rank ( comm, &rank );
MPI_Comm_size ( comm, &nprocs );
MPI_Get_processor_name( name, &resultlen );
MPI_Get_processor_name( name_tmp, &resultlen );
#ifdef BGQ
/* BGQ name is @ index=5 - Ex: 'Task 3207 of 4096 (3,0,1,0,0,7) R00-M1-N12-J05' */
//split_proc_name( &name_tmp[0], strlen(name[0]), &name[0], 5 );
//strcpy(name, name_tmp);
char localhost[MPI_MAX_PROCESSOR_NAME];
gethostname(localhost, MPI_MAX_PROCESSOR_NAME);
strcpy(name, localhost);
#else
strcpy(name, name_tmp);
#endif
/* Tally data quantities associated with each aggregator */
data_to_send_per_aggr = (int64_t *) calloc (nb_aggr, sizeof (int64_t));
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment