Commit 1a31aa68 authored by Huihuo Zheng's avatar Huihuo Zheng
Browse files

read

parent c3df39c0
......@@ -56,7 +56,7 @@ From the application perspective, since the data migration from the local storag
\begin{figure}[hbt]
\centering
\includegraphics[width=0.6\textwidth]{schematic.png}
\caption{Schematic demonstration of incorporating node-local storage into the parallel I/O workflow. The memory buffers are written into the node-local storage first, and then migrated to the file system asynchronously by a pthread through parallel I/O function calls.}\label{fig:schematic}
\caption{Schematic demonstration of incorporating node-local storage into the parallel write workflow. The memory buffers are written into the node-local storage first, and then migrated to the file system asynchronously by a pthread through parallel I/O function calls.}\label{fig:schematic}
\end{figure}
The whole workflow is conceptually simple. However, it does need careful design to incorporate the workflow into I/O libraries such as MPI I/O and HDF5 such that it can be easily adopted by the application developers. Ideally, the application developers shall not need to be aware of the underneath I/O process. They should make minimal change or even no change to their code to adopt this framework. In Sec.~\ref{sec:design}, we will provide the details of our prototype design in both MPI I/O and system-aware HDF5 library to meet all these needs. In Sec.~\ref{sec:design}, we then provide our initial performance evaluation on Theta and discuss our future work to fully integrate into the VFD framework in the HDF5 library.
......
......@@ -174,6 +174,7 @@ void *H5Dwrite_pthread_func(void *arg) {
enough for storing the buffer of the current task, it will wait for the
I/O thread to finsh all the previous tasks.
*/
hid_t H5Fcreate_cache( const char *name, unsigned flags, hid_t fcpl_id, hid_t fapl_id ) {
int rc = pthread_create(&H5DWMM.io.pthread, NULL, H5Dwrite_pthread_func, NULL);
srand(time(NULL)); // Initialization, should only be called once.
......@@ -181,11 +182,11 @@ hid_t H5Fcreate_cache( const char *name, unsigned flags, hid_t fcpl_id, hid_t fa
MPI_Comm comm, comm_dup;
MPI_Info info;
H5Pget_fapl_mpio(fapl_id, &comm, &info);
MPI_Comm_dup(comm, &comm_dup);
MPI_Comm_dup(comm, &H5DWMM.mpi.comm);
MPI_Comm_rank(comm, &H5DWMM.mpi.rank);
MPI_Comm_split_type(comm_dup, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &H5DWMM.mpi.comm);
MPI_Comm_rank(H5DWMM.mpi.comm, &H5DWMM.mpi.local_rank);
MPI_Comm_size(H5DWMM.mpi.comm, &H5DWMM.mpi.ppn);
MPI_Comm_split_type(H5DWMM.mpi.comm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &H5DWMM.mpi.node_comm);
MPI_Comm_rank(H5DWMM.mpi.node_comm, &H5DWMM.mpi.local_rank);
MPI_Comm_size(H5DWMM.mpi.node_comm, &H5DWMM.mpi.ppn);
strcpy(H5DWMM.mmap.fname, H5DWMM.ssd->path);
char rnd[255];
sprintf(rnd, "%d", rand());
......@@ -280,6 +281,7 @@ void H5WPthreadTerminate() {
and terminate the pthread, remove the files on the SSD.
*/
herr_t H5Fclose_cache( hid_t file_id ) {
// we should check whether cache is turn on for file_id. Therefore we should have a property.
H5WPthreadTerminate();
close(H5DWMM.mmap.fd);
remove(H5DWMM.mmap.fname);
......@@ -316,8 +318,6 @@ herr_t H5Sclose_cache(hid_t filespace) {
The following functions are for parallel read.
*/
H5Dread_cache_metadata
H5DRMM = {
.io.master_cond = PTHREAD_COND_INITIALIZER,
......@@ -469,15 +469,29 @@ void create_mmap(const char *prefix) {
MPI_Type_commit(&H5DRMM.dset.mpi_datatype);
MPI_Win_create(H5DRMM.mmap.buf, ss, H5DRMM.dset.esize, MPI_INFO_NULL, H5DRMM.mpi.comm, &H5DRMM.mpi.win);
}
hid_t H5Fopen_cache( const char *name, hid_t fcpl_id, hid_t fapl_id ) {
srand(time(NULL)); // Initialization, should only be called once.
setH5SSD();
MPI_Comm comm;
MPI_Info info;
H5Pget_fapl_mpio(fapl_id, &comm, &info);
MPI_Comm_dup(comm, &H5DRMM.mpi.comm);
MPI_Comm_rank(comm, &H5DRMM.mpi.rank);
MPI_Comm_size(comm, &H5DRMM.mpi.nproc);
MPI_Comm_split_type(H5DRMM.mpi.comm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &H5DRMM.mpi.node_comm);
MPI_Comm_rank(H5DRMM.mpi.node_comm, &H5DRMM.mpi.local_rank);
MPI_Comm_size(H5DRMM.mpi.node_comm, &H5DRMM.mpi.ppn);
H5DRMM.ssd->mspace_per_rank_total = H5DRMM.ssd->mspace_total / H5DRMM.mpi.ppn;
H5DRMM.ssd->mspace_per_rank_left = H5DRMM.ssd->mspace_per_rank_total;
return H5Fopen(name, fcpl_id, fapl_id);
}
/*
Open a dataset, can create memory map.
*/
hid_t H5Dopen_cache(hid_t loc_id, const char *name, hid_t dapl_id) {
int rc = pthread_create(&H5DRMM.io.pthread, NULL, H5Dread_pthread_func, NULL);
MPI_Info info;
MPI_Comm comm;
MPI_Comm_dup(MPI_COMM_WORLD, &H5DRMM.mpi.comm);
srand(time(NULL)); // Initialization, should only be called once.
hid_t dset = H5Dopen(loc_id, name, dapl_id);
H5DRMM.dset.h5_datatype = H5Dget_type(dset);
......@@ -493,8 +507,6 @@ hid_t H5Dopen_cache(hid_t loc_id, const char *name, hid_t dapl_id) {
H5DRMM.dset.sample.nel = dim;
H5DRMM.dset.sample.dim = ndims-1;
H5DRMM.dset.ns_glob = gdims[0];
MPI_Comm_size(H5DRMM.mpi.comm, &H5DRMM.mpi.nproc);
MPI_Comm_rank(H5DRMM.mpi.comm, &H5DRMM.mpi.rank);
parallel_dist(gdims[0], H5DRMM.mpi.nproc, H5DRMM.mpi.rank, H5DRMM.dset.ns_loc, H5DRMM.dset.s_offset);
H5DRMM.dset.sample.size = H5DRMM.dset.esize*H5DRMM.dset.sample.nel;
H5DRMM.dset.size = H5DRMM.dset.sample.size*H5DRMM.dset.ns_loc;
......
......@@ -41,7 +41,8 @@ typedef struct _MPI_INFO {
int rank;
int local_rank;
int nproc;
MPI_Comm comm;
MPI_Comm comm; // global communicator
MPI_Comm node_comm; // node local communicator
MPI_Win win;
} MPI_INFO;
......@@ -106,6 +107,7 @@ void set_hyperslab_from_samples(int *samples, int nsample, hid_t &fspace);
void get_samples_from_filespace(hid_t fspace, vector<int> &samples, bool &contiguous);
//void create_mmap(char *path, H5Dio_mmap &f);
void parallel_dist(size_t dim, int nproc, int rank, size_t &ldim, size_t &start);
hid_t H5Fopen_cache(const char *name, hid_t fcpl_id, hid_t fapl_id);
hid_t H5Dopen_cache(hid_t loc_id, const char *name, hid_t dapl_id);
herr_t H5Dread_to_cache(hid_t dataset_id, hid_t mem_type_id, hid_t mem_space_id, hid_t file_space_id, hid_t xfer_plist_id, void * buf);
herr_t H5Dread_from_cache(hid_t dataset_id, hid_t mem_type_id, hid_t mem_space_id, hid_t file_space_id, hid_t xfer_plist_id, void * buf);
......
......@@ -119,7 +119,11 @@ int main(int argc, char **argv) {
setenv("SSD_PATH", local_storage, 1);
hid_t plist_id = H5Pcreate(H5P_FILE_ACCESS);
H5Pset_fapl_mpio(plist_id, MPI_COMM_WORLD, MPI_INFO_NULL);
hid_t fd = H5Fopen(fname, H5F_ACC_RDONLY, plist_id);
hid_t fd;
if (cache)
fd = H5Fopen_cache(fname, H5F_ACC_RDONLY, plist_id);
else
fd = H5Fopen(fname, H5F_ACC_RDONLY, plist_id);
hid_t dset;
tt.start_clock("H5Dopen");
if (cache) {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment