diff options
author | Quincey Koziol <koziol@hdfgroup.org> | 2002-05-17 12:53:46 (GMT) |
---|---|---|
committer | Quincey Koziol <koziol@hdfgroup.org> | 2002-05-17 12:53:46 (GMT) |
commit | a6b4cba798a494dea1d29474cc5658f7003615d9 (patch) | |
tree | 5ffa6f7b9868849e81a6392b29ad59ec9218dfe1 /src | |
parent | 567c04276158059089d64e0e9fd5b9c7e1b8d7ba (diff) | |
download | hdf5-a6b4cba798a494dea1d29474cc5658f7003615d9.zip hdf5-a6b4cba798a494dea1d29474cc5658f7003615d9.tar.gz hdf5-a6b4cba798a494dea1d29474cc5658f7003615d9.tar.bz2 |
[svn-r5429] Purpose:
Bug fix/Code improvement.
Description:
Currently, the chunk data allocation routine invoked to allocate space for
the entire dataset is inefficient. It writes out each chunk in the dataset,
whether it is already allocated or not. Additionally, this happens not
only when it is created, but also anytime it is opened for writing, or the
dataset is extended. Worse, there's too much parallel I/O syncronization,
which slows things down even more.
Solution:
Only attempt to write out chunks that don't already exist. Additionally,
share the I/O writing between all the nodes, instead of writing everything
with process 0. Then, only block with MPI_Barrier if chunks were actually
created.
Platforms tested:
IRIX64 6.5 (modi4)
Diffstat (limited to 'src')
-rw-r--r-- | src/H5Distore.c | 167 | ||||
-rw-r--r-- | src/H5FDmpio.c | 56 | ||||
-rw-r--r-- | src/H5FDmpio.h | 2 | ||||
-rw-r--r-- | src/H5Fistore.c | 167 | ||||
-rw-r--r-- | src/H5Fpkg.h | 2 |
5 files changed, 264 insertions, 130 deletions
diff --git a/src/H5Distore.c b/src/H5Distore.c index f5bee8a..1d7feeb 100644 --- a/src/H5Distore.c +++ b/src/H5Distore.c @@ -1037,16 +1037,10 @@ H5F_istore_flush_entry(H5F_t *f, H5F_rdcc_ent_t *ent, hbool_t reset) * Create the chunk it if it doesn't exist, or reallocate the chunk if * its size changed. Then write the data into the file. */ - if (H5B_insert(f, H5B_ISTORE, ent->layout->addr, ent->split_ratios, - &udata)<0) { - HGOTO_ERROR(H5E_IO, H5E_WRITEERROR, FAIL, - "unable to allocate chunk"); - } - if (H5F_block_write(f, H5FD_MEM_DRAW, udata.addr, udata.key.nbytes, H5P_DATASET_XFER_DEFAULT, - buf)<0) { - HGOTO_ERROR(H5E_IO, H5E_WRITEERROR, FAIL, - "unable to write raw data to file"); - } + if (H5B_insert(f, H5B_ISTORE, ent->layout->addr, ent->split_ratios, &udata)<0) + HGOTO_ERROR(H5E_IO, H5E_WRITEERROR, FAIL, "unable to allocate chunk"); + if (H5F_block_write(f, H5FD_MEM_DRAW, udata.addr, udata.key.nbytes, H5P_DATASET_XFER_DEFAULT, buf)<0) + HGOTO_ERROR(H5E_IO, H5E_WRITEERROR, FAIL, "unable to write raw data to file"); /* Mark cache entry as clean */ ent->dirty = FALSE; @@ -2413,21 +2407,32 @@ done: * Robb Matzke, 1999-08-02 * The split_ratios are passed in as part of the data transfer * property list. + * + * Quincey Koziol, 2002-05-16 + * Rewrote algorithm to allocate & write blocks without using + * lock/unlock code. *------------------------------------------------------------------------- */ +#ifdef H5_HAVE_PARALLEL herr_t H5F_istore_allocate(H5F_t *f, hid_t dxpl_id, const H5O_layout_t *layout, const hsize_t *space_dim, H5P_genplist_t *dc_plist) { - - int i, carry; - unsigned u; - hssize_t chunk_offset[H5O_LAYOUT_NDIMS]; - uint8_t *chunk=NULL; - unsigned idx_hint=0; - hsize_t chunk_size; - H5O_pline_t pline; /* I/O pipeline information */ - H5O_fill_t fill; /* Fill value information */ + hssize_t chunk_offset[H5O_LAYOUT_NDIMS]; /* Offset of current chunk */ + hsize_t chunk_size; /* Size of chunk in bytes */ + H5O_pline_t pline; /* I/O pipeline information */ + H5O_fill_t fill; /* Fill value information */ + H5F_istore_ud1_t udata; /* B-tree pass-through for creating chunk */ + void *chunk=NULL; /* Chunk buffer for writing fill values */ + H5P_genplist_t *dx_plist; /* Data xfer property list */ + double split_ratios[3];/* B-tree node splitting ratios */ + int mpi_rank; /* This process's rank */ + int mpi_size; /* Total # of processes */ + int mpi_round=0; /* Current process responsible for I/O */ + unsigned chunk_allocated=0; /* Flag to indicate that chunk was actually allocated */ + int carry; /* Flag to indicate that chunk increment carrys to higher dimension (sorta) */ + int i; /* Local index variable */ + unsigned u; /* Local index variable */ herr_t ret_value=SUCCEED; /* Return value */ FUNC_ENTER(H5F_istore_allocate, FAIL); @@ -2438,52 +2443,86 @@ H5F_istore_allocate(H5F_t *f, hid_t dxpl_id, const H5O_layout_t *layout, assert(layout && H5D_CHUNKED==layout->type); assert(layout->ndims>0 && layout->ndims<=H5O_LAYOUT_NDIMS); assert(H5F_addr_defined(layout->addr)); + assert(H5I_GENPROP_LST==H5I_get_type(dxpl_id)); + assert(TRUE==H5P_isa_class(dxpl_id,H5P_DATASET_XFER)); + assert(dc_plist!=NULL); - /* Get necessary properties from property list */ + /* Get necessary properties from dataset creation property list */ if(H5P_get(dc_plist, H5D_CRT_FILL_VALUE_NAME, &fill) < 0) - HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't get fill value"); + HGOTO_ERROR(H5E_STORAGE, H5E_CANTGET, FAIL, "can't get fill value"); if(H5P_get(dc_plist, H5D_CRT_DATA_PIPELINE_NAME, &pline) < 0) - HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't get data pipeline"); + HGOTO_ERROR(H5E_STORAGE, H5E_CANTGET, FAIL, "can't get data pipeline"); + + /* Get necessary properties from dataset transfer property list */ + if (TRUE!=H5P_isa_class(dxpl_id,H5P_DATASET_XFER) || NULL == (dx_plist = H5I_object(dxpl_id))) + HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, "not a dataset transfer property list"); + if(H5P_get(dx_plist,H5D_XFER_BTREE_SPLIT_RATIO_NAME,split_ratios)<0) + HGOTO_ERROR(H5E_STORAGE, H5E_CANTGET, FAIL, "can't get B-tree split ratios"); + + /* Can't use data I/O pipeline in parallel (yet) */ + if (pline.nfilters>0) + HGOTO_ERROR(H5E_STORAGE, H5E_UNSUPPORTED, FAIL, "can't use data pipeline in parallel"); /* * Setup indice to go through all chunks. (Future improvement * should allocate only chunks that have no file space assigned yet. */ for (u=0, chunk_size=1; u<layout->ndims; u++) { - chunk_offset[u]=0; + chunk_offset[u] = 0; chunk_size *= layout->dim[u]; } /* end for */ - /* Loop over all chunks */ - carry=0; - while (carry==0) { - /* No file space assigned yet. Allocate it. */ - /* The following needs improvement like calling the */ - /* allocation directly rather than indirectly using the */ - /* allocation effect in the unlock process. */ + /* Allocate chunk buffer for processes to use when writing fill values */ + if (NULL==(chunk = H5F_istore_chunk_alloc(chunk_size))) + HGOTO_ERROR(H5E_RESOURCE, H5E_NOSPACE, FAIL, "memory allocation failed for chunk"); + /* Fill the chunk with the proper values */ + if(fill.buf) { + /* + * Replicate the fill value throughout the chunk. + */ + assert(0==chunk_size % fill.size); + H5V_array_fill(chunk, fill.buf, fill.size, chunk_size/fill.size); + } else { /* - * Lock the chunk, copy from application to chunk, then unlock the - * chunk. + * No fill value was specified, assume all zeros. */ + HDmemset (chunk, 0, chunk_size); + } /* end else */ -#ifdef H5_HAVE_PARALLEL - /* rky 981207 Serialize access to this critical region. */ - if (SUCCEED!= H5FD_mpio_wait_for_left_neighbor(f->shared->lf)) - HGOTO_ERROR (H5E_IO, H5E_WRITEERROR, FAIL, "unable to lock the data chunk"); -#endif - if (NULL==(chunk=H5F_istore_lock(f, dxpl_id, layout, &pline, - &fill, chunk_offset, FALSE, &idx_hint))) - HGOTO_ERROR (H5E_IO, H5E_WRITEERROR, FAIL, "unable to read raw data chunk"); - - H5_CHECK_OVERFLOW(chunk_size,hsize_t,size_t); - if (H5F_istore_unlock(f, dxpl_id, layout, &pline, TRUE, - chunk_offset, &idx_hint, chunk, (size_t)chunk_size)<0) - HGOTO_ERROR (H5E_IO, H5E_WRITEERROR, FAIL, "uanble to unlock raw data chunk"); -#ifdef H5_HAVE_PARALLEL - if (SUCCEED!= H5FD_mpio_signal_right_neighbor(f->shared->lf)) - HGOTO_ERROR (H5E_IO, H5E_WRITEERROR, FAIL, "unable to unlock the data chunk"); -#endif + /* Retrieve up MPI parameters */ + if ((mpi_rank=H5FD_mpio_mpi_rank(f->shared->lf))<0) + HGOTO_ERROR(H5E_INTERNAL, H5E_MPI, FAIL, "Can't retrieve MPI rank"); + if ((mpi_size=H5FD_mpio_mpi_size(f->shared->lf))<0) + HGOTO_ERROR(H5E_INTERNAL, H5E_MPI, FAIL, "Can't retrieve MPI size"); + + /* Loop over all chunks */ + carry=0; + while (carry==0) { + /* Check if the chunk exists yet */ + if(H5F_istore_get_addr(f,layout,chunk_offset)==HADDR_UNDEF) { + /* Initialize the chunk information */ + udata.mesg = *layout; + udata.key.filter_mask = 0; + udata.addr = HADDR_UNDEF; + udata.key.nbytes = chunk_size; + for (u=0; u<layout->ndims; u++) + udata.key.offset[u] = chunk_offset[u]; + + /* Allocate the chunk with all processes */ + if (H5B_insert(f, H5B_ISTORE, layout->addr, split_ratios, &udata)<0) + HGOTO_ERROR(H5E_IO, H5E_WRITEERROR, FAIL, "unable to allocate chunk"); + + /* Round-robin write the chunks out from only one process */ + if(mpi_round==mpi_rank) { + if (H5F_block_write(f, H5FD_MEM_DRAW, udata.addr, udata.key.nbytes, dxpl_id, chunk)<0) + HGOTO_ERROR(H5E_IO, H5E_WRITEERROR, FAIL, "unable to write raw data to file"); + } /* end if */ + mpi_round=(++mpi_round)%mpi_size; + + /* Indicate that a chunk was allocated */ + chunk_allocated=1; + } /* end if */ /* Increment indices */ for (i=layout->ndims-1, carry=1; i>=0 && carry; --i) { @@ -2496,24 +2535,24 @@ H5F_istore_allocate(H5F_t *f, hid_t dxpl_id, const H5O_layout_t *layout, } /* end for */ } /* end while */ -#ifdef H5_HAVE_PARALLEL - /* - * rky 980923 - * - * The following barrier is a temporary fix to prevent overwriting real - * data caused by a race between one proc's call of H5F_istore_allocate - * (from H5D_init_storage, ultimately from H5Dcreate and H5Dextend) and - * another proc's call of H5Dwrite. Eventually, this barrier should be - * removed, when H5D_init_storage is changed to call H5MF_alloc directly - * to allocate space, instead of calling H5F_istore_unlock. - */ - if (MPI_Barrier(H5FD_mpio_communicator(f->shared->lf))) - HGOTO_ERROR(H5E_INTERNAL, H5E_MPI, FAIL, "MPI_Barrier failed"); -#endif + /* Only need to block at the barrier if we actually allocated a chunk */ + if(chunk_allocated) { + /* Wait at barrier to avoid race conditions where some processes are + * still writing out chunks and other processes race ahead to read + * them in, getting bogus data. + */ + if (MPI_Barrier(H5FD_mpio_communicator(f->shared->lf))) + HGOTO_ERROR(H5E_INTERNAL, H5E_MPI, FAIL, "MPI_Barrier failed"); + } /* end if */ done: + /* Free the chunk for fill values */ + if(chunk!=NULL) + H5F_istore_chunk_free(chunk); + FUNC_LEAVE(ret_value); } +#endif /* H5_HAVE_PARALLEL */ /*------------------------------------------------------------------------- @@ -2824,7 +2863,6 @@ H5F_istore_initialize_by_extent(H5F_t *f, const H5O_layout_t *layout, hsize_t idx_max[H5O_LAYOUT_NDIMS]; hsize_t sub_size[H5O_LAYOUT_NDIMS]; hsize_t naccessed; /*bytes accessed in chunk */ - hsize_t elm_size; /*size of an element in bytes */ hsize_t end_chunk; /*chunk position counter */ hssize_t start[H5O_LAYOUT_NDIMS]; /*starting location of hyperslab */ hsize_t count[H5O_LAYOUT_NDIMS]; /*element count of hyperslab */ @@ -2866,7 +2904,6 @@ H5F_istore_initialize_by_extent(H5F_t *f, const H5O_layout_t *layout, for(i = 0; i < rank; i++) size[i] = curr_dims[i]; size[i] = layout->dim[i]; - elm_size = size[i]; /* Default dataset transfer property list */ dxpl_id = H5P_DATASET_XFER_DEFAULT; diff --git a/src/H5FDmpio.c b/src/H5FDmpio.c index f4ba335..3572390 100644 --- a/src/H5FDmpio.c +++ b/src/H5FDmpio.c @@ -449,6 +449,62 @@ H5FD_mpio_communicator(H5FD_t *_file) /*------------------------------------------------------------------------- + * Function: H5FD_mpio_mpi_rank + * + * Purpose: Returns the MPI rank for a process + * + * Return: Success: non-negative + * Failure: negative + * + * Programmer: Quincey Koziol + * Thursday, May 16, 2002 + * + * Modifications: + * + *------------------------------------------------------------------------- + */ +int +H5FD_mpio_mpi_rank(H5FD_t *_file) +{ + H5FD_mpio_t *file = (H5FD_mpio_t*)_file; + + FUNC_ENTER(H5FD_mpio_mpi_rank, FAIL); + assert(file); + assert(H5FD_MPIO==file->pub.driver_id); + + FUNC_LEAVE(file->mpi_rank); +} /* end H5FD_mpio_mpi_rank() */ + + +/*------------------------------------------------------------------------- + * Function: H5FD_mpio_mpi_size + * + * Purpose: Returns the number of MPI processes + * + * Return: Success: non-negative + * Failure: negative + * + * Programmer: Quincey Koziol + * Thursday, May 16, 2002 + * + * Modifications: + * + *------------------------------------------------------------------------- + */ +int +H5FD_mpio_mpi_size(H5FD_t *_file) +{ + H5FD_mpio_t *file = (H5FD_mpio_t*)_file; + + FUNC_ENTER(H5FD_mpio_mpi_rank, FAIL); + assert(file); + assert(H5FD_MPIO==file->pub.driver_id); + + FUNC_LEAVE(file->mpi_size); +} /* end H5FD_mpio_mpi_size() */ + + +/*------------------------------------------------------------------------- * Function: H5FD_mpio_setup * * Purpose: Set the buffer type BTYPE, file type FTYPE, and absolute base diff --git a/src/H5FDmpio.h b/src/H5FDmpio.h index 425a346..4750ef2 100644 --- a/src/H5FDmpio.h +++ b/src/H5FDmpio.h @@ -62,6 +62,8 @@ __DLL__ herr_t H5FD_mpio_setup(H5FD_t *_file, MPI_Datatype btype, MPI_Datatype f __DLL__ herr_t H5FD_mpio_wait_for_left_neighbor(H5FD_t *file); __DLL__ herr_t H5FD_mpio_signal_right_neighbor(H5FD_t *file); __DLL__ herr_t H5FD_mpio_closing(H5FD_t *file); +__DLL__ int H5FD_mpio_mpi_rank(H5FD_t *_file); +__DLL__ int H5FD_mpio_mpi_size(H5FD_t *_file); #ifdef __cplusplus } #endif diff --git a/src/H5Fistore.c b/src/H5Fistore.c index f5bee8a..1d7feeb 100644 --- a/src/H5Fistore.c +++ b/src/H5Fistore.c @@ -1037,16 +1037,10 @@ H5F_istore_flush_entry(H5F_t *f, H5F_rdcc_ent_t *ent, hbool_t reset) * Create the chunk it if it doesn't exist, or reallocate the chunk if * its size changed. Then write the data into the file. */ - if (H5B_insert(f, H5B_ISTORE, ent->layout->addr, ent->split_ratios, - &udata)<0) { - HGOTO_ERROR(H5E_IO, H5E_WRITEERROR, FAIL, - "unable to allocate chunk"); - } - if (H5F_block_write(f, H5FD_MEM_DRAW, udata.addr, udata.key.nbytes, H5P_DATASET_XFER_DEFAULT, - buf)<0) { - HGOTO_ERROR(H5E_IO, H5E_WRITEERROR, FAIL, - "unable to write raw data to file"); - } + if (H5B_insert(f, H5B_ISTORE, ent->layout->addr, ent->split_ratios, &udata)<0) + HGOTO_ERROR(H5E_IO, H5E_WRITEERROR, FAIL, "unable to allocate chunk"); + if (H5F_block_write(f, H5FD_MEM_DRAW, udata.addr, udata.key.nbytes, H5P_DATASET_XFER_DEFAULT, buf)<0) + HGOTO_ERROR(H5E_IO, H5E_WRITEERROR, FAIL, "unable to write raw data to file"); /* Mark cache entry as clean */ ent->dirty = FALSE; @@ -2413,21 +2407,32 @@ done: * Robb Matzke, 1999-08-02 * The split_ratios are passed in as part of the data transfer * property list. + * + * Quincey Koziol, 2002-05-16 + * Rewrote algorithm to allocate & write blocks without using + * lock/unlock code. *------------------------------------------------------------------------- */ +#ifdef H5_HAVE_PARALLEL herr_t H5F_istore_allocate(H5F_t *f, hid_t dxpl_id, const H5O_layout_t *layout, const hsize_t *space_dim, H5P_genplist_t *dc_plist) { - - int i, carry; - unsigned u; - hssize_t chunk_offset[H5O_LAYOUT_NDIMS]; - uint8_t *chunk=NULL; - unsigned idx_hint=0; - hsize_t chunk_size; - H5O_pline_t pline; /* I/O pipeline information */ - H5O_fill_t fill; /* Fill value information */ + hssize_t chunk_offset[H5O_LAYOUT_NDIMS]; /* Offset of current chunk */ + hsize_t chunk_size; /* Size of chunk in bytes */ + H5O_pline_t pline; /* I/O pipeline information */ + H5O_fill_t fill; /* Fill value information */ + H5F_istore_ud1_t udata; /* B-tree pass-through for creating chunk */ + void *chunk=NULL; /* Chunk buffer for writing fill values */ + H5P_genplist_t *dx_plist; /* Data xfer property list */ + double split_ratios[3];/* B-tree node splitting ratios */ + int mpi_rank; /* This process's rank */ + int mpi_size; /* Total # of processes */ + int mpi_round=0; /* Current process responsible for I/O */ + unsigned chunk_allocated=0; /* Flag to indicate that chunk was actually allocated */ + int carry; /* Flag to indicate that chunk increment carrys to higher dimension (sorta) */ + int i; /* Local index variable */ + unsigned u; /* Local index variable */ herr_t ret_value=SUCCEED; /* Return value */ FUNC_ENTER(H5F_istore_allocate, FAIL); @@ -2438,52 +2443,86 @@ H5F_istore_allocate(H5F_t *f, hid_t dxpl_id, const H5O_layout_t *layout, assert(layout && H5D_CHUNKED==layout->type); assert(layout->ndims>0 && layout->ndims<=H5O_LAYOUT_NDIMS); assert(H5F_addr_defined(layout->addr)); + assert(H5I_GENPROP_LST==H5I_get_type(dxpl_id)); + assert(TRUE==H5P_isa_class(dxpl_id,H5P_DATASET_XFER)); + assert(dc_plist!=NULL); - /* Get necessary properties from property list */ + /* Get necessary properties from dataset creation property list */ if(H5P_get(dc_plist, H5D_CRT_FILL_VALUE_NAME, &fill) < 0) - HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't get fill value"); + HGOTO_ERROR(H5E_STORAGE, H5E_CANTGET, FAIL, "can't get fill value"); if(H5P_get(dc_plist, H5D_CRT_DATA_PIPELINE_NAME, &pline) < 0) - HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't get data pipeline"); + HGOTO_ERROR(H5E_STORAGE, H5E_CANTGET, FAIL, "can't get data pipeline"); + + /* Get necessary properties from dataset transfer property list */ + if (TRUE!=H5P_isa_class(dxpl_id,H5P_DATASET_XFER) || NULL == (dx_plist = H5I_object(dxpl_id))) + HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, "not a dataset transfer property list"); + if(H5P_get(dx_plist,H5D_XFER_BTREE_SPLIT_RATIO_NAME,split_ratios)<0) + HGOTO_ERROR(H5E_STORAGE, H5E_CANTGET, FAIL, "can't get B-tree split ratios"); + + /* Can't use data I/O pipeline in parallel (yet) */ + if (pline.nfilters>0) + HGOTO_ERROR(H5E_STORAGE, H5E_UNSUPPORTED, FAIL, "can't use data pipeline in parallel"); /* * Setup indice to go through all chunks. (Future improvement * should allocate only chunks that have no file space assigned yet. */ for (u=0, chunk_size=1; u<layout->ndims; u++) { - chunk_offset[u]=0; + chunk_offset[u] = 0; chunk_size *= layout->dim[u]; } /* end for */ - /* Loop over all chunks */ - carry=0; - while (carry==0) { - /* No file space assigned yet. Allocate it. */ - /* The following needs improvement like calling the */ - /* allocation directly rather than indirectly using the */ - /* allocation effect in the unlock process. */ + /* Allocate chunk buffer for processes to use when writing fill values */ + if (NULL==(chunk = H5F_istore_chunk_alloc(chunk_size))) + HGOTO_ERROR(H5E_RESOURCE, H5E_NOSPACE, FAIL, "memory allocation failed for chunk"); + /* Fill the chunk with the proper values */ + if(fill.buf) { + /* + * Replicate the fill value throughout the chunk. + */ + assert(0==chunk_size % fill.size); + H5V_array_fill(chunk, fill.buf, fill.size, chunk_size/fill.size); + } else { /* - * Lock the chunk, copy from application to chunk, then unlock the - * chunk. + * No fill value was specified, assume all zeros. */ + HDmemset (chunk, 0, chunk_size); + } /* end else */ -#ifdef H5_HAVE_PARALLEL - /* rky 981207 Serialize access to this critical region. */ - if (SUCCEED!= H5FD_mpio_wait_for_left_neighbor(f->shared->lf)) - HGOTO_ERROR (H5E_IO, H5E_WRITEERROR, FAIL, "unable to lock the data chunk"); -#endif - if (NULL==(chunk=H5F_istore_lock(f, dxpl_id, layout, &pline, - &fill, chunk_offset, FALSE, &idx_hint))) - HGOTO_ERROR (H5E_IO, H5E_WRITEERROR, FAIL, "unable to read raw data chunk"); - - H5_CHECK_OVERFLOW(chunk_size,hsize_t,size_t); - if (H5F_istore_unlock(f, dxpl_id, layout, &pline, TRUE, - chunk_offset, &idx_hint, chunk, (size_t)chunk_size)<0) - HGOTO_ERROR (H5E_IO, H5E_WRITEERROR, FAIL, "uanble to unlock raw data chunk"); -#ifdef H5_HAVE_PARALLEL - if (SUCCEED!= H5FD_mpio_signal_right_neighbor(f->shared->lf)) - HGOTO_ERROR (H5E_IO, H5E_WRITEERROR, FAIL, "unable to unlock the data chunk"); -#endif + /* Retrieve up MPI parameters */ + if ((mpi_rank=H5FD_mpio_mpi_rank(f->shared->lf))<0) + HGOTO_ERROR(H5E_INTERNAL, H5E_MPI, FAIL, "Can't retrieve MPI rank"); + if ((mpi_size=H5FD_mpio_mpi_size(f->shared->lf))<0) + HGOTO_ERROR(H5E_INTERNAL, H5E_MPI, FAIL, "Can't retrieve MPI size"); + + /* Loop over all chunks */ + carry=0; + while (carry==0) { + /* Check if the chunk exists yet */ + if(H5F_istore_get_addr(f,layout,chunk_offset)==HADDR_UNDEF) { + /* Initialize the chunk information */ + udata.mesg = *layout; + udata.key.filter_mask = 0; + udata.addr = HADDR_UNDEF; + udata.key.nbytes = chunk_size; + for (u=0; u<layout->ndims; u++) + udata.key.offset[u] = chunk_offset[u]; + + /* Allocate the chunk with all processes */ + if (H5B_insert(f, H5B_ISTORE, layout->addr, split_ratios, &udata)<0) + HGOTO_ERROR(H5E_IO, H5E_WRITEERROR, FAIL, "unable to allocate chunk"); + + /* Round-robin write the chunks out from only one process */ + if(mpi_round==mpi_rank) { + if (H5F_block_write(f, H5FD_MEM_DRAW, udata.addr, udata.key.nbytes, dxpl_id, chunk)<0) + HGOTO_ERROR(H5E_IO, H5E_WRITEERROR, FAIL, "unable to write raw data to file"); + } /* end if */ + mpi_round=(++mpi_round)%mpi_size; + + /* Indicate that a chunk was allocated */ + chunk_allocated=1; + } /* end if */ /* Increment indices */ for (i=layout->ndims-1, carry=1; i>=0 && carry; --i) { @@ -2496,24 +2535,24 @@ H5F_istore_allocate(H5F_t *f, hid_t dxpl_id, const H5O_layout_t *layout, } /* end for */ } /* end while */ -#ifdef H5_HAVE_PARALLEL - /* - * rky 980923 - * - * The following barrier is a temporary fix to prevent overwriting real - * data caused by a race between one proc's call of H5F_istore_allocate - * (from H5D_init_storage, ultimately from H5Dcreate and H5Dextend) and - * another proc's call of H5Dwrite. Eventually, this barrier should be - * removed, when H5D_init_storage is changed to call H5MF_alloc directly - * to allocate space, instead of calling H5F_istore_unlock. - */ - if (MPI_Barrier(H5FD_mpio_communicator(f->shared->lf))) - HGOTO_ERROR(H5E_INTERNAL, H5E_MPI, FAIL, "MPI_Barrier failed"); -#endif + /* Only need to block at the barrier if we actually allocated a chunk */ + if(chunk_allocated) { + /* Wait at barrier to avoid race conditions where some processes are + * still writing out chunks and other processes race ahead to read + * them in, getting bogus data. + */ + if (MPI_Barrier(H5FD_mpio_communicator(f->shared->lf))) + HGOTO_ERROR(H5E_INTERNAL, H5E_MPI, FAIL, "MPI_Barrier failed"); + } /* end if */ done: + /* Free the chunk for fill values */ + if(chunk!=NULL) + H5F_istore_chunk_free(chunk); + FUNC_LEAVE(ret_value); } +#endif /* H5_HAVE_PARALLEL */ /*------------------------------------------------------------------------- @@ -2824,7 +2863,6 @@ H5F_istore_initialize_by_extent(H5F_t *f, const H5O_layout_t *layout, hsize_t idx_max[H5O_LAYOUT_NDIMS]; hsize_t sub_size[H5O_LAYOUT_NDIMS]; hsize_t naccessed; /*bytes accessed in chunk */ - hsize_t elm_size; /*size of an element in bytes */ hsize_t end_chunk; /*chunk position counter */ hssize_t start[H5O_LAYOUT_NDIMS]; /*starting location of hyperslab */ hsize_t count[H5O_LAYOUT_NDIMS]; /*element count of hyperslab */ @@ -2866,7 +2904,6 @@ H5F_istore_initialize_by_extent(H5F_t *f, const H5O_layout_t *layout, for(i = 0; i < rank; i++) size[i] = curr_dims[i]; size[i] = layout->dim[i]; - elm_size = size[i]; /* Default dataset transfer property list */ dxpl_id = H5P_DATASET_XFER_DEFAULT; diff --git a/src/H5Fpkg.h b/src/H5Fpkg.h index 72cd321..30439b4 100644 --- a/src/H5Fpkg.h +++ b/src/H5Fpkg.h @@ -194,10 +194,12 @@ __DLL__ herr_t H5F_istore_write(H5F_t *f, hid_t dxpl_id, const hsize_t size_m[], const hssize_t offset_m[], const hssize_t offset[], const hsize_t size[], const void *buf); +#ifdef H5_HAVE_PARALLEL __DLL__ herr_t H5F_istore_allocate (H5F_t *f, hid_t dxpl_id, const struct H5O_layout_t *layout, const hsize_t *space_dim, struct H5P_genplist_t *dc_plist); +#endif /* H5_HAVE_PARALLEL */ /* Functions that operate on contiguous storage wrt boot block */ __DLL__ herr_t H5F_contig_read(H5F_t *f, hsize_t max_data, H5FD_mem_t type, haddr_t addr, |