summaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/H5Dio.c436
-rw-r--r--src/H5Dmpio.c1646
-rw-r--r--src/H5Dpkg.h85
-rw-r--r--src/H5Dselect.c2
4 files changed, 1701 insertions, 468 deletions
diff --git a/src/H5Dio.c b/src/H5Dio.c
index 21face4..6b62ca9 100644
--- a/src/H5Dio.c
+++ b/src/H5Dio.c
@@ -49,6 +49,7 @@
/* Information for mapping between file space and memory space */
+#if 0
/* Structure holding information about a chunk's selection for mapping */
typedef struct H5D_chunk_info_t {
hsize_t index; /* "Index" of chunk in dataset */
@@ -77,8 +78,11 @@ typedef struct fm_map {
hsize_t down_chunks[H5O_LAYOUT_NDIMS]; /* "down" size of number of chunks in each dimension */
H5O_layout_t *layout; /* Dataset layout information*/
H5S_sel_type msel_type; /* Selection type in memory */
+ hsize_t total_chunks; /* Number of total chunks */
+ hbool_t *select_chunk; /* store the information about whether this chunk is selected or not */
} fm_map;
+#endif
/********************/
/* Local Prototypes */
/********************/
@@ -107,12 +111,9 @@ static herr_t H5D_chunk_write(H5D_io_info_t *io_info, hsize_t nelmts,
const H5T_t *mem_type, const H5S_t *mem_space,
const H5S_t *file_space, H5T_path_t *tpath,
hid_t src_id, hid_t dst_id, const void *buf);
+
#ifdef H5_HAVE_PARALLEL
-static herr_t H5D_ioinfo_make_ind(H5D_io_info_t *io_info);
-static herr_t H5D_ioinfo_make_coll(H5D_io_info_t *io_info);
static herr_t H5D_ioinfo_term(H5D_io_info_t *io_info);
-static herr_t H5D_mpio_get_min_chunk(const H5D_io_info_t *io_info,
- const fm_map *fm, int *min_chunkf);
#endif /* H5_HAVE_PARALLEL */
/* I/O info operations */
@@ -125,7 +126,7 @@ static herr_t H5D_create_chunk_map(const H5D_t *dataset, const H5T_t *mem_type,
const H5S_t *file_space, const H5S_t *mem_space, fm_map *fm);
static herr_t H5D_destroy_chunk_map(const fm_map *fm);
static herr_t H5D_free_chunk_info(void *item, void *key, void *opdata);
-static herr_t H5D_create_chunk_file_map_hyper(const fm_map *fm);
+static herr_t H5D_create_chunk_file_map_hyper(fm_map *fm, const H5D_t *dset);
static herr_t H5D_create_chunk_mem_map_hyper(const fm_map *fm);
static herr_t H5D_chunk_file_cb(void *elem, hid_t type_id, unsigned ndims,
const hsize_t *coords, void *fm);
@@ -952,6 +953,8 @@ H5D_contig_read(H5D_io_info_t *io_info, hsize_t nelmts,
hsize_t smine_start; /*strip mine start loc */
size_t n, smine_nelmts; /*elements per strip */
H5D_storage_t store; /*union of storage info for dataset */
+
+
herr_t ret_value = SUCCEED; /*return value */
FUNC_ENTER_NOAPI_NOINIT(H5D_contig_read)
@@ -981,10 +984,21 @@ H5D_contig_read(H5D_io_info_t *io_info, hsize_t nelmts,
|| dataset->shared->efl.nused>0 || 0 == nelmts
|| dataset->shared->layout.type==H5D_COMPACT);
H5_CHECK_OVERFLOW(nelmts,hsize_t,size_t);
- status = (io_info->ops.read)(io_info,
+
+#ifdef H5_HAVE_PARALLEL
+ if(io_info->dxpl_cache->xfer_mode == H5FD_MPIO_COLLECTIVE) {
+ if(H5D_contig_collective_io(io_info,file_space,mem_space,buf,FALSE)<0)
+ HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't manipulate collective I/O");
+ }
+ else
+#endif
+ {
+ status = (io_info->ops.read)(io_info,
(size_t)nelmts, H5T_get_size(dataset->shared->type),
- file_space, mem_space,
+ file_space, mem_space,0,
buf/*out*/);
+ }
+
#ifdef H5S_DEBUG
H5_timer_end(&(io_info->stats->stats[1].read_timer), &timer);
io_info->stats->stats[1].read_nbytes += nelmts * H5T_get_size(dataset->shared->type);
@@ -1208,6 +1222,7 @@ H5D_contig_write(H5D_io_info_t *io_info, hsize_t nelmts,
hsize_t smine_start; /*strip mine start loc */
size_t n, smine_nelmts; /*elements per strip */
H5D_storage_t store; /*union of storage info for dataset */
+
herr_t ret_value = SUCCEED; /*return value */
FUNC_ENTER_NOAPI_NOINIT(H5D_contig_write)
@@ -1232,10 +1247,20 @@ H5D_contig_write(H5D_io_info_t *io_info, hsize_t nelmts,
H5_timer_begin(&timer);
#endif
H5_CHECK_OVERFLOW(nelmts,hsize_t,size_t);
- status = (io_info->ops.write)(io_info,
+#ifdef H5_HAVE_PARALLEL
+ if(io_info->dxpl_cache->xfer_mode == H5FD_MPIO_COLLECTIVE) {
+ if(H5D_contig_collective_io(io_info,file_space,mem_space,buf,TRUE)<0)
+ HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't manipulate collective I/O");
+ }
+ else
+#endif
+ {
+ status = (io_info->ops.write)(io_info,
(size_t)nelmts, H5T_get_size(dataset->shared->type),
- file_space, mem_space,
- buf);
+ file_space, mem_space,0,
+ buf/*out*/);
+ }
+
#ifdef H5S_DEBUG
H5_timer_end(&(io_info->stats->stats[0].write_timer), &timer);
io_info->stats->stats[0].write_nbytes += nelmts * H5T_get_size(mem_type);
@@ -1460,10 +1485,6 @@ H5D_chunk_read(H5D_io_info_t *io_info, hsize_t nelmts,
uint8_t *tconv_buf = NULL; /*data type conv buffer */
uint8_t *bkg_buf = NULL; /*background buffer */
H5D_storage_t store; /*union of EFL and chunk pointer in file space */
-#ifdef H5_HAVE_PARALLEL
- int count_chunk; /* Number of chunks accessed */
- int min_num_chunk; /* Number of chunks to access collectively */
-#endif
herr_t ret_value = SUCCEED; /*return value */
FUNC_ENTER_NOAPI_NOINIT(H5D_chunk_read)
@@ -1489,99 +1510,48 @@ H5D_chunk_read(H5D_io_info_t *io_info, hsize_t nelmts,
|| dataset->shared->efl.nused>0 || 0 == nelmts
|| dataset->shared->layout.type==H5D_COMPACT);
- /* Get first node in chunk skip list */
- chunk_node=H5SL_first(fm.fsel);
-
#ifdef H5_HAVE_PARALLEL
if(io_info->dxpl_cache->xfer_mode == H5FD_MPIO_COLLECTIVE) {
- if(H5D_mpio_get_min_chunk(io_info, &fm, &min_num_chunk)<0)
- HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't get minimum number of chunk")
+ if(H5D_mpio_chunk_adjust_iomode(io_info,&fm))
+ HGOTO_ERROR(H5E_DATASET,H5E_CANTGET,FAIL,"can't adjust collective I/O")
+ }
+ /* Temporarily shut down collective IO for chunking */
+ if(io_info->dxpl_cache->xfer_mode == H5FD_MPIO_COLLECTIVE) {
+ if(H5D_chunk_collective_io(io_info,&fm,buf,FALSE)<0)
+ HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't manipulate collective I/O");
}
- count_chunk = 0;
-#endif /* H5_HAVE_PARALLEL */
+
+ else {/* sequential or independent read */
+#endif
+ /* Get first node in chunk skip list */
+ chunk_node=H5SL_first(fm.fsel);
- /* Iterate through chunks to be operated on */
- while(chunk_node) {
+ while(chunk_node) {
H5D_chunk_info_t *chunk_info; /* chunk information */
-#ifdef H5_HAVE_PARALLEL
- hbool_t make_ind, make_coll; /* Flags to indicate that the MPI mode should change */
-#endif /* H5_HAVE_PARALLEL */
- /* Get the actual chunk information from the skip list node */
- chunk_info=H5SL_item(chunk_node);
+ /* Get the actual chunk information from the skip list node */
+ chunk_info=H5SL_item(chunk_node);
- /* Pass in chunk's coordinates in a union. */
- store.chunk.offset = chunk_info->coords;
- store.chunk.index = chunk_info->index;
+ /* Pass in chunk's coordinates in a union. */
+ store.chunk.offset = chunk_info->coords;
+ store.chunk.index = chunk_info->index;
-#ifdef H5_HAVE_PARALLEL
- /* Reset flags for changing parallel I/O mode */
- make_ind = make_coll = FALSE;
-
- if(io_info->dxpl_cache->xfer_mode == H5FD_MPIO_COLLECTIVE) {
- /* Increment chunk we are operating on */
- count_chunk++;
-
- /* If the number of chunk is greater than minimum number of chunk,
- Do independent read */
- if(count_chunk > min_num_chunk) {
- /* Switch to independent I/O (permanently) */
- make_ind = TRUE;
- }
-#ifndef H5_MPI_COMPLEX_DERIVED_DATATYPE_WORKS
- else {
- int is_regular; /* If this chunk's selections are regular */
- int mpi_code; /* MPI error code */
- int all_regular = 0; /* If this chunk's selections are regular on all processes */
-
- /* Determine if this process has regular selections */
- if(H5S_SELECT_IS_REGULAR(chunk_info->fspace) == TRUE &&
- H5S_SELECT_IS_REGULAR(chunk_info->mspace) == TRUE)
- is_regular = 1;
- else
- is_regular = 0;
-
- /* Determine if all processes have regular selections */
- if (MPI_SUCCESS != (mpi_code = MPI_Allreduce(&is_regular, &all_regular, 1, MPI_INT, MPI_LAND, io_info->comm)))
- HMPI_GOTO_ERROR(FAIL, "MPI_Allreduce failed", mpi_code)
-
- /* For regular selection,
- if MPI_COMPLEX_DERIVED_DATATYPE is not defined,
- unless spaces for all processors are regular, independent read operation should be performed.*/
- if(!all_regular) {
- /* Switch to independent I/O (temporarily) */
- make_ind = TRUE;
- make_coll = TRUE;
- } /* end if */
- } /* end else */
-#endif /* H5_MPI_COMPLEX_DERIVED_DATATYPE_WORKS */
- } /* end if */
+ /* Perform the actual read operation */
+ status = (io_info->ops.read)(io_info,
+ chunk_info->chunk_points, H5T_get_size(dataset->shared->type),
+ chunk_info->fspace, chunk_info->mspace, 0,buf);
- /* Switch to independent I/O */
- if(make_ind)
- if(H5D_ioinfo_make_ind(io_info) < 0)
- HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't switch to independent I/O")
-#endif /* H5_HAVE_PARALLEL */
+ /* Check return value from optimized read */
+ if (status<0)
+ HGOTO_ERROR(H5E_DATASET, H5E_READERROR, FAIL, " optimized read failed")
- /* Perform the actual read operation */
- status = (io_info->ops.read)(io_info,
- chunk_info->chunk_points, H5T_get_size(dataset->shared->type),
- chunk_info->fspace, chunk_info->mspace, buf);
+ chunk_node = H5SL_next(chunk_node);
+ }
#ifdef H5_HAVE_PARALLEL
- /* Switch back to collective I/O */
- if(make_coll)
- if(H5D_ioinfo_make_coll(io_info) < 0)
- HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't switch to independent I/O")
-#endif /* H5_HAVE_PARALLEL */
-
- /* Check return value from optimized read */
- if (status<0)
- HGOTO_ERROR(H5E_DATASET, H5E_READERROR, FAIL, "optimized read failed")
+ }
+#endif
- /* Get the next chunk node in the skip list */
- chunk_node=H5SL_next(chunk_node);
- } /* end while */
#ifdef H5S_DEBUG
H5_timer_end(&(io_info->stats->stats[1].read_timer), &timer);
@@ -1843,10 +1813,7 @@ H5D_chunk_write(H5D_io_info_t *io_info, hsize_t nelmts,
uint8_t *tconv_buf = NULL; /*data type conv buffer */
uint8_t *bkg_buf = NULL; /*background buffer */
H5D_storage_t store; /*union of EFL and chunk pointer in file space */
-#ifdef H5_HAVE_PARALLEL
- int count_chunk; /* Number of chunks accessed */
- int min_num_chunk; /* Number of chunks to access collectively */
-#endif
+
herr_t ret_value = SUCCEED; /*return value */
FUNC_ENTER_NOAPI_NOINIT(H5D_chunk_write)
@@ -1868,98 +1835,48 @@ H5D_chunk_write(H5D_io_info_t *io_info, hsize_t nelmts,
#endif
#ifdef H5_HAVE_PARALLEL
+ /* Check whether the collective mode can be turned off globally*/
+
if(io_info->dxpl_cache->xfer_mode == H5FD_MPIO_COLLECTIVE) {
- if(H5D_mpio_get_min_chunk(io_info, &fm, &min_num_chunk)<0)
- HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't get minimum number of chunk")
+ if(H5D_mpio_chunk_adjust_iomode(io_info,&fm))
+ HGOTO_ERROR(H5E_DATASET,H5E_CANTGET,FAIL,"can't adjust collective I/O")
+ }
+ if(io_info->dxpl_cache->xfer_mode == H5FD_MPIO_COLLECTIVE) {
+ if(H5D_chunk_collective_io(io_info,&fm,buf,TRUE)<0)
+ HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't manipulate collective I/O");
}
- count_chunk = 0;
-#endif /* H5_HAVE_PARALLEL */
+ else {/* sequential or independent write */
+
+ #endif /* H5_HAVE_PARALLEL */
+ /* Get first node in chunk skip list */
+ chunk_node=H5SL_first(fm.fsel);
- /* Get first node in chunk skip list */
- chunk_node=H5SL_first(fm.fsel);
-
- /* Iterate through chunks to be operated on */
- while(chunk_node) {
+ while(chunk_node) {
H5D_chunk_info_t *chunk_info; /* Chunk information */
-#ifdef H5_HAVE_PARALLEL
- hbool_t make_ind, make_coll; /* Flags to indicate that the MPI mode should change */
-#endif /* H5_HAVE_PARALLEL */
- /* Get the actual chunk information from the skip list node */
- chunk_info=H5SL_item(chunk_node);
+ /* Get the actual chunk information from the skip list node */
+ chunk_info=H5SL_item(chunk_node);
- /* Pass in chunk's coordinates in a union. */
- store.chunk.offset = chunk_info->coords;
- store.chunk.index = chunk_info->index;
+ /* Pass in chunk's coordinates in a union. */
+ store.chunk.offset = chunk_info->coords;
+ store.chunk.index = chunk_info->index;
-#ifdef H5_HAVE_PARALLEL
- /* Reset flags for changing parallel I/O mode */
- make_ind = make_coll = FALSE;
-
- if(io_info->dxpl_cache->xfer_mode == H5FD_MPIO_COLLECTIVE) {
- /* Increment chunk we are operating on */
- count_chunk++;
-
- /* If the number of chunk is greater than minimum number of chunk,
- Do independent write */
- if(count_chunk > min_num_chunk) {
- /* Switch to independent I/O (permanently) */
- make_ind = TRUE;
- }
-#ifndef H5_MPI_COMPLEX_DERIVED_DATATYPE_WORKS
- else {
- int is_regular; /* If this chunk's selections are regular */
- int mpi_code; /* MPI error code */
- int all_regular = 0; /* If this chunk's selections are regular on all processes */
-
- /* Determine if this process has regular selections */
- if(H5S_SELECT_IS_REGULAR(chunk_info->fspace) == TRUE &&
- H5S_SELECT_IS_REGULAR(chunk_info->mspace) == TRUE)
- is_regular = 1;
- else
- is_regular = 0;
-
- /* Determine if all processes have regular selections */
- if (MPI_SUCCESS != (mpi_code = MPI_Allreduce(&is_regular, &all_regular, 1, MPI_INT, MPI_LAND, io_info->comm)))
- HMPI_GOTO_ERROR(FAIL, "MPI_Allreduce failed", mpi_code)
-
- /* For regular selection,
- if MPI_COMPLEX_DERIVED_DATATYPE is not defined,
- unless spaces for all processors are regular, independent read operation should be performed.*/
- if(!all_regular) {
- /* Switch to independent I/O (temporarily) */
- make_ind = TRUE;
- make_coll = TRUE;
- } /* end if */
- } /* end else */
-#endif /* H5_MPI_COMPLEX_DERIVED_DATATYPE_WORKS */
- } /* end if */
-
- /* Switch to independent I/O */
- if(make_ind)
- if(H5D_ioinfo_make_ind(io_info) < 0)
- HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't switch to independent I/O")
-#endif /* H5_HAVE_PARALLEL */
+ /* Perform the actual read operation */
- /* Perform the actual write operation */
- status = (io_info->ops.write)(io_info,
- chunk_info->chunk_points, H5T_get_size(dataset->shared->type),
- chunk_info->fspace, chunk_info->mspace, buf);
+ status = (io_info->ops.write)(io_info,
+ chunk_info->chunk_points, H5T_get_size(dataset->shared->type),
+ chunk_info->fspace, chunk_info->mspace, 0,buf);
-#ifdef H5_HAVE_PARALLEL
- /* Switch back to collective I/O */
- if(make_coll)
- if(H5D_ioinfo_make_coll(io_info) < 0)
- HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't switch to independent I/O")
-#endif /* H5_HAVE_PARALLEL */
+ /* Check return value from optimized read */
+ if (status<0)
+ HGOTO_ERROR(H5E_DATASET, H5E_READERROR, FAIL, " optimized read failed")
- /* Check return value from optimized write */
- if (status<0)
- HGOTO_ERROR(H5E_DATASET, H5E_WRITEERROR, FAIL, "optimized write failed")
+ chunk_node = H5SL_next(chunk_node);
- /* Get the next chunk node in the skip list */
- chunk_node=H5SL_next(chunk_node);
- } /* end while */
+ }
+#ifdef H5_HAVE_PARALLEL
+ }
+#endif
#ifdef H5S_DEBUG
H5_timer_end(&(io_info->stats->stats[0].write_timer), &timer);
@@ -2211,6 +2128,7 @@ H5D_create_chunk_map(const H5D_t *dataset, const H5T_t *mem_type, const H5S_t *f
H5S_sel_type fsel_type; /* Selection type on disk */
char bogus; /* "bogus" buffer to pass to selection iterator */
unsigned u; /* Local index variable */
+ hbool_t sel_hyper_flag;
herr_t ret_value = SUCCEED; /* Return value */
FUNC_ENTER_NOAPI_NOINIT(H5D_create_chunk_map)
@@ -2218,6 +2136,7 @@ H5D_create_chunk_map(const H5D_t *dataset, const H5T_t *mem_type, const H5S_t *f
/* Get layout for dataset */
fm->layout = &(dataset->shared->layout);
+
/* Check if the memory space is scalar & make equivalent memory space */
if((sm_ndims = H5S_GET_EXTENT_NDIMS(mem_space))<0)
HGOTO_ERROR (H5E_DATASPACE, H5E_CANTGET, FAIL, "unable to get dimension number")
@@ -2272,6 +2191,14 @@ H5D_create_chunk_map(const H5D_t *dataset, const H5T_t *mem_type, const H5S_t *f
if(H5V_array_down(f_ndims,fm->chunks,fm->down_chunks)<0)
HGOTO_ERROR (H5E_INTERNAL, H5E_BADVALUE, FAIL, "can't compute 'down' sizes")
+ /* calculate total chunk in file map*/
+ fm->select_chunk = NULL;
+ fm->total_chunks = 1;
+ for(u=0; u<fm->f_ndims; u++)
+ fm->total_chunks= fm->total_chunks*fm->chunks[u];
+
+
+
/* Initialize skip list for chunk selections */
if((fm->fsel=H5SL_create(H5SL_TYPE_HSIZE,0.5,H5D_DEFAULT_SKIPLIST_HEIGHT))==NULL)
HGOTO_ERROR(H5E_DATASET,H5E_CANTCREATE,FAIL,"can't create skip list for chunk selections")
@@ -2291,8 +2218,13 @@ H5D_create_chunk_map(const H5D_t *dataset, const H5T_t *mem_type, const H5S_t *f
if((fm->msel_type=H5S_GET_SELECT_TYPE(equiv_mspace))<H5S_SEL_NONE)
HGOTO_ERROR (H5E_DATASET, H5E_BADSELECT, FAIL, "unable to convert from file to memory data space")
+ /* If the selection is NONE or POINTS, set the flag to FALSE */
+ if(fsel_type == H5S_SEL_POINTS || fsel_type == H5S_SEL_NONE)
+ sel_hyper_flag = FALSE;
+ else
+ sel_hyper_flag = TRUE;
/* Check if file selection is a point selection */
- if(fsel_type==H5S_SEL_POINTS) {
+ if(!sel_hyper_flag) {
/* Create temporary datatypes for selection iteration */
if((f_tid = H5I_register(H5I_DATATYPE, H5T_copy(dataset->shared->type, H5T_COPY_ALL)))<0)
HGOTO_ERROR (H5E_DATATYPE, H5E_CANTREGISTER, FAIL, "unable to register file datatype")
@@ -2307,7 +2239,7 @@ H5D_create_chunk_map(const H5D_t *dataset, const H5T_t *mem_type, const H5S_t *f
} /* end if */
else {
/* Build the file selection for each chunk */
- if(H5D_create_chunk_file_map_hyper(fm)<0)
+ if(H5D_create_chunk_file_map_hyper(fm,dataset)<0)
HGOTO_ERROR (H5E_DATASET, H5E_CANTINIT, FAIL, "unable to create file chunk selections")
/* Clean file chunks' hyperslab span "scratch" information */
@@ -2329,7 +2261,7 @@ H5D_create_chunk_map(const H5D_t *dataset, const H5T_t *mem_type, const H5S_t *f
} /* end else */
/* Build the memory selection for each chunk */
- if(fsel_type!=H5S_SEL_POINTS && H5S_select_shape_same(file_space,equiv_mspace)==TRUE) {
+ if(sel_hyper_flag && H5S_select_shape_same(file_space,equiv_mspace)==TRUE) {
/* Reset chunk template information */
fm->mchunk_tmpl=NULL;
@@ -2500,7 +2432,7 @@ H5D_destroy_chunk_map(const fm_map *fm)
if(fm->mchunk_tmpl)
if(H5S_close(fm->mchunk_tmpl)<0)
HGOTO_ERROR(H5E_DATASPACE, H5E_CANTRELEASE, FAIL, "can't release memory chunk dataspace template")
-
+ if(fm->select_chunk) H5MM_xfree(fm->select_chunk);
done:
FUNC_LEAVE_NOAPI(ret_value)
} /* end H5D_destroy_chunk_map() */
@@ -2519,7 +2451,7 @@ done:
*-------------------------------------------------------------------------
*/
static herr_t
-H5D_create_chunk_file_map_hyper(const fm_map *fm)
+H5D_create_chunk_file_map_hyper(fm_map *fm,const H5D_t *dset)
{
hssize_t ssel_points; /* Number of elements in file selection */
hsize_t sel_points; /* Number of elements in file selection */
@@ -2548,12 +2480,18 @@ H5D_create_chunk_file_map_hyper(const fm_map *fm)
HGOTO_ERROR(H5E_DATASPACE, H5E_CANTGET, FAIL, "can't get file selection bound info")
/* Set initial chunk location & hyperslab size */
+
for(u=0; u<fm->f_ndims; u++) {
start_coords[u]=(sel_start[u]/fm->layout->u.chunk.dim[u])*fm->layout->u.chunk.dim[u];
coords[u]=start_coords[u];
end[u]=(coords[u]+fm->chunk_dim[u])-1;
} /* end for */
+ if(IS_H5FD_MPI(dset->oloc.file)) {
+ if(NULL == (fm->select_chunk = (hbool_t *) H5MM_calloc(fm->total_chunks*sizeof(hbool_t))))
+ HGOTO_ERROR (H5E_RESOURCE, H5E_NOSPACE, FAIL, "can't allocate chunk info")
+ }
+
/* Calculate the index of this chunk */
if(H5V_chunk_index(fm->f_ndims,coords,fm->layout->u.chunk.dim,fm->down_chunks,&chunk_index)<0)
HGOTO_ERROR (H5E_DATASPACE, H5E_BADRANGE, FAIL, "can't get chunk index")
@@ -2608,6 +2546,10 @@ H5D_create_chunk_file_map_hyper(const fm_map *fm)
/* Set the chunk index */
new_chunk_info->index=chunk_index;
+ /* store chunk selection information */
+ if(IS_H5FD_MPI(dset->oloc.file))
+ fm->select_chunk[chunk_index] = TRUE;
+
/* Set the file chunk dataspace */
new_chunk_info->fspace=tmp_fchunk;
@@ -3140,96 +3082,6 @@ done:
#ifdef H5_HAVE_PARALLEL
/*-------------------------------------------------------------------------
- * Function: H5D_ioinfo_make_ind
- *
- * Purpose: Switch to MPI independent I/O
- *
- * Return: Non-negative on success/Negative on failure
- *
- * Programmer: Quincey Koziol
- * Friday, August 12, 2005
- *
- *-------------------------------------------------------------------------
- */
-static herr_t
-H5D_ioinfo_make_ind(H5D_io_info_t *io_info)
-{
- H5P_genplist_t *dx_plist; /* Data transer property list */
- herr_t ret_value = SUCCEED; /*return value */
-
- FUNC_ENTER_NOAPI_NOINIT(H5D_ioinfo_make_ind)
-
- /* Get the dataset transfer property list */
- if (NULL == (dx_plist = H5I_object(io_info->dxpl_id)))
- HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, "not a dataset transfer property list")
-
- /* Change the xfer_mode to independent, handle the request,
- * then set xfer_mode before return.
- */
- io_info->dxpl_cache->xfer_mode = H5FD_MPIO_INDEPENDENT;
- if(H5P_set (dx_plist, H5D_XFER_IO_XFER_MODE_NAME, &io_info->dxpl_cache->xfer_mode) < 0)
- HGOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "can't set transfer mode")
-
- /* Set the pointers to the non-MPI-specific routines */
- io_info->ops.read = H5D_select_read;
- io_info->ops.write = H5D_select_write;
-
- /* Indicate that the transfer mode should be restored before returning
- * to user.
- */
- io_info->xfer_mode_changed=TRUE;
-
-done:
- FUNC_LEAVE_NOAPI(ret_value)
-} /* end H5D_ioinfo_make_ind() */
-
-
-/*-------------------------------------------------------------------------
- * Function: H5D_ioinfo_make_coll
- *
- * Purpose: Switch to MPI collective I/O
- *
- * Return: Non-negative on success/Negative on failure
- *
- * Programmer: Quincey Koziol
- * Friday, August 12, 2005
- *
- *-------------------------------------------------------------------------
- */
-static herr_t
-H5D_ioinfo_make_coll(H5D_io_info_t *io_info)
-{
- H5P_genplist_t *dx_plist; /* Data transer property list */
- herr_t ret_value = SUCCEED; /*return value */
-
- FUNC_ENTER_NOAPI_NOINIT(H5D_ioinfo_make_coll)
-
- /* Get the dataset transfer property list */
- if (NULL == (dx_plist = H5I_object(io_info->dxpl_id)))
- HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, "not a dataset transfer property list")
-
- /* Change the xfer_mode to independent, handle the request,
- * then set xfer_mode before return.
- */
- io_info->dxpl_cache->xfer_mode = H5FD_MPIO_COLLECTIVE;
- if(H5P_set (dx_plist, H5D_XFER_IO_XFER_MODE_NAME, &io_info->dxpl_cache->xfer_mode) < 0)
- HGOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "can't set transfer mode")
-
- /* Set the pointers to the MPI-specific routines */
- io_info->ops.read = H5D_mpio_select_read;
- io_info->ops.write = H5D_mpio_select_write;
-
- /* Indicate that the transfer mode should _NOT_ be restored before returning
- * to user.
- */
- io_info->xfer_mode_changed=FALSE;
-
-done:
- FUNC_LEAVE_NOAPI(ret_value)
-} /* end H5D_ioinfo_make_coll() */
-
-
-/*-------------------------------------------------------------------------
* Function: H5D_ioinfo_term
*
* Purpose: Common logic for terminating an I/O info object
@@ -3267,38 +3119,4 @@ done:
FUNC_LEAVE_NOAPI(ret_value)
} /* end H5D_ioinfo_term() */
-
-/*-------------------------------------------------------------------------
- * Function: H5D_mpio_get_min_chunk
- *
- * Purpose: Routine for obtaining minimum number of chunks to cover
- * hyperslab selection selected by all processors.
- *
- * Return: Non-negative on success/Negative on failure
- *
- * Programmer:
- *
- *-------------------------------------------------------------------------
- */
-static herr_t
-H5D_mpio_get_min_chunk(const H5D_io_info_t *io_info,
- const fm_map *fm, int *min_chunkf)
-{
- int num_chunkf; /* Number of chunks to iterate over */
- int mpi_code; /* MPI return code */
- herr_t ret_value = SUCCEED;
-
- FUNC_ENTER_NOAPI_NOINIT(H5D_mpio_get_min_chunk);
-
- /* Get the number of chunks to perform I/O on */
- num_chunkf = H5SL_count(fm->fsel);
-
- /* Determine the minimum # of chunks for all processes */
- if (MPI_SUCCESS != (mpi_code = MPI_Allreduce(&num_chunkf, min_chunkf, 1, MPI_INT, MPI_MIN, io_info->comm)))
- HMPI_GOTO_ERROR(FAIL, "MPI_Allreduce failed", mpi_code)
-
-done:
- FUNC_LEAVE_NOAPI(ret_value);
-} /* end H5D_mpio_get_min_chunk() */
-#endif /*H5_HAVE_PARALLEL*/
-
+#endif
diff --git a/src/H5Dmpio.c b/src/H5Dmpio.c
index 9d8821a..2bec70d 100644
--- a/src/H5Dmpio.c
+++ b/src/H5Dmpio.c
@@ -26,19 +26,23 @@
/****************/
#define H5D_PACKAGE /*suppress error about including H5Dpkg */
+/*#define KENT */
/***********/
/* Headers */
/***********/
#include "H5private.h" /* Generic Functions */
+#include "H5Iprivate.h"
#include "H5Dpkg.h" /* Datasets */
#include "H5Eprivate.h" /* Error handling */
#include "H5Fprivate.h" /* File access */
#include "H5FDprivate.h" /* File drivers */
+#include "H5MMprivate.h"
#include "H5Oprivate.h" /* Object headers */
#include "H5Pprivate.h" /* Property lists */
#include "H5Sprivate.h" /* Dataspaces */
+#include "H5Vprivate.h" /* Vector */
#ifdef H5_HAVE_PARALLEL
@@ -46,20 +50,109 @@
/* Local Macros */
/****************/
+/* Macros to represent different IO options */
+#define H5D_ONE_LINK_CHUNK_IO 0
+#define H5D_MULTI_CHUNK_IO 1
+
+/***** Macros for One linked collective IO case. *****/
+/* The default value to do one linked collective IO for all chunks.
+ If the average number of chunks per process is greater than this value,
+ the library will create an MPI derived datatype to link all chunks to do collective IO.
+ The user can set this value through an API. */
+#define H5D_ONE_LINK_CHUNK_IO_THRESHOLD 0 /* always set this option with value 0*/
+
+/* Macros to represent options on how to obtain chunk address for one linked-chunk IO case */
+#define H5D_OBTAIN_ONE_CHUNK_ADDR_IND 0
+#define H5D_OBTAIN_ALL_CHUNK_ADDR_IND 1
+#define H5D_OBTAIN_ALL_CHUNK_ADDR_COL 2
+
+/* Macros to define the default ratio of obtaining all chunk addresses for one linked-chunk IO case */
+#define H5D_ALL_CHUNK_ADDR_THRES_IND 10
+#define H5D_ALL_CHUNK_ADDR_THRES_COL 20
+
+/***** Macros for multi-chunk collective IO case. *****/
+/* The default value of the threshold to do collective IO for this chunk.
+ If the average number of processes per chunk is greater than the default value,
+ collective IO is done for this chunk.
+*/
+#define H5D_MULTI_CHUNK_IO_COL_THRESHOLD 50
+
+/* Macros to represent different IO modes(NONE, Independent or collective)for multiple chunk IO case */
+#define H5D_CHUNK_IO_MODE_IND 0
+#define H5D_CHUNK_IO_MODE_COL 1
+
+/* Macros to represent the regularity of the selection for multiple chunk IO case. */
+#define H5D_CHUNK_SELECT_REG 1
+#define H5D_CHUNK_SELECT_IRREG 2
+#define H5D_CHUNK_SELECT_NONE 0
+
/******************/
/* Local Typedefs */
/******************/
+/* Combine chunk address and chunk info into a struct for better performance. */
+typedef struct H5D_chunk_addr_info_t {
+ haddr_t chunk_addr;
+ H5D_chunk_info_t chunk_info;
+} H5D_chunk_addr_info_t;
+
+/* Combine all information that needs to know for collective MPI-IO of this selection. */
+typedef struct H5D_common_coll_info_t {
+ hbool_t mbt_is_derived;
+ hbool_t mft_is_derived;
+ size_t mpi_buf_count;
+ haddr_t chunk_addr;
+} H5D_common_coll_info_t;
+
/********************/
/* Local Prototypes */
/********************/
-/* For regular hyperslab selection. */
+static herr_t
+H5D_multi_chunk_collective_io(H5D_io_info_t *io_info,fm_map *fm,void *buf,
+ hbool_t do_write);
+
+static herr_t
+H5D_link_chunk_collective_io(H5D_io_info_t *io_info,fm_map *fm,void *buf,
+ hbool_t do_write,int sum_chunk);
+
+static herr_t
+H5D_inter_collective_io(H5D_io_info_t *io_info,const H5S_t *file_space,
+ const H5S_t *mem_space,haddr_t addr,
+ void *buf, hbool_t do_write );
+
+static herr_t
+H5D_final_collective_io(H5D_io_info_t *io_info,MPI_Datatype*mpi_file_type,
+ MPI_Datatype *mpi_buf_type,
+ H5D_common_coll_info_t* coll_info,
+ void *buf, hbool_t do_write);
+#ifdef OLD_WAY
static herr_t
-H5D_mpio_spaces_xfer(H5D_io_info_t *io_info, size_t elmt_size,
- const H5S_t *file_space, const H5S_t *mem_space,
- void *buf/*out*/,
- hbool_t do_write);
+H5D_pre_sort_chunk(H5D_io_info_t *io_info,int total_chunks,
+ haddr_t total_chunk_addr_array[]);
+#endif
+
+static herr_t
+H5D_sort_chunk(H5D_io_info_t * io_info,
+ fm_map *fm,
+ H5D_chunk_addr_info_t chunk_addr_info_array[],
+ int many_chunk_opt);
+
+static herr_t
+H5D_obtain_mpio_mode(H5D_io_info_t* io_info,
+ fm_map *fm,
+ uint8_t assign_io_mode[],
+ haddr_t chunk_addr[]);
+
+static herr_t H5D_ioinfo_make_ind(H5D_io_info_t *io_info);
+static herr_t H5D_ioinfo_make_coll(H5D_io_info_t *io_info);
+static herr_t H5D_mpio_get_min_chunk(const H5D_io_info_t *io_info,
+ const fm_map *fm, int *min_chunkf);
+static int H5D_cmp_chunk_addr(const void *addr1, const void *addr2);
+static herr_t
+H5D_mpio_get_sum_chunk(const H5D_io_info_t *io_info,
+ const fm_map *fm, int *sum_chunkf);
+
/*********************/
/* Package Variables */
@@ -104,7 +197,7 @@ H5D_mpio_opt_possible( const H5D_io_info_t *io_info,
if (io_info->dxpl_cache->xfer_mode==H5FD_MPIO_INDEPENDENT)
HGOTO_DONE(FALSE);
- /* Optimized MPI types flag must be set and it is must be collective IO */
+ /* Optimized MPI types flag must be set and it must be collective IO */
/* (Don't allow parallel I/O for the MPI-posix driver, since it doesn't do real collective I/O) */
if (!(H5S_mpi_opt_types_g && io_info->dxpl_cache->xfer_mode==H5FD_MPIO_COLLECTIVE && !IS_H5FD_MPIPOSIX(io_info->dset->oloc.file))) {
local_opinion = FALSE;
@@ -131,14 +224,14 @@ H5D_mpio_opt_possible( const H5D_io_info_t *io_info,
goto broadcast;
} /* end if */
- /*The handling of memory space is different for chunking
+ /* The handling of memory space is different for chunking
and contiguous storage,
For contigous storage, mem_space and file_space won't
change when it it is doing disk IO.
For chunking storage, mem_space will change for different
chunks. So for chunking storage, whether we can use
- collective IO will defer until the each chunk IO is reached.
- For contiguous storage, if we find the MPI-IO cannot
+ collective IO will defer until each chunk IO is reached.
+ For contiguous storage, if we find MPI-IO cannot
support complicated MPI derived data type, we will
set use_par_opt_io = FALSE.
*/
@@ -184,124 +277,53 @@ done:
/*-------------------------------------------------------------------------
- * Function: H5D_mpio_spaces_xfer
+ * Function: H5D_mpio_chunk_adjust_iomode
*
- * Purpose: Use MPI-IO to transfer data efficiently
- * directly between app buffer and file.
- *
- * Return: non-negative on success, negative on failure.
+ * Purpose: Checks if it is possible to do collective IO
*
- * Programmer: rky 980813
- *
- * Notes:
- * For collective data transfer only since this would eventually call
- * H5FD_mpio_setup to do setup to eveually call MPI_File_set_view in
- * H5FD_mpio_read or H5FD_mpio_write. MPI_File_set_view is a collective
- * call. Letting independent data transfer use this route would result in
- * hanging.
+ * Return: Success: Non-negative: TRUE or FALSE
+ * Failure: Negative
*
- * The preconditions for calling this routine are located in the
- * H5S_mpio_opt_possible() routine, which determines whether this routine
- * can be called for a given dataset transfer.
+ * Programmer: Muqun Yang
+ * Monday, Feb. 13th, 2006
*
*-------------------------------------------------------------------------
*/
-static herr_t
-H5D_mpio_spaces_xfer(H5D_io_info_t *io_info, size_t elmt_size,
- const H5S_t *file_space, const H5S_t *mem_space,
- void *_buf /*out*/, hbool_t do_write )
-{
- haddr_t addr; /* Address of dataset (or selection) within file */
- size_t mpi_buf_count, mpi_file_count; /* Number of "objects" to transfer */
- hsize_t mpi_buf_offset, mpi_file_offset; /* Offset within dataset where selection (ie. MPI type) begins */
- MPI_Datatype mpi_buf_type, mpi_file_type; /* MPI types for buffer (memory) and file */
- hbool_t mbt_is_derived=0, /* Whether the buffer (memory) type is derived and needs to be free'd */
- mft_is_derived=0; /* Whether the file type is derived and needs to be free'd */
- hbool_t plist_is_setup=0; /* Whether the dxpl has been customized */
- uint8_t *buf=(uint8_t *)_buf; /* Alias for pointer arithmetic */
- int mpi_code; /* MPI return code */
- herr_t ret_value = SUCCEED; /* Return value */
-
- FUNC_ENTER_NOAPI_NOINIT(H5D_mpio_spaces_xfer);
+herr_t
+H5D_mpio_chunk_adjust_iomode(H5D_io_info_t *io_info, const fm_map *fm) {
- /* Check args */
- assert (io_info);
- assert (io_info->dset);
- assert (file_space);
- assert (mem_space);
- assert (buf);
- assert (IS_H5FD_MPIO(io_info->dset->oloc.file));
- /* Make certain we have the correct type of property list */
- assert(TRUE==H5P_isa_class(io_info->dxpl_id,H5P_DATASET_XFER));
+ int min_chunk;
+ herr_t ret_value = SUCCEED;
- /* create the MPI buffer type */
- if (H5S_mpio_space_type( mem_space, elmt_size,
- /* out: */
- &mpi_buf_type,
- &mpi_buf_count,
- &mpi_buf_offset,
- &mbt_is_derived )<0)
- HGOTO_ERROR(H5E_DATASPACE, H5E_BADTYPE, FAIL,"couldn't create MPI buf type");
-
- /* create the MPI file type */
- if ( H5S_mpio_space_type( file_space, elmt_size,
- /* out: */
- &mpi_file_type,
- &mpi_file_count,
- &mpi_file_offset,
- &mft_is_derived )<0)
- HGOTO_ERROR(H5E_DATASPACE, H5E_BADTYPE, FAIL,"couldn't create MPI file type");
-
- /* Get the base address of the contiguous dataset or the chunk */
- if(io_info->dset->shared->layout.type == H5D_CONTIGUOUS)
- addr = H5D_contig_get_addr(io_info->dset) + mpi_file_offset;
- else {
- haddr_t chunk_addr; /* for collective chunk IO */
+ FUNC_ENTER_NOAPI_NOINIT(H5D_mpio_chunk_adjust_iomode)
- assert(io_info->dset->shared->layout.type == H5D_CHUNKED);
- chunk_addr=H5D_istore_get_addr(io_info,NULL);
- addr = H5F_BASE_ADDR(io_info->dset->oloc.file) + chunk_addr + mpi_file_offset;
- }
+#ifndef H5_MPI_SPECIAL_COLLECTIVE_IO_WORKS
+ if(H5D_mpio_get_min_chunk(io_info,fm,&min_chunk)<0)
+ HGOTO_ERROR(H5E_DATASPACE, H5E_CANTSWAP, FAIL, "unable to obtain the min chunk number of all processes");
+ if(min_chunk == 0) {
+ H5P_genplist_t *dx_plist; /* Data transer property list */
- /*
- * Pass buf type, file type to the file driver. Request an MPI type
- * transfer (instead of an elementary byteblock transfer).
- */
- if(H5FD_mpi_setup_collective(io_info->dxpl_id, mpi_buf_type, mpi_file_type)<0)
- HGOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "can't set MPI-I/O properties");
- plist_is_setup=1;
-
- /* Adjust the buffer pointer to the beginning of the selection */
- buf+=mpi_buf_offset;
-
- /* transfer the data */
- if (do_write) {
- if (H5F_block_write(io_info->dset->oloc.file, H5FD_MEM_DRAW, addr, mpi_buf_count, io_info->dxpl_id, buf) <0)
- HGOTO_ERROR(H5E_IO, H5E_WRITEERROR, FAIL,"MPI write failed");
- } else {
- if (H5F_block_read (io_info->dset->oloc.file, H5FD_MEM_DRAW, addr, mpi_buf_count, io_info->dxpl_id, buf) <0)
- HGOTO_ERROR(H5E_IO, H5E_READERROR, FAIL,"MPI read failed");
- }
+ /* Get the dataset transfer property list */
+ if (NULL == (dx_plist = H5I_object(io_info->dxpl_id)))
+ HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, "not a dataset creation property list")
-done:
- /* Reset the dxpl settings */
- if(plist_is_setup) {
- if(H5FD_mpi_teardown_collective(io_info->dxpl_id)<0)
- HDONE_ERROR(H5E_DATASPACE, H5E_CANTFREE, FAIL, "unable to reset dxpl values");
- } /* end if */
+ /* Change the xfer_mode to independent for handling the I/O */
+ io_info->dxpl_cache->xfer_mode = H5FD_MPIO_INDEPENDENT;
+ if(H5P_set (dx_plist, H5D_XFER_IO_XFER_MODE_NAME, &io_info->dxpl_cache->xfer_mode) < 0)
+ HGOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "can't set transfer mode")
- /* free the MPI buf and file types */
- if (mbt_is_derived) {
- if (MPI_SUCCESS != (mpi_code= MPI_Type_free( &mpi_buf_type )))
- HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code);
- }
- if (mft_is_derived) {
- if (MPI_SUCCESS != (mpi_code= MPI_Type_free( &mpi_file_type )))
- HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code);
- }
+ /* Indicate that the transfer mode should be restored before returning
+ * to user.
+ */
+ io_info->xfer_mode_changed = TRUE;
+ io_info->ops.read = H5D_select_read;
+ io_info->ops.write = H5D_select_write;
+ } /* end if */
+#endif
+done:
+ FUNC_LEAVE_NOAPI(ret_value)
+}
- FUNC_LEAVE_NOAPI(ret_value);
-} /* end H5D_mpio_spaces_xfer() */
/*-------------------------------------------------------------------------
@@ -317,17 +339,20 @@ done:
*/
herr_t
H5D_mpio_select_read(H5D_io_info_t *io_info,
- size_t UNUSED nelmts, size_t elmt_size,
- const H5S_t *file_space, const H5S_t *mem_space,
- void *buf/*out*/)
+ size_t mpi_buf_count,
+ size_t elmt_size,
+ const H5S_t UNUSED *file_space,
+ const H5S_t UNUSED *mem_space,
+ haddr_t addr,
+ void *buf/*out*/)
{
- herr_t ret_value;
+ herr_t ret_value = SUCCEED;
- FUNC_ENTER_NOAPI_NOFUNC(H5D_mpio_select_read);
-
- ret_value = H5D_mpio_spaces_xfer(io_info, elmt_size, file_space,
- mem_space, buf, 0/*read*/);
+ FUNC_ENTER_NOAPI(H5D_mpio_select_read,FAIL);
+ if(H5F_block_read (io_info->dset->oloc.file, H5FD_MEM_DRAW, addr, mpi_buf_count, io_info->dxpl_id, buf) < 0)
+ HGOTO_ERROR(H5E_IO,H5E_READERROR,FAIL,"can't finish collective parallel read");
+done:
FUNC_LEAVE_NOAPI(ret_value);
} /* end H5D_mpio_select_read() */
@@ -345,19 +370,1382 @@ H5D_mpio_select_read(H5D_io_info_t *io_info,
*/
herr_t
H5D_mpio_select_write(H5D_io_info_t *io_info,
- size_t UNUSED nelmts, size_t elmt_size,
- const H5S_t *file_space, const H5S_t *mem_space,
- const void *buf)
+ size_t mpi_buf_count,
+ size_t elmt_size,
+ const H5S_t UNUSED *file_space,
+ const H5S_t UNUSED *mem_space,
+ haddr_t addr,
+ const void *buf)
{
herr_t ret_value;
- FUNC_ENTER_NOAPI_NOFUNC(H5D_mpio_select_write);
+ FUNC_ENTER_NOAPI(H5D_mpio_select_write,FAIL);
/*OKAY: CAST DISCARDS CONST QUALIFIER*/
- ret_value = H5D_mpio_spaces_xfer(io_info, elmt_size, file_space,
- mem_space, (void*)buf, 1/*write*/);
+ if(H5F_block_write (io_info->dset->oloc.file, H5FD_MEM_DRAW, addr, mpi_buf_count, io_info->dxpl_id, buf)<0)
+ HGOTO_ERROR(H5E_IO,H5E_WRITEERROR,FAIL,"can't finish collective parallel write");
+done:
+ FUNC_LEAVE_NOAPI(ret_value);
+} /* end H5D_mpio_select_write() */
+
+/*-------------------------------------------------------------------------
+ * Function: H5D_ioinfo_make_ind
+ *
+ * Purpose: Switch to MPI independent I/O
+ *
+ * Return: Non-negative on success/Negative on failure
+ *
+ * Programmer: Quincey Koziol
+ * Friday, August 12, 2005
+ *
+ *-------------------------------------------------------------------------
+ */
+static herr_t
+H5D_ioinfo_make_ind(H5D_io_info_t *io_info)
+{
+ H5P_genplist_t *dx_plist; /* Data transer property list */
+ herr_t ret_value = SUCCEED; /*return value */
+
+ FUNC_ENTER_NOAPI_NOINIT(H5D_ioinfo_make_ind)
+
+ /* Get the dataset transfer property list */
+ if (NULL == (dx_plist = H5I_object(io_info->dxpl_id)))
+ HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, "not a dataset transfer property list")
+
+ /* Change the xfer_mode to independent, handle the request,
+ * then set xfer_mode before return.
+ */
+ io_info->dxpl_cache->xfer_mode = H5FD_MPIO_INDEPENDENT;
+ if(H5P_set (dx_plist, H5D_XFER_IO_XFER_MODE_NAME, &io_info->dxpl_cache->xfer_mode) < 0)
+ HGOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "can't set transfer mode")
+
+ /* Set the pointers to the non-MPI-specific routines */
+ io_info->ops.read = H5D_select_read;
+ io_info->ops.write = H5D_select_write;
+
+ /* Indicate that the transfer mode should be restored before returning
+ * to user.
+ */
+ io_info->xfer_mode_changed=TRUE;
+
+done:
+ FUNC_LEAVE_NOAPI(ret_value)
+} /* end H5D_ioinfo_make_ind() */
+
+
+/*-------------------------------------------------------------------------
+ * Function: H5D_ioinfo_make_coll
+ *
+ * Purpose: Switch to MPI collective I/O
+ *
+ * Return: Non-negative on success/Negative on failure
+ *
+ * Programmer: Quincey Koziol
+ * Friday, August 12, 2005
+ *
+ *-------------------------------------------------------------------------
+ */
+static herr_t
+H5D_ioinfo_make_coll(H5D_io_info_t *io_info)
+{
+ H5P_genplist_t *dx_plist; /* Data transer property list */
+ herr_t ret_value = SUCCEED; /*return value */
+
+ FUNC_ENTER_NOAPI_NOINIT(H5D_ioinfo_make_coll)
+
+ /* Get the dataset transfer property list */
+ if (NULL == (dx_plist = H5I_object(io_info->dxpl_id)))
+ HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, "not a dataset transfer property list")
+
+ /* Change the xfer_mode to independent, handle the request,
+ * then set xfer_mode before return.
+ */
+ io_info->dxpl_cache->xfer_mode = H5FD_MPIO_COLLECTIVE;
+ if(H5P_set (dx_plist, H5D_XFER_IO_XFER_MODE_NAME, &io_info->dxpl_cache->xfer_mode) < 0)
+ HGOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "can't set transfer mode")
+
+ /* Set the pointers to the MPI-specific routines */
+ io_info->ops.read = H5D_mpio_select_read;
+ io_info->ops.write = H5D_mpio_select_write;
+
+ /* Indicate that the transfer mode should _NOT_ be restored before returning
+ * to user.
+ */
+ io_info->xfer_mode_changed=FALSE;
+
+done:
+ FUNC_LEAVE_NOAPI(ret_value)
+} /* end H5D_ioinfo_make_coll() */
+
+
+
+
+/*-------------------------------------------------------------------------
+ * Function: H5D_mpio_get_min_chunk
+ *
+ * Purpose: Routine for obtaining minimum number of chunks to cover
+ * hyperslab selection selected by all processors.
+ *
+ * Return: Non-negative on success/Negative on failure
+ *
+ * Programmer:
+ *
+ *-------------------------------------------------------------------------
+ */
+static herr_t
+H5D_mpio_get_min_chunk(const H5D_io_info_t *io_info,
+ const fm_map *fm, int *min_chunkf)
+{
+ int num_chunkf; /* Number of chunks to iterate over */
+ int mpi_code; /* MPI return code */
+ herr_t ret_value = SUCCEED;
+
+ FUNC_ENTER_NOAPI_NOINIT(H5D_mpio_get_min_chunk);
+
+ /* Get the number of chunks to perform I/O on */
+ num_chunkf = H5SL_count(fm->fsel);
+
+ /* Determine the minimum # of chunks for all processes */
+ if (MPI_SUCCESS != (mpi_code = MPI_Allreduce(&num_chunkf, min_chunkf, 1, MPI_INT, MPI_MIN, io_info->comm)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_Allreduce failed", mpi_code)
+
+done:
FUNC_LEAVE_NOAPI(ret_value);
-} /* end H5D_mpio_spaces_write() */
+} /* end H5D_mpio_get_min_chunk() */
+/*-------------------------------------------------------------------------
+ * Function: H5D_mpio_get_sum_chunk
+ *
+ * Purpose: Routine for obtaining total number of chunks to cover
+ * hyperslab selection selected by all processors.
+ *
+ * Return: Non-negative on success/Negative on failure
+ *
+ * Programmer:
+ *
+ *-------------------------------------------------------------------------
+ */
+static herr_t
+H5D_mpio_get_sum_chunk(const H5D_io_info_t *io_info,
+ const fm_map *fm, int *sum_chunkf)
+{
+ int num_chunkf; /* Number of chunks to iterate over */
+ size_t ori_num_chunkf;
+ int mpi_code; /* MPI return code */
+ herr_t ret_value = SUCCEED;
+
+ FUNC_ENTER_NOAPI_NOINIT(H5D_mpio_get_sum_chunk);
+
+ /* Get the number of chunks to perform I/O on */
+ num_chunkf = 0;
+ ori_num_chunkf = H5SL_count(fm->fsel);
+ H5_ASSIGN_OVERFLOW(num_chunkf,ori_num_chunkf,size_t,int);
+
+ /* Determine the minimum # of chunks for all processes */
+ if (MPI_SUCCESS != (mpi_code = MPI_Allreduce(&num_chunkf, sum_chunkf, 1, MPI_INT, MPI_SUM, io_info->comm)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_Allreduce failed", mpi_code)
+
+done:
+ FUNC_LEAVE_NOAPI(ret_value);
+} /* end H5D_mpio_get_sum_chunk() */
+
+
+/*-------------------------------------------------------------------------
+ * Function: H5D_contig_collective_io
+ *
+ * Purpose: Routine for
+ * 1) building up MPI derived datatype
+ * 2) setting up collective IO property list
+ * 3) Do IO
+ *
+ *
+ * Return: Non-negative on success/Negative on failure
+ *
+ * Programmer:
+ *
+ * Modifications:
+ *
+ *-------------------------------------------------------------------------
+ */
+herr_t
+H5D_contig_collective_io(H5D_io_info_t *io_info,
+ const H5S_t *file_space,
+ const H5S_t *mem_space,
+ void *buf,
+ hbool_t do_write)
+{
+
+
+ haddr_t addr = HADDR_UNDEF; /* Address of dataset (or selection) within file */
+ hbool_t select_valid;
+ herr_t ret_value = SUCCEED; /* return value */
+
+ FUNC_ENTER_NOAPI_NOINIT(H5D_contig_collective_io)
+ assert (IS_H5FD_MPIO(io_info->dset->oloc.file));
+
+ /* Make certain we have the correct type of property list */
+ assert(TRUE==H5P_isa_class(io_info->dxpl_id,H5P_DATASET_XFER));
+
+ /* Get the base address of the contiguous dataset */
+ if(io_info->dset->shared->layout.type == H5D_CONTIGUOUS)
+ addr = H5D_contig_get_addr(io_info->dset);
+
+ if(H5D_inter_collective_io(io_info,file_space,mem_space,addr,buf,do_write)<0)
+ HGOTO_ERROR(H5E_IO, H5E_CANTGET, FAIL,"couldn't finish shared collective MPI-IO");
+
+ done:
+
+ FUNC_LEAVE_NOAPI(ret_value)
+} /* end H5D_contig_collective_io */
+
+/*-------------------------------------------------------------------------
+ * Function: H5D_chunk_collective_io
+ *
+ * Purpose: Routine for
+ 1) choose an IO option:
+ a) One collective IO defined by one MPI derived datatype to link through all chunks
+ or b) multiple chunk IOs,to do MPI-IO for each chunk, the IO mode may be adjusted
+ due to the selection pattern for each chunk.
+ * For option a)
+ 1. Sort the chunk address, obtain chunk info according to the sorted chunk address
+ 2. Build up MPI derived datatype for each chunk
+ 3. Build up the final MPI derived datatype
+ 4. Set up collective IO property list
+ 5. Do IO
+ * For option b)
+ 1. Use MPI_gather and MPI_Bcast to obtain information of *collective/independent/none*
+ IO mode for each chunk of the data space
+ 2. Depending on whether the IO mode is collective or independent or none,
+ Create either MPI derived datatype for each chunk to do collective IO or just do independent IO
+ 3. Set up collective IO property list for collective mode
+ 4. DO IO
+ *
+ * Return: Non-negative on success/Negative on failure
+ *
+ * Programmer:
+ *
+ * Modifications:
+ *
+ *-------------------------------------------------------------------------
+ */
+herr_t
+H5D_chunk_collective_io(H5D_io_info_t *io_info,fm_map *fm,void *buf, hbool_t do_write)
+{
+
+ int io_option = H5D_MULTI_CHUNK_IO;
+ int min_chunk,sum_chunk,mpi_size;
+ int one_link_chunk_io_threshold;
+ herr_t ret_value = SUCCEED;
+
+
+ FUNC_ENTER_NOAPI_NOINIT(H5D_chunk_collective_io)
+
+ assert (IS_H5FD_MPIO(io_info->dset->oloc.file));
+
+ if(H5D_mpio_get_sum_chunk(io_info,fm,&sum_chunk)<0)
+ HGOTO_ERROR(H5E_DATASPACE, H5E_CANTSWAP, FAIL, "unable to obtain the total chunk number of all processes");
+ if((mpi_size = H5F_mpi_get_size(io_info->dset->oloc.file))<0)
+ HGOTO_ERROR (H5E_IO, H5E_MPI, FAIL, "unable to obtain mpi size");
+ one_link_chunk_io_threshold = H5D_ONE_LINK_CHUNK_IO_THRESHOLD;/*This should be replaced by the user inputting value from API. */
+
+ /* step 1: choose an IO option */
+ /* If the average number of chunk per process is greater than a threshold, we will do one link chunked IO. */
+ if(sum_chunk/mpi_size >= one_link_chunk_io_threshold) io_option = H5D_ONE_LINK_CHUNK_IO;
+
+/* If this MPI-IO package doesn't support collective IO when no IO is done for one or more processes,
+ use MULTIPLE CHUNK IO */
+/*
+#ifndef H5_MPI_SPECIAL_COLLECTIVE_IO_WORKS
+ if(H5D_mpio_get_min_chunk(io_info,fm,&min_chunk)<0)
+ HGOTO_ERROR(H5E_DATASPACE, H5E_CANTSWAP, FAIL, "unable to obtain the min chunk number of all processes");
+ if(min_chunk == 0) io_option = H5D_MULTI_CHUNK_IO;
+#endif
+*/
+
+#ifndef H5_MPI_COMPLEX_DERIVED_DATATYPE_WORKS
+ if(io_option == H5D_ONE_LINK_CHUNK_IO ) io_option = H5D_MULTI_CHUNK_IO ;/* We can not do this with one chunk IO. */
+#endif
+
+ /* step 2: Go ahead to do IO.*/
+ if(io_option == H5D_ONE_LINK_CHUNK_IO) {
+ if(H5D_link_chunk_collective_io(io_info,fm,buf,do_write,sum_chunk)<0)
+ HGOTO_ERROR(H5E_IO, H5E_CANTGET, FAIL,"couldn't finish linked chunk MPI-IO");
+ }
+ else {/* multiple chunk IOs */
+ printf("coming into multiple chunk \n");
+ if(H5D_multi_chunk_collective_io(io_info,fm,buf,do_write)<0)
+ HGOTO_ERROR(H5E_IO, H5E_CANTGET, FAIL,"couldn't finish multiple chunk MPI-IO");
+ }
+
+done:
+ FUNC_LEAVE_NOAPI(ret_value)
+} /* end H5D_chunk_collective_io */
+
+/*-------------------------------------------------------------------------
+ * Function: H5D_link_chunk_collective_io
+ *
+ * Purpose: Routine for one collective IO with one MPI derived datatype to link with all chunks
+
+ 1. Sort the chunk address and chunk info
+ 2. Build up MPI derived datatype for each chunk
+ 3. Build up the final MPI derived datatype
+ 4. Use common collective IO routine to do MPI-IO
+
+ *
+ * Return: Non-negative on success/Negative on failure
+ *
+ * Programmer:
+ *
+ * Modifications:
+ *
+ *-------------------------------------------------------------------------
+ */
+
+static herr_t
+H5D_link_chunk_collective_io(H5D_io_info_t *io_info,fm_map *fm,void *buf, hbool_t do_write,int sum_chunk)
+{
+
+
+ size_t src_type_size; /*size of source type */
+ size_t dst_type_size; /*size of destination type*/
+ hsize_t mpi_buf_extra_offset;
+ hsize_t mpi_file_extra_offset;
+ size_t mpi_buf_count;
+ size_t mpi_file_count;
+ hbool_t mbt_is_derived=0, /* Whether the buffer (memory) type is derived and needs to be free'd */
+ mft_is_derived=0; /* Whether the file type is derived and needs to be free'd */
+
+ int mpi_size,mpi_code; /* MPI return code */
+
+ int i,num_chunk,total_chunks;
+ size_t ori_num_chunk;
+ hsize_t ori_total_chunks;
+ haddr_t chunk_base_addr;
+ haddr_t* total_chunk_addr_array;
+ MPI_Datatype *chunk_mtype;
+ MPI_Datatype *chunk_ftype;
+ MPI_Datatype chunk_final_mtype;
+ MPI_Datatype chunk_final_ftype;
+ MPI_Aint *chunk_disp_array;
+ MPI_Aint *chunk_mem_disp_array;
+ int *blocklen;
+ int blocklen_value;
+ int actual_bsearch_coll_chunk_threshold;
+ int bsearch_coll_chunk_threshold;
+ int bsearch_chunk_ratio;
+ int bsearch_chunk_threshold;
+ int many_chunk_opt = 0;
+
+ H5D_common_coll_info_t coll_info;
+ H5D_chunk_addr_info_t* chunk_addr_info_array;
+
+#ifdef CC_PERF
+ char *bc_percent = NULL;
+ char *bcc_percent = NULL;
+#endif
+ herr_t ret_value = SUCCEED;
+
+ FUNC_ENTER_NOAPI_NOINIT(H5D_link_chunk_collective_io)
+ ori_total_chunks = fm->total_chunks;
+ H5_ASSIGN_OVERFLOW(total_chunks,ori_total_chunks,hsize_t,int);
+
+ /* Handle with a special case when only one chunk is covered by all processes */
+ if(total_chunks == 1){
+ H5SL_node_t *chunk_node;
+ H5D_chunk_info_t *chunk_info;
+ H5D_storage_t store;
+
+ chunk_node = H5SL_first(fm->fsel);
+ if(chunk_node == NULL) {
+ if(H5D_istore_chunkmap(io_info,total_chunks,&chunk_base_addr,fm->down_chunks)<0)
+ HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't get chunk address");
+ if(H5D_inter_collective_io(io_info,NULL,NULL,chunk_base_addr,buf,do_write)<0)
+ HGOTO_ERROR(H5E_STORAGE, H5E_CANTGET, FAIL,"couldn't finish shared collective MPI-IO");
+ }
+ else {
+ if(NULL ==(chunk_info = H5SL_item(chunk_node)))
+ HGOTO_ERROR(H5E_STORAGE, H5E_CANTGET, FAIL,"couldn't get chunk info from skipped list");
+ io_info->store = &store;
+ store.chunk.offset = chunk_info->coords;
+ store.chunk.index = chunk_info->index;
+
+ if(HADDR_UNDEF==(chunk_base_addr = H5D_istore_get_addr(io_info,NULL)))
+ HGOTO_ERROR(H5E_STORAGE, H5E_CANTGET, FAIL,"couldn't get chunk info from skipped list");
+
+#ifdef KENT
+printf("before inter_collective_io for total chunk = 1 \n");
+#endif
+ if(H5D_inter_collective_io(io_info,chunk_info->fspace,chunk_info->mspace,chunk_base_addr,buf,do_write)<0)
+ HGOTO_ERROR(H5E_STORAGE, H5E_CANTGET, FAIL,"couldn't finish shared collective MPI-IO");
+ }
+ goto done;
+ }
+
+ /* Allocate chunking information */
+
+ ori_num_chunk = H5SL_count(fm->fsel);
+ H5_ASSIGN_OVERFLOW(num_chunk,ori_num_chunk,size_t,int);
+
+
+ if(num_chunk == 0) total_chunk_addr_array = H5MM_malloc(sizeof(haddr_t)*total_chunks);
+ else
+ {
+ chunk_addr_info_array= H5MM_malloc(num_chunk*sizeof(H5D_chunk_addr_info_t));
+ chunk_mtype = H5MM_malloc(num_chunk*sizeof(MPI_Datatype));
+ chunk_ftype = H5MM_malloc(num_chunk*sizeof(MPI_Datatype));
+ chunk_disp_array = H5MM_malloc(num_chunk*sizeof(MPI_Aint));
+ chunk_mem_disp_array = H5MM_calloc(num_chunk*sizeof(MPI_Aint));
+ blocklen = H5MM_malloc(num_chunk*sizeof(int));
+ }
+
+ /* Obtain information to do collective IO,
+ in order to do collective IO, no datatype conversion should happen. */
+ if((src_type_size = H5T_get_size(io_info->dset->shared->type))==0)
+ HGOTO_ERROR(H5E_DATATYPE, H5E_BADSIZE, FAIL, "datatype size invalid");
+ dst_type_size = src_type_size;
+
+
+#ifdef CC_PERF
+ /* "bcc" means 'b-tree iterately obtain all chunk addresses collectively',
+ "bc" means 'b-tree iterately obtain all chunk addresses individually',
+ the default one means 'obtaining the chunk address individually',
+ */
+
+ if(bcc_percent=getenv("BCC_PERCENT")){
+ bsearch_coll_chunk_threshold = atoi(bcc_percent);
+ assert((bsearch_coll_chunk_threshold >=0) &&(bsearch_coll_chunk_threshold <=100));
+ }
+ else
+ bsearch_coll_chunk_threshold = H5D_ALL_CHUNK_ADDR_THRES_COL;
+#else
+ bsearch_coll_chunk_threshold = H5D_ALL_CHUNK_ADDR_THRES_COL; /*This number may be changed according to the performance study */
+#endif
+
+ if((mpi_size = H5F_mpi_get_size(io_info->dset->oloc.file))<0)
+ HGOTO_ERROR (H5E_IO, H5E_MPI, FAIL, "unable to obtain mpi size");
+
+ /* Calculate the actual threshold to obtain all chunk addresses collectively
+ The bigger this number is, the more possible the use of obtaining chunk address collectively. */
+ actual_bsearch_coll_chunk_threshold = sum_chunk*100/(total_chunks*mpi_size);
+
+ if(actual_bsearch_coll_chunk_threshold >= bsearch_coll_chunk_threshold)
+ many_chunk_opt = H5D_OBTAIN_ALL_CHUNK_ADDR_COL;
+
+ else {
+
+#ifdef CC_PERF
+ if(bc_percent=getenv("BC_PERCENT")){
+ bsearch_chunk_ratio = atoi(bc_percent);
+ assert((bsearch_chunk_ratio<=100)&&(bsearch_chunk_ratio>=0));
+ }
+ else
+ bsearch_chunk_ratio = H5D_ALL_CHUNK_ADDR_THRES_IND;
+#else
+ bsearch_chunk_ratio = H5D_ALL_CHUNK_ADDR_THRES_IND; /*This number may be changed according to the performance study */
+#endif
+
+ /* This threshold is to check whether we can use iterator to obtain all chunk addresses.
+ The unit of the threshold is the number of chunks. The value should be at least 1.
+ It can be calculated as follows:
+
+ if(total_chunks*bsearch_chunk_ratio/100 <=1)
+ bsearch_chunk_threahold = 1;
+ else
+ bsearch_chunk_threshold = total_chunks*bsearch_chunk_ratio/100;
+ In order to make the caluculation more efficient,
+ we use the following approximate formula to calculate the threshold.
+
+ bsearch_chunk_threshold = 1+ (total_chunks*bsearch_chunk_ratio-99)/100;
+
+ The only difference is when total_chunks* besearch_chunk_ratio == 100n+99;
+ the approximate formula will give value (n+1) instead of n for threshold.
+ That shouldn't matter much from our persective.
+ */
+
+ bsearch_chunk_threshold = 1 +(total_chunks*bsearch_chunk_ratio-99)/100;
+ if(num_chunk > bsearch_chunk_threshold) many_chunk_opt = H5D_OBTAIN_ALL_CHUNK_ADDR_IND;
+ }
+
+ /* Sort the chunk address
+ when chunk optimization selection is either H5D_OBTAIN_*/
+ if(num_chunk == 0){
+ if(H5D_istore_chunkmap(io_info,total_chunks,total_chunk_addr_array,fm->down_chunks)<0)
+ HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't get chunk address");
+ chunk_base_addr = total_chunk_addr_array[0];
+ }
+
+ else {
+ if(H5D_sort_chunk(io_info,fm,chunk_addr_info_array,many_chunk_opt)<0)
+ HGOTO_ERROR (H5E_DATASPACE, H5E_CANTSWAP, FAIL, "unable to sort chunk address");
+ chunk_base_addr = chunk_addr_info_array[0].chunk_addr;
+ }
+
+ /* Obtain MPI derived datatype from all individual chunks */
+ for ( i = 0; i < num_chunk; i++) {
+ /* Disk MPI derived datatype */
+ if(H5S_mpio_space_type(chunk_addr_info_array[i].chunk_info.fspace,src_type_size,&chunk_ftype[i],
+ &mpi_file_count,&mpi_file_extra_offset,&mft_is_derived)<0)
+ HGOTO_ERROR(H5E_DATASPACE, H5E_BADTYPE, FAIL,"couldn't create MPI file type");
+
+ /* Buffer MPI derived datatype */
+ if(H5S_mpio_space_type(chunk_addr_info_array[i].chunk_info.mspace,dst_type_size,&chunk_mtype[i],
+ &mpi_buf_count,&mpi_buf_extra_offset,&mbt_is_derived)<0)
+ HGOTO_ERROR(H5E_DATASPACE, H5E_BADTYPE, FAIL,"couldn't create MPI buf type");
+
+ /* Chunk address relative to the first chunk */
+ chunk_addr_info_array[i].chunk_addr -= chunk_base_addr;
+ H5_ASSIGN_OVERFLOW(chunk_disp_array[i],chunk_addr_info_array[i].chunk_addr,haddr_t,MPI_Aint);
+ /*chunk_disp_array[i] = (MPI_Aint)chunk_addr_array[i];*/
+ }
+
+ blocklen_value = 1;
+ if(num_chunk){
+
+ /* initialize the buffer with the constant value 1 */
+ H5V_array_fill(blocklen,&blocklen_value,sizeof(int),(size_t)num_chunk);
+
+ /* Create final MPI derived datatype */
+ if(MPI_SUCCESS != (mpi_code = MPI_Type_struct(num_chunk,blocklen,chunk_disp_array,chunk_ftype,&chunk_final_ftype)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_Type_struct failed", mpi_code);
+ if(MPI_SUCCESS != (mpi_code = MPI_Type_commit(&chunk_final_ftype)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_Type_commit failed", mpi_code);
+
+ if(MPI_SUCCESS != (mpi_code = MPI_Type_struct(num_chunk,blocklen,chunk_mem_disp_array,chunk_mtype,&chunk_final_mtype)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_Type_struct failed", mpi_code);
+ if(MPI_SUCCESS != (mpi_code = MPI_Type_commit(&chunk_final_mtype)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_Type_commit failed", mpi_code);
+
+ for ( i = 0; i< num_chunk;i++){
+ if (MPI_SUCCESS != (mpi_code= MPI_Type_free( chunk_mtype+i )))
+ HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code);
+ if (MPI_SUCCESS != (mpi_code= MPI_Type_free( chunk_ftype+i )))
+ HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code);
+ }
+
+ /* buffer, file derived datatypes should be true */
+ coll_info.mbt_is_derived = 1;
+ coll_info.mft_is_derived = 1;
+ coll_info.mpi_buf_count = 1;
+ coll_info.chunk_addr = chunk_base_addr;
+
+ }
+
+ else {/* no selection at all for this process */
+ chunk_final_ftype = MPI_BYTE;
+ chunk_final_mtype = MPI_BYTE;
+
+ /* buffer, file derived datatypes should be true */
+ coll_info.mbt_is_derived = 0;
+ coll_info.mft_is_derived = 0;
+ coll_info.mpi_buf_count = 0;
+ coll_info.chunk_addr = chunk_base_addr;
+ }
+ if(H5D_final_collective_io(io_info,&chunk_final_ftype,&chunk_final_mtype,&coll_info,buf,do_write)<0)
+ HGOTO_ERROR(H5E_IO, H5E_CANTGET, FAIL,"couldn't finish MPI-IO");
+
+ done:
+ if(num_chunk == 0) HDfree(total_chunk_addr_array);
+ else {
+ if (fm->total_chunks != 1) {
+ HDfree(chunk_addr_info_array);
+ HDfree(chunk_mtype);
+ HDfree(chunk_ftype);
+ HDfree(chunk_disp_array);
+ HDfree(chunk_mem_disp_array);
+ HDfree(blocklen);
+ }
+ }
+
+ FUNC_LEAVE_NOAPI(ret_value)
+}
+/* end H5D_link_chunk_collective_io */
+
+
+/*-------------------------------------------------------------------------
+ * Function: H5D_multi_chunk_collective_io
+ *
+ * Purpose: To do IO per chunk according to IO mode(collective/independent/none)
+
+ 1. Use MPI_gather and MPI_Bcast to obtain IO mode in each chunk(collective/independent/none)
+ 2. Depending on whether the IO mode is collective or independent or none,
+ Create either MPI derived datatype for each chunk or just do independent IO
+ 3. Use common collective IO routine to do MPI-IO
+ *
+ * Return: Non-negative on success/Negative on failure
+ *
+ * Programmer:
+ *
+ * Modifications:
+ *
+ *-------------------------------------------------------------------------
+ */
+static herr_t
+H5D_multi_chunk_collective_io(H5D_io_info_t *io_info,fm_map *fm,void *buf, hbool_t do_write)
+{
+
+ int i,total_chunk;
+ int mpi_rank;
+ hsize_t ori_total_chunk;
+ uint8_t *chunk_io_option;
+
+ H5SL_node_t *chunk_node; /* Current node in chunk skip list */
+ H5D_chunk_info_t *chunk_info;
+ haddr_t *chunk_addr;
+ H5D_storage_t store; /* union of EFL and chunk pointer in file space */
+ hbool_t select_chunk;
+ hbool_t last_io_mode_coll = TRUE;
+ herr_t ret_value = SUCCEED;
+
+
+ FUNC_ENTER_NOAPI_NOINIT(H5D_multi_chunk_collective_io)
+#ifdef KENT
+ mpi_rank = H5F_mpi_get_rank(io_info->dset->oloc.file);
+#endif
+ /* Allocate memories */
+ ori_total_chunk = fm->total_chunks;
+ H5_ASSIGN_OVERFLOW(total_chunk,ori_total_chunk,hsize_t,int);
+ HDassert(total_chunk!=0);
+ chunk_io_option = (uint8_t *)H5MM_calloc(total_chunk*sizeof(MPI_BYTE));
+ chunk_addr = (haddr_t *)H5MM_calloc(total_chunk*sizeof(haddr_t));
+#ifdef KENT
+ printf("total_chunk %d\n",total_chunk);
+#endif
+
+ /* obtain IO option for each chunk */
+ if(H5D_obtain_mpio_mode(io_info,fm,chunk_io_option,chunk_addr)<0)
+ HGOTO_ERROR (H5E_DATASET, H5E_CANTRECV, FAIL, "unable to obtain MPIO mode");
+
+ for( i = 0; i<total_chunk;i++){
+
+#ifdef KENT
+printf("mpi_rank = %d, chunk index = %d\n",mpi_rank,i);
+#endif
+ select_chunk = fm->select_chunk[i];
+ if(select_chunk == 1){/* Have selection elements in this chunk. Find the chunk info. */
+#ifdef NEW_WAY
+ if(NULL ==(chunk_info = H5SL_item(chunk_node)))
+ HGOTO_ERROR(H5E_STORAGE, H5E_CANTGET, FAIL,"couldn't get chunk info from skipped list");
+ HDassert(chunk_info->index == i);
+
+ /* Set dataset storage for I/O info */
+ io_info->store=&store;
+ /* Pass in chunk's coordinates in a union. */
+ store.chunk.offset = chunk_info->coords;
+ store.chunk.index = chunk_info->index;
+ if(NULL ==(chunk_node = H5SL_first(fm->fsel)))
+ HGOTO_ERROR(H5E_STORAGE, H5E_CANTGET, FAIL,"couldn't get chunk node from skipped list");
+#else
+
+ if(NULL ==(chunk_node = H5SL_first(fm->fsel)))
+ HGOTO_ERROR(H5E_STORAGE, H5E_CANTGET, FAIL,"couldn't get chunk node from skipped list");
+ while(chunk_node){
+
+ if(NULL ==(chunk_info = H5SL_item(chunk_node)))
+ HGOTO_ERROR(H5E_STORAGE, H5E_CANTGET, FAIL,"couldn't get chunk info from skipped list");
+ if(chunk_info->index == i) {
+ /* Set dataset storage for I/O info */
+ io_info->store=&store;
+ /* Pass in chunk's coordinates in a union. */
+ store.chunk.offset = chunk_info->coords;
+ store.chunk.index = chunk_info->index;
+ break;
+ }
+
+ chunk_node = H5SL_next(chunk_node);
+ }
+#endif
+ }
+
+ if(chunk_io_option[i] == 1){ /*collective IO for this chunk,
+ note: even there is no selection for this process,
+ the process still needs to contribute MPI NONE TYPE.*/
+#ifdef KENT
+printf("inside collective chunk IO mpi_rank = %d, chunk index = %d\n",mpi_rank,i);
+#endif
+
+ if(!last_io_mode_coll)
+ /* Switch back to collective I/O */
+ if(H5D_ioinfo_make_coll(io_info) < 0)
+ HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't switch to collective I/O")
+
+ if(select_chunk){
+ if(H5D_inter_collective_io(io_info,chunk_info->fspace,chunk_info->mspace,
+ chunk_addr[i],buf,do_write )<0)
+ HGOTO_ERROR(H5E_IO, H5E_CANTGET, FAIL,"couldn't finish shared collective MPI-IO");
+
+ }
+ else{
+ if(H5D_inter_collective_io(io_info,NULL,NULL,
+ chunk_addr[i],buf,do_write )<0)
+ HGOTO_ERROR(H5E_IO, H5E_CANTGET, FAIL,"couldn't finish shared collective MPI-IO");
+
+ }
+
+
+ last_io_mode_coll = TRUE;
+
+ }
+ else {/*possible independent IO for this chunk*/
+#ifdef KENT
+printf("inside independent IO mpi_rank = %d, chunk index = %d\n",mpi_rank,i);
+#endif
+
+ HDassert(chunk_io_option[i] == 0);
+ if(!select_chunk) continue; /* this process has nothing to do with this chunk, continue! */
+ if(last_io_mode_coll)
+ /* Switch to independent I/O */
+ if(H5D_ioinfo_make_ind(io_info) < 0)
+ HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't switch to independent I/O")
+
+ if(do_write) {
+ ret_value = (io_info->ops.write)(io_info,
+ chunk_info->chunk_points,H5T_get_size(io_info->dset->shared->type),
+ chunk_info->fspace,chunk_info->mspace,0,
+ buf);
+ /* Check return value of the write */
+ if (ret_value<0)
+ HGOTO_ERROR(H5E_DATASET, H5E_WRITEERROR, FAIL, "optimized write failed")
+ }
+ else {
+ ret_value = (io_info->ops.read)(io_info,
+ chunk_info->chunk_points,H5T_get_size(io_info->dset->shared->type),
+ chunk_info->fspace,chunk_info->mspace,0,
+ buf);
+ /* Check return value from optimized write */
+ if (ret_value<0)
+ HGOTO_ERROR(H5E_DATASET, H5E_READERROR, FAIL, "optimized read failed")
+ }
+
+ last_io_mode_coll = FALSE;
+ }
+ }
+
+ if(!last_io_mode_coll)
+ /* Switch back to collective I/O */
+ if(H5D_ioinfo_make_coll(io_info) < 0)
+ HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't switch to collective I/O")
+
+ done:
+ HDfree(chunk_io_option);
+ HDfree(chunk_addr);
+
+ FUNC_LEAVE_NOAPI(ret_value)
+} /* end H5D_multi_chunk_collective_io */
+
+/*-------------------------------------------------------------------------
+ * Function: H5D_inter_collective_io
+ *
+ * Purpose: Routine for the shared part of collective IO between multiple chunk
+ collective IO and contiguous collective IO
+
+ *
+ * Return: Non-negative on success/Negative on failure
+ *
+ * Programmer:
+ *
+ * Modifications:
+ *
+ *-------------------------------------------------------------------------
+ */
+static herr_t
+H5D_inter_collective_io(H5D_io_info_t *io_info,const H5S_t *file_space,const H5S_t *mem_space,
+ haddr_t addr, void *buf, hbool_t do_write )
+{
+
+ size_t mpi_buf_count, mpi_file_count; /* Number of "objects" to transfer */
+ MPI_Datatype mpi_file_type,mpi_buf_type;
+ hsize_t mpi_buf_offset, mpi_file_offset; /* Offset within dataset where selection (ie. MPI type) begins */
+ hbool_t mbt_is_derived=0, /* Whether the buffer (memory) type is derived and needs to be free'd */
+ mft_is_derived=0; /* Whether the file type is derived and needs to be free'd */
+ H5D_common_coll_info_t coll_info;
+ herr_t ret_value = SUCCEED; /* return value */
+
+ FUNC_ENTER_NOAPI_NOINIT(H5D_inter_collective_io)
+ if((file_space!=NULL) && (mem_space != NULL)) {
+ /*Obtain disk and memory MPI derived datatype */
+ if(H5S_mpio_space_type(file_space,H5T_get_size(io_info->dset->shared->type),
+ &mpi_file_type,&mpi_file_count,&mpi_file_offset,&mft_is_derived)<0)
+ HGOTO_ERROR(H5E_DATASPACE, H5E_BADTYPE, FAIL,"couldn't create MPI file type");
+
+ if(H5S_mpio_space_type(mem_space,H5T_get_size(io_info->dset->shared->type),
+ &mpi_buf_type,&mpi_buf_count,&mpi_buf_offset,&mbt_is_derived)<0)
+ HGOTO_ERROR(H5E_DATASPACE, H5E_BADTYPE, FAIL,"couldn't create MPI buffer type");
+
+ }
+ else {
+ /* For non-selection, participate with a none MPI derived datatype, the count is 0. */
+ mpi_buf_type = MPI_BYTE;
+ mpi_file_type = MPI_BYTE;
+ mpi_file_count = 0;
+ mpi_buf_count = 0;
+ }
+
+ coll_info.mbt_is_derived = mbt_is_derived;
+ coll_info.mft_is_derived = mft_is_derived;
+ coll_info.mpi_buf_count = mpi_buf_count;
+ coll_info.chunk_addr = addr;
+
+#ifdef KENT
+printf("before final collective IO\n");
+#endif
+ if(H5D_final_collective_io(io_info,&mpi_file_type,&mpi_buf_type,&coll_info,buf,do_write)<0)
+ HGOTO_ERROR(H5E_IO, H5E_CANTGET, FAIL,"couldn't finish collective MPI-IO");
+ done:
+
+ FUNC_LEAVE_NOAPI(ret_value)
+} /* end H5D_inter_collective_io */
+/*-------------------------------------------------------------------------
+ * Function: H5D_final_collective_io
+ *
+ * Purpose: Routine for the common part of collective IO with different storages.
+
+ *
+ * Return: Non-negative on success/Negative on failure
+ *
+ * Programmer:
+ *
+ * Modifications:
+ *
+ *-------------------------------------------------------------------------
+ */
+static herr_t
+H5D_final_collective_io(H5D_io_info_t *io_info,MPI_Datatype*mpi_file_type,MPI_Datatype *mpi_buf_type,
+ H5D_common_coll_info_t* coll_info, void *buf, hbool_t do_write)
+{
+
+
+ int mpi_code; /* MPI return code */
+ hbool_t plist_is_setup=0; /* Whether the dxpl has been customized */
+ herr_t ret_value = SUCCEED;
+
+
+ FUNC_ENTER_NOAPI_NOINIT(H5D_final_collective_io)
+
+ /*
+ * Pass buf type, file type to the file driver.
+ */
+
+ if(H5FD_mpi_setup_collective(io_info->dxpl_id, *mpi_buf_type, *mpi_file_type)<0)
+ HGOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "can't set MPI-I/O properties");
+
+ plist_is_setup=1;
+ /*HDfprintf(stdout,"chunk addr %Hu\n",coll_info->chunk_addr);
+ printf("mpi_buf_count %d\n",coll_info->mpi_buf_count); */
+ if(do_write) {
+ ret_value = (io_info->ops.write)(io_info,
+ coll_info->mpi_buf_count,0,NULL,NULL,coll_info->chunk_addr,
+ buf);
+ /* Check return value from optimized write */
+ if (ret_value<0)
+ HGOTO_ERROR(H5E_DATASET, H5E_WRITEERROR, FAIL, "optimized write failed")
+ }
+ else {
+ ret_value = (io_info->ops.read)(io_info,
+ coll_info->mpi_buf_count,0,NULL,NULL,coll_info->chunk_addr,
+ buf);
+ /* Check return value from optimized write */
+ if (ret_value<0)
+ HGOTO_ERROR(H5E_DATASET, H5E_READERROR, FAIL, "optimized read failed")
+ }
+ done:
+
+ /* Reset the dxpl settings */
+ if(plist_is_setup) {
+ if(H5FD_mpi_teardown_collective(io_info->dxpl_id)<0)
+ HDONE_ERROR(H5E_DATASPACE, H5E_CANTFREE, FAIL, "unable to reset dxpl values");
+ } /* end if */
+
+ /* free the MPI buf and file types */
+ if (coll_info->mbt_is_derived) {
+ if (MPI_SUCCESS != (mpi_code= MPI_Type_free( mpi_buf_type )))
+ HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code);
+
+ }
+ if (coll_info->mft_is_derived) {
+ if (MPI_SUCCESS != (mpi_code= MPI_Type_free( mpi_file_type )))
+ HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code);
+ }
+
+ FUNC_LEAVE_NOAPI(ret_value)
+}/* end H5D_final_collective_io */
+
+#ifdef OLD_WAY
+/*-------------------------------------------------------------------------
+ * Function: H5D_pre_sort_chunk
+ *
+ * Purpose: Routine to obtain addresses of all chunks for all processes
+
+ Description:
+ root will collective all chunk addresses and broadcast towards other processes.
+
+ Parameters:
+
+ Input: H5D_io_info_t* io_info,
+ int total_chunks,
+ Output: haddr_t total_chunk_addr_array[], : array to store sorted total chunk addresses
+ *
+ * Return: Non-negative on success/Negative on failure
+ *
+ * Programmer:
+ *
+ * Modifications:
+ *
+ *-------------------------------------------------------------------------
+ */
+static herr_t
+H5D_pre_sort_chunk(H5D_io_info_t *io_info,int total_chunks,haddr_t total_chunk_addr_array[]){
+
+ int root, mpi_rank;
+ MPI_Comm comm;
+ int mpi_code;
+ MPI_Datatype chunk_addrtype;
+ int mpi_type_cleanup = 0;
+ herr_t ret_value = SUCCEED;
+
+ FUNC_ENTER_NOAPI_NOINIT(H5D_pre_sort_chunk)
+
+ root = 0;
+ comm = io_info->comm;
+ if(MPI_SUCCESS !=(mpi_code = MPI_Comm_rank(comm,&mpi_rank)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_Comm_rank failed", mpi_code);
+
+ /*Create received MPI derived datatype */
+ if(MPI_SUCCESS !=(mpi_code = MPI_Type_contiguous(sizeof(haddr_t)*total_chunks,MPI_BYTE,&chunk_addrtype)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_Type_contiguous failed", mpi_code);
+ if(MPI_SUCCESS != (mpi_code = MPI_Type_commit(&chunk_addrtype)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_Type_commit failed", mpi_code);
+
+
+ mpi_type_cleanup = 1;
+
+ if(mpi_rank == root) {
+ if(H5D_istore_chunkmap(io_info,total_chunks,total_chunk_addr_array)<0)
+ HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't get chunk address");
+
+ }
+ /* Broadcasting the MPI_IO option info. and chunk address info. */
+ if(MPI_SUCCESS !=(mpi_code = MPI_Bcast(total_chunk_addr_array,1,chunk_addrtype,root,comm)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_BCast failed", mpi_code);
+
+done:
+
+ if(mpi_type_cleanup){
+ if (MPI_SUCCESS != (mpi_code= MPI_Type_free( &chunk_addrtype )))
+ HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code);
+ }
+ FUNC_LEAVE_NOAPI(ret_value)
+} /* end H5D_pre_sort_chunk() */
+
+#endif
+/*-------------------------------------------------------------------------
+ * Function: H5D_sort_chunk
+ *
+ * Purpose: Routine to sort chunks in increasing order of chunk address
+ Each chunk address is also obtained.
+
+ Description:
+ For most cases, the chunk address has already been sorted in increasing order.
+ The special sorting flag is used to optimize this common case.
+ quick sort is used for necessary sorting.
+
+ Parameters:
+ Input: H5D_io_info_t* io_info,
+ fm_map *fm(global chunk map struct)
+ Input/Output: H5D_chunk_addr_info_t chunk_addr_info_array[] : array to store chunk address and information
+ many_chunk_opt : flag to optimize the way to obtain chunk addresses
+ for many chunks
+ *
+ * Return: Non-negative on success/Negative on failure
+ *
+ * Programmer:
+ *
+ * Modifications:
+ *
+ *-------------------------------------------------------------------------
+ */
+
+static herr_t
+H5D_sort_chunk(H5D_io_info_t * io_info,
+ fm_map *fm,
+ H5D_chunk_addr_info_t chunk_addr_info_array[],
+ int many_chunk_opt)
+{
+
+
+ H5SL_node_t *chunk_node; /* Current node in chunk skip list */
+ H5D_chunk_info_t *chunk_info; /* Current chunking info. of this node. */
+ haddr_t chunk_addr; /* Current chunking address of this node */
+ haddr_t *total_chunk_addr_array; /* The array of chunk address for the total number of chunk */
+ int i,j,k,mpi_code;
+ int total_chunks;
+ size_t num_chunks;
+ int mpi_type_cleanup = 0;
+ int tchunk_addr_cleanup = 0;
+ MPI_Datatype chunk_addrtype;
+ H5D_storage_t store; /*union of EFL and chunk pointer in file space */
+ hbool_t do_sort = FALSE;
+ herr_t ret_value = SUCCEED; /*return value */
+
+ FUNC_ENTER_NOAPI_NOINIT(H5D_sort_chunk)
+
+ num_chunks = H5SL_count(fm->fsel);
+ /* If we need to optimize the way to obtain the chunk address */
+ if(many_chunk_opt != H5D_OBTAIN_ONE_CHUNK_ADDR_IND){
+
+ total_chunks = (int)fm->total_chunks;
+ total_chunk_addr_array = H5MM_malloc(sizeof(haddr_t)*total_chunks);
+ tchunk_addr_cleanup = 1;
+
+ if(many_chunk_opt == H5D_OBTAIN_ALL_CHUNK_ADDR_COL) {/* We will broadcast the array from the root process */
+
+ int mpi_rank, root,mpi_code;
+ root = 0;
+ if((mpi_rank = H5F_mpi_get_rank(io_info->dset->oloc.file))<0)
+ HGOTO_ERROR (H5E_IO, H5E_MPI, FAIL, "unable to obtain mpi rank");
+
+ /*Create received MPI derived datatype */
+ if(MPI_SUCCESS !=(mpi_code = MPI_Type_contiguous(sizeof(haddr_t)*total_chunks,MPI_BYTE,&chunk_addrtype)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_Type_contiguous failed", mpi_code);
+ if(MPI_SUCCESS !=(mpi_code = MPI_Type_commit(&chunk_addrtype)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_Type_commit failed", mpi_code);
+
+ mpi_type_cleanup = 1;
+
+ if(mpi_rank == root) {
+ if(H5D_istore_chunkmap(io_info,total_chunks,total_chunk_addr_array,fm->down_chunks)<0)
+ HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't get chunk address");
+ }
+ /* Broadcasting the MPI_IO option info. and chunk address info. */
+ if(MPI_SUCCESS !=(mpi_code = MPI_Bcast(total_chunk_addr_array,1,chunk_addrtype,root,io_info->comm)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_BCast failed", mpi_code);
+ }
+
+ else { /* Obtain all chunk addresses independently */
+ if(H5D_istore_chunkmap(io_info,total_chunks,total_chunk_addr_array,fm->down_chunks)<0)
+ HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't get chunk address");
+ }
+ }
+
+ /* Get first node in chunk skip list */
+ if(NULL ==(chunk_node = H5SL_first(fm->fsel)))
+ HGOTO_ERROR(H5E_STORAGE, H5E_CANTGET, FAIL,"couldn't get chunk node from skipped list");
+ /* Set dataset storage for I/O info */
+ io_info->store = &store;
+ if(NULL ==(chunk_info = H5SL_item(chunk_node)))
+ HGOTO_ERROR(H5E_STORAGE, H5E_CANTGET, FAIL,"couldn't get chunk info from skipped list");
+ store.chunk.offset = chunk_info->coords;
+ store.chunk.index = chunk_info->index;
+ i = 0;
+ if(many_chunk_opt == H5D_OBTAIN_ONE_CHUNK_ADDR_IND){
+ if(HADDR_UNDEF==(chunk_addr = H5D_istore_get_addr(io_info,NULL)))
+ HGOTO_ERROR(H5E_STORAGE, H5E_CANTGET, FAIL,"couldn't get chunk info from skipped list");
+ }
+ else
+ chunk_addr = total_chunk_addr_array[chunk_info->index];
+ chunk_addr_info_array[i].chunk_addr = chunk_addr;
+ chunk_addr_info_array[i].chunk_info = *chunk_info;
+
+ chunk_node = H5SL_next(chunk_node);
+ while(chunk_node) {
+
+ chunk_info = H5SL_item(chunk_node);
+ store.chunk.offset = chunk_info->coords;
+ store.chunk.index = chunk_info->index;
+
+ if(many_chunk_opt == H5D_OBTAIN_ONE_CHUNK_ADDR_IND){
+ if(HADDR_UNDEF==(chunk_addr = H5D_istore_get_addr(io_info,NULL)))
+ HGOTO_ERROR(H5E_STORAGE, H5E_CANTGET, FAIL,"couldn't get chunk info from skipped list");
+ }
+ else
+ chunk_addr = total_chunk_addr_array[chunk_info->index];
+
+ if(chunk_addr < chunk_addr_info_array[i].chunk_addr) do_sort = TRUE;
+ chunk_addr_info_array[i+1].chunk_addr = chunk_addr;
+ chunk_addr_info_array[i+1].chunk_info =*chunk_info;
+ i++;
+ chunk_node = H5SL_next(chunk_node);
+ }
+
+ if(do_sort)
+ HDqsort(chunk_addr_info_array,num_chunks,sizeof(chunk_addr_info_array),H5D_cmp_chunk_addr);
+
+done:
+
+ if(tchunk_addr_cleanup)
+ HDfree(total_chunk_addr_array);
+ if(mpi_type_cleanup) {
+ if (MPI_SUCCESS != (mpi_code= MPI_Type_free( &chunk_addrtype )))
+ HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code);
+ }
+ FUNC_LEAVE_NOAPI(ret_value)
+} /* end H5D_sort_chunk() */
+
+
+/*-------------------------------------------------------------------------
+ * Function: H5D_obtain_mpio_mode
+ *
+ * Purpose: Routine to obtain each io mode(collective,independent or none) for each chunk;
+ Each chunk address is also obtained.
+
+ Description:
+
+ 1) Each process provides two piece of information for all chunks with selection
+ a) chunk index
+ b) wheather this chunk is regular(for MPI derived datatype not working case)
+
+ 2) Gather all the information to the root process
+
+ 3) Root process will do the following:
+ a) Obtain chunk address for all chunks in this data space
+ b) With the consideration of the user option, calculate IO mode for each chunk
+ c) Build MPI derived datatype to combine "chunk address" and "assign_io" information
+ in order to do MPI Bcast only once
+ d) MPI Bcast the IO mode and chunk address information for each chunk.
+ 4) Each process then retrieves IO mode and chunk address information to assign_io_mode and chunk_addr.
+
+ Parameters:
+
+ Input: H5D_io_info_t* io_info,
+ fm_map *fm,(global chunk map struct)
+ Output: uint8_t assign_io_mode[], : IO mode, collective, independent or none
+ haddr_t chunk_addr[], : chunk address array for each chunk
+ *
+ * Return: Non-negative on success/Negative on failure
+ *
+ * Programmer:
+ *
+ * Modifications:
+ *
+ *-------------------------------------------------------------------------
+ */
+
+static herr_t
+H5D_obtain_mpio_mode(H5D_io_info_t* io_info,
+ fm_map *fm,
+ uint8_t assign_io_mode[],
+ haddr_t chunk_addr[])
+{
+
+ int total_chunks;
+ hsize_t ori_total_chunks;
+ int percent_nproc_per_chunk,threshold_nproc_per_chunk;
+ uint8_t* io_mode_info;
+ uint8_t* recv_io_mode_info;
+ uint8_t* mergebuf;
+ uint8_t* tempbuf;
+
+ H5SL_node_t* chunk_node;
+ H5D_chunk_info_t* chunk_info;
+
+ MPI_Datatype bastype[2];
+ MPI_Datatype chunk_addrtype;
+ int bascount;
+ int basblock[2];
+ MPI_Aint basdisp[2];
+ MPI_Datatype rtype;
+ MPI_Datatype stype;
+ int mpi_size,mpi_rank;
+ MPI_Comm comm;
+ int root;
+ int mpi_code;
+ int multi_chunk_io_col_threshold;
+ int mem_cleanup = 0,
+ mpi_type_cleanup = 0;
+
+ herr_t ret_value = SUCCEED;
+
+ FUNC_ENTER_NOAPI_NOINIT(H5D_obtain_mpio_mode)
+
+ /* Assign the rank 0 to the root */
+ root = 0;
+ comm = io_info->comm;
+
+ /* Obtain the number of process and the current rank of the process */
+ if((mpi_rank = H5F_mpi_get_rank(io_info->dset->oloc.file))<0)
+ HGOTO_ERROR (H5E_IO, H5E_MPI, FAIL, "unable to obtain mpi rank");
+ if((mpi_size = H5F_mpi_get_size(io_info->dset->oloc.file))<0)
+ HGOTO_ERROR (H5E_IO, H5E_MPI, FAIL, "unable to obtain mpi size");
+ multi_chunk_io_col_threshold = H5D_MULTI_CHUNK_IO_COL_THRESHOLD; /* May replace by user-input */
+ percent_nproc_per_chunk = multi_chunk_io_col_threshold;/* For example, above 50%, do collective IO */
+ threshold_nproc_per_chunk = mpi_size * percent_nproc_per_chunk/100;
+
+ /* Allocate memory */
+ ori_total_chunks = fm->total_chunks;
+ H5_ASSIGN_OVERFLOW(total_chunks,ori_total_chunks,hsize_t,int);
+
+ io_mode_info = (uint8_t *)H5MM_calloc(total_chunks*sizeof(MPI_BYTE));
+ mergebuf = H5MM_malloc((sizeof(haddr_t)+sizeof(MPI_BYTE))*total_chunks);
+ tempbuf = mergebuf + sizeof(MPI_BYTE)*total_chunks;
+ if(mpi_rank == root)
+ recv_io_mode_info = (uint8_t *)H5MM_malloc(total_chunks*sizeof(MPI_BYTE)*mpi_size);
+
+
+ mem_cleanup = 1;
+
+ chunk_node = H5SL_first(fm->fsel);
+
+ /*Obtain the regularity and selection information for all chunks in this process. */
+ while(chunk_node){
+
+ chunk_info = H5SL_item(chunk_node);
+
+#ifndef H5_MPI_COMPLEX_DERIVED_DATATYPE_WORKS
+ /* regularity information: 1, selection information: 2 */
+ if(H5S_SELECT_IS_REGULAR(chunk_info->fspace) == TRUE &&
+ H5S_SELECT_IS_REGULAR(chunk_info->mspace) == TRUE)
+#endif
+ io_mode_info[chunk_info->index] = H5D_CHUNK_SELECT_REG; /* this chunk is selected and is "regular" without defining H5_MPI_COMPLEX_DERIVED_DATATYPE_WORKS. */
+#ifndef H5_MPI_COMPLEX_DERIVED_DATATYPE_WORKS
+ else
+ io_mode_info[chunk_info->index] = H5D_CHUNK_SELECT_IRREG; /* this chunk is selected and is irregular*/
+#endif
+
+ chunk_node = H5SL_next(chunk_node);
+ }
+
+ /*Create sent MPI derived datatype */
+ if(MPI_SUCCESS !=(mpi_code = MPI_Type_contiguous(total_chunks,MPI_BYTE,&stype)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_Comm_rank failed", mpi_code);
+ if(MPI_SUCCESS != (mpi_code = MPI_Type_commit(&stype)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_Type_commit failed", mpi_code);
+
+ /*Create received basic MPI derived datatype */
+ bascount = 2;
+ basblock[0] = total_chunks;
+ basblock[1] = total_chunks;
+ basdisp[0] = 0;
+ basdisp[1] = (MPI_Aint)(sizeof(MPI_BYTE)*total_chunks);/* may need to check overflow */
+ bastype[0] = MPI_BYTE;
+
+ if(MPI_SUCCESS !=(mpi_code = MPI_Type_contiguous(sizeof(haddr_t),MPI_BYTE,&chunk_addrtype)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_Type_contiguous failed", mpi_code);
+ if(MPI_SUCCESS != (mpi_code = MPI_Type_commit(&chunk_addrtype)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_Type_commit failed", mpi_code);
+ bastype[1] = chunk_addrtype;
+
+ if(MPI_SUCCESS !=(mpi_code = MPI_Type_struct(bascount,basblock,basdisp,bastype,&rtype)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_Type_struct failed", mpi_code);
+ if(MPI_SUCCESS !=(mpi_code = MPI_Type_commit(&rtype)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_Type_commit failed", mpi_code);
+
+ /* Set up a flag to clean up the MPI derived datatype later */
+ mpi_type_cleanup = 1;
+
+ /*Gather all the information */
+ if(MPI_SUCCESS !=(mpi_code = MPI_Gather(io_mode_info,1,stype,recv_io_mode_info,1,stype,root,comm)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_Gather failed", mpi_code);
+
+ /* Calculate the mode for IO(collective, independent or none) at root process */
+ if(mpi_rank == root) {
+
+ int ic,nproc;
+ int* nproc_per_chunk;
+#if !defined(H5_MPI_COMPLEX_DERIVED_DATATYPE_WORKS) || !defined(H5_MPI_SPECIAL_COLLECTIVE_IO_WORKS)
+ int* ind_this_chunk;
+#endif
+
+ /* pre-computing: calculate number of processes and
+ regularity of the selection occupied in each chunk */
+ nproc_per_chunk = (int*)H5MM_calloc(total_chunks*sizeof(int));
+#if !defined(H5_MPI_COMPLEX_DERIVED_DATATYPE_WORKS) || !defined(H5_MPI_SPECIAL_COLLECTIVE_IO_WORKS)
+ ind_this_chunk = (int*)H5MM_calloc(total_chunks*sizeof(int));
+#endif
+
+ /* calculating the chunk address */
+ if(H5D_istore_chunkmap(io_info,total_chunks,chunk_addr,fm->down_chunks)<0){
+ HDfree(nproc_per_chunk);
+#if !defined(H5_MPI_COMPLEX_DERIVED_DATATYPE_WORKS) || !defined(H5_MPI_SPECIAL_COLLECTIVE_IO_WORKS)
+ HDfree(ind_this_chunk);
+#endif
+ HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't get chunk address");
+ }
+
+ /* checking for number of process per chunk and regularity of the selection*/
+ for (nproc = 0;nproc <mpi_size;nproc++){
+ uint8_t *tmp_recv_io_mode_info = recv_io_mode_info + nproc*total_chunks;
+ /* calculate the number of process per chunk and adding irregular selection option */
+ for(ic = 0; ic < total_chunks; ic++, tmp_recv_io_mode_info++){
+ if(*tmp_recv_io_mode_info != 0) {
+ nproc_per_chunk[ic]++;
+#ifndef H5_MPI_COMPLEX_DERIVED_DATATYPE_WORKS
+ if(*tmp_recv_io_mode_info == H5D_CHUNK_SELECT_IRREG)
+ ind_this_chunk[ic] = 1;
+#endif
+ }
+#ifndef H5_MPI_SPECIAL_COLLECTIVE_IO_WORKS
+ else {
+ /*checking whether we have a selection in this chunk */
+ ind_this_chunk[ic] = 1;
+ }
+#endif
+ }
+
+ }
+
+ /* Calculating MPIO mode for each chunk (collective, independent, none) */
+ for(ic = 0; ic < total_chunks; ic++){
+ if(nproc_per_chunk[ic]>=MAX(2,threshold_nproc_per_chunk)){
+#if !defined(H5_MPI_COMPLEX_DERIVED_DATATYPE_WORKS) || !defined(H5_MPI_SPECIAL_COLLECTIVE_IO_WORKS)
+ if(!ind_this_chunk[ic]) assign_io_mode[ic] = H5D_CHUNK_IO_MODE_COL;
+#else
+ assign_io_mode[ic] = H5D_CHUNK_IO_MODE_COL;
+#endif
+ }
+
+ }
+
+ /* merge buffer io_mode info and chunk addr into one */
+ HDmemcpy(mergebuf,assign_io_mode,sizeof(MPI_BYTE)*total_chunks);
+ HDmemcpy(tempbuf,chunk_addr,sizeof(haddr_t)*total_chunks);
+
+ HDfree(nproc_per_chunk);
+#if !defined(H5_MPI_COMPLEX_DERIVED_DATATYPE_WORKS) || !defined(H5_MPI_SPECIAL_COLLECTIVE_IO_WORKS)
+ HDfree(ind_this_chunk);
+#endif
+ }
+
+ /* Broadcasting the MPI_IO option info. and chunk address info. */
+ if(MPI_SUCCESS !=(mpi_code = MPI_Bcast(mergebuf,1,rtype,root,comm)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_BCast failed", mpi_code);
+
+ HDmemcpy(assign_io_mode,mergebuf,sizeof(MPI_BYTE)*total_chunks);
+ HDmemcpy(chunk_addr,tempbuf,sizeof(haddr_t)*total_chunks);
+
+done:
+
+ if(mpi_type_cleanup) {
+ if (MPI_SUCCESS != (mpi_code= MPI_Type_free( &chunk_addrtype )))
+ HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code);
+
+ if (MPI_SUCCESS != (mpi_code= MPI_Type_free( &stype )))
+ HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code);
+
+ if (MPI_SUCCESS != (mpi_code= MPI_Type_free( &rtype )))
+ HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code);
+ }
+
+ if(mem_cleanup){
+ HDfree(io_mode_info);
+ HDfree(mergebuf);
+ if(mpi_rank == root)
+ HDfree(recv_io_mode_info);
+ }
+
+ FUNC_LEAVE_NOAPI(ret_value)
+ }/* end H5D_obtain_mpio_mode*/
+
+static int
+H5D_cmp_chunk_addr(const void *chunk_addr_info1, const void *chunk_addr_info2)
+{
+ haddr_t addr1, addr2;
+
+ FUNC_ENTER_NOAPI_NOINIT(H5D_cmp_chunk_addr)
+
+ addr1 = ((const H5D_chunk_addr_info_t *)chunk_addr_info1)->chunk_addr;
+ addr2 = ((const H5D_chunk_addr_info_t *)chunk_addr_info2)->chunk_addr;
+
+ FUNC_LEAVE_NOAPI(H5F_addr_cmp(addr1, addr2))
+
+ }
#endif /* H5_HAVE_PARALLEL */
diff --git a/src/H5Dpkg.h b/src/H5Dpkg.h
index 97e7763..df30800 100644
--- a/src/H5Dpkg.h
+++ b/src/H5Dpkg.h
@@ -34,7 +34,8 @@
#include "H5Gprivate.h" /* Groups */
#include "H5Oprivate.h" /* Object headers */
#include "H5Sprivate.h" /* Dataspaces */
-#include "H5Tprivate.h" /* Datatypes */
+#include "H5Tprivate.h" /* Datatype functions */
+#include "H5SLprivate.h" /* Skip lists */
/**************************/
/* Package Private Macros */
@@ -67,14 +68,14 @@
struct H5D_io_info_t;
typedef herr_t (*H5D_io_read_func_t)(struct H5D_io_info_t *io_info,
size_t nelmts, size_t elmt_size,
- const H5S_t *file_space, const H5S_t *mem_space,
+ const H5S_t *file_space, const H5S_t *mem_space,haddr_t addr,
void *buf/*out*/);
/* Write directly from app buffer to file */
typedef herr_t (*H5D_io_write_func_t)(struct H5D_io_info_t *io_info,
size_t nelmts, size_t elmt_size,
- const H5S_t *file_space, const H5S_t *mem_space,
+ const H5S_t *file_space, const H5S_t *mem_space,haddr_t addr,
const void *buf);
/* Function pointers for I/O on particular types of dataset layouts */
@@ -187,6 +188,39 @@ typedef enum {
H5D_ALLOC_WRITE /* Dataset is being extended */
} H5D_time_alloc_t;
+
+/* Structure holding information about a chunk's selection for mapping */
+typedef struct H5D_chunk_info_t {
+ hsize_t index; /* "Index" of chunk in dataset */
+ size_t chunk_points; /* Number of elements selected in chunk */
+ H5S_t *fspace; /* Dataspace describing chunk & selection in it */
+ hsize_t coords[H5O_LAYOUT_NDIMS]; /* Coordinates of chunk in file dataset's dataspace */
+ H5S_t *mspace; /* Dataspace describing selection in memory corresponding to this chunk */
+ unsigned mspace_shared; /* Indicate that the memory space for a chunk is shared and shouldn't be freed */
+} H5D_chunk_info_t;
+
+/* Main structure holding the mapping between file chunks and memory */
+typedef struct fm_map {
+ H5SL_t *fsel; /* Skip list containing file dataspaces for all chunks */
+ hsize_t last_index; /* Index of last chunk operated on */
+ H5D_chunk_info_t *last_chunk_info; /* Pointer to last chunk's info */
+ const H5S_t *file_space; /* Pointer to the file dataspace */
+ const H5S_t *mem_space; /* Pointer to the memory dataspace */
+ unsigned mem_space_copy; /* Flag to indicate that the memory dataspace must be copied */
+ hsize_t f_dims[H5O_LAYOUT_NDIMS]; /* File dataspace dimensions */
+ H5S_t *mchunk_tmpl; /* Dataspace template for new memory chunks */
+ unsigned f_ndims; /* Number of dimensions for file dataspace */
+ H5S_sel_iter_t mem_iter; /* Iterator for elements in memory selection */
+ unsigned m_ndims; /* Number of dimensions for memory dataspace */
+ hsize_t chunks[H5O_LAYOUT_NDIMS]; /* Number of chunks in each dimension */
+ hsize_t chunk_dim[H5O_LAYOUT_NDIMS]; /* Size of chunk in each dimension */
+ hsize_t down_chunks[H5O_LAYOUT_NDIMS]; /* "down" size of number of chunks in each dimension */
+ H5O_layout_t *layout; /* Dataset layout information*/
+ H5S_sel_type msel_type; /* Selection type in memory */
+ hsize_t total_chunks; /* Number of total chunks */
+ hbool_t *select_chunk; /* store the information about whether this chunk is selected or not */
+} fm_map;
+
/*****************************/
/* Package Private Variables */
/*****************************/
@@ -215,10 +249,12 @@ H5_DLL size_t H5D_select_mgath (const void *_buf,
H5_DLL herr_t H5D_select_read(H5D_io_info_t *io_info,
size_t nelmts, size_t elmt_size,
const H5S_t *file_space, const H5S_t *mem_space,
+ haddr_t addr,
void *buf/*out*/);
H5_DLL herr_t H5D_select_write(H5D_io_info_t *io_info,
size_t nelmts, size_t elmt_size,
const H5S_t *file_space, const H5S_t *mem_space,
+ haddr_t addr,
const void *buf/*out*/);
/* Functions that operate on contiguous storage */
@@ -296,42 +332,31 @@ H5_DLL ssize_t H5D_efl_writevv(const H5D_io_info_t *io_info,
H5_DLL herr_t H5D_mpio_select_read(H5D_io_info_t *io_info,
size_t nelmts, size_t elmt_size,
const struct H5S_t *file_space, const struct H5S_t *mem_space,
- void *buf/*out*/);
+ haddr_t addr,void *buf/*out*/);
/* MPI-IO function to read , it will select either regular or irregular read */
H5_DLL herr_t H5D_mpio_select_write(H5D_io_info_t *io_info,
size_t nelmts, size_t elmt_size,
const struct H5S_t *file_space, const struct H5S_t *mem_space,
- const void *buf);
-
-/* MPI-IO function to read directly from app buffer to file rky980813 */
-H5_DLL herr_t H5D_mpio_spaces_read(H5D_io_info_t *io_info,
- size_t nelmts, size_t elmt_size,
- const struct H5S_t *file_space, const struct H5S_t *mem_space,
- void *buf/*out*/);
-
-/* MPI-IO function to write directly from app buffer to file rky980813 */
-H5_DLL herr_t H5D_mpio_spaces_write(H5D_io_info_t *io_info,
- size_t nelmts, size_t elmt_size,
- const struct H5S_t *file_space, const struct H5S_t *mem_space,
- const void *buf);
-
-/* MPI-IO function to read directly from app buffer to file rky980813 */
-H5_DLL herr_t H5D_mpio_spaces_span_read(H5D_io_info_t *io_info,
- size_t nelmts, size_t elmt_size,
- const struct H5S_t *file_space, const struct H5S_t *mem_space,
- void *buf/*out*/);
-
-/* MPI-IO function to write directly from app buffer to file rky980813 */
-H5_DLL herr_t H5D_mpio_spaces_span_write(H5D_io_info_t *io_info,
- size_t nelmts, size_t elmt_size,
- const struct H5S_t *file_space, const struct H5S_t *mem_space,
- const void *buf);
-
+ haddr_t addr,const void *buf);
+
+/* MPI-IO function to handle contiguous collective IO */
+H5_DLL herr_t
+H5D_contig_collective_io(H5D_io_info_t *io_info,
+ const H5S_t *file_space,const H5S_t *mem_space,
+ void *_buf,hbool_t do_write);
+
+/* MPI-IO function to handle chunked collective IO */
+H5_DLL herr_t
+H5D_chunk_collective_io(H5D_io_info_t * io_info,fm_map *fm, void*buf,
+ hbool_t do_write);
/* MPI-IO function to check if a direct I/O transfer is possible between
* memory and the file */
H5_DLL htri_t H5D_mpio_opt_possible(const H5D_io_info_t *io_info, const H5S_t *mem_space,
const H5S_t *file_space, const H5T_path_t *tpath);
+
+H5_DLL herr_t H5D_mpio_chunk_adjust_iomode(H5D_io_info_t *io_info,const fm_map *fm);
+
#endif /* H5_HAVE_PARALLEL */
/* Testing functions */
diff --git a/src/H5Dselect.c b/src/H5Dselect.c
index 9f98d9e..df9d32f 100644
--- a/src/H5Dselect.c
+++ b/src/H5Dselect.c
@@ -432,6 +432,7 @@ herr_t
H5D_select_read(H5D_io_info_t *io_info,
size_t nelmts, size_t elmt_size,
const H5S_t *file_space, const H5S_t *mem_space,
+ const haddr_t UNUSED addr,
void *buf/*out*/)
{
H5S_sel_iter_t mem_iter; /* Memory selection iteration info */
@@ -575,6 +576,7 @@ herr_t
H5D_select_write(H5D_io_info_t *io_info,
size_t nelmts, size_t elmt_size,
const H5S_t *file_space, const H5S_t *mem_space,
+ const haddr_t UNUSED addr,
const void *buf/*out*/)
{
H5S_sel_iter_t mem_iter; /* Memory selection iteration info */