summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorrjzamora <rzamora@anl.gov>2018-12-14 23:18:32 (GMT)
committerrjzamora <rzamora@anl.gov>2018-12-14 23:18:32 (GMT)
commitb5120771b1076ff32421b5d3cf71f76a25b2f8c7 (patch)
tree4457f7972e1735fcfe405827b1047dffbaa24e8f
parent7670f112caea0748574b6b1b9752543a168f2538 (diff)
downloadhdf5-b5120771b1076ff32421b5d3cf71f76a25b2f8c7.zip
hdf5-b5120771b1076ff32421b5d3cf71f76a25b2f8c7.tar.gz
hdf5-b5120771b1076ff32421b5d3cf71f76a25b2f8c7.tar.bz2
First commit of ccio branch to add changes for the topology-aware MPIO VFD. The topology-aware VFD is part of the custom-collective I/O (CCIO) verison of the MPIO VFD. All CCIO modifications rely on a flattened-buffer (flatbuf) representation of the data in file and memory space. The CCIO options are enabled by Env Vars, and work by rerouting the usual collective-IO calls into new select_read and select_write VFD callback functions. The flatbuf selections are performed using exisiting HDF5 functionalities for dataspace selections.
-rw-r--r--src/H5Dmpio.c103
-rw-r--r--src/H5FDcore.c2
-rw-r--r--src/H5FDdirect.c2
-rw-r--r--src/H5FDfamily.c2
-rw-r--r--src/H5FDint.c75
-rw-r--r--src/H5FDlog.c2
-rw-r--r--src/H5FDmpio.c5107
-rw-r--r--src/H5FDmpio_topology.h922
-rw-r--r--src/H5FDmulti.c2
-rw-r--r--src/H5FDprivate.h3
-rw-r--r--src/H5FDpublic.h30
-rw-r--r--src/H5FDsec2.c2
-rw-r--r--src/H5FDstdio.c2
-rw-r--r--src/H5Fio.c128
-rw-r--r--src/H5Fprivate.h16
-rw-r--r--src/H5Sall.c9
-rw-r--r--src/H5Shyper.c29
-rw-r--r--src/H5Smpio.c122
-rw-r--r--src/H5Spoint.c15
-rw-r--r--src/H5Sprivate.h29
20 files changed, 6448 insertions, 154 deletions
diff --git a/src/H5Dmpio.c b/src/H5Dmpio.c
index 2c06800..7aebbbb 100644
--- a/src/H5Dmpio.c
+++ b/src/H5Dmpio.c
@@ -618,6 +618,10 @@ H5D__contig_collective_read(H5D_io_info_t *io_info, const H5D_type_info_t *type_
H5D_chunk_map_t H5_ATTR_UNUSED *fm)
{
H5D_mpio_actual_io_mode_t actual_io_mode = H5D_MPIO_CONTIGUOUS_COLLECTIVE;
+ char *do_custom_agg; /* CCIO-read env variable */
+ hid_t file_space_hid; /* file space for CCIO */
+ hid_t mem_space_hid; /* memory space for CCIO */
+ const H5D_contig_storage_t *store_contig; /* Storage structure for CCIO */
herr_t ret_value = SUCCEED; /* Return value */
FUNC_ENTER_PACKAGE
@@ -625,9 +629,32 @@ H5D__contig_collective_read(H5D_io_info_t *io_info, const H5D_type_info_t *type_
/* Sanity check */
HDassert(H5FD_MPIO == H5F_DRIVER_ID(io_info->dset->oloc.file));
- /* Call generic internal collective I/O routine */
- if(H5D__inter_collective_io(io_info, type_info, file_space, mem_space) < 0)
- HGOTO_ERROR(H5E_IO, H5E_READERROR, FAIL, "couldn't finish shared collective MPI-IO")
+ /* Check for CCIO-read option */
+ do_custom_agg = HDgetenv("HDF5_CCIO_RD");
+ if (do_custom_agg && (strcmp(do_custom_agg,"yes") == 0)) {
+
+ /* Call select_read (rather than `inter_collective` if using CCIO) */
+ file_space_hid = H5I_register(H5I_DATASPACE, file_space, TRUE);
+ mem_space_hid = H5I_register(H5I_DATASPACE, mem_space, TRUE);
+
+ /* Contiguous storage info for this I/O operation: */
+ store_contig = &(io_info->store->contig);
+
+ if(H5F_select_read(io_info->dset->oloc.file, H5FD_MEM_DRAW, file_space_hid, mem_space_hid, (size_t)type_info->src_type_size, store_contig->dset_addr, io_info->u.rbuf) < 0)
+ HGOTO_ERROR(H5E_IO, H5E_READERROR, FAIL, "can't finish collective parallel read")
+
+ if(NULL != ((H5S_t *)H5I_object_verify(file_space_hid, H5I_DATASPACE)))
+ H5Sclose(file_space_hid);
+ if(NULL != ((H5S_t *)H5I_object_verify(mem_space_hid, H5I_DATASPACE)))
+ H5Sclose(mem_space_hid);
+
+ } else {
+
+ /* Call generic internal collective I/O routine */
+ if(H5D__inter_collective_io(io_info, type_info, file_space, mem_space) < 0)
+ HGOTO_ERROR(H5E_IO, H5E_READERROR, FAIL, "couldn't finish shared collective MPI-IO")
+
+ }
/* Set the actual I/O mode property. internal_collective_io will not break to
* independent I/O, so we set it here.
@@ -658,6 +685,10 @@ H5D__contig_collective_write(H5D_io_info_t *io_info, const H5D_type_info_t *type
H5D_chunk_map_t H5_ATTR_UNUSED *fm)
{
H5D_mpio_actual_io_mode_t actual_io_mode = H5D_MPIO_CONTIGUOUS_COLLECTIVE;
+ char *do_custom_agg; /* CCIO-write env variable */
+ hid_t file_space_hid; /* file space for CCIO */
+ hid_t mem_space_hid; /* memory space for CCIO */
+ const H5D_contig_storage_t *store_contig; /* Storage structure for CCIO */
herr_t ret_value = SUCCEED; /* Return value */
FUNC_ENTER_PACKAGE
@@ -665,9 +696,32 @@ H5D__contig_collective_write(H5D_io_info_t *io_info, const H5D_type_info_t *type
/* Sanity check */
HDassert(H5FD_MPIO == H5F_DRIVER_ID(io_info->dset->oloc.file));
- /* Call generic internal collective I/O routine */
- if(H5D__inter_collective_io(io_info, type_info, file_space, mem_space) < 0)
- HGOTO_ERROR(H5E_IO, H5E_WRITEERROR, FAIL, "couldn't finish shared collective MPI-IO")
+ /* Check for CCIO-write option */
+ do_custom_agg = HDgetenv("HDF5_CCIO_WR");
+ if (do_custom_agg && (strcmp(do_custom_agg,"yes") == 0)) {
+
+ /* Call select_write (rather than `inter_collective` if using CCIO) */
+ hid_t file_space_hid = H5I_register(H5I_DATASPACE, file_space,TRUE);
+ hid_t mem_space_hid = H5I_register(H5I_DATASPACE, mem_space,TRUE);
+
+ /* Contiguous storage info for this I/O operation: */
+ store_contig = &(io_info->store->contig);
+
+ if(H5F_select_write(io_info->dset->oloc.file, H5FD_MEM_DRAW, file_space_hid, mem_space_hid, (size_t)type_info->src_type_size, store_contig->dset_addr, io_info->u.wbuf) < 0)
+ HGOTO_ERROR(H5E_IO, H5E_WRITEERROR, FAIL, "can't finish collective parallel write")
+
+ if(NULL != ((H5S_t *)H5I_object_verify(file_space_hid, H5I_DATASPACE)))
+ H5Sclose(file_space_hid);
+ if(NULL != ((H5S_t *)H5I_object_verify(mem_space_hid, H5I_DATASPACE)))
+ H5Sclose(mem_space_hid);
+
+ } else {
+
+ /* Call generic internal collective I/O routine */
+ if(H5D__inter_collective_io(io_info, type_info, file_space, mem_space) < 0)
+ HGOTO_ERROR(H5E_IO, H5E_WRITEERROR, FAIL, "couldn't finish shared collective MPI-IO")
+
+ }
/* Set the actual I/O mode property. internal_collective_io will not break to
* independent I/O, so we set it here.
@@ -743,7 +797,7 @@ H5D__chunk_collective_io(H5D_io_info_t *io_info, const H5D_type_info_t *type_inf
io_option = H5D_ONE_LINK_CHUNK_IO; /*no opt*/
/* direct request to multi-chunk-io */
else if(H5FD_MPIO_CHUNK_MULTI_IO == chunk_opt_mode)
- io_option = H5D_MULTI_CHUNK_IO;
+ io_option = H5D_MULTI_CHUNK_IO;
/* via default path. branch by num threshold */
else {
unsigned one_link_chunk_io_threshold; /* Threshold to use single collective I/O for all chunks */
@@ -1054,9 +1108,9 @@ if(H5DEBUG(D))
/* Obtain MPI derived datatype from all individual chunks */
for(u = 0; u < num_chunk; u++) {
- hsize_t *permute_map = NULL; /* array that holds the mapping from the old,
- out-of-order displacements to the in-order
- displacements of the MPI datatypes of the
+ hsize_t *permute_map = NULL; /* array that holds the mapping from the old,
+ out-of-order displacements to the in-order
+ displacements of the MPI datatypes of the
point selection of the file space */
hbool_t is_permuted = FALSE;
@@ -1066,8 +1120,8 @@ if(H5DEBUG(D))
* where it will be freed.
*/
if(H5S_mpio_space_type(chunk_addr_info_array[u].chunk_info.fspace,
- type_info->src_type_size,
- &chunk_ftype[u], /* OUT: datatype created */
+ type_info->src_type_size,
+ &chunk_ftype[u], /* OUT: datatype created */
&chunk_mpi_file_counts[u], /* OUT */
&(chunk_mft_is_derived_array[u]), /* OUT */
TRUE, /* this is a file space,
@@ -1085,9 +1139,9 @@ if(H5DEBUG(D))
if(is_permuted)
HDassert(permute_map);
if(H5S_mpio_space_type(chunk_addr_info_array[u].chunk_info.mspace,
- type_info->dst_type_size, &chunk_mtype[u],
- &chunk_mpi_mem_counts[u],
- &(chunk_mbt_is_derived_array[u]),
+ type_info->dst_type_size, &chunk_mtype[u],
+ &chunk_mpi_mem_counts[u],
+ &(chunk_mbt_is_derived_array[u]),
FALSE, /* this is a memory
space, so if the file
space is not
@@ -1947,9 +2001,9 @@ H5D__inter_collective_io(H5D_io_info_t *io_info, const H5D_type_info_t *type_inf
if((file_space != NULL) && (mem_space != NULL)) {
int mpi_file_count; /* Number of file "objects" to transfer */
- hsize_t *permute_map = NULL; /* array that holds the mapping from the old,
- out-of-order displacements to the in-order
- displacements of the MPI datatypes of the
+ hsize_t *permute_map = NULL; /* array that holds the mapping from the old,
+ out-of-order displacements to the in-order
+ displacements of the MPI datatypes of the
point selection of the file space */
hbool_t is_permuted = FALSE;
@@ -1958,8 +2012,8 @@ H5D__inter_collective_io(H5D_io_info_t *io_info, const H5D_type_info_t *type_inf
* and will be fed into the next call to H5S_mpio_space_type
* where it will be freed.
*/
- if(H5S_mpio_space_type(file_space, type_info->src_type_size,
- &mpi_file_type, &mpi_file_count, &mft_is_derived, /* OUT: datatype created */
+ if(H5S_mpio_space_type(file_space, type_info->src_type_size,
+ &mpi_file_type, &mpi_file_count, &mft_is_derived, /* OUT: datatype created */
TRUE, /* this is a file space, so
permute the datatype if the
point selection is out of
@@ -1968,13 +2022,13 @@ H5D__inter_collective_io(H5D_io_info_t *io_info, const H5D_type_info_t *type_inf
the permutation of
points selected in
case they are out of
- order */
+ order */
&is_permuted /* OUT */) < 0)
HGOTO_ERROR(H5E_DATASPACE, H5E_BADTYPE, FAIL, "couldn't create MPI file type")
/* Sanity check */
if(is_permuted)
HDassert(permute_map);
- if(H5S_mpio_space_type(mem_space, type_info->src_type_size,
+ if(H5S_mpio_space_type(mem_space, type_info->src_type_size,
&mpi_buf_type, &mpi_buf_count, &mbt_is_derived, /* OUT: datatype created */
FALSE, /* this is a memory space, so if
the file space is not
@@ -1986,7 +2040,7 @@ H5D__inter_collective_io(H5D_io_info_t *io_info, const H5D_type_info_t *type_inf
generated by the
file_space selection
and applied to the
- memory selection */,
+ memory selection */,
&is_permuted /* IN */) < 0)
HGOTO_ERROR(H5E_DATASPACE, H5E_BADTYPE, FAIL, "couldn't create MPI buffer type")
/* Sanity check */
@@ -2556,7 +2610,7 @@ H5D__construct_filtered_io_info_list(const H5D_io_info_t *io_info, const H5D_typ
local_info_array[i].num_writers = 0;
local_info_array[i].owners.original_owner = local_info_array[i].owners.new_owner = mpi_rank;
local_info_array[i].buf = NULL;
-
+
local_info_array[i].async_info.num_receive_requests = 0;
local_info_array[i].async_info.receive_buffer_array = NULL;
local_info_array[i].async_info.receive_requests_array = NULL;
@@ -3246,4 +3300,3 @@ done:
FUNC_LEAVE_NOAPI(ret_value)
} /* end H5D__filtered_collective_chunk_entry_io() */
#endif /* H5_HAVE_PARALLEL */
-
diff --git a/src/H5FDcore.c b/src/H5FDcore.c
index 2ab04dc..ef346f7 100644
--- a/src/H5FDcore.c
+++ b/src/H5FDcore.c
@@ -170,6 +170,8 @@ static const H5FD_class_t H5FD_core_g = {
H5FD__core_get_handle, /* get_handle */
H5FD__core_read, /* read */
H5FD__core_write, /* write */
+ NULL, /* select_read */
+ NULL, /* select_write */
H5FD__core_flush, /* flush */
H5FD__core_truncate, /* truncate */
H5FD_core_lock, /* lock */
diff --git a/src/H5FDdirect.c b/src/H5FDdirect.c
index 906ec28..b39acf6 100644
--- a/src/H5FDdirect.c
+++ b/src/H5FDdirect.c
@@ -167,6 +167,8 @@ static const H5FD_class_t H5FD_direct_g = {
H5FD_direct_get_handle, /*get_handle */
H5FD_direct_read, /*read */
H5FD_direct_write, /*write */
+ NULL, /* select_read */
+ NULL, /* select_write */
NULL, /*flush */
H5FD_direct_truncate, /*truncate */
H5FD_direct_lock, /*lock */
diff --git a/src/H5FDfamily.c b/src/H5FDfamily.c
index e52a71a..bd15cd8 100644
--- a/src/H5FDfamily.c
+++ b/src/H5FDfamily.c
@@ -135,6 +135,8 @@ static const H5FD_class_t H5FD_family_g = {
H5FD_family_get_handle, /*get_handle */
H5FD_family_read, /*read */
H5FD_family_write, /*write */
+ NULL, /*select_read */
+ NULL, /*select_write */
H5FD_family_flush, /*flush */
H5FD_family_truncate, /*truncate */
H5FD_family_lock, /*lock */
diff --git a/src/H5FDint.c b/src/H5FDint.c
index ea8c4d8..22292b4 100644
--- a/src/H5FDint.c
+++ b/src/H5FDint.c
@@ -252,6 +252,81 @@ done:
FUNC_LEAVE_NOAPI(ret_value)
} /* end H5FD_write() */
+/*-------------------------------------------------------------------------
+ * Function: H5FD_select_read
+ *
+ * Purpose: Private version of H5FDselect_read()
+ *
+ * Return: Success: Non-negative
+ * Failure: Negative
+ *
+ * Programmer: Quincey Koziol
+ * Saturday, November 4, 2017
+ *
+ * Rick Zamora (Revised October 24, 2018)
+ *
+ *-------------------------------------------------------------------------
+ */
+herr_t
+H5FD_select_read(H5FD_t *file, H5FD_mem_t type,
+ hid_t file_space, hid_t mem_space, size_t elmt_size, haddr_t addr, void *buf/*out*/)
+{
+ hid_t dxpl_id = H5I_INVALID_HID; /* DXPL for operation */
+ herr_t ret_value = SUCCEED; /* Return value */
+ FUNC_ENTER_NOAPI(FAIL)
+ {}
+ /* Sanity checks */
+ HDassert(file);
+ HDassert(file->cls);
+ HDassert(file->cls->select_read);
+ HDassert(buf);
+ /* Get proper DXPL for I/O */
+ dxpl_id = H5CX_get_dxpl();
+ /* Dispatch to driver */
+ if((file->cls->select_read)(file, type, dxpl_id, file_space, mem_space, elmt_size, addr + file->base_addr, buf) < 0)
+ HGOTO_ERROR(H5E_VFL, H5E_READERROR, FAIL, "driver read request failed")
+done:
+ FUNC_LEAVE_NOAPI(ret_value)
+} /* end H5FD_select_read() */
+
+/*-------------------------------------------------------------------------
+ * Function: H5FD_select_write
+ *
+ * Purpose: Private version of H5FDselect_write()
+ *
+ * Return: Success: Non-negative
+ * Failure: Negative
+ *
+ * Programmer: Quincey Koziol
+ * Saturday, November 4, 2017
+ *
+ * Rick Zamora (Revised October 24, 2018)
+ *
+ *-------------------------------------------------------------------------
+ */
+herr_t
+H5FD_select_write(H5FD_t *file, H5FD_mem_t type,
+ hid_t file_space, hid_t mem_space, size_t elmt_size, haddr_t addr, const void *buf)
+{
+ hid_t dxpl_id; /* DXPL for operation */
+ haddr_t eoa = HADDR_UNDEF; /* EOA for file */
+ herr_t ret_value = SUCCEED; /* Return value */
+ FUNC_ENTER_NOAPI(FAIL)
+ {}
+ /* Sanity checks */
+ HDassert(file);
+ HDassert(file->cls);
+ HDassert(file->cls->select_write);
+ HDassert(buf);
+ /* Get proper DXPL for I/O */
+ dxpl_id = H5CX_get_dxpl();
+ /* Dispatch to driver */
+ if((file->cls->select_write)(file, type, dxpl_id, file_space, mem_space, elmt_size, addr + file->base_addr, buf) < 0)
+ HGOTO_ERROR(H5E_VFL, H5E_WRITEERROR, FAIL, "driver write request failed")
+done:
+ FUNC_LEAVE_NOAPI(ret_value)
+} /* end H5FD_select_write() */
+
/*-------------------------------------------------------------------------
* Function: H5FD_set_eoa
diff --git a/src/H5FDlog.c b/src/H5FDlog.c
index 3dcd7f5..18a61aa 100644
--- a/src/H5FDlog.c
+++ b/src/H5FDlog.c
@@ -211,6 +211,8 @@ static const H5FD_class_t H5FD_log_g = {
H5FD_log_get_handle, /*get_handle */
H5FD_log_read, /*read */
H5FD_log_write, /*write */
+ NULL, /*select_read */
+ NULL, /*select_write */
NULL, /*flush */
H5FD_log_truncate, /*truncate */
H5FD_log_lock, /*lock */
diff --git a/src/H5FDmpio.c b/src/H5FDmpio.c
index 87f8b6a..e4d7c87 100644
--- a/src/H5FDmpio.c
+++ b/src/H5FDmpio.c
@@ -21,7 +21,6 @@
#include "H5FDdrvr_module.h" /* This source code file is part of the H5FD driver module */
-
#include "H5private.h" /* Generic Functions */
#include "H5CXprivate.h" /* API Contexts */
#include "H5Dprivate.h" /* Dataset functions */
@@ -32,8 +31,206 @@
#include "H5Iprivate.h" /* IDs */
#include "H5MMprivate.h" /* Memory management */
#include "H5Pprivate.h" /* Property lists */
+#include "H5FDmpio_topology.h" /* Topology API */
+#include <pthread.h>
+//#define topo_timing
+//#define onesidedtrace
#ifdef H5_HAVE_PARALLEL
+#ifdef BGQ
+#define inline
+#endif
+
+/* optypes for ADIO Requests */
+#define READ_CA 26
+#define WRITE_CA 27
+
+/*******************************/
+/* CCIO Typedefs and Functions */
+/*******************************/
+
+typedef struct CustomAgg_FH_Struct_Data *CustomAgg_FH_Data;
+typedef long ADIO_Offset_CA;
+
+/*
+ * Declaration of i/o thread data structure (bgmpio_pthreadwc)
+ */
+typedef struct wcThreadFuncData_CA {
+ MPI_File fh;
+ int io_kind;
+ char *buf;
+ MPI_Offset size;
+ MPI_Offset offset;
+ int error_code;
+ int myrank;
+} ThreadFuncData;
+
+/*
+ * FSLayout determines how aggregators will be mapped to the file
+ * LUSTRE -> Aggregators will be mapped to specific LUSTRE-like stripes
+ * GPFS -> Aggregators will be each assigned to a contiguous file domain
+ */
+enum FSLayout{LUSTRE, GPFS};
+
+/*
+ * Structure holding important info for CCIO options
+ * (Must be populated at the MPI_File_open)
+ */
+typedef struct CustomAgg_FH_Struct_Data {
+ MPI_Comm comm;
+ MPI_File fh;
+ int io_buf_put_amounts;
+ char *io_buf;
+ MPI_Win io_buf_window; /* Window over the io_buf to support one-sided aggregation */
+ MPI_Win io_buf_put_amounts_window; /* Window over the io_buf_put_amounts */
+ int ccio_read;
+ int ccio_write;
+ int cb_nodes;
+ int ppn; /* Only used in topology-aware cb selection if env var is set */
+ int pps; /* Only used in topology-aware cb selection if env var is set */
+ enum AGGSelect topo_cb_select;
+ int cb_buffer_size;
+ int fs_block_count;
+ int fs_block_size;
+ int onesided_always_rmw;
+ int onesided_no_rmw;
+ int onesided_inform_rmw;
+ int onesided_write_aggmethod;
+ int onesided_read_aggmethod;
+ int *ranklist;
+ int ranklist_populated;
+ /* ------- Added for Async IO ------- */
+ int async_io_outer; /* Assume H5FD_mpio_ccio_osagg_write calls will only require 1 "inner" round */
+ int async_io_inner; /* Assume File-domain aggregation mapping */
+ char *io_buf_d; /* Duplicate for "outer" async IO */
+ int io_buf_put_amounts_d; /* Duplicate for "outer" async IO */
+ MPI_Win io_buf_window_d; /* Duplicate for "outer" async IO */
+ MPI_Win io_buf_put_amounts_window_d; /* Duplicate for "outer" async IO */
+ MPIO_Request io_Request;
+ MPIO_Request io_Request_d;
+ int check_req;
+ int check_req_d;
+ int use_dup;
+ int pthread_io;
+ /* ---------------------------------- */
+ enum FSLayout fslayout;
+} CustomAgg_FH_Struct_Data;
+
+/*
+ * This data structure holds parameters related to regulating
+ * the one-sided aggregation algorithm.
+ */
+typedef struct FS_Block_Parms {
+ int stripeSize; /* size in bytes of the "striping" unit - a size of 0 indicates to the */
+ /* onesided algorithm that we are a non-striping file system */
+ ADIO_Offset_CA segmentLen; /* size in bytes of the segment (stripeSize*number of aggs) */
+ /* up to the size of the file) */
+ int stripesPerAgg; /* the number of stripes to be packed into an agg cb for this segment */
+ int segmentIter; /* segment number for the group of stripes currently being packed into */
+ /* the agg cb - resets to 0 for each cb flush to the file system */
+ int flushCB; /* once we have fully packed the cb on an agg this flags */
+ /* tells us to now write to the file */
+ ADIO_Offset_CA stripedLastFileOffset; /* since we are now just calling the onesided algorithm */
+ /* with the offset range of segment, we still need to */
+ /* know the actual last offset of the file. */
+ int firstStripedIOCall; /* whether this is the first call in the first segement of the */
+ /* onesided algorithm. */
+ int lastStripedIOCall; /* whether this is the last call in the last segement of the */
+ /* onesided algorithm. */
+ int iWasUsedStripingAgg; /* whether this rank was ever a used agg for this striping segement */
+ int numStripesUsed; /* the number of stripes packed into an aggregator */
+ /* These 2 elements are the offset and lengths in the file corresponding to the actual stripes */
+ MPI_Offset *stripeIOoffsets;
+ int *stripeIOLens;
+ int amountOfStripedDataExpected; /* used to determine holes in this segment thereby requiring a rmw */
+ /* These 2 elements enable ADIOI_OneSidedWriteAggregation to be called multiple times but only */
+ /* perform the potientially computationally costly flattening of the source buffer just once */
+ hsize_t bufTypeExtent;
+ /* These three elements track the state of the source buffer advancement through multiple calls */
+ /* to ADIOI_OneSidedWriteAggregation */
+ ADIO_Offset_CA lastDataTypeExtent;
+ int lastFlatBufIndice;
+ ADIO_Offset_CA lastIndiceOffset;
+} FS_Block_Parms;
+
+/*
+ * This data structure holds the access state of the source buffer for target
+ * file domains within aggregators corresponding to the target data blocks.
+ * The validity of the usage of this structure relies on the requirement that
+ * only 1 aggregator can write to agiven file domain.
+ */
+typedef struct FDSourceBufferState_CA {
+ ADIO_Offset_CA indiceOffset;
+ hsize_t bufTypeExtent;
+ ADIO_Offset_CA dataTypeExtent;
+ int flatBufIndice;
+ ADIO_Offset_CA sourceBufferOffset;
+} FDSourceBufferState_CA;
+
+void calc_file_domains(ADIO_Offset_CA *st_offsets, ADIO_Offset_CA *end_offsets,
+ int nprocs, int nprocs_for_coll, ADIO_Offset_CA *min_st_offset_ptr,
+ ADIO_Offset_CA **fd_start_ptr, ADIO_Offset_CA **fd_end_ptr,
+ ADIO_Offset_CA *fd_size_ptr, ADIO_Offset_CA blksize);
+
+void H5FD_mpio_ccio_write_one_sided(CustomAgg_FH_Data ca_data, const void *buf,
+ MPI_Offset mpi_off, H5S_flatbuf_t *memFlatBuf, H5S_flatbuf_t *fileFlatBuf, int *error_code);
+
+void H5FD_mpio_ccio_read_one_sided(CustomAgg_FH_Data ca_data, void *buf, MPI_Offset mpi_off,
+ H5S_flatbuf_t *memFlatBuf, H5S_flatbuf_t *fileFlatBuf, int *error_code);
+
+void H5FD_mpio_ccio_iterate_write(CustomAgg_FH_Data ca_data, const void *buf,
+ int *fs_block_info, ADIO_Offset_CA *offset_list, ADIO_Offset_CA *len_list,
+ MPI_Offset mpi_off, int contig_access_count, int currentValidDataIndex,
+ ADIO_Offset_CA start_offset, ADIO_Offset_CA end_offset,
+ ADIO_Offset_CA firstFileOffset, ADIO_Offset_CA lastFileOffset,
+ H5S_flatbuf_t *memFlatBuf, H5S_flatbuf_t *fileFlatBuf, int myrank, int *error_code);
+
+void H5FD_mpio_ccio_iterate_read(CustomAgg_FH_Data ca_data, void *buf,
+ int *fs_block_info, ADIO_Offset_CA *offset_list, ADIO_Offset_CA *len_list,
+ MPI_Offset mpi_off, int contig_access_count, int currentValidDataIndex,
+ ADIO_Offset_CA start_offset, ADIO_Offset_CA end_offset,
+ ADIO_Offset_CA firstFileOffset, ADIO_Offset_CA lastFileOffset,
+ H5S_flatbuf_t *memFlatBuf, H5S_flatbuf_t *fileFlatBuf, int myrank, int *error_code);
+
+void H5FD_mpio_ccio_osagg_write(CustomAgg_FH_Data ca_data,
+ ADIO_Offset_CA *offset_list,
+ ADIO_Offset_CA *len_list,
+ int contig_access_count,
+ const void *buf,
+ H5S_flatbuf_t *memFlatBuf,
+ int *error_code,
+ ADIO_Offset_CA firstFileOffset,
+ ADIO_Offset_CA lastFileOffset,
+ int numNonZeroDataOffsets,
+ ADIO_Offset_CA *fd_start,
+ ADIO_Offset_CA* fd_end,
+ int hole_found,
+ FS_Block_Parms *stripe_parms);
+
+void H5FD_mpio_ccio_osagg_read(CustomAgg_FH_Data ca_data,
+ ADIO_Offset_CA *offset_list,
+ ADIO_Offset_CA *len_list,
+ int contig_access_count,
+ const void *buf,
+ H5S_flatbuf_t *flatBuf,
+ int *error_code,
+ ADIO_Offset_CA firstFileOffset,
+ ADIO_Offset_CA lastFileOffset,
+ int numNonZeroDataOffsets,
+ ADIO_Offset_CA *fd_start,
+ ADIO_Offset_CA* fd_end,
+ FS_Block_Parms *stripe_parms,
+ int do_file_read);
+
+void H5FD_mpio_ccio_file_read(CustomAgg_FH_Data ca_data, int *error_code,
+ ADIO_Offset_CA firstFileOffset, ADIO_Offset_CA lastFileOffset,
+ ADIO_Offset_CA *fd_start, ADIO_Offset_CA* fd_end);
+
+void *IO_Thread_Func(void *vptr_args);
+
+/***********************************/
+/* END CCIO Typedefs and Functions */
+/***********************************/
/*
* The driver identification number, initialized at runtime if H5_HAVE_PARALLEL
@@ -68,6 +265,7 @@ typedef struct H5FD_mpio_t {
haddr_t eoa; /*end-of-address marker */
haddr_t last_eoa; /* Last known end-of-address marker */
haddr_t local_eof; /* Local end-of-file address for each process */
+ CustomAgg_FH_Struct_Data custom_agg_data;
} H5FD_mpio_t;
/* Private Prototypes */
@@ -89,12 +287,20 @@ static herr_t H5FD_mpio_read(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, hadd
size_t size, void *buf);
static herr_t H5FD_mpio_write(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t addr,
size_t size, const void *buf);
+static herr_t H5FD_mpio_custom_read(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id,
+ hid_t file_space, hid_t mem_space, size_t elmt_size, haddr_t addr, void *buf);
+static herr_t H5FD_mpio_custom_write(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id,
+ hid_t file_space, hid_t mem_space, size_t elmt_size, haddr_t addr, const void *buf);
static herr_t H5FD_mpio_flush(H5FD_t *_file, hid_t dxpl_id, hbool_t closing);
static herr_t H5FD_mpio_truncate(H5FD_t *_file, hid_t dxpl_id, hbool_t closing);
static int H5FD_mpio_mpi_rank(const H5FD_t *_file);
static int H5FD_mpio_mpi_size(const H5FD_t *_file);
static MPI_Comm H5FD_mpio_communicator(const H5FD_t *_file);
static herr_t H5FD_mpio_get_info(H5FD_t *_file, void** mpi_info);
+static herr_t H5FD_mpio_ccio_setup(const char *name, H5FD_mpio_t *file, MPI_File fh);
+static herr_t H5FD_mpio_ccio_cleanup(const H5FD_mpio_t *file);
+static herr_t H5FD_mpio_setup_flatbuf( H5S_sel_type space_sel_type, H5S_flatbuf_t *curflatbuf,
+ H5S_sel_iter_t *sel_iter, H5S_t *space_stype, size_t elmt_size, hbool_t is_regular);
/* The MPIO file driver information */
static const H5FD_class_mpi_t H5FD_mpio_g = {
@@ -126,6 +332,8 @@ static const H5FD_class_mpi_t H5FD_mpio_g = {
H5FD_mpio_get_handle, /*get_handle */
H5FD_mpio_read, /*read */
H5FD_mpio_write, /*write */
+ H5FD_mpio_custom_read, /*select_read */
+ H5FD_mpio_custom_write, /*select_write */
H5FD_mpio_flush, /*flush */
H5FD_mpio_truncate, /*truncate */
NULL, /*lock */
@@ -157,7 +365,6 @@ static int H5FD_mpio_Debug[256] =
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 };
#endif
-
/*--------------------------------------------------------------------------
NAME
@@ -1013,6 +1220,10 @@ H5FD_mpio_open(const char *name, unsigned flags, hid_t fapl_id,
file->mpi_rank = mpi_rank;
file->mpi_size = mpi_size;
+ /* Setup structures for ccio optimizations */
+ /* (Optimizations used for select_<write,read> calls) */
+ H5FD_mpio_ccio_setup(name, file, fh);
+
/* Only processor p0 will get the filesize and broadcast it. */
if (mpi_rank == 0) {
if (MPI_SUCCESS != (mpi_code=MPI_File_get_size(fh, &size)))
@@ -1106,6 +1317,9 @@ H5FD_mpio_close(H5FD_t *_file)
if (MPI_SUCCESS != (mpi_code=MPI_File_close(&(file->f)/*in,out*/)))
HMPI_GOTO_ERROR(FAIL, "MPI_File_close failed", mpi_code)
+ /* Clean structures used for ccio optimizations */
+ H5FD_mpio_ccio_cleanup( file );
+
/* Clean up other stuff */
H5FD_mpi_comm_info_free(&file->comm, &file->info);
H5MM_xfree(file);
@@ -1863,7 +2077,584 @@ done:
FUNC_LEAVE_NOAPI(ret_value)
} /* end H5FD_mpio_write() */
-
+/*-------------------------------------------------------------------------
+ * Function: H5FD_mpio_setup_flatbuf
+ *
+ * Purpose: Define the flatbuf structure needed by H5FD_mpio_custom_write.
+ * This is a helper function to avoid repeating code. The flatbuf can
+ * be a file or memory flatbuf -- and the structure depends on the type
+ * of selection.
+ *
+ * Return: Success: Non-negative
+ * Failure: Negative
+ *
+ * Programmer: Rick Zamora, 2018-07-03
+ * (Based on code originally in H5FD_mpio_custom_write)
+ *-------------------------------------------------------------------------
+ */
+static herr_t
+H5FD_mpio_setup_flatbuf( H5S_sel_type space_sel_type, H5S_flatbuf_t *curflatbuf,
+ H5S_sel_iter_t *sel_iter, H5S_t *space_stype, size_t elmt_size, hbool_t is_regular)
+{
+ herr_t ret_value = SUCCEED;
+ size_t sel_nseq, sel_nelem;
+ hsize_t flatBufSize;
+ unsigned null_flags = 0;
+ hsize_t num_points;
+ hssize_t snum_points;
+ int numSpaceDims = 0;
+ int numSelDims = 0;
+ H5S_hyper_dim_t *diminfo;
+ hsize_t numBlockEntries = 1;
+ hsize_t numElements = 1;
+ herr_t rc = 0;
+
+ if (space_sel_type == H5S_SEL_NONE) {
+ curflatbuf->indices = NULL;
+ curflatbuf->blocklens = NULL;
+ curflatbuf->count = 0;
+ curflatbuf->size = 0;
+ curflatbuf->extent = 0;
+#ifdef H5FDmpio_DEBUG
+ if (H5FD_mpio_Debug[(int)'t']) {
+ fprintf(stdout,"space_sel_type == H5S_SEL_NONE for flatbuf - setting everything to 0\n");
+ fflush(stdout);
+ }
+#endif
+ }
+ else if (space_sel_type == H5S_SEL_ALL) {
+
+ /* For H5S_SEL_ALL there is just 1 big block */
+ curflatbuf->indices = (hsize_t *) H5MM_malloc(1 * sizeof(hsize_t));
+ curflatbuf->blocklens = (size_t *) H5MM_malloc(1 * sizeof(size_t));
+ curflatbuf->count = 1;
+ curflatbuf->extent = curflatbuf->indices[0] + (hsize_t)curflatbuf->blocklens[0];
+
+ if(H5S__all_get_seq_list(space_stype,null_flags,sel_iter,1,sel_iter->elmt_left,&sel_nseq,&sel_nelem,curflatbuf->indices,curflatbuf->blocklens) < 0)
+ {
+ fprintf(stdout,"ERROR: H5S__all_get_seq_list failed");
+ ret_value = FAIL;
+ }
+
+ flatBufSize = 0;
+ for (int j=0;j<curflatbuf->count;j++) {
+ flatBufSize += curflatbuf->blocklens[j];
+ }
+ curflatbuf->size = flatBufSize;
+
+#ifdef H5FDmpio_DEBUG
+ if (H5FD_mpio_Debug[(int)'t']) {
+ fprintf(stdout,"space_sel_type == H5S_SEL_ALL for flatbuf - curflatbuf->size is %ld curflatbuf->indices[0] is %ldcurflatbuf->blocklens[0] is %ld curflatbuf->extent is %ld\n",curflatbuf->size,curflatbuf->indices[0],curflatbuf->blocklens[0],curflatbuf->extent);
+ fflush(stdout);
+ }
+#endif
+ }
+ else if (space_sel_type == H5S_SEL_POINTS) {
+
+ if((snum_points = (hssize_t)H5S_GET_SELECT_NPOINTS(space_stype)) < 0)
+ {
+ fprintf(stdout,"ERROR: can't get number of elements selected");
+ ret_value = FAIL;
+ }
+ num_points = (hsize_t)snum_points;
+
+ curflatbuf->indices = (hsize_t *) H5MM_malloc(num_points * sizeof(hsize_t));
+ curflatbuf->blocklens = (size_t *) H5MM_malloc(num_points * sizeof(size_t));
+ curflatbuf->count = 1;
+
+ /* Get the extent */
+ hsize_t dims[H5O_LAYOUT_NDIMS]; /* Total size of memory buf */
+ if((numSpaceDims = H5S_get_simple_extent_dims (space_stype, dims, NULL)) < 0){
+ fprintf(stdout,"ERROR: unable to retrieve data space dimensions");
+ ret_value = FAIL;
+ }
+ curflatbuf->extent = 1;
+ for (int j=0;j<numSpaceDims;j++)
+ curflatbuf->extent *= dims[j];
+
+ if(H5S_point_get_seq_list(space_stype,null_flags,sel_iter,num_points,num_points,&sel_nseq,&sel_nelem,curflatbuf->indices,curflatbuf->blocklens) < 0)
+ {
+ fprintf(stdout,"ERROR: H5S__all_get_seq_list failed");
+ ret_value = FAIL;
+ }
+
+ flatBufSize = 0;
+ for (int j=0;j<curflatbuf->count;j++) {
+ flatBufSize += curflatbuf->blocklens[j];
+ }
+ curflatbuf->size = flatBufSize;
+
+#ifdef H5FDmpio_DEBUG
+ if (H5FD_mpio_Debug[(int)'t']) {
+ fprintf(stdout,"space_sel_type == H5S_SEL_POINTS called H5S_point_get_seq_list for file flatbuf - curflatbuf->count is %ld numSpaceDims is %d curflatbuf->extent is %ld curflatbuf->size is %ld returned sel_nseq %ld sel_nelem %ld offset/len pairs for curflatbuf->count entries are:\n",curflatbuf->count,numSpaceDims,curflatbuf->extent,curflatbuf->size,sel_nseq,sel_nelem);
+ for (int j=0;j<curflatbuf->count;j++)
+ fprintf(stdout, " %d offset: %ld len: %ld\n",j,curflatbuf->indices[j],curflatbuf->blocklens[j]);
+ fflush(stdout);
+ }
+#endif
+ }
+ else if (space_sel_type == H5S_SEL_HYPERSLABS) {
+
+ if (!is_regular){
+ fprintf(stdout, "ERROR: irregular space selection not supported");
+ // rjz - commenting this check for now:
+ //ret_value = FAIL;
+ }
+
+ diminfo = sel_iter->u.hyp.diminfo;
+ HDassert(diminfo);
+
+ /* Here we need to use a function inside the space module since that is where the H5S_t structure is
+ * actually defined.
+ */
+ rc = H5S_mpio_return_space_rank_and_extent(space_stype, &numSpaceDims, &(curflatbuf->extent));
+
+ curflatbuf->extent *= elmt_size;
+
+ /* Check for flattened selection, if so use the selection iter_rank for the number of
+ * dimensions instead of the space rank.
+ */
+ if(sel_iter->u.hyp.iter_rank != 0 && sel_iter->u.hyp.iter_rank < numSpaceDims)
+ numSelDims = sel_iter->u.hyp.iter_rank;
+ else
+ numSelDims = numSpaceDims;
+
+#ifdef H5FDmpio_DEBUG
+ if (H5FD_mpio_Debug[(int)'t']) {
+ fprintf(stdout,"space_sel_type == H5S_SEL_HYPERSLABS computing numBlockEntries and numElements\n");
+ fflush(stdout);
+ }
+#endif
+
+ numBlockEntries = 1, numElements = 1;
+ for(int u = 0; u < numSelDims; u++) {
+#ifdef H5FDmpio_DEBUG
+ if (H5FD_mpio_Debug[(int)'t']) {
+ fprintf(stdout,"iter %d diminfo[u].count is %ld and diminfo[u].block is %ld\n",u,diminfo[u].count,diminfo[u].block);
+ fflush(stdout);
+ }
+#endif
+ if (u < (numSelDims-1)) {
+ numBlockEntries *= (diminfo[u].count * diminfo[u].block);
+ numElements *= (diminfo[u].count * diminfo[u].block);
+ }
+ else {
+ numBlockEntries *= diminfo[u].count;
+ numElements *= (diminfo[u].count * diminfo[u].block);
+ }
+ }
+
+ curflatbuf->indices = (hsize_t *) H5MM_malloc(numElements * sizeof(hsize_t));
+ curflatbuf->blocklens = (size_t *) H5MM_malloc(numElements * sizeof(size_t));
+ curflatbuf->count = numBlockEntries;
+
+#ifdef H5FDmpio_DEBUG
+ if (H5FD_mpio_Debug[(int)'t']) {
+ fprintf(stdout,"calling H5S__hyper_get_seq_list for file flatbuf - numSelDims is %d numElements is %ld curflatbuf->count is %ld curflatbuf->extent is %ld\n",numSelDims,numElements, curflatbuf->count, curflatbuf->extent);
+ fflush(stdout);
+ }
+#endif
+
+ if(H5S__hyper_get_seq_list(space_stype,null_flags,sel_iter,numElements,numElements,&sel_nseq,&sel_nelem,curflatbuf->indices,curflatbuf->blocklens) < 0)
+ {
+ fprintf(stdout,"ERROR: H5S__hyper_get_seq_list failed");
+ ret_value = FAIL;
+ }
+
+ flatBufSize = 0;
+ for (int j=0;j<curflatbuf->count;j++) {
+ flatBufSize += curflatbuf->blocklens[j];
+ }
+ curflatbuf->size = flatBufSize;
+
+#ifdef H5FDmpio_DEBUG
+ if (H5FD_mpio_Debug[(int)'t']) {
+ fprintf(stdout,"called H5S__hyper_get_seq_list for file flatbuf - numSelDims is %d numElements is %ld curflatbuf->count is %ld curflatbuf->extent is %ld curflatbuf->size is %ld returned sel_nseq %ld sel_nelem %ld offset/len pairs for curflatbuf->count entries are:\n",numSelDims,numElements, curflatbuf->count,curflatbuf->extent,curflatbuf->size,sel_nseq,sel_nelem);
+ for (int j=0;j<curflatbuf->count;j++)
+ fprintf(stdout, " %d offset: %ld len: %ld\n",j,curflatbuf->indices[j],curflatbuf->blocklens[j]);
+ fflush(stdout);
+ }
+#endif
+
+ }
+ else {
+ fprintf(stdout, "ERROR: In H5FD_mpio_setup_flatbuf, space selection type not recognized");
+ ret_value = FAIL;
+ }
+
+ return ret_value;
+} /* H5FD_mpio_setup_flatbuf */
+
+/*-------------------------------------------------------------------------
+ * Function: H5FD_mpio_custom_write
+ *
+ * Purpose: Writes data from a memory flatbuf into a file flatbuf. The memory
+ * and file flatbuf structures are defined using the H5S_<*>_get_seq_list
+ * functions, where <*> depends on the type of selection: all, points,
+ * hyperslab, or none.
+ * Note that this function is called from H5FD_select_write(), and is
+ * used to call optimized "write" routines defined in the "custom-collective
+ * IO virtual file layer" (CCIO) of the MPIO-VFD (see H5FDmpio_ccio.c).
+ *
+ * Return:
+ *
+ * Programmer: Quincey Koziol and Paul Coffman
+ * Unknown (Winter), 2018
+ *
+ * Modifications:
+ * Rick Zamora, 2018-07-02
+ * cleanup and refactoring.
+ *
+ *-------------------------------------------------------------------------
+ */
+static herr_t H5FD_mpio_custom_write(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id,
+ hid_t file_space, hid_t mem_space, size_t elmt_size, haddr_t addr, const void *buf)
+{
+ H5FD_mpio_t *file = (H5FD_mpio_t*)_file;
+ MPI_Offset mpi_off;
+ MPI_Status mpi_stat; /* Status from I/O operation */
+ H5S_t *file_space_stype;
+ int file_space_ref_count;
+ H5S_t *mem_space_stype;
+ int mem_space_ref_count;
+ int mpi_code; /* MPI return code */
+#if MPI_VERSION >= 3
+ MPI_Count bytes_written;
+ MPI_Count type_size; /* MPI datatype used for I/O's size */
+ MPI_Count io_size; /* Actual number of bytes requested */
+#else
+ int bytes_written;
+ int type_size; /* MPI datatype used for I/O's size */
+ int io_size; /* Actual number of bytes requested */
+#endif
+ int size_i;
+ H5P_genplist_t *plist = NULL; /* Property list pointer */
+ H5FD_mpio_xfer_t xfer_mode; /* I/O tranfer mode */
+ herr_t ret_value = SUCCEED;
+ H5S_flatbuf_t file_flatbuf;
+ H5S_flatbuf_t mem_flatbuf;
+ hbool_t is_permuted = FALSE;
+ hbool_t is_regular = TRUE;
+ H5S_sel_iter_t sel_iter;
+ H5S_class_t file_space_extent_type;
+ H5S_class_t mem_space_extent_type;
+ H5S_sel_type file_space_sel_type;
+ H5S_sel_type mem_space_sel_type;
+ herr_t rc = 0;
+ hsize_t *permute_map = NULL;
+
+ /* Note: permute_map array holds the mapping from the old (out-of-order)
+ * displacements to the in-order displacements of the H5S_flatbuf_t of the
+ * point selection of the file space.
+ */
+
+ FUNC_ENTER_NOAPI_NOINIT
+
+ /* File and memory space setup */
+ file_space_stype = (H5S_t *) H5I_remove(file_space);
+ file_space_ref_count = H5I_dec_ref(file_space);
+ mem_space_stype = (H5S_t *) H5I_remove(mem_space);
+ mem_space_ref_count = H5I_dec_ref(mem_space);
+
+ /* some numeric conversions */
+ if(H5FD_mpi_haddr_to_MPIOff(addr, &mpi_off) < 0)
+ HGOTO_ERROR(H5E_INTERNAL, H5E_BADRANGE, FAIL, "can't convert from haddr to MPI off")
+
+ size_i = (int)elmt_size;
+ if((hsize_t)size_i != elmt_size)
+ HGOTO_ERROR(H5E_INTERNAL, H5E_BADRANGE, FAIL, "can't convert from elmt_size to size_i")
+
+ HDassert(file);
+ HDassert(H5FD_MPIO==file->pub.driver_id);
+
+ /* Make certain we have the correct type of property list */
+ HDassert(H5I_GENPROP_LST==H5I_get_type(dxpl_id));
+ HDassert(TRUE==H5P_isa_class(dxpl_id,H5P_DATASET_XFER));
+ HDassert(buf);
+
+ /* Portably initialize MPI status variable */
+ HDmemset(&mpi_stat, 0, sizeof(MPI_Status));
+
+ /*
+ * Create flatbuf for FILE space selection
+ */
+
+ if(H5S_select_iter_init(&sel_iter, file_space_stype, elmt_size) < 0)
+ HGOTO_ERROR(H5E_DATASPACE, H5E_CANTINIT, FAIL, "unable to initialize selection iterator")
+
+ rc = H5S_mpio_return_space_extent_and_select_type(file_space_stype, &is_permuted, &is_regular, &file_space_extent_type, &file_space_sel_type);
+
+ if(is_permuted)
+ HGOTO_ERROR(H5E_DATASPACE, H5E_BADTYPE, FAIL, "permuted space selections not supported")
+
+ /* Currently, file_space_extent_type must be H5S_NULL, H5S_SCALAR, or H5S_SIMPLE */
+ if (!((file_space_extent_type == H5S_NULL) || (file_space_extent_type == H5S_SCALAR) || (file_space_extent_type == H5S_SIMPLE)))
+ HGOTO_ERROR(H5E_DATASPACE, H5E_BADTYPE, FAIL, "file space extent type invalid")
+
+ if(H5S_SELECT_ITER_RELEASE(&sel_iter) < 0)
+ HDONE_ERROR(H5E_DATASPACE, H5E_CANTRELEASE, FAIL, "unable to release selection iterator")
+
+ if ((file_space_sel_type == H5S_SEL_NONE) || (file_space_sel_type == H5S_SEL_ALL) ||
+ (file_space_sel_type == H5S_SEL_POINTS) || (file_space_sel_type == H5S_SEL_HYPERSLABS)) {
+ if( H5FD_mpio_setup_flatbuf( file_space_sel_type, &file_flatbuf, &sel_iter, file_space_stype, elmt_size, is_regular ) < 0)
+ HGOTO_ERROR(H5E_DATASPACE, H5E_BADTYPE, FAIL, "Call to H5FD_mpio_setup_flatbuf failed for FILE")
+ }
+ else {
+ HGOTO_ERROR(H5E_DATASPACE, H5E_BADTYPE, FAIL, "Space selection type not recognized")
+ }
+
+#ifdef onesidedtrace
+//
+// typedef struct H5S_flatbuf_t {
+// hsize_t count; /* number of contiguous blocks */
+// size_t *blocklens; /* array of contiguous block lengths (bytes)*/
+// hsize_t *indices; /*array of byte offsets of each block */
+// hsize_t extent; /* offset range for one instance of this flatbuf */
+// hsize_t size; /* number of bytes of block data */
+// } H5S_flatbuf_t;
+//
+ printf("_______ - file_flatbuf.count = %d, file_flatbuf.extent = %d, file_flatbuf.size = %d\n",file_flatbuf.count,file_flatbuf.extent,file_flatbuf.size);
+ for (int i=0; i<file_flatbuf.count; i++) {
+ printf("_______ - file_flatbuf.blocklens[%d] = %d, file_flatbuf.indices[%d] = %d\n",i,file_flatbuf.blocklens[i],i,file_flatbuf.indices[i]);
+ }
+#endif
+
+ /*
+ * Create flatbuf for MEMORY space selection
+ */
+
+ if(H5S_select_iter_init(&sel_iter, mem_space_stype, elmt_size) < 0)
+ HGOTO_ERROR(H5E_DATASPACE, H5E_CANTINIT, FAIL, "unable to initialize selection iterator")
+
+ rc = H5S_mpio_return_space_extent_and_select_type(mem_space_stype, &is_permuted, &is_regular, &mem_space_extent_type, &mem_space_sel_type);
+
+ if(is_permuted)
+ HGOTO_ERROR(H5E_DATASPACE, H5E_BADTYPE, FAIL, "permuted space selections not supported")
+
+ /* Currently, mem_space_extent_type must be H5S_NULL, H5S_SCALAR, or H5S_SIMPLE */
+ if (!((mem_space_extent_type == H5S_NULL) || (mem_space_extent_type == H5S_SCALAR) || (mem_space_extent_type == H5S_SIMPLE)))
+ HGOTO_ERROR(H5E_DATASPACE, H5E_BADTYPE, FAIL, "space extent type invalid")
+
+ if ((mem_space_sel_type == H5S_SEL_NONE) || (mem_space_sel_type == H5S_SEL_ALL) ||
+ (mem_space_sel_type == H5S_SEL_POINTS) || (mem_space_sel_type == H5S_SEL_HYPERSLABS)) {
+ if( H5FD_mpio_setup_flatbuf( mem_space_sel_type, &mem_flatbuf, &sel_iter, mem_space_stype, elmt_size, is_regular ) < 0)
+ HGOTO_ERROR(H5E_DATASPACE, H5E_BADTYPE, FAIL, "Call to H5FD_mpio_setup_flatbuf failed for MEM")
+ }
+ else {
+ HGOTO_ERROR(H5E_DATASPACE, H5E_BADTYPE, FAIL, "Space selection type not recognized")
+ }
+
+#ifdef onesidedtrace
+ printf("_______ - mem_flatbuf.count = %d, mem_flatbuf.extent = %d, mem_flatbuf.size = %d\n",mem_flatbuf.count,mem_flatbuf.extent,mem_flatbuf.size);
+ for (int i=0; i<mem_flatbuf.count; i++) {
+ printf("_______ - mem_flatbuf.blocklens[%d] = %d, mem_flatbuf.indices[%d] = %d\n",i,mem_flatbuf.blocklens[i],i,mem_flatbuf.indices[i]);
+ }
+#endif
+
+ if(H5S_SELECT_ITER_RELEASE(&sel_iter) < 0)
+ HDONE_ERROR(H5E_DATASPACE, H5E_CANTRELEASE, FAIL, "unable to release selection iterator")
+
+ /* Obtain the data transfer properties */
+ if(NULL == (plist = (H5P_genplist_t *)H5I_object(dxpl_id)))
+ HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, "not a file access property list")
+
+ /* get the transfer mode from the dxpl */
+ if(H5P_get(plist, H5D_XFER_IO_XFER_MODE_NAME, &xfer_mode)<0)
+ HGOTO_ERROR(H5E_PLIST, H5E_CANTGET, FAIL, "can't get MPI-I/O transfer mode")
+
+ /*
+ * If using collective IO call the custom agggregation algorithm here.
+ */
+ if(xfer_mode == H5FD_MPIO_COLLECTIVE) {
+
+ int error_code;
+ H5FD_mpio_ccio_write_one_sided((CustomAgg_FH_Data)&(file->custom_agg_data), buf, mpi_off, &mem_flatbuf, &file_flatbuf, &error_code);
+ if (file_flatbuf.indices) H5MM_free(file_flatbuf.indices);
+ if (file_flatbuf.blocklens) H5MM_free(file_flatbuf.blocklens);
+ if (mem_flatbuf.indices) H5MM_free(mem_flatbuf.indices);
+ if (mem_flatbuf.blocklens) H5MM_free(mem_flatbuf.blocklens);
+
+ }
+ else {
+ /*
+ * Not collective IO, just do MPI_File_write_at - don't support this for now
+ */
+ HGOTO_ERROR(H5E_PLIST, H5E_CANTGET, FAIL, "H5FD_MPIO_COLLECTIVE xfer mode required for custom aggregation")
+ }
+
+done:
+ FUNC_LEAVE_NOAPI(ret_value)
+}
+
+/*-------------------------------------------------------------------------
+ * Function: H5FD_mpio_custom_read
+ *
+ * Purpose: Reads data from a file flatbuf into a memory flatbuf. The memory
+ * and file flatbuf structures are defined using the H5S_<*>_get_seq_list
+ * functions, where <*> depends on the type of selection: all, points,
+ * hyperslab, or none.
+ * Note that this function is called from H5FD_select_read(), and is
+ * used to call optimized "read" routines defined in the "custom-collective
+ * IO virtual file layer" (CCIO) of the MPIO-VFD (see H5FDmpio_ccio.c).
+ *
+ * Return:
+ *
+ * Programmer: Rick Zamora, 2018-07-10
+ *
+ *
+ * Modifications:
+ * Rick Zamora, 2018-11-06
+ * cleanup and refactoring.
+ *
+ *-------------------------------------------------------------------------
+ */
+static herr_t H5FD_mpio_custom_read(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id,
+ hid_t file_space, hid_t mem_space, size_t elmt_size, haddr_t addr, void *buf)
+{
+
+ H5FD_mpio_t *file = (H5FD_mpio_t*)_file;
+ MPI_Offset mpi_off;
+ MPI_Status mpi_stat; /* Status from I/O operation */
+ H5S_t *file_space_stype;
+ int file_space_ref_count;
+ H5S_t *mem_space_stype;
+ int mem_space_ref_count;
+ int size_i;
+ H5P_genplist_t *plist = NULL; /* Property list pointer */
+ H5FD_mpio_xfer_t xfer_mode; /* I/O tranfer mode */
+ herr_t ret_value = SUCCEED;
+ H5S_flatbuf_t file_flatbuf;
+ H5S_flatbuf_t mem_flatbuf;
+ hbool_t is_permuted = FALSE;
+ hbool_t is_regular = TRUE;
+ H5S_sel_iter_t sel_iter;
+ H5S_class_t file_space_extent_type;
+ H5S_class_t mem_space_extent_type;
+ H5S_sel_type file_space_sel_type;
+ H5S_sel_type mem_space_sel_type;
+ herr_t rc = 0;
+ hsize_t *permute_map = NULL;
+
+ /* Note: permute_map array holds the mapping from the old (out-of-order)
+ * displacements to the in-order displacements of the H5S_flatbuf_t of the
+ * point selection of the file space.
+ */
+
+ FUNC_ENTER_NOAPI_NOINIT
+
+ /* File and memory space setup */
+ file_space_stype = (H5S_t *) H5I_remove(file_space);
+ file_space_ref_count = H5I_dec_ref(file_space);
+ mem_space_stype = (H5S_t *) H5I_remove(mem_space);
+ mem_space_ref_count = H5I_dec_ref(mem_space);
+
+ /* some numeric conversions */
+ if(H5FD_mpi_haddr_to_MPIOff(addr, &mpi_off) < 0)
+ HGOTO_ERROR(H5E_INTERNAL, H5E_BADRANGE, FAIL, "can't convert from haddr to MPI off")
+
+ size_i = (int)elmt_size;
+ if((hsize_t)size_i != elmt_size)
+ HGOTO_ERROR(H5E_INTERNAL, H5E_BADRANGE, FAIL, "can't convert from elmt_size to size_i")
+
+ HDassert(file);
+ HDassert(H5FD_MPIO==file->pub.driver_id);
+
+ /* Make certain we have the correct type of property list */
+ HDassert(H5I_GENPROP_LST==H5I_get_type(dxpl_id));
+ HDassert(TRUE==H5P_isa_class(dxpl_id,H5P_DATASET_XFER));
+ HDassert(buf);
+
+ /* Portably initialize MPI status variable */
+ HDmemset(&mpi_stat, 0, sizeof(MPI_Status));
+
+ /*
+ * Create flatbuf for FILE space selection
+ */
+
+ if(H5S_select_iter_init(&sel_iter, file_space_stype, elmt_size) < 0)
+ HGOTO_ERROR(H5E_DATASPACE, H5E_CANTINIT, FAIL, "unable to initialize selection iterator")
+
+ rc = H5S_mpio_return_space_extent_and_select_type(file_space_stype, &is_permuted, &is_regular, &file_space_extent_type, &file_space_sel_type);
+
+ if(is_permuted)
+ HGOTO_ERROR(H5E_DATASPACE, H5E_BADTYPE, FAIL, "permuted space selections not supported")
+
+ /* Currently, file_space_extent_type must be H5S_NULL, H5S_SCALAR, or H5S_SIMPLE */
+ if (!((file_space_extent_type == H5S_NULL) || (file_space_extent_type == H5S_SCALAR) || (file_space_extent_type == H5S_SIMPLE)))
+ HGOTO_ERROR(H5E_DATASPACE, H5E_BADTYPE, FAIL, "file space extent type invalid")
+
+ if(H5S_SELECT_ITER_RELEASE(&sel_iter) < 0)
+ HDONE_ERROR(H5E_DATASPACE, H5E_CANTRELEASE, FAIL, "unable to release selection iterator")
+
+ if ((file_space_sel_type == H5S_SEL_NONE) || (file_space_sel_type == H5S_SEL_ALL) ||
+ (file_space_sel_type == H5S_SEL_POINTS) || (file_space_sel_type == H5S_SEL_HYPERSLABS)) {
+ if( H5FD_mpio_setup_flatbuf( file_space_sel_type, &file_flatbuf, &sel_iter, file_space_stype, elmt_size, is_regular ) < 0)
+ HGOTO_ERROR(H5E_DATASPACE, H5E_BADTYPE, FAIL, "Call to H5FD_mpio_setup_flatbuf failed for FILE")
+ }
+ else {
+ HGOTO_ERROR(H5E_DATASPACE, H5E_BADTYPE, FAIL, "Space selection type not recognized")
+ }
+
+ /*
+ * Create flatbuf for MEMORY space selection
+ */
+
+ if(H5S_select_iter_init(&sel_iter, mem_space_stype, elmt_size) < 0)
+ HGOTO_ERROR(H5E_DATASPACE, H5E_CANTINIT, FAIL, "unable to initialize selection iterator")
+
+ rc = H5S_mpio_return_space_extent_and_select_type(mem_space_stype, &is_permuted, &is_regular, &mem_space_extent_type, &mem_space_sel_type);
+
+ if(is_permuted)
+ HGOTO_ERROR(H5E_DATASPACE, H5E_BADTYPE, FAIL, "permuted space selections not supported")
+
+ /* Currently, mem_space_extent_type must be H5S_NULL, H5S_SCALAR, or H5S_SIMPLE */
+ if (!((mem_space_extent_type == H5S_NULL) || (mem_space_extent_type == H5S_SCALAR) || (mem_space_extent_type == H5S_SIMPLE)))
+ HGOTO_ERROR(H5E_DATASPACE, H5E_BADTYPE, FAIL, "space extent type invalid")
+
+ if ((mem_space_sel_type == H5S_SEL_NONE) || (mem_space_sel_type == H5S_SEL_ALL) ||
+ (mem_space_sel_type == H5S_SEL_POINTS) || (mem_space_sel_type == H5S_SEL_HYPERSLABS)) {
+ if( H5FD_mpio_setup_flatbuf( mem_space_sel_type, &mem_flatbuf, &sel_iter, mem_space_stype, elmt_size, is_regular ) < 0)
+ HGOTO_ERROR(H5E_DATASPACE, H5E_BADTYPE, FAIL, "Call to H5FD_mpio_setup_flatbuf failed for MEM")
+ }
+ else {
+ HGOTO_ERROR(H5E_DATASPACE, H5E_BADTYPE, FAIL, "Space selection type not recognized")
+ }
+
+ if(H5S_SELECT_ITER_RELEASE(&sel_iter) < 0)
+ HDONE_ERROR(H5E_DATASPACE, H5E_CANTRELEASE, FAIL, "unable to release selection iterator")
+
+ /* Obtain the data transfer properties */
+ if(NULL == (plist = (H5P_genplist_t *)H5I_object(dxpl_id)))
+ HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, "not a file access property list")
+
+ /* get the transfer mode from the dxpl */
+ if(H5P_get(plist, H5D_XFER_IO_XFER_MODE_NAME, &xfer_mode)<0)
+ HGOTO_ERROR(H5E_PLIST, H5E_CANTGET, FAIL, "can't get MPI-I/O transfer mode")
+
+ /*
+ * If using collective IO call the custom agggregation algorithm here.
+ */
+ if(xfer_mode == H5FD_MPIO_COLLECTIVE) {
+
+ int error_code;
+ H5FD_mpio_ccio_read_one_sided((CustomAgg_FH_Data)&(file->custom_agg_data), buf, mpi_off, &mem_flatbuf, &file_flatbuf, &error_code);
+ if (file_flatbuf.indices) H5MM_free(file_flatbuf.indices);
+ if (file_flatbuf.blocklens) H5MM_free(file_flatbuf.blocklens);
+ if (mem_flatbuf.indices) H5MM_free(mem_flatbuf.indices);
+ if (mem_flatbuf.blocklens) H5MM_free(mem_flatbuf.blocklens);
+
+ }
+ else {
+ /*
+ * Not collective IO, just do MPI_File_write_at - don't support this for now
+ */
+ HGOTO_ERROR(H5E_PLIST, H5E_CANTGET, FAIL, "H5FD_MPIO_COLLECTIVE xfer mode required for custom aggregation")
+ }
+
+done:
+ FUNC_LEAVE_NOAPI(ret_value)
+}
+
/*-------------------------------------------------------------------------
* Function: H5FD_mpio_flush
*
@@ -1908,20 +2699,19 @@ done:
FUNC_LEAVE_NOAPI(ret_value)
} /* end H5FD_mpio_flush() */
-
/*-------------------------------------------------------------------------
* Function: H5FD_mpio_truncate
*
* Purpose: Make certain the file's size matches it's allocated size
*
- * This is a little sticky in the mpio case, as it is not
+ * This is a little sticky in the mpio case, as it is not
* easy for us to track the current EOF by extracting it from
- * write calls.
+ * write calls.
*
* Instead, we first check to see if the eoa has changed since
- * the last call to this function. If it has, we call
- * MPI_File_get_size() to determine the current EOF, and
- * only call MPI_File_set_size() if this value disagrees
+ * the last call to this function. If it has, we call
+ * MPI_File_get_size() to determine the current EOF, and
+ * only call MPI_File_set_size() if this value disagrees
* with the current eoa.
*
* Return: Success: Non-negative
@@ -1932,11 +2722,11 @@ done:
*
* Changes: Heavily reworked to avoid unnecessary MPI_File_set_size()
* calls. The hope is that these calls are superfluous in the
- * typical case, allowing us to avoid truncates most of the
+ * typical case, allowing us to avoid truncates most of the
* time.
*
- * The basic idea is to query the file system to get the
- * current eof, and only truncate if the file systems
+ * The basic idea is to query the file system to get the
+ * current eof, and only truncate if the file systems
* conception of the eof disagrees with our eoa.
*
* JRM -- 10/27/17
@@ -1963,13 +2753,13 @@ H5FD_mpio_truncate(H5FD_t *_file, hid_t H5_ATTR_UNUSED dxpl_id, hbool_t H5_ATTR_
MPI_Offset size;
MPI_Offset needed_eof;
- /* In principle, it is possible for the size returned by the
- * call to MPI_File_get_size() to depend on whether writes from
+ /* In principle, it is possible for the size returned by the
+ * call to MPI_File_get_size() to depend on whether writes from
* all proceeses have completed at the time process 0 makes the
- * call.
+ * call.
*
* In practice, most (all?) truncate calls will come after a barrier
- * and with no interviening writes to the file (with the possible
+ * and with no interviening writes to the file (with the possible
* exception of sueprblock / superblock extension message updates).
*
* Check the "MPI file closing" flag in the API context to determine
@@ -2000,13 +2790,13 @@ H5FD_mpio_truncate(H5FD_t *_file, hid_t H5_ATTR_UNUSED dxpl_id, hbool_t H5_ATTR_
if(MPI_SUCCESS != (mpi_code = MPI_File_set_size(file->f, needed_eof)))
HMPI_GOTO_ERROR(FAIL, "MPI_File_set_size failed", mpi_code)
- /* In general, we must wait until all processes have finished
- * the truncate before any process can continue, since it is
- * possible that a process would write at the end of the
+ /* In general, we must wait until all processes have finished
+ * the truncate before any process can continue, since it is
+ * possible that a process would write at the end of the
* file, and this write would be discarded by the truncate.
*
- * While this is an issue for a user initiated flush, it may
- * not be an issue at file close. If so, we may be able to
+ * While this is an issue for a user initiated flush, it may
+ * not be an issue at file close. If so, we may be able to
* optimize out the following barrier in that case.
*/
if(MPI_SUCCESS != (mpi_code = MPI_Barrier(file->comm)))
@@ -2114,5 +2904,4278 @@ H5FD_mpio_communicator(const H5FD_t *_file)
FUNC_LEAVE_NOAPI(file->comm)
} /* end H5FD_mpio_communicator() */
-#endif /* H5_HAVE_PARALLEL */
+/*-------------------------------------------------------------------------
+ * Function: HDF5_ccio_win_setup
+ *
+ * Purpose: Function to setup one-sided communication structures.
+ *
+ * Return: MPI_SUCCESS on success.
+ *
+ * Note: This function must be called in mpio_open
+ *
+ *-------------------------------------------------------------------------
+ */
+int HDF5_ccio_win_setup(CustomAgg_FH_Data ca_data, int procs) {
+
+ int ret = MPI_SUCCESS;
+ ret = MPI_Win_create(ca_data->io_buf,ca_data->cb_buffer_size,1,MPI_INFO_NULL,ca_data->comm, &(ca_data->io_buf_window));
+#ifdef onesidedtrace
+ printf("CREATING ca_data->io_buf_window %016lx - ret = %d.\n",ca_data->io_buf_window,ret);
+#endif
+ if (ret != MPI_SUCCESS) goto fn_exit;
+ ca_data->io_buf_put_amounts = 0;
+ ret =MPI_Win_create(&(ca_data->io_buf_put_amounts),sizeof(int),sizeof(int),MPI_INFO_NULL,ca_data->comm, &(ca_data->io_buf_put_amounts_window));
+
+ if (ca_data->async_io_outer == 1) {
+ ret = MPI_Win_create(ca_data->io_buf_d,ca_data->cb_buffer_size,1,MPI_INFO_NULL,ca_data->comm, &(ca_data->io_buf_window_d));
+ if (ret != MPI_SUCCESS) goto fn_exit;
+ ca_data->io_buf_put_amounts_d = 0;
+ ret = MPI_Win_create(&(ca_data->io_buf_put_amounts_d),sizeof(int),sizeof(int),MPI_INFO_NULL,ca_data->comm, &(ca_data->io_buf_put_amounts_window_d));
+ }
+
+fn_exit:
+ return ret;
+}
+/*-------------------------------------------------------------------------
+ * Function: H5FD_mpio_ccio_setup
+ *
+ * Purpose: Checks if CCIO VFD options are desired, and popluates
+ * necessary data structures.
+ *
+ * Return: Success:
+ * Failure: NULL
+ *
+ * Programmer: Paul Coffman & Rick Zamora
+ * June 13, 2018
+ *
+ *-------------------------------------------------------------------------
+ */
+static herr_t H5FD_mpio_ccio_setup(const char *name, H5FD_mpio_t *file, MPI_File fh)
+{
+ char *do_custom_agg_rd = HDgetenv("HDF5_CCIO_RD");
+ char *do_custom_agg_wr = HDgetenv("HDF5_CCIO_WR");
+ char *cb_buffer_size = HDgetenv("HDF5_CCIO_CB_SIZE");
+ char *cb_nodes = HDgetenv("HDF5_CCIO_CB_NODES");
+ char *fs_block_size = HDgetenv("HDF5_CCIO_FS_BLOCK_SIZE");
+ char *fs_block_count = HDgetenv("HDF5_CCIO_FS_BLOCK_COUNT");
+ char *custom_agg_debug_str = HDgetenv("HDF5_CCIO_DEBUG");
+ char *ccio_wr_method = HDgetenv("HDF5_CCIO_WR_METHOD");
+ char *ccio_rd_method = HDgetenv("HDF5_CCIO_RD_METHOD");
+ char *do_async_io = HDgetenv("HDF5_CCIO_ASYNC");
+ char *set_cb_nodes_stride = HDgetenv("HDF5_CCIO_CB_STRIDE");
+ char *use_file_system = HDgetenv("HDF5_CCIO_FS");
+ char *do_topo_select = HDgetenv("HDF5_CCIO_TOPO_CB_SELECT");
+ char *set_ppn = HDgetenv("HDF5_CCIO_TOPO_PPN");
+ char *set_pps = HDgetenv("HDF5_CCIO_TOPO_PPS");
+ char *use_fd_agg = HDgetenv("HDF5_CCIO_FD_AGG");
+ int custom_agg_debug = 0;
+ int mpi_rank = file->mpi_rank; /* MPI rank of this process */
+ int mpi_size = file->mpi_size; /* Total number of MPI processes */
+ int i, rc;
+ herr_t ret_value = SUCCEED;
+
+ FUNC_ENTER_NOAPI_NOINIT
+ {}
+
+ if (custom_agg_debug_str && (strcmp(custom_agg_debug_str,"yes") == 0))
+ custom_agg_debug = 1;
+
+ if (use_fd_agg && (strcmp(use_fd_agg,"yes") == 0))
+ file->custom_agg_data.fslayout = GPFS;
+ else
+ file->custom_agg_data.fslayout = LUSTRE;
+
+ /* Set some defaults for the one-sided agg algorithm */
+ file->custom_agg_data.ccio_read = 0;
+ file->custom_agg_data.ccio_write = 0;
+ file->custom_agg_data.cb_nodes = 1;
+ file->custom_agg_data.ppn = 0;
+ file->custom_agg_data.pps = 0;
+ file->custom_agg_data.cb_buffer_size = 1048576;
+ file->custom_agg_data.fs_block_count = 1;
+ file->custom_agg_data.fs_block_size = 1048576;
+ file->custom_agg_data.onesided_always_rmw = 0;
+ file->custom_agg_data.onesided_no_rmw = 1;
+ file->custom_agg_data.onesided_inform_rmw = 0;
+ file->custom_agg_data.onesided_write_aggmethod = 1;
+ file->custom_agg_data.onesided_read_aggmethod = 1;
+ file->custom_agg_data.topo_cb_select = DEFAULT;
+ file->custom_agg_data.ranklist_populated = 0;
+
+ if (do_custom_agg_wr && (strcmp(do_custom_agg_wr,"yes") == 0)) {
+ file->custom_agg_data.ccio_write = 1;
+ }
+ if (do_custom_agg_rd && (strcmp(do_custom_agg_rd,"yes") == 0)) {
+ file->custom_agg_data.ccio_read = 1;
+ }
+
+ /* Check if we are using CCIO Options*/
+ if ( (file->custom_agg_data.ccio_read) || (file->custom_agg_data.ccio_write) ) {
+
+ /* By default, use env variables for agg settings */
+ if ( cb_nodes ) {
+ file->custom_agg_data.cb_nodes = atoi( cb_nodes );
+ }
+ if ( set_ppn ) {
+ file->custom_agg_data.ppn = atoi( set_ppn );
+ }
+ if ( set_pps ) {
+ file->custom_agg_data.pps = atoi( set_pps );
+ }
+ if ( cb_buffer_size ) {
+ file->custom_agg_data.cb_buffer_size = atoi( cb_buffer_size );
+ }
+ if ( fs_block_count ) {
+ file->custom_agg_data.fs_block_count = atoi( fs_block_count );
+ }
+ if ( fs_block_size ) {
+ file->custom_agg_data.fs_block_size = atoi( fs_block_size );
+ }
+ file->custom_agg_data.comm = file->comm;
+ file->custom_agg_data.fh = fh;
+
+ /* TODO: Can we handle multiple stripes per aggregator?
+ * For now, just pretend like the stripe size is the same as the buffer size...
+ */
+ int stripes_per_cb_buf = file->custom_agg_data.cb_buffer_size / file->custom_agg_data.fs_block_size;
+ if ( stripes_per_cb_buf > 1 ) {
+ file->custom_agg_data.fs_block_size = file->custom_agg_data.cb_buffer_size;
+ file->custom_agg_data.fs_block_count /= stripes_per_cb_buf;
+ }
+
+ int tot_cb_bufsize = (int)(file->custom_agg_data.cb_buffer_size);
+ file->custom_agg_data.io_buf_put_amounts = 0;
+ file->custom_agg_data.io_buf_window = MPI_WIN_NULL;
+ file->custom_agg_data.io_buf_put_amounts_window = MPI_WIN_NULL;
+
+ /* Determine IF and HOW asynchronous I/O will be performed */
+ file->custom_agg_data.async_io_inner = 0;
+ file->custom_agg_data.async_io_outer = 0;
+ file->custom_agg_data.check_req = 0;
+ file->custom_agg_data.pthread_io = 0;
+ if (do_async_io && (strcmp(do_async_io,"yes") == 0)) {
+ /* Allow 'outer' pipelining if this is LUSTRE-like mapping */
+ if(file->custom_agg_data.fslayout == LUSTRE) {
+ file->custom_agg_data.async_io_outer = 1;
+ file->custom_agg_data.io_buf_d = (char *) H5MM_malloc(tot_cb_bufsize*sizeof(char));
+ }
+ /* Allow 'inner' pipelining if this is GPFS-like mapping */
+ else {
+ file->custom_agg_data.cb_buffer_size *= 2;
+ tot_cb_bufsize = (int)(file->custom_agg_data.cb_buffer_size);
+ file->custom_agg_data.async_io_inner = 1;
+ file->custom_agg_data.pthread_io = 1; /* pthreads needed for current 'inner' approach */
+ }
+ }
+ file->custom_agg_data.io_buf = (char *) H5MM_malloc(tot_cb_bufsize*sizeof(char));
+ file->custom_agg_data.io_buf_put_amounts_d = 0;
+ file->custom_agg_data.io_buf_window_d = MPI_WIN_NULL;
+ file->custom_agg_data.io_buf_put_amounts_window_d = MPI_WIN_NULL;
+ file->custom_agg_data.use_dup = 0;
+
+ if ( ccio_wr_method ) {
+ file->custom_agg_data.onesided_write_aggmethod = atoi( ccio_wr_method );
+ if (file->custom_agg_data.onesided_write_aggmethod < 1)
+ file->custom_agg_data.onesided_write_aggmethod = 1;
+ if (file->custom_agg_data.onesided_write_aggmethod < 2)
+ file->custom_agg_data.onesided_write_aggmethod = 2;
+ }
+ if ( ccio_rd_method ) {
+ file->custom_agg_data.onesided_read_aggmethod = atoi( ccio_rd_method );
+ if (file->custom_agg_data.onesided_read_aggmethod < 1)
+ file->custom_agg_data.onesided_read_aggmethod = 1;
+ if (file->custom_agg_data.onesided_read_aggmethod < 2)
+ file->custom_agg_data.onesided_read_aggmethod = 2;
+ }
+
+ if (custom_agg_debug && (mpi_rank == 0)) {
+ fprintf(stdout,"Custom aggregation info on mpio_open: MPI_MAX_INFO_VAL is %d H5FD_mpio_open fh is %016lx cb_buffer_size is %d cb_nodes is %d fs_block_count is %d fs_block_size is %d\n",MPI_MAX_INFO_VAL,fh,file->custom_agg_data.cb_buffer_size,file->custom_agg_data.cb_nodes,file->custom_agg_data.fs_block_count,file->custom_agg_data.fs_block_size);
+ fflush(stdout);
+ }
+
+ /* Generate the initial ranklist using a constant stride between ranks */
+ file->custom_agg_data.ranklist = (int *) H5MM_malloc(mpi_size * sizeof(int));
+ for (i=0;i<mpi_size;i++)
+ file->custom_agg_data.ranklist[i] = i;
+ int cb_nodes_stride = mpi_size / file->custom_agg_data.cb_nodes;
+
+ /* If HDF5_CCIO_CB_STRIDE is set to a reasonable value, use it */
+ if (set_cb_nodes_stride) {
+ int set_stride_val = atoi( set_cb_nodes_stride );
+ if ((set_stride_val > 0) && (set_stride_val <= cb_nodes_stride)) {
+ cb_nodes_stride = set_stride_val;
+ }
+ }
+ for (i=0;i<(file->custom_agg_data.cb_nodes);i++) {
+ file->custom_agg_data.ranklist[i] = i*cb_nodes_stride;
+ }
+
+ /*
+ * Here, we can check the HDF5_CCIO_TOPO_CB_SELECT env variable.
+ * Use string to set AGGSelect custom_agg_data value...
+ */
+ if (do_topo_select) {
+ if (strcmp(do_topo_select,"data") == 0) {
+ file->custom_agg_data.topo_cb_select = DATA;
+ } else if (strcmp(do_topo_select,"spread") == 0) {
+ file->custom_agg_data.topo_cb_select = SPREAD;
+ } else if (strcmp(do_topo_select,"strided") == 0) {
+ /* Stride not really supported through topology API,
+ * Just use the strided rank list created above.
+ */
+ file->custom_agg_data.topo_cb_select = DEFAULT;
+ } else if (strcmp(do_topo_select,"random") == 0) {
+ file->custom_agg_data.topo_cb_select = RANDOM;
+ }
+ }
+
+ /* Show the aggregator ranks if we are in debug mode */
+ if (custom_agg_debug && (mpi_rank == 0)) {
+ fprintf(stdout,"DEBUG: file->custom_agg_data.cb_nodes is now set to %d romio_aggregator_list is:", file->custom_agg_data.cb_nodes);
+ for (i=0;i<file->custom_agg_data.cb_nodes;i++)
+ fprintf(stdout," %d",file->custom_agg_data.ranklist[i]);
+ fprintf(stdout,"\n");
+ fflush(stdout);
+ }
+
+ }
+
+done:
+ FUNC_LEAVE_NOAPI(ret_value)
+} /* H5FD_mpio_ccio_setup */
+
+/*-------------------------------------------------------------------------
+ * Function: H5FD_mpio_ccio_cleanup
+ *
+ * Purpose: Cleans data structures used for CCIO VFD options
+ *
+ * Return: Success:
+ * Failure: NULL
+ *
+ * Programmer: Rick Zamora
+ * October 25, 2018
+ *
+ *-------------------------------------------------------------------------
+ */
+static herr_t H5FD_mpio_ccio_cleanup(const H5FD_mpio_t *file)
+{
+ herr_t ret_value = SUCCEED;
+ FUNC_ENTER_NOAPI_NOINIT
+ {}
+
+ /*
+ * If doing custom aggregation, clean it up.
+ */
+ char *do_custom_agg_wr = HDgetenv("HDF5_CCIO_WR");
+ char *do_custom_agg_rd = HDgetenv("HDF5_CCIO_RD");
+ if ( (do_custom_agg_wr && (strcmp(do_custom_agg_wr,"yes") == 0)) ||
+ (do_custom_agg_rd && (strcmp(do_custom_agg_rd,"yes") == 0)) ) {
+
+ CustomAgg_FH_Data ca_data = (CustomAgg_FH_Data)&(file->custom_agg_data);
+ if (ca_data->io_buf_window != MPI_WIN_NULL)
+ ret_value = MPI_Win_free(&ca_data->io_buf_window);
+ if (ca_data->io_buf_put_amounts_window != MPI_WIN_NULL)
+ ret_value = MPI_Win_free(&ca_data->io_buf_put_amounts_window);
+ if (ca_data->io_buf_window_d != MPI_WIN_NULL)
+ ret_value = MPI_Win_free(&ca_data->io_buf_window_d);
+ if (ca_data->io_buf_put_amounts_window_d != MPI_WIN_NULL)
+ ret_value = MPI_Win_free(&ca_data->io_buf_put_amounts_window_d);
+
+ H5MM_free(file->custom_agg_data.io_buf);
+ H5MM_free(file->custom_agg_data.ranklist);
+ if(file->custom_agg_data.async_io_outer)
+ H5MM_free(file->custom_agg_data.io_buf_d);
+
+ }
+
+done:
+ FUNC_LEAVE_NOAPI(ret_value)
+} /* H5FD_mpio_ccio_cleanup */
+
+
+/*-------------------------------------------------------------------------
+ * Function: H5FD_mpio_calc_offset_list
+ *
+ * Purpose: Function to get the offset list of absolute file offsets
+ * and associated lengths.
+ *
+ * Return: Void.
+ *
+ *-------------------------------------------------------------------------
+ */
+void H5FD_mpio_calc_offset_list(ADIO_Offset_CA
+ memFlatBufSize, H5S_flatbuf_t *fileFlatBuf, MPI_Offset mpi_off,
+ ADIO_Offset_CA **offset_list_ptr, ADIO_Offset_CA
+ **len_list_ptr, ADIO_Offset_CA *start_offset_ptr,
+ ADIO_Offset_CA *end_offset_ptr, int
+ *contig_access_count_ptr)
+{
+ int i, j, k;
+ int st_index=0;
+ int contig_access_count;
+ ADIO_Offset_CA *len_list;
+ ADIO_Offset_CA *offset_list;
+
+ /* For the data in the memFlatBuf for this process, calculate the list of offsets and
+ lengths in the file and determine the start and end offsets using the fileFlatBuf */
+
+ if ( !(fileFlatBuf->size) || (memFlatBufSize == 0) ) {
+ *contig_access_count_ptr = 0;
+ *offset_list_ptr = (ADIO_Offset_CA *) H5MM_malloc(sizeof(ADIO_Offset_CA));
+ *len_list_ptr = (ADIO_Offset_CA *) H5MM_malloc(sizeof(ADIO_Offset_CA));
+
+ offset_list = *offset_list_ptr;
+ len_list = *len_list_ptr;
+ offset_list[0] = 0;
+ len_list[0] = 0;
+ *start_offset_ptr = 0;
+ *end_offset_ptr = -1;
+ return;
+ }
+ else {
+ /* first count how many entries we will need to malloc correct amount of memory*/
+ ADIO_Offset_CA bytesRemaining = memFlatBufSize;
+ int fbindex = 0;
+ int contig_access_count = 0;
+ while (bytesRemaining > 0) {
+ contig_access_count++;
+ bytesRemaining -= fileFlatBuf->blocklens[fbindex++];
+ }
+#ifdef onesidedtrace
+ printf("memFlatBufSize is %ld contig_access_count is %d\n",memFlatBufSize,contig_access_count);
+ fflush(stdout);
+#endif
+ *offset_list_ptr = (ADIO_Offset_CA *) H5MM_malloc(contig_access_count*sizeof(ADIO_Offset_CA));
+ *len_list_ptr = (ADIO_Offset_CA *) H5MM_malloc(contig_access_count*sizeof(ADIO_Offset_CA));
+ offset_list = *offset_list_ptr;
+ len_list = *len_list_ptr;
+
+ /* now set the offset and len list */
+ bytesRemaining = memFlatBufSize;
+ fbindex = 0;
+ int offlenindex = 0;
+ while (bytesRemaining > 0) {
+ if (fileFlatBuf->blocklens[fbindex] <= bytesRemaining) {
+ offset_list[offlenindex] = fileFlatBuf->indices[fbindex] + mpi_off;
+ len_list[offlenindex] = fileFlatBuf->blocklens[fbindex];
+ }
+ else {
+ offset_list[offlenindex] = fileFlatBuf->indices[fbindex] + mpi_off;
+ len_list[offlenindex] = bytesRemaining;
+ }
+ bytesRemaining -= fileFlatBuf->blocklens[fbindex];
+ fbindex++;
+ offlenindex++;
+
+ }
+ *contig_access_count_ptr = contig_access_count;
+ *start_offset_ptr = offset_list[0];
+ *end_offset_ptr = offset_list[offlenindex-1] + len_list[offlenindex-1] - (ADIO_Offset_CA)1;
+ }
+} /* H5FD_mpio_calc_offset_list */
+
+
+/*-------------------------------------------------------------------------
+ * Function: H5FD_mpio_ccio_write_one_sided
+ *
+ * Purpose: Generic One-sided Collective Write Implementation
+ *
+ * Return: Void.
+ *
+ *-------------------------------------------------------------------------
+ */
+void H5FD_mpio_ccio_write_one_sided(CustomAgg_FH_Data ca_data, const void *buf, MPI_Offset mpi_off,
+ H5S_flatbuf_t *memFlatBuf, H5S_flatbuf_t *fileFlatBuf, int *error_code)
+{
+ /*
+ * This function writes a memFlatBuf into a fileFlatBuf
+ */
+
+ int i, nprocs, myrank;
+ int contig_access_count = 0;
+ ADIO_Offset_CA start_offset, end_offset, fd_size, min_st_offset, off;
+ ADIO_Offset_CA *offset_list = NULL, *st_offsets = NULL, *fd_start = NULL, *fd_end = NULL, *end_offsets = NULL;
+ ADIO_Offset_CA *len_list = NULL;
+ int *fs_block_info = NULL;
+ ADIO_Offset_CA **buf_idx = NULL;
+ int old_error, tmp_error;
+ ADIO_Offset_CA *fs_offsets0, *fs_offsets, *count_sizes;
+
+ MPI_Comm_size(ca_data->comm, &nprocs);
+ MPI_Comm_rank(ca_data->comm, &myrank);
+
+#ifdef topo_timing
+ double endTimeTopo = 0.0;
+ double startTimeTopo = 0.0;
+ double endTime = 0.0;
+ double startTime = 0.0;
+ startTime = MPI_Wtime();
+#endif
+
+#ifdef onesidedtrace
+ printf("Rank %d - H5FD_mpio_ccio_write_one_sided - ca_data->cb_nodes is %d\n",myrank,ca_data->cb_nodes);
+ fflush(stdout);
+ // dump the flatbufs
+ printf("Rank %d - memFlatBuf->size is %ld fileFlatBuf->size is %ld mpi_off is %ld\n",myrank,memFlatBuf->size,fileFlatBuf->size,mpi_off);
+ int flatbufCount = memFlatBuf->count;
+ for (i=0;i<flatbufCount;i++) {
+ printf("Rank %d - memFlatBuf->indices[%d] is %ld memFlatBuf->blocklens[%d] is %ld\n",myrank,i,memFlatBuf->indices[i],i,memFlatBuf->blocklens[i]);
+ }
+ flatbufCount = fileFlatBuf->count;
+ for (i=0;i<flatbufCount;i++) {
+ printf("Rank %d - fileFlatBuf->indices[%d] is %ld fileFlatBuf->blocklens[%d] is %ld\n",myrank,i,fileFlatBuf->indices[i],i,fileFlatBuf->blocklens[i]);
+ }
+ fflush(stdout);
+#endif
+
+ /* For this process's request, calculate the list of offsets and
+ * lengths in the file and determine the start and end offsets.
+ * Note: end_offset points to the last byte-offset that will be accessed.
+ * e.g., if start_offset=0 and 100 bytes to be read, end_offset=99
+ */
+
+ H5FD_mpio_calc_offset_list((ADIO_Offset_CA)memFlatBuf->size, fileFlatBuf, mpi_off,
+ &offset_list, &len_list, &start_offset, &end_offset, &contig_access_count);
+
+#ifdef onesidedtrace
+ printf("Rank %d - contig_access_count = %d\n",myrank,contig_access_count);
+#endif
+
+ /* each process communicates its start and end offsets to other
+ * processes. The result is an array each of start and end offsets
+ * stored in order of process rank.
+ */
+ st_offsets = (ADIO_Offset_CA *) H5MM_malloc(nprocs * sizeof(ADIO_Offset_CA));
+ end_offsets = (ADIO_Offset_CA *) H5MM_malloc(nprocs * sizeof(ADIO_Offset_CA));
+
+ /* One-sided aggregation needs the amount of data per rank as well because
+ * the difference in starting and ending offsets for 1 byte is 0 the same
+ * as 0 bytes so it cannot be distiguished.
+ */
+ count_sizes = (ADIO_Offset_CA *) H5MM_malloc(nprocs*sizeof(ADIO_Offset_CA));
+ fs_offsets0 = (ADIO_Offset_CA *) H5MM_malloc(3*nprocs*sizeof(ADIO_Offset_CA));
+ fs_offsets = (ADIO_Offset_CA *) H5MM_malloc(3*nprocs*sizeof(ADIO_Offset_CA));
+ for (i=0; i<nprocs; i++) {
+ fs_offsets0[i*3] = 0;
+ fs_offsets0[i*3+1] = 0;
+ fs_offsets0[i*3+2] = 0;
+ }
+ fs_offsets0[myrank*3] = start_offset;
+ fs_offsets0[myrank*3+1] = end_offset;
+ fs_offsets0[myrank*3+2] = (ADIO_Offset_CA) memFlatBuf->size;
+ MPI_Allreduce( fs_offsets0, fs_offsets, nprocs*3, MPI_LONG, MPI_MAX, ca_data->comm );
+ for (i=0; i<nprocs; i++) {
+ st_offsets [i] = fs_offsets[i*3] ;
+ end_offsets[i] = fs_offsets[i*3+1];
+ count_sizes[i] = fs_offsets[i*3+2];
+ }
+
+ H5MM_free( fs_offsets0 );
+ H5MM_free( fs_offsets );
+
+ ADIO_Offset_CA lastFileOffset = 0, firstFileOffset = -1;
+ int currentValidDataIndex = 0;
+ /* Take out the 0-data offsets by shifting the indexes with data to the front
+ * and keeping track of the valid data index for use as the length.
+ */
+ for (i=0; i<nprocs; i++) {
+ if (count_sizes[i] > 0) {
+ st_offsets[currentValidDataIndex] = st_offsets[i];
+ end_offsets[currentValidDataIndex] = end_offsets[i];
+
+ lastFileOffset = MAX(lastFileOffset,end_offsets[currentValidDataIndex]);
+ if (firstFileOffset == -1)
+ firstFileOffset = st_offsets[currentValidDataIndex];
+ else
+ firstFileOffset = MIN(firstFileOffset,st_offsets[currentValidDataIndex]);
+
+ currentValidDataIndex++;
+ }
+ }
+#ifdef onesidedtrace
+ printf("Rank %d - H5FD_mpio_calc_offset_list results:\n",myrank);
+ for (i=0;i<contig_access_count;i++) {
+ printf("Rank %d - offset_list[%d] is %ld len_list[%d] is %ld start_offset is %ld end_offset is %ld firstFileOffset is %ld lastFileOffset is %ld\n",myrank,i,offset_list[i],i,len_list[i],start_offset,end_offset,firstFileOffset,lastFileOffset);
+ }
+#endif
+
+ /* Select Topology-aware list of cb_nodes if desired */
+ if ((ca_data->topo_cb_select != DEFAULT) && (ca_data->ranklist_populated==0)) {
+#ifdef topo_timing
+ startTimeTopo = MPI_Wtime();
+#endif
+
+ topology_aware_ranklist ( fileFlatBuf->blocklens, fileFlatBuf->indices, fileFlatBuf->count, &(ca_data->ranklist[0]), ca_data->cb_buffer_size, ca_data->cb_nodes, ca_data->ppn, ca_data->pps, 0, ca_data->comm, ca_data->topo_cb_select, (int)(ca_data->fslayout == GPFS) );
+
+ /* Only populating ranklist when necessary */
+ ca_data->ranklist_populated = 1;
+
+#ifdef onesidedtrace
+ if (myrank == 0) {
+ fprintf(stdout,"Topology-aware CB Selection (type %d): ca_data->cb_nodes is %d, and ranklist is:", ca_data->topo_cb_select, ca_data->cb_nodes);
+ for (i=0;i<ca_data->cb_nodes;i++)
+ fprintf(stdout," %d",ca_data->ranklist[i]);
+ fprintf(stdout,"\n");
+ }
+ MPI_Barrier(ca_data->comm);
+#endif
+
+#ifdef topo_timing
+ endTimeTopo = MPI_Wtime();
+#endif
+ }
+
+ /* Use GPFS-like mapping of aggregators to file data */
+ if (ca_data->fslayout == GPFS) {
+
+ calc_file_domains(st_offsets, end_offsets,
+ currentValidDataIndex, ca_data->cb_nodes, &min_st_offset, &fd_start,
+ &fd_end, &fd_size, ca_data->fs_block_size);
+
+ /*
+ * Pass this datastructure to indicate we are a non-striping filesystem
+ * (by setting stripe size to 0).
+ * That is, we are NOT using the LUSTRE approach here...
+ */
+
+ FS_Block_Parms noStripeParms;
+ noStripeParms.stripeSize = 0;
+ noStripeParms.segmentLen = 0;
+ noStripeParms.stripesPerAgg = 0;
+ noStripeParms.segmentIter = 0;
+ noStripeParms.flushCB = 1;
+ noStripeParms.stripedLastFileOffset = 0;
+ noStripeParms.firstStripedIOCall = 0;
+ noStripeParms.lastStripedIOCall = 0;
+ noStripeParms.iWasUsedStripingAgg = 0;
+ noStripeParms.numStripesUsed = 0;
+ noStripeParms.amountOfStripedDataExpected = 0;
+ noStripeParms.bufTypeExtent = 0;
+ noStripeParms.lastDataTypeExtent = 0;
+ noStripeParms.lastFlatBufIndice = 0;
+ noStripeParms.lastIndiceOffset = 0;
+ int holeFound = 0;
+
+ H5FD_mpio_ccio_osagg_write(ca_data, offset_list, len_list, contig_access_count,
+ buf, memFlatBuf, error_code, firstFileOffset, lastFileOffset,
+ currentValidDataIndex, fd_start, fd_end, &holeFound, &noStripeParms);
+
+ int anyHolesFound = 0;
+ if (!(ca_data->onesided_no_rmw))
+ MPI_Allreduce(&holeFound, &anyHolesFound, 1, MPI_INT, MPI_MAX, ca_data->comm);
+ if (anyHolesFound == 0) {
+ H5MM_free(offset_list);
+ H5MM_free(len_list);
+ H5MM_free(st_offsets);
+ H5MM_free(end_offsets);
+ H5MM_free(fd_start);
+ H5MM_free(fd_end);
+ H5MM_free(count_sizes);
+ }
+ else {
+ /* Holes are found in the data and the user has not set
+ * romio_onesided_no_rmw --- set romio_onesided_always_rmw to 1
+ * and re-call ADIOI_OneSidedWriteAggregation and if the user has
+ * romio_onesided_inform_rmw set then inform him of this condition
+ * and behavior.
+ */
+ if (ca_data->onesided_inform_rmw && (myrank ==0)) {
+ fprintf(stderr,"Information: Holes found during one-sided "
+ "write aggregation algorithm --- re-running one-sided "
+ "write aggregation with ROMIO_ONESIDED_ALWAYS_RMW set to 1.\n");
+ ca_data->onesided_always_rmw = 1;
+ int prev_onesided_no_rmw = ca_data->onesided_no_rmw;
+ ca_data->onesided_no_rmw = 1;
+ H5FD_mpio_ccio_osagg_write(ca_data, offset_list, len_list, contig_access_count,
+ buf, memFlatBuf, error_code, firstFileOffset, lastFileOffset,
+ currentValidDataIndex, fd_start, fd_end, &holeFound, &noStripeParms);
+ ca_data->onesided_no_rmw = prev_onesided_no_rmw;
+ H5MM_free(offset_list);
+ H5MM_free(len_list);
+ H5MM_free(st_offsets);
+ H5MM_free(end_offsets);
+ H5MM_free(fd_start);
+ H5MM_free(fd_end);
+ H5MM_free(count_sizes);
+ }
+ }
+
+ }
+ /* Use LUSTRE-like mapping of aggregators to file data */
+ else {
+
+ /* Rewriting the ca_data as 'fs_block_info' (probably NOT necessary) */
+ fs_block_info = (int *) H5MM_malloc(3 * sizeof(int));
+ fs_block_info[0] = ca_data->fs_block_size;
+ fs_block_info[1] = ca_data->fs_block_count;
+ fs_block_info[2] = ca_data->cb_nodes;
+#ifdef onesidedtrace
+ printf("Rank %d - ca_data->cb_buffer_size is %lu fs_block_info[0] is %d fs_block_info[1] is %d fs_block_info[2] is %d\n",myrank,ca_data->cb_buffer_size,fs_block_info[0],fs_block_info[1],fs_block_info[2]);
+ fflush(stdout);
+#endif
+
+ /* Async I/O - Make sure we are starting with the main buffer */
+ ca_data->use_dup = 0;
+
+ /* Iterate over 1+ aggregation rounds and write to FS when buffers are full */
+ H5FD_mpio_ccio_iterate_write(ca_data, buf, fs_block_info, offset_list, len_list, mpi_off, contig_access_count, currentValidDataIndex, start_offset, end_offset, firstFileOffset, lastFileOffset, memFlatBuf, fileFlatBuf, myrank, error_code);
+
+ /* Async I/O - Wait for any outstanding requests (we are done with this I/O call) */
+ ca_data->use_dup = 0;
+ if (ca_data->check_req == 1) {
+ MPIO_Wait(&ca_data->io_Request, error_code);
+ ca_data->check_req = 0;
+ }
+
+ H5MM_free(offset_list);
+ H5MM_free(len_list);
+ H5MM_free(st_offsets);
+ H5MM_free(end_offsets);
+ H5MM_free(count_sizes);
+ H5MM_free(fs_block_info);
+
+ }
+
+#ifdef topo_timing
+ endTime = MPI_Wtime();
+ double max_frac;
+ double l_frac = (endTimeTopo - startTimeTopo) / (endTime - startTime);
+ MPI_Allreduce ( &l_frac, &max_frac, 1, MPI_DOUBLE, MPI_MAX, ca_data->comm );
+ if ((myrank == 0)&& (ca_data->topo_cb_select != DEFAULT)) {
+ printf("WRITE: Aggregator Selection Fraction = %f\n", max_frac);
+ fflush(stdout);
+ }
+ MPI_Barrier(ca_data->comm);
+#endif
+
+} /* H5FD_mpio_ccio_write_one_sided */
+
+/*-------------------------------------------------------------------------
+ * Function: H5FD_mpio_ccio_read_one_sided
+ *
+ * Purpose: Generic One-sided Collective Read Implementation
+ *
+ * Return: Void.
+ *
+ *-------------------------------------------------------------------------
+ */
+ void H5FD_mpio_ccio_read_one_sided(CustomAgg_FH_Data ca_data, void *buf, MPI_Offset mpi_off,
+ H5S_flatbuf_t *memFlatBuf, H5S_flatbuf_t *fileFlatBuf,
+ int *error_code)
+{
+ /*
+ * This function reads a fileFlatBuf into a memFlatBuf
+ */
+
+ int i, ii, nprocs, nprocs_for_coll, myrank;
+ int contig_access_count=0;
+ ADIO_Offset_CA start_offset, end_offset, fd_size, min_st_offset, off;
+ ADIO_Offset_CA *offset_list = NULL, *st_offsets = NULL, *fd_start = NULL,
+ *fd_end = NULL, *end_offsets = NULL;
+ ADIO_Offset_CA *len_list = NULL;
+ ADIO_Offset_CA *fs_offsets0 = NULL, *fs_offsets = NULL;
+ ADIO_Offset_CA *count_sizes;
+ int *fs_block_info = NULL;
+
+ MPI_Comm_size(ca_data->comm, &nprocs);
+ MPI_Comm_rank(ca_data->comm, &myrank);
+
+#ifdef topo_timing
+ double endTimeTopo = 0.0;
+ double startTimeTopo = 0.0;
+ double endTime = 0.0;
+ double startTime = 0.0;
+ startTime = MPI_Wtime();
+#endif
+
+#ifdef onesidedtrace
+ printf("Rank %d - H5FD_mpio_ccio_read_one_sided - ca_data->cb_nodes is %d\n",myrank,ca_data->cb_nodes);
+ fflush(stdout);
+
+ /* dump the flatbufs */
+ printf("Rank %d - memFlatBuf->size is %ld fileFlatBuf->size is %ld mpi_off is %ld\n",myrank,memFlatBuf->size,fileFlatBuf->size,mpi_off);
+ int flatbufCount = memFlatBuf->count;
+ for (i=0;i<flatbufCount;i++) {
+ printf("Rank %d - memFlatBuf->indices[%d] is %ld memFlatBuf->blocklens[%d] is %ld\n",myrank,i,memFlatBuf->indices[i],i,memFlatBuf->blocklens[i]);
+ }
+ flatbufCount = fileFlatBuf->count;
+ for (i=0;i<flatbufCount;i++) {
+ printf("Rank %d - fileFlatBuf->indices[%d] is %ld fileFlatBuf->blocklens[%d] is %ld\n",myrank,i,fileFlatBuf->indices[i],i,fileFlatBuf->blocklens[i]);
+ }
+ fflush(stdout);
+#endif
+
+ /* For this process's request, calculate the list of offsets and
+ * lengths in the file and determine the start and end offsets.
+ * Note: end_offset points to the last byte-offset that will be accessed.
+ * e.g., if start_offset=0 and 100 bytes to be read, end_offset=99
+ */
+
+ H5FD_mpio_calc_offset_list((ADIO_Offset_CA)memFlatBuf->size, fileFlatBuf, mpi_off,
+ &offset_list, &len_list, &start_offset, &end_offset, &contig_access_count);
+
+#ifdef onesidedtrace
+ printf("Rank %d - contig_access_count = %d\n",myrank,contig_access_count);
+#endif
+
+ /* each process communicates its start and end offsets to other
+ processes. The result is an array each of start and end offsets stored
+ in order of process rank. */
+ st_offsets = (ADIO_Offset_CA *) H5MM_malloc(nprocs*sizeof(ADIO_Offset_CA));
+ end_offsets = (ADIO_Offset_CA *) H5MM_malloc(nprocs*sizeof(ADIO_Offset_CA));
+
+ /* One-sided aggregation needs the amount of data per rank as well because
+ * the difference in starting and ending offsets for 1 byte is 0 the same
+ * as 0 bytes so it cannot be distiguished.
+ */
+ count_sizes = (ADIO_Offset_CA *) H5MM_malloc(nprocs*sizeof(ADIO_Offset_CA));
+ fs_offsets0 = (ADIO_Offset_CA *) H5MM_malloc(3*nprocs*sizeof(ADIO_Offset_CA));
+ fs_offsets = (ADIO_Offset_CA *) H5MM_malloc(3*nprocs*sizeof(ADIO_Offset_CA));
+ for (ii=0; ii<nprocs; ii++) {
+ fs_offsets0[ii*3] = 0;
+ fs_offsets0[ii*3+1] = 0;
+ fs_offsets0[ii*3+2] = 0;
+ }
+ fs_offsets0[myrank*3] = start_offset;
+ fs_offsets0[myrank*3+1] = end_offset;
+ fs_offsets0[myrank*3+2] = (ADIO_Offset_CA) memFlatBuf->size;
+ MPI_Allreduce( fs_offsets0, fs_offsets, nprocs*3, MPI_LONG, MPI_MAX, ca_data->comm );
+ for (ii=0; ii<nprocs; ii++) {
+ st_offsets [ii] = fs_offsets[ii*3] ;
+ end_offsets[ii] = fs_offsets[ii*3+1];
+ count_sizes[ii] = fs_offsets[ii*3+2];
+ }
+
+ H5MM_free( fs_offsets0 );
+ H5MM_free( fs_offsets );
+
+ ADIO_Offset_CA lastFileOffset = 0, firstFileOffset = -1;
+ int currentNonZeroDataIndex = 0;
+ /* Take out the 0-data offsets by shifting the indexes with data to the front
+ * and keeping track of the valid data index for use as the length.
+ */
+ for (i=0; i<nprocs; i++) {
+ if (count_sizes[i] > 0) {
+ st_offsets[currentNonZeroDataIndex] = st_offsets[i];
+ end_offsets[currentNonZeroDataIndex] = end_offsets[i];
+ lastFileOffset = MAX(lastFileOffset,end_offsets[currentNonZeroDataIndex]);
+ if (firstFileOffset == -1)
+ firstFileOffset = st_offsets[currentNonZeroDataIndex];
+ else
+ firstFileOffset = MIN(firstFileOffset,st_offsets[currentNonZeroDataIndex]);
+ currentNonZeroDataIndex++;
+ }
+ }
+
+ /* Select Topology-aware list of cb_nodes if desired */
+ if ((ca_data->topo_cb_select != DEFAULT) && (ca_data->ranklist_populated==0)) {
+#ifdef topo_timing
+ startTimeTopo = MPI_Wtime();
+#endif
+
+ topology_aware_ranklist ( fileFlatBuf->blocklens, fileFlatBuf->indices, fileFlatBuf->count, &(ca_data->ranklist[0]), ca_data->cb_buffer_size, ca_data->cb_nodes, ca_data->ppn, ca_data->pps, 0, ca_data->comm, ca_data->topo_cb_select, (int)(ca_data->fslayout == GPFS) );
+
+ /* Only populating ranklist when necessary */
+ ca_data->ranklist_populated = 1;
+
+#ifdef onesidedtrace
+ if (myrank == 0) {
+ fprintf(stdout,"Topology-aware CB Selection: ca_data->cb_nodes is %d, and ranklist is:", ca_data->cb_nodes);
+ for (i=0;i<ca_data->cb_nodes;i++)
+ fprintf(stdout," %d",ca_data->ranklist[i]);
+ fprintf(stdout,"\n");
+ }
+ MPI_Barrier(ca_data->comm);
+#endif
+
+#ifdef topo_timing
+ endTimeTopo = MPI_Wtime();
+#endif
+ }
+
+ /* Use LUSTRE-style data mapping to aggs */
+ if (ca_data->fslayout == LUSTRE) {
+
+ /* Rewriting the ca_data as 'fs_block_info' (probably NOT necessary) */
+ fs_block_info = (int *) H5MM_malloc(3 * sizeof(int));
+ fs_block_info[0] = ca_data->fs_block_size;
+ fs_block_info[1] = ca_data->fs_block_count;
+ fs_block_info[2] = ca_data->cb_nodes;
+#ifdef onesidedtrace
+ printf("Rank %d - ca_data->cb_buffer_size is %lu fs_block_info[0] is %d fs_block_info[1] is %d fs_block_info[2] is %d\n",myrank,ca_data->cb_buffer_size,fs_block_info[0],fs_block_info[1],fs_block_info[2]);
+ fflush(stdout);
+#endif
+
+ /* Async I/O - Make sure we are starting with the main buffer */
+ if (ca_data->check_req == 1) {
+ MPIO_Wait(&ca_data->io_Request, error_code);
+ ca_data->check_req = 0;
+ }
+ if (ca_data->check_req_d == 1) {
+ MPIO_Wait(&ca_data->io_Request_d, error_code);
+ ca_data->check_req_d = 0;
+ }
+ ca_data->use_dup = 0;
+
+ /* Iterate over 1+ aggregation rounds and read to mem when buffers are full */
+ H5FD_mpio_ccio_iterate_read(ca_data, buf, fs_block_info, offset_list, len_list, mpi_off, contig_access_count, currentNonZeroDataIndex, start_offset, end_offset, firstFileOffset, lastFileOffset, memFlatBuf, fileFlatBuf, myrank, error_code);
+
+ /* Async I/O - Wait for any outstanding requests (we are done with this I/O call) */
+ if (ca_data->check_req == 1) {
+ MPIO_Wait(&ca_data->io_Request, error_code);
+ ca_data->check_req = 0;
+ }
+ if (ca_data->check_req_d == 1) {
+ MPIO_Wait(&ca_data->io_Request_d, error_code);
+ ca_data->check_req_d = 0;
+ }
+ ca_data->use_dup = 0;
+
+ H5MM_free(offset_list);
+ H5MM_free(len_list);
+ H5MM_free(st_offsets);
+ H5MM_free(end_offsets);
+ H5MM_free(count_sizes);
+ H5MM_free(fs_block_info);
+
+ }
+ /* Use GPFS-style data mapping to aggs */
+ else {
+
+ calc_file_domains(st_offsets, end_offsets, currentNonZeroDataIndex, ca_data->cb_nodes, &min_st_offset, &fd_start, &fd_end, &fd_size, ca_data->fs_block_size);
+
+ /* Indicate that this is NOT a striped file system.. */
+ FS_Block_Parms noStripeParms;
+ noStripeParms.stripeSize = 0;
+ noStripeParms.segmentLen = 0;
+ noStripeParms.stripesPerAgg = 0;
+ noStripeParms.segmentIter = 0;
+ noStripeParms.flushCB = 1;
+ noStripeParms.stripedLastFileOffset = 0;
+ noStripeParms.firstStripedIOCall = 0;
+ noStripeParms.lastStripedIOCall = 0;
+ noStripeParms.iWasUsedStripingAgg = 0;
+ noStripeParms.numStripesUsed = 0;
+ noStripeParms.amountOfStripedDataExpected = 0;
+ noStripeParms.bufTypeExtent = 0;
+ noStripeParms.lastDataTypeExtent = 0;
+ noStripeParms.lastFlatBufIndice = 0;
+ noStripeParms.lastIndiceOffset = 0;
+
+ H5FD_mpio_ccio_osagg_read(ca_data, offset_list, len_list, contig_access_count, buf, memFlatBuf, error_code, firstFileOffset, lastFileOffset, currentNonZeroDataIndex, fd_start, fd_end, &noStripeParms, 1);
+ // last '1' means you SHOULD be reading in H5FD_mpio_ccio_osagg_read.
+
+ H5MM_free(offset_list);
+ H5MM_free(len_list);
+ H5MM_free(st_offsets);
+ H5MM_free(end_offsets);
+ H5MM_free(fd_start);
+ H5MM_free(fd_end);
+ H5MM_free(count_sizes);
+
+ }
+
+#ifdef topo_timing
+ endTime = MPI_Wtime();
+ double max_frac;
+ double l_frac = (endTimeTopo - startTimeTopo)/(endTime - startTime);
+ MPI_Allreduce ( &l_frac, &max_frac, 1, MPI_DOUBLE, MPI_MAX, ca_data->comm );
+ if ((myrank == 0)&& (ca_data->topo_cb_select != DEFAULT)) {
+ printf("READ: Aggregator Selection Fraction = %f\n", max_frac);
+ fflush(stdout);
+ }
+ MPI_Barrier(ca_data->comm);
+#endif
+
+} /* H5FD_mpio_ccio_read_one_sided */
+
+/*-------------------------------------------------------------------------
+ * Function: H5FD_mpio_ccio_iterate_write
+ *
+ * Purpose: This function calls H5FD_mpio_ccio_osagg_write
+ * iteratively to essentially pack stripes of data into the
+ * collective buffer and then flushes the buffer to the file when
+ * fully packed (repeating this process until all the data is
+ * completely written to the file).
+ *
+ * Return: Void.
+ *
+ *-------------------------------------------------------------------------
+ */
+void H5FD_mpio_ccio_iterate_write(CustomAgg_FH_Data ca_data, const void *buf,
+ int *fs_block_info, ADIO_Offset_CA *offset_list, ADIO_Offset_CA *len_list,
+ MPI_Offset mpi_off, int contig_access_count, int currentValidDataIndex,
+ ADIO_Offset_CA start_offset, ADIO_Offset_CA end_offset,
+ ADIO_Offset_CA firstFileOffset, ADIO_Offset_CA lastFileOffset,
+ H5S_flatbuf_t *memFlatBuf, H5S_flatbuf_t *fileFlatBuf, int myrank, int *error_code)
+{
+
+ int i;
+ int stripesPerAgg = ca_data->cb_buffer_size / fs_block_info[0];
+ int numStripedAggs = ca_data->cb_nodes;
+ if (stripesPerAgg == 0) {
+ /* The striping unit is larger than the collective buffer size
+ * therefore we must abort since the buffer has already been
+ * allocated during the open.
+ */
+ fprintf(stderr,"Error: The collective buffer size %d is less "
+ "than the fs_block_size %d - This collective I/O implementation "
+ "cannot continue.\n",ca_data->cb_buffer_size,fs_block_info[0]);
+ MPI_Abort(MPI_COMM_WORLD, 1);
+ }
+ MPI_Comm_rank(ca_data->comm, &myrank);
+
+ /* Declare FS_Block_Parms here - these parameters will be locally managed
+ * for this invokation of `iterate_one_sided`. This will allow for concurrent
+ * one-sided collective writes via multi-threading as well as multiple communicators.
+ */
+ FS_Block_Parms stripeParms;
+ stripeParms.stripeSize = fs_block_info[0];
+ stripeParms.stripedLastFileOffset = lastFileOffset;
+ stripeParms.iWasUsedStripingAgg = 0;
+ stripeParms.numStripesUsed = 0;
+ stripeParms.amountOfStripedDataExpected = 0;
+ stripeParms.bufTypeExtent = 0;
+ stripeParms.lastDataTypeExtent = 0;
+ stripeParms.lastFlatBufIndice = 0;
+ stripeParms.lastIndiceOffset = 0;
+
+ /* The general algorithm here is to divide the file up into segements, a segment
+ * being defined as a contiguous region of the file which has up to one occurrence
+ * of each stripe - the data for each stripe being written out by a particular
+ * aggregator. The segmentLen is the maximum size in bytes of each segment
+ * (stripeSize*number of aggs). Iteratively call H5FD_mpio_ccio_osagg_write
+ * for each segment to aggregate the data to the collective buffers, but only do
+ * the actual write (via flushCB stripe parm) once stripesPerAgg stripes
+ * have been packed or the aggregation for all the data is complete, minimizing
+ * synchronization.
+ */
+ stripeParms.segmentLen = ((ADIO_Offset_CA)numStripedAggs)*((ADIO_Offset_CA)(fs_block_info[0]));
+
+ /* These arrays define the file offsets for the stripes for a given segment - similar
+ * to the concept of file domains in GPFS, essentially file domeains for the segment.
+ */
+ ADIO_Offset_CA *segment_stripe_start = (ADIO_Offset_CA *) H5MM_malloc(numStripedAggs*sizeof(ADIO_Offset_CA));
+ ADIO_Offset_CA *segment_stripe_end = (ADIO_Offset_CA *) H5MM_malloc(numStripedAggs*sizeof(ADIO_Offset_CA));
+
+ /* Find the actual range of stripes in the file that have data in the offset
+ * ranges being written -- skip holes at the front and back of the file.
+ */
+ int currentOffsetListIndex = 0;
+ int fileSegmentIter = 0;
+ int startingStripeWithData = 0;
+ int foundStartingStripeWithData = 0;
+ while (!foundStartingStripeWithData) {
+ if ( ((startingStripeWithData+1) * (ADIO_Offset_CA)(fs_block_info[0])) > firstFileOffset)
+ foundStartingStripeWithData = 1;
+ else
+ startingStripeWithData++;
+ }
+
+ ADIO_Offset_CA currentSegementOffset = (ADIO_Offset_CA)startingStripeWithData * (ADIO_Offset_CA)(fs_block_info[0]);
+
+ int numSegments = (int) ((lastFileOffset+(ADIO_Offset_CA)1 - currentSegementOffset)/stripeParms.segmentLen);
+ if ((lastFileOffset+(ADIO_Offset_CA)1 - currentSegementOffset)%stripeParms.segmentLen > 0)
+ numSegments++;
+
+#ifdef onesidedtrace
+ printf("Rank %d - H5FD_mpio_ccio_iterate_write ca_data->cb_nodes is %d numStripedAggs is %d numSegments is %d start_offset is %ld end_offset is %ld firstFileOffset is %ld lastFileOffset is %ld\n",myrank,ca_data->cb_nodes,numStripedAggs,numSegments,start_offset,end_offset,firstFileOffset,lastFileOffset);
+ fflush(stdout);
+#endif
+
+ /* To support read-modify-write use a while-loop to redo the aggregation if necessary
+ * to fill in the holes.
+ */
+ int doAggregation = 1;
+ int holeFound = 0;
+
+ /* Remember onesided_no_rmw setting if we have to re-do
+ * the aggregation if holes are found.
+ */
+ int prev_onesided_no_rmw = ca_data->onesided_no_rmw;
+
+ while (doAggregation) {
+
+ int totalDataWrittenLastRound = 0;
+
+ /* This variable tracks how many segment stripes we have packed into the agg
+ * buffers so we know when to flush to the file system.
+ */
+ stripeParms.segmentIter = 0;
+
+ /* stripeParms.stripesPerAgg is the number of stripes to aggregate before doing a flush.
+ */
+ stripeParms.stripesPerAgg = stripesPerAgg;
+ if (stripeParms.stripesPerAgg > numSegments)
+ stripeParms.stripesPerAgg = numSegments;
+
+ for (fileSegmentIter=0;fileSegmentIter < numSegments;fileSegmentIter++) {
+
+ int dataWrittenThisRound = 0;
+
+ /* Define the segment range in terms of file offsets.
+ */
+ ADIO_Offset_CA segmentFirstFileOffset = currentSegementOffset;
+ if ((currentSegementOffset+stripeParms.segmentLen-(ADIO_Offset_CA)1) > lastFileOffset)
+ currentSegementOffset = lastFileOffset;
+ else
+ currentSegementOffset += (stripeParms.segmentLen-(ADIO_Offset_CA)1);
+ ADIO_Offset_CA segmentLastFileOffset = currentSegementOffset;
+ currentSegementOffset++;
+
+ ADIO_Offset_CA segment_stripe_offset = segmentFirstFileOffset;
+ for (i=0;i<numStripedAggs;i++) {
+ if (firstFileOffset > segment_stripe_offset)
+ segment_stripe_start[i] = firstFileOffset;
+ else
+ segment_stripe_start[i] = segment_stripe_offset;
+ if ((segment_stripe_offset + (ADIO_Offset_CA)(fs_block_info[0])) > lastFileOffset)
+ segment_stripe_end[i] = lastFileOffset;
+ else
+ segment_stripe_end[i] = segment_stripe_offset + (ADIO_Offset_CA)(fs_block_info[0]) - (ADIO_Offset_CA)1;
+ segment_stripe_offset += (ADIO_Offset_CA)(fs_block_info[0]);
+ }
+
+ /* In the interest of performance for non-contiguous data with large offset lists
+ * essentially modify the given offset and length list appropriately for this segment
+ * and then pass pointers to the sections of the lists being used for this segment
+ * to H5FD_mpio_ccio_osagg_write. Remember how we have modified the list for this
+ * segment, and then restore it appropriately after processing for this segment has
+ * concluded, so it is ready for the next segment.
+ */
+ int segmentContigAccessCount = 0;
+ int startingOffsetListIndex = -1;
+ int endingOffsetListIndex = -1;
+ ADIO_Offset_CA startingOffsetAdvancement = 0;
+ ADIO_Offset_CA startingLenTrim = 0;
+ ADIO_Offset_CA endingLenTrim = 0;
+
+ while ( ( ( offset_list[currentOffsetListIndex] + ((ADIO_Offset_CA)(len_list[currentOffsetListIndex]))-(ADIO_Offset_CA)1 ) < segmentFirstFileOffset)
+ && (currentOffsetListIndex < (contig_access_count-1) ) )
+ currentOffsetListIndex++;
+ startingOffsetListIndex = currentOffsetListIndex;
+ endingOffsetListIndex = currentOffsetListIndex;
+ int offsetInSegment = 0;
+ ADIO_Offset_CA offsetStart = offset_list[currentOffsetListIndex];
+ ADIO_Offset_CA offsetEnd = (offset_list[currentOffsetListIndex] + ((ADIO_Offset_CA)(len_list[currentOffsetListIndex]))-(ADIO_Offset_CA)1);
+
+ if (len_list[currentOffsetListIndex] == 0)
+ offsetInSegment = 0;
+ else if ((offsetStart >= segmentFirstFileOffset) && (offsetStart <= segmentLastFileOffset)) {
+ offsetInSegment = 1;
+ }
+ else if ((offsetEnd >= segmentFirstFileOffset) && (offsetEnd <= segmentLastFileOffset)) {
+ offsetInSegment = 1;
+ }
+ else if ((offsetStart <= segmentFirstFileOffset) && (offsetEnd >= segmentLastFileOffset)) {
+ offsetInSegment = 1;
+ }
+
+ if (!offsetInSegment) {
+ segmentContigAccessCount = 0;
+ }
+ else {
+ /* We are in the segment, advance currentOffsetListIndex until we are out of segment.
+ */
+ segmentContigAccessCount = 1;
+
+ while ((offset_list[currentOffsetListIndex] <= segmentLastFileOffset) && (currentOffsetListIndex < contig_access_count)) {
+ dataWrittenThisRound += (int) len_list[currentOffsetListIndex];
+ currentOffsetListIndex++;
+ }
+
+ if (currentOffsetListIndex > startingOffsetListIndex) {
+ /* If we did advance, if we are at the end need to check if we are still in segment.
+ */
+ if (currentOffsetListIndex == contig_access_count) {
+ currentOffsetListIndex--;
+ }
+ else if (offset_list[currentOffsetListIndex] > segmentLastFileOffset) {
+ /* We advanced into the last one and it still in the segment.
+ */
+ currentOffsetListIndex--;
+ }
+ else {
+ dataWrittenThisRound += (int) len_list[currentOffsetListIndex];
+ }
+ segmentContigAccessCount += (currentOffsetListIndex-startingOffsetListIndex);
+ endingOffsetListIndex = currentOffsetListIndex;
+ }
+ }
+
+ if (segmentContigAccessCount > 0) {
+ /* Trim edges here so all data in the offset list range fits exactly in the segment.
+ */
+ if (offset_list[startingOffsetListIndex] < segmentFirstFileOffset) {
+ startingOffsetAdvancement = segmentFirstFileOffset-offset_list[startingOffsetListIndex];
+ offset_list[startingOffsetListIndex] += startingOffsetAdvancement;
+ dataWrittenThisRound -= (int) startingOffsetAdvancement;
+ startingLenTrim = startingOffsetAdvancement;
+ len_list[startingOffsetListIndex] -= startingLenTrim;
+ }
+
+ if ((offset_list[endingOffsetListIndex] + ((ADIO_Offset_CA)(len_list[endingOffsetListIndex]))-(ADIO_Offset_CA)1) > segmentLastFileOffset) {
+ endingLenTrim = offset_list[endingOffsetListIndex]+ ((ADIO_Offset_CA)(len_list[endingOffsetListIndex]))-(ADIO_Offset_CA)1 - segmentLastFileOffset;
+ len_list[endingOffsetListIndex] -= endingLenTrim;
+ dataWrittenThisRound -= (int) endingLenTrim;
+ }
+ }
+
+ int holeFoundThisRound = 0;
+
+ /* Once we have packed the collective buffers do the actual write.
+ */
+ if ((stripeParms.segmentIter == (stripeParms.stripesPerAgg-1)) || (fileSegmentIter == (numSegments-1))) {
+ stripeParms.flushCB = 1;
+ }
+ else
+ stripeParms.flushCB = 0;
+
+ stripeParms.firstStripedIOCall = 0;
+ stripeParms.lastStripedIOCall = 0;
+ if (fileSegmentIter == 0) {
+ stripeParms.firstStripedIOCall = 1;
+ }
+ else if (fileSegmentIter == (numSegments-1))
+ stripeParms.lastStripedIOCall = 1;
+
+ /* The difference in calls to H5FD_mpio_ccio_osagg_write is based on the whether the buftype is
+ * contiguous. The algorithm tracks the position in the source buffer when called
+ * multiple times -- in the case of contiguous data this is simple and can be externalized with
+ * a buffer offset, in the case of non-contiguous data this is complex and the state must be tracked
+ * internally, therefore no external buffer offset. Care was taken to minimize
+ * H5FD_mpio_ccio_osagg_write changes at the expense of some added complexity to the caller.
+ */
+
+#ifdef onesidedtrace
+ if (myrank == 0) {
+ int j;
+ printf("\n\nRank %d - Segment iteration %d stripeParms.flushCB is %d aggregator placement and assignment over %d aggs is:\n",myrank,fileSegmentIter,stripeParms.flushCB, ca_data->cb_nodes);
+ for (j=0;j<ca_data->cb_nodes;j++)
+ printf("Rank %d - agg rank %d writing to offset range %ld to %ld\n",myrank,ca_data->ranklist[j],segment_stripe_start[j],segment_stripe_end[j]);
+ printf("\n\n");
+ }
+#endif
+
+ if (memFlatBuf->count == 1) {
+ H5FD_mpio_ccio_osagg_write(ca_data,(ADIO_Offset_CA*)&(offset_list[startingOffsetListIndex]), (ADIO_Offset_CA*)&(len_list[startingOffsetListIndex]), segmentContigAccessCount, buf+totalDataWrittenLastRound, memFlatBuf, error_code, segmentFirstFileOffset, segmentLastFileOffset, currentValidDataIndex, segment_stripe_start, segment_stripe_end, 0,&stripeParms);
+ }
+ else {
+ H5FD_mpio_ccio_osagg_write(ca_data,(ADIO_Offset_CA*)&(offset_list[startingOffsetListIndex]), (ADIO_Offset_CA*)&(len_list[startingOffsetListIndex]), segmentContigAccessCount, buf, memFlatBuf, error_code, segmentFirstFileOffset, segmentLastFileOffset, currentValidDataIndex, segment_stripe_start, segment_stripe_end, 0,&stripeParms);
+ }
+
+ /* Async I/O - Switch between buffers */
+ if(ca_data->async_io_outer) {
+ ca_data->use_dup = (ca_data->use_dup + 1) % 2;
+ }
+
+ if (stripeParms.flushCB) {
+ stripeParms.segmentIter = 0;
+ if (stripesPerAgg > (numSegments-fileSegmentIter-1))
+ stripeParms.stripesPerAgg = numSegments-fileSegmentIter-1;
+ else
+ stripeParms.stripesPerAgg = stripesPerAgg;
+ }
+ else
+ stripeParms.segmentIter++;
+
+ if (holeFoundThisRound)
+ holeFound = 1;
+
+ /* If we know we won't be doing a pre-read in a subsequent call to
+ * H5FD_mpio_ccio_osagg_write which will have a barrier to keep
+ * feeder ranks from doing rma to the collective buffer before the
+ * write completes that we told it do with the stripeParms.flushCB
+ * flag then we need to do a barrier here.
+ */
+ if (!ca_data->onesided_always_rmw && stripeParms.flushCB) {
+ if (fileSegmentIter < (numSegments-1)) {
+ MPI_Barrier(ca_data->comm);
+ }
+ }
+
+ /* Restore the offset_list and len_list to values that are ready for the
+ * next iteration.
+ */
+ if (segmentContigAccessCount > 0) {
+ offset_list[endingOffsetListIndex] += len_list[endingOffsetListIndex];
+ len_list[endingOffsetListIndex] = endingLenTrim;
+ }
+ totalDataWrittenLastRound += dataWrittenThisRound;
+ } /* fileSegmentIter for-loop */
+
+ /* Check for holes in the data unless onesided_no_rmw is set.
+ * If a hole is found redo the entire aggregation and write.
+ */
+ if (!ca_data->onesided_no_rmw) {
+ int anyHolesFound = 0;
+ MPI_Allreduce(&holeFound, &anyHolesFound, 1, MPI_INT, MPI_MAX, ca_data->comm);
+
+ if (anyHolesFound) {
+ H5MM_free(offset_list);
+ H5MM_free(len_list);
+ H5FD_mpio_calc_offset_list((ADIO_Offset_CA)memFlatBuf->size, fileFlatBuf, mpi_off,
+ &offset_list, &len_list, &start_offset, &end_offset, &contig_access_count);
+
+ currentSegementOffset = (ADIO_Offset_CA)startingStripeWithData * (ADIO_Offset_CA)(fs_block_info[0]);
+ ca_data->onesided_always_rmw = 1;
+ ca_data->onesided_no_rmw = 1;
+
+ /* Holes are found in the data and the user has not set
+ * onesided_no_rmw --- set onesided_always_rmw to 1
+ * and redo the entire aggregation and write and if the user has
+ * onesided_inform_rmw set then inform him of this condition
+ * and behavior.
+ */
+ if (ca_data->onesided_inform_rmw && (myrank ==0)) {
+ fprintf(stderr,"Information: Holes found during one-sided "
+ "write aggregation algorithm --- re-running one-sided "
+ "write aggregation with onesided_always_rmw set to 1.\n");
+ }
+ }
+ else
+ doAggregation = 0;
+ }
+ else
+ doAggregation = 0;
+
+ } // while doAggregation
+ ca_data->onesided_no_rmw = prev_onesided_no_rmw;
+
+ H5MM_free(segment_stripe_start);
+ H5MM_free(segment_stripe_end);
+
+} /* H5FD_mpio_ccio_iterate_write */
+
+/*-------------------------------------------------------------------------
+ * Function: H5FD_mpio_ccio_iterate_read
+ *
+ * Purpose: This function calls H5FD_mpio_ccio_osagg_read
+ * iteratively to perform "rounds" of one-sided collective
+ * data aggregation.
+ *
+ * Return: Void.
+ *
+ *-------------------------------------------------------------------------
+ */
+ void H5FD_mpio_ccio_iterate_read(CustomAgg_FH_Data ca_data, void *buf,
+ int *fs_block_info, ADIO_Offset_CA *offset_list, ADIO_Offset_CA *len_list,
+ MPI_Offset mpi_off, int contig_access_count, int currentValidDataIndex,
+ ADIO_Offset_CA start_offset, ADIO_Offset_CA end_offset,
+ ADIO_Offset_CA firstFileOffset, ADIO_Offset_CA lastFileOffset,
+ H5S_flatbuf_t *memFlatBuf, H5S_flatbuf_t *fileFlatBuf, int myrank, int *error_code)
+{
+
+ int i;
+ int stripesPerAgg = ca_data->cb_buffer_size / fs_block_info[0];
+ int numStripedAggs = ca_data->cb_nodes;
+ if (stripesPerAgg == 0) {
+ /* The striping unit is larger than the collective buffer size
+ * therefore we must abort since the buffer has already been
+ * allocated during the open.
+ */
+ fprintf(stderr,"Error: The collective buffer size %d is less "
+ "than the fs_block_size %d - This collective I/O implementation "
+ "cannot continue.\n",ca_data->cb_buffer_size,fs_block_info[0]);
+ MPI_Abort(MPI_COMM_WORLD, 1);
+ }
+ MPI_Comm_rank(ca_data->comm, &myrank);
+
+ /* Declare ADIOI_OneSidedStripeParms here - these parameters will be locally managed
+ * for this invokation of H5FD_mpio_ccio_iterate_read. This will allow for concurrent
+ * one-sided collective writes via multi-threading as well as multiple communicators.
+ */
+ FS_Block_Parms stripeParms;
+ stripeParms.stripeSize = fs_block_info[0]; /* stripe_size */
+ stripeParms.stripedLastFileOffset = lastFileOffset;
+ stripeParms.iWasUsedStripingAgg = 0;
+ stripeParms.numStripesUsed = 0;
+ stripeParms.amountOfStripedDataExpected = 0;
+ stripeParms.bufTypeExtent = 0;
+ stripeParms.lastDataTypeExtent = 0;
+ stripeParms.lastFlatBufIndice = 0;
+ stripeParms.lastIndiceOffset = 0;
+
+ /* The general algorithm here is to divide the file up into segments, a segment
+ * being defined as a contiguous region of the file which has up to `numStripedAggs`
+ * occurrences of each stripe - the data for each stripe being READ by a particular
+ * aggregator. The segmentLen is the maximum size in bytes of each segment
+ * (stripeSize * numStripedAggs). Here, we iteratively call
+ * H5FD_mpio_ccio_osagg_read for each segment to READ the data.
+ */
+ stripeParms.segmentLen = ((ADIO_Offset_CA)numStripedAggs)*((ADIO_Offset_CA)(fs_block_info[0]));
+
+ /* These arrays define the file offsets for the stripes for a given segment - similar
+ * to the concept of file domains in GPFS, essentially file domeains for the segment.
+ */
+ ADIO_Offset_CA *segment_stripe_start = (ADIO_Offset_CA *) H5MM_malloc(numStripedAggs*sizeof(ADIO_Offset_CA));
+ ADIO_Offset_CA *segment_stripe_end = (ADIO_Offset_CA *) H5MM_malloc(numStripedAggs*sizeof(ADIO_Offset_CA));
+ ADIO_Offset_CA *segment_stripe_start_next;
+ ADIO_Offset_CA *segment_stripe_end_next;
+ if (ca_data->async_io_outer) {
+ segment_stripe_start_next = (ADIO_Offset_CA *) H5MM_malloc(numStripedAggs*sizeof(ADIO_Offset_CA));
+ segment_stripe_end_next = (ADIO_Offset_CA *) H5MM_malloc(numStripedAggs*sizeof(ADIO_Offset_CA));
+ }
+
+ /* Find the actual range of stripes in the file that have data in the offset
+ * ranges being written -- skip holes at the front and back of the file.
+ */
+ int currentOffsetListIndex = 0;
+ int fileSegmentIter = 0;
+ int startingStripeWithData = 0;
+ int foundStartingStripeWithData = 0;
+ while (!foundStartingStripeWithData) {
+ if ( ((startingStripeWithData+1) * (ADIO_Offset_CA)(fs_block_info[0])) > firstFileOffset)
+ foundStartingStripeWithData = 1;
+ else
+ startingStripeWithData++;
+ }
+
+ /* currentSegementOffset = Offset to beginning of first stripe with data to be read */
+ ADIO_Offset_CA currentSegementOffset = (ADIO_Offset_CA)startingStripeWithData * (ADIO_Offset_CA)(fs_block_info[0]);
+
+ /* How many "rounds" of segements will we need to iterate through here */
+ int numSegments = (int) ((lastFileOffset+(ADIO_Offset_CA)1 - currentSegementOffset)/stripeParms.segmentLen);
+ if ((lastFileOffset+(ADIO_Offset_CA)1 - currentSegementOffset)%stripeParms.segmentLen > 0)
+ numSegments++;
+
+#ifdef onesidedtrace
+ printf("Rank %d - H5FD_mpio_ccio_iterate_read ca_data->cb_nodes is %d numStripedAggs is %d numSegments is %d start_offset is %ld end_offset is %ld firstFileOffset is %ld lastFileOffset is %ld\n",myrank,ca_data->cb_nodes,numStripedAggs,numSegments,start_offset,end_offset,firstFileOffset,lastFileOffset);
+ fflush(stdout);
+#endif
+
+ /* This variable tracks how many segment stripes we have packed into the agg
+ * buffers so we know when the buffers are full.
+ */
+ stripeParms.segmentIter = 0;
+
+ /* stripeParms.stripesPerAgg is the number of stripes the aggregator must
+ * read to fill it's buffer.
+ */
+ stripeParms.stripesPerAgg = stripesPerAgg;
+ if (stripeParms.stripesPerAgg > numSegments)
+ stripeParms.stripesPerAgg = numSegments;
+
+ int totalDataReadLastRound = 0;
+
+ /* Use 'next' read offsets for async I/O */
+ ADIO_Offset_CA segmentFirstFileOffset_next, segmentLastFileOffset_next;
+
+ /* Async I/O - Start with use_dup==0 */
+ ca_data->use_dup = 0;
+
+ /* Now, we iterate trhough all the segments that we want to read */
+ for (fileSegmentIter=0;fileSegmentIter < numSegments;fileSegmentIter++) {
+
+ int dataReadThisRound = 0;
+
+ ADIO_Offset_CA segmentFirstFileOffset, segmentLastFileOffset;
+
+ /* Define the segment range in terms of a file offsets.
+ * Just increment the offset from the previous 'currentSegementOffset'
+ */
+ segmentFirstFileOffset = currentSegementOffset;
+ if ((currentSegementOffset+stripeParms.segmentLen-(ADIO_Offset_CA)1) > lastFileOffset)
+ currentSegementOffset = lastFileOffset;
+ else
+ currentSegementOffset += (stripeParms.segmentLen-(ADIO_Offset_CA)1);
+ segmentLastFileOffset = currentSegementOffset;
+ currentSegementOffset++; // shifting by one byte offset
+
+ ADIO_Offset_CA segment_stripe_offset = segmentFirstFileOffset;
+ for (i=0;i<numStripedAggs;i++) {
+ if (firstFileOffset > segment_stripe_offset)
+ segment_stripe_start[i] = firstFileOffset;
+ else
+ segment_stripe_start[i] = segment_stripe_offset;
+ if ((segment_stripe_offset + (ADIO_Offset_CA)(fs_block_info[0])) > lastFileOffset)
+ segment_stripe_end[i] = lastFileOffset;
+ else
+ segment_stripe_end[i] = segment_stripe_offset + (ADIO_Offset_CA)(fs_block_info[0]) - (ADIO_Offset_CA)1;
+ segment_stripe_offset += (ADIO_Offset_CA)(fs_block_info[0]);
+ }
+
+ if ((ca_data->async_io_outer) && (fileSegmentIter<(numSegments-1)) && (numSegments>1)) {
+ ADIO_Offset_CA cso_prev = currentSegementOffset;
+ segmentFirstFileOffset_next = cso_prev;
+ if ((cso_prev+stripeParms.segmentLen-(ADIO_Offset_CA)1) > lastFileOffset)
+ cso_prev = lastFileOffset;
+ else
+ cso_prev += (stripeParms.segmentLen-(ADIO_Offset_CA)1);
+ segmentLastFileOffset_next = cso_prev;
+
+ ADIO_Offset_CA sso_next = segmentFirstFileOffset_next;
+ for (i=0;i<numStripedAggs;i++) {
+ if (firstFileOffset > sso_next)
+ segment_stripe_start_next[i] = firstFileOffset;
+ else
+ segment_stripe_start_next[i] = sso_next;
+ if ((sso_next + (ADIO_Offset_CA)(fs_block_info[0])) > lastFileOffset)
+ segment_stripe_end_next[i] = lastFileOffset;
+ else
+ segment_stripe_end_next[i] = sso_next + (ADIO_Offset_CA)(fs_block_info[0]) - (ADIO_Offset_CA)1;
+ sso_next += (ADIO_Offset_CA)(fs_block_info[0]);
+ }
+
+ }
+
+ /* In the interest of performance for non-contiguous data with large offset lists
+ * essentially modify the given offset and length list appropriately for this segment
+ * and then pass pointers to the sections of the lists being used for this segment
+ * to H5FD_mpio_ccio_osagg_read. Remember how we have modified the list for this
+ * segment, and then restore it appropriately after processing for this segment has
+ * concluded, so it is ready for the next segment.
+ */
+ int segmentContigAccessCount = 0;
+ int startingOffsetListIndex = -1;
+ int endingOffsetListIndex = -1;
+ ADIO_Offset_CA startingOffsetAdvancement = 0;
+ ADIO_Offset_CA startingLenTrim = 0;
+ ADIO_Offset_CA endingLenTrim = 0;
+
+ while ( ((offset_list[currentOffsetListIndex] + ((ADIO_Offset_CA)(len_list[currentOffsetListIndex]))-(ADIO_Offset_CA)1) < segmentFirstFileOffset) && (currentOffsetListIndex < (contig_access_count-1)))
+ {
+ currentOffsetListIndex++;
+ }
+
+ startingOffsetListIndex = currentOffsetListIndex;
+ endingOffsetListIndex = currentOffsetListIndex;
+ int offsetInSegment = 0;
+ ADIO_Offset_CA offsetStart = offset_list[currentOffsetListIndex];
+ ADIO_Offset_CA offsetEnd = (offset_list[currentOffsetListIndex] + ((ADIO_Offset_CA)(len_list[currentOffsetListIndex]))-(ADIO_Offset_CA)1);
+
+ if (len_list[currentOffsetListIndex] == 0)
+ offsetInSegment = 0;
+ else if ((offsetStart >= segmentFirstFileOffset) && (offsetStart <= segmentLastFileOffset)) {
+ offsetInSegment = 1;
+ }
+ else if ((offsetEnd >= segmentFirstFileOffset) && (offsetEnd <= segmentLastFileOffset)) {
+ offsetInSegment = 1;
+ }
+ else if ((offsetStart <= segmentFirstFileOffset) && (offsetEnd >= segmentLastFileOffset)) {
+ offsetInSegment = 1;
+ }
+
+ if (!offsetInSegment) {
+ segmentContigAccessCount = 0;
+
+ }
+ else {
+ /* We are in the segment, advance currentOffsetListIndex until we are out of segment.
+ */
+ segmentContigAccessCount = 1;
+
+ while ((offset_list[currentOffsetListIndex] <= segmentLastFileOffset) && (currentOffsetListIndex < contig_access_count)) {
+ dataReadThisRound += (int) len_list[currentOffsetListIndex];
+ currentOffsetListIndex++;
+ }
+
+ if (currentOffsetListIndex > startingOffsetListIndex) {
+ /* If we did advance, if we are at the end need to check if we are still in segment.
+ */
+ if (currentOffsetListIndex == contig_access_count) {
+ currentOffsetListIndex--;
+ }
+ else if (offset_list[currentOffsetListIndex] > segmentLastFileOffset) {
+ /* We advanced into the last one and it still in the segment.
+ */
+ currentOffsetListIndex--;
+ }
+ else {
+ dataReadThisRound += (int) len_list[currentOffsetListIndex];
+ }
+ segmentContigAccessCount += (currentOffsetListIndex-startingOffsetListIndex);
+ endingOffsetListIndex = currentOffsetListIndex;
+ }
+ }
+
+ if (segmentContigAccessCount > 0) {
+ /* Trim edges here so all data in the offset list range fits exactly in the segment.
+ */
+ if (offset_list[startingOffsetListIndex] < segmentFirstFileOffset) {
+ startingOffsetAdvancement = segmentFirstFileOffset-offset_list[startingOffsetListIndex];
+ offset_list[startingOffsetListIndex] += startingOffsetAdvancement;
+ dataReadThisRound -= (int) startingOffsetAdvancement;
+ startingLenTrim = startingOffsetAdvancement;
+ len_list[startingOffsetListIndex] -= startingLenTrim;
+ }
+
+ if ((offset_list[endingOffsetListIndex] + ((ADIO_Offset_CA)(len_list[endingOffsetListIndex]))-(ADIO_Offset_CA)1) > segmentLastFileOffset) {
+ endingLenTrim = offset_list[endingOffsetListIndex]+ ((ADIO_Offset_CA)(len_list[endingOffsetListIndex]))-(ADIO_Offset_CA)1 - segmentLastFileOffset;
+ len_list[endingOffsetListIndex] -= endingLenTrim;
+ dataReadThisRound -= (int) endingLenTrim;
+ }
+ }
+
+ /* Once we have packed the collective buffers, set stripeParms.flushCB = 1
+ * to signify this (note that stripeParms.flushCB does NOT control the actual I/O for reading)
+ * That is, we are reading on every call, so 'flushCB' isn't really necessary for reads
+ */
+ if ((stripeParms.segmentIter == (stripeParms.stripesPerAgg-1)) || (fileSegmentIter == (numSegments-1))) {
+ stripeParms.flushCB = 1;
+ }
+ else
+ stripeParms.flushCB = 0;
+
+ stripeParms.firstStripedIOCall = 0;
+ stripeParms.lastStripedIOCall = 0;
+ if (fileSegmentIter == 0) {
+ stripeParms.firstStripedIOCall = 1;
+ }
+ else if (fileSegmentIter == (numSegments-1))
+ stripeParms.lastStripedIOCall = 1;
+
+ /* The difference in calls to H5FD_mpio_ccio_osagg_read is based on the whether the buftype is
+ * contiguous. The algorithm tracks the position in the target buffer when called
+ * multiple times -- in the case of contiguous data this is simple and can be externalized with
+ * a buffer offset, in the case of non-contiguous data this is complex and the state must be tracked
+ * internally, therefore no external buffer offset. Care was taken to minimize
+ * H5FD_mpio_ccio_osagg_read changes at the expense of some added complexity to the caller.
+ */
+
+ /* Async I/O - Create a pipeline of 'reads' */
+ if ((ca_data->async_io_outer) && (fileSegmentIter==0) && (numSegments>1)) {
+
+ /* Read data from file into aggregator buffers */
+ H5FD_mpio_ccio_file_read(ca_data, error_code, segmentFirstFileOffset, segmentLastFileOffset, segment_stripe_start, segment_stripe_end);
+
+ /* Async I/O - Start prefetch of next iteration with duplite buffer */
+ ca_data->use_dup = (ca_data->use_dup + 1) % 2;
+
+ /* Read data from file into aggregator buffers for NEXT interation */
+ H5FD_mpio_ccio_file_read(ca_data, error_code, segmentFirstFileOffset_next, segmentLastFileOffset_next, segment_stripe_start_next, segment_stripe_end_next);
+
+ /* Async I/O - Switch back to current buffer */
+ ca_data->use_dup = (ca_data->use_dup + 1) % 2;
+
+ } else if ((ca_data->async_io_outer) && (fileSegmentIter<(numSegments-1)) && (numSegments>1)) {
+
+ /* Async I/O - Start prefetch of next iteration with duplite buffer */
+ ca_data->use_dup = (ca_data->use_dup + 1) % 2;
+
+ /* Read data from file into aggregator buffers for NEXT interation */
+ H5FD_mpio_ccio_file_read(ca_data, error_code, segmentFirstFileOffset_next, segmentLastFileOffset_next, segment_stripe_start_next, segment_stripe_end_next);
+
+ /* Async I/O - Switch back to current buffer */
+ ca_data->use_dup = (ca_data->use_dup + 1) % 2;
+
+ } else if ((!ca_data->async_io_outer) || (numSegments<2)) {
+
+ /* Read data from file into aggregator buffers */
+ H5FD_mpio_ccio_file_read(ca_data, error_code, segmentFirstFileOffset, segmentLastFileOffset, segment_stripe_start, segment_stripe_end);
+
+ }
+
+ /* Async I/O - Wait for necessary buffer to be ready for RMA */
+ if (ca_data->use_dup && ca_data->check_req_d) {
+ MPIO_Wait(&ca_data->io_Request_d, error_code);
+ ca_data->check_req_d = 0;
+ } else if (!ca_data->use_dup && ca_data->check_req) {
+ MPIO_Wait(&ca_data->io_Request, error_code);
+ ca_data->check_req = 0;
+ }
+
+ if (memFlatBuf->count == 1) {
+
+ /* Ranks perform one-sided read of data from collective buffers */
+ H5FD_mpio_ccio_osagg_read(ca_data,(ADIO_Offset_CA*)&(offset_list[startingOffsetListIndex]), (ADIO_Offset_CA*)&(len_list[startingOffsetListIndex]), segmentContigAccessCount, buf+totalDataReadLastRound, memFlatBuf, error_code, segmentFirstFileOffset, segmentLastFileOffset, currentValidDataIndex, segment_stripe_start, segment_stripe_end, &stripeParms, 0); // Last '0' means the file read should be skipped in this call
+
+ } else {
+
+ /* Ranks perform one-sided read of data from collective buffers */
+ H5FD_mpio_ccio_osagg_read(ca_data,(ADIO_Offset_CA*)&(offset_list[startingOffsetListIndex]), (ADIO_Offset_CA*)&(len_list[startingOffsetListIndex]), segmentContigAccessCount, buf, memFlatBuf, error_code, segmentFirstFileOffset, segmentLastFileOffset, currentValidDataIndex, segment_stripe_start, segment_stripe_end, &stripeParms, 0); // Last '0' means the file read should be skipped in this call
+
+ }
+
+ /* Async I/O - change 'current' buffer */
+ if ((ca_data->async_io_outer) && (numSegments>1)) {
+ ca_data->use_dup = (ca_data->use_dup + 1) % 2;
+ }
+
+ //if (stripeParms.flushCB) {
+ stripeParms.segmentIter = 0;
+ if (stripesPerAgg > (numSegments-fileSegmentIter-1))
+ stripeParms.stripesPerAgg = numSegments-fileSegmentIter-1;
+ else
+ stripeParms.stripesPerAgg = stripesPerAgg;
+ //}
+ //else
+ // stripeParms.segmentIter++;
+
+ /* Need barrier here.
+ */
+ if (fileSegmentIter < (numSegments-1)) {
+ MPI_Barrier(ca_data->comm);
+ }
+
+ /* Restore the offset_list and len_list to values that are ready for the
+ * next iteration.
+ */
+ if (segmentContigAccessCount > 0) {
+ offset_list[endingOffsetListIndex] += len_list[endingOffsetListIndex];
+ len_list[endingOffsetListIndex] = endingLenTrim;
+ }
+ totalDataReadLastRound += dataReadThisRound;
+
+ } // fileSegmentIter for-loop
+
+ H5MM_free(segment_stripe_start);
+ H5MM_free(segment_stripe_end);
+ if (ca_data->async_io_outer) {
+ H5MM_free(segment_stripe_start_next);
+ H5MM_free(segment_stripe_end_next);
+ }
+
+} /* End IterateOneSidedRead */
+
+/*-------------------------------------------------------------------------
+ * Function: H5FD_mpio_nc_buffer_advance
+ *
+ * This funtion packs a contigous buffer of data from the non-contgious source
+ * buffer for a specified chunk of data and advances the FDSourceBufferState
+ * machinery, so subsequent calls with the FDSourceBufferState will return the
+ * next linear chunk.
+ * Parameters:
+ * in: sourceDataBuffer - pointer to source data buffer.
+ * in: flatBuf - pointer to flattened source data buffer
+ * in: targetNumBytes - number of bytes to return and advance.
+ * in: packing - whether data is being packed from the source buffer to the
+ * packed buffer (1) or unpacked from the packed buffer to the source
+ * buffer (0)
+ * in/out: currentFDSourceBufferState - pointer to FDSourceBufferState structure, current
+ * data used as starting point, will be updated with
+ * the new state after targetNumBytes advance.
+ * out: packedDataBufer - pointer to the output packed data buffer. If the
+ * value is NULL then no data will be written.
+ *
+ *-------------------------------------------------------------------------
+ */
+inline static void H5FD_mpio_nc_buffer_advance(char *sourceDataBuffer,
+ H5S_flatbuf_t *flatBuf, int targetNumBytes, int packing,
+ FDSourceBufferState_CA *currentFDSourceBufferState, char *packedDataBufer)
+{
+ /*
+ * Make currentDataTypeExtent and bufTypeExtent ADIO_Offset_CA since they are
+ * used in offset calculations
+ */
+ ADIO_Offset_CA currentIndiceOffset = currentFDSourceBufferState->indiceOffset;
+ ADIO_Offset_CA bufTypeExtent = (ADIO_Offset_CA)currentFDSourceBufferState->bufTypeExtent;
+ ADIO_Offset_CA currentDataTypeExtent = currentFDSourceBufferState->dataTypeExtent;
+ int currentFlatBufIndice = currentFDSourceBufferState->flatBufIndice;
+ int targetSendDataIndex = 0;
+
+#ifdef onesidedtrace
+ printf("H5FD_mpio_nc_buffer_advance: currentFlatBufIndice is %d currentDataTypeExtent is %ld currentIndiceOffset is %ld\n",currentFlatBufIndice,currentDataTypeExtent,currentIndiceOffset);
+#endif
+
+ int remainingBytesToLoad = targetNumBytes;
+ while (remainingBytesToLoad > 0) {
+ if ((flatBuf->blocklens[currentFlatBufIndice] - currentIndiceOffset) >= remainingBytesToLoad) { // we can get the rest of our data from this indice
+ ADIO_Offset_CA physicalSourceBufferOffset = (currentDataTypeExtent * bufTypeExtent) + flatBuf->indices[currentFlatBufIndice] + currentIndiceOffset;
+
+#ifdef onesidedtrace
+ printf("loading remainingBytesToLoad %d from src buffer offset %ld to targetSendDataIndex %d\n",remainingBytesToLoad,physicalSourceBufferOffset,targetSendDataIndex);
+#endif
+
+ if (packedDataBufer != NULL) {
+ if (packing)
+ memcpy(&(packedDataBufer[targetSendDataIndex]),&(sourceDataBuffer[physicalSourceBufferOffset]),remainingBytesToLoad);
+ else
+ memcpy(&(sourceDataBuffer[physicalSourceBufferOffset]),&(packedDataBufer[targetSendDataIndex]),remainingBytesToLoad);
+ }
+
+ targetSendDataIndex += remainingBytesToLoad;
+ currentIndiceOffset += (ADIO_Offset_CA)remainingBytesToLoad;
+ if (currentIndiceOffset >= flatBuf->blocklens[currentFlatBufIndice]) {
+ currentIndiceOffset = (ADIO_Offset_CA)0;
+ currentFlatBufIndice++;
+ if (currentFlatBufIndice == flatBuf->count) {
+ currentFlatBufIndice = 0;
+ currentDataTypeExtent++;
+ }
+ }
+ remainingBytesToLoad = 0;
+
+ }
+ else { // we can only get part of our data from this indice
+ ADIO_Offset_CA amountDataToLoad = (flatBuf->blocklens[currentFlatBufIndice] - currentIndiceOffset);
+ ADIO_Offset_CA physicalSourceBufferOffset = (currentDataTypeExtent * bufTypeExtent) + flatBuf->indices[currentFlatBufIndice] + currentIndiceOffset;
+
+#ifdef onesidedtrace
+ printf("loading amountDataToLoad %d from src buffer offset %ld to targetSendDataIndex %d\n",amountDataToLoad,physicalSourceBufferOffset,targetSendDataIndex);
+#endif
+ if (packedDataBufer != NULL) {
+ if (packing)
+ memcpy(&(packedDataBufer[targetSendDataIndex]),&(sourceDataBuffer[physicalSourceBufferOffset]),amountDataToLoad);
+ else
+ memcpy(&(sourceDataBuffer[physicalSourceBufferOffset]),&(packedDataBufer[targetSendDataIndex]),amountDataToLoad);
+ }
+
+ targetSendDataIndex += amountDataToLoad;
+ currentIndiceOffset = (ADIO_Offset_CA)0;
+ currentFlatBufIndice++;
+ if (currentFlatBufIndice == flatBuf->count) {
+ currentFlatBufIndice = 0;
+ currentDataTypeExtent++;
+ }
+ remainingBytesToLoad -= amountDataToLoad;
+ }
+ } // while
+
+ /*
+ * Update machinery with new flatbuf position
+ */
+ currentFDSourceBufferState->indiceOffset = currentIndiceOffset;
+ currentFDSourceBufferState->dataTypeExtent = currentDataTypeExtent;
+ currentFDSourceBufferState->flatBufIndice = currentFlatBufIndice;
+#ifdef onesidedtrace
+ printf("source buf advanced to currentFlatBufIndice %d currentDataTypeExtent %ld currentIndiceOffset %ld\n",currentFlatBufIndice,currentDataTypeExtent,currentIndiceOffset);
+#endif
+}; /* H5FD_mpio_nc_buffer_advance */
+
+/*-------------------------------------------------------------------------
+ * Function: H5FD_mpio_ccio_osagg_write
+ *
+ * Purpose:
+ *
+ * The H5FD_mpio_ccio_osagg_write algorithm is called once
+ * for each segment of data, a segment being defined as a contiguous region of the file which
+ * is the size of one striping unit times the number of aggregators. For lustre the striping unit
+ * corresponds with the actual file stripe, in the case of gpfs these are file domains.
+ * Each call effectively packs one striping unit of data into the collective buffer on each agg,
+ * with additional parameters which govern when to flush the collective buffer to the file.
+ * Therefore in practice the collective write call for a file system such as
+ * lustre on a dataset composed of multiple segments would call the algorithm several times without a
+ * flush parameter to fill the collective buffers with multiple stripes of data, before calling it again to flush
+ * the collective buffer to the file system. In this fashion the synchronization can be minimized as that
+ * only needs to occur during the actual read from or write to the file system. In the case of gpfs
+ * this function is called just once. The FS_Block_Parms parameter is used to save the
+ * state and re-use variables thru repetative calls to help in the case of lustre to avoid costly
+ * recomputation, for consistency gpfs utilizes it as well but doesn't use some aspects of it. This
+ * function was originally first written for gpfs only and then modified to support lustre.
+ *
+ *-------------------------------------------------------------------------
+ */
+void H5FD_mpio_ccio_osagg_write(CustomAgg_FH_Data ca_data,
+ ADIO_Offset_CA *offset_list,
+ ADIO_Offset_CA *len_list,
+ int contig_access_count,
+ const void *buf,
+ H5S_flatbuf_t *memFlatBuf,
+ int *error_code,
+ ADIO_Offset_CA firstFileOffset,
+ ADIO_Offset_CA lastFileOffset,
+ int numNonZeroDataOffsets,
+ ADIO_Offset_CA *fd_start,
+ ADIO_Offset_CA* fd_end,
+ int hole_found,
+ FS_Block_Parms *stripe_parms)
+
+{
+ int i,j; /* generic iterators */
+
+ /*
+ * Make local copy of certain ADIOI_OneSidedStripeParms elements for
+ * faster access - pay for pointer dereference only once.
+ */
+ int stripeSize = stripe_parms->stripeSize;
+ int segmentIter = stripe_parms->segmentIter;
+ hsize_t bufTypeExtent = stripe_parms->bufTypeExtent;
+
+ if ((stripeSize > 0) && stripe_parms->firstStripedIOCall)
+ stripe_parms->iWasUsedStripingAgg = 0;
+
+#ifdef onesidedtrace
+ if (buf == NULL) {
+ printf("H5FD_mpio_ccio_osagg_write - buf is NULL contig_access_count is %d\n",contig_access_count);
+ for (i=0;i<contig_access_count;i++)
+ printf("offset_list[%d] is %ld len_list[%d] is %ld\n",i,offset_list[i],i,len_list[i]);
+ }
+ if (contig_access_count < 0)
+ printf("H5FD_mpio_ccio_osagg_write - contig_access_count "
+ "of %d is less than 0\n",contig_access_count);
+#endif
+
+ int lenListOverZero = 0;
+ for (i=0;((i<contig_access_count) && (!lenListOverZero));i++)
+ if (len_list[i] > 0)
+ lenListOverZero = 1;
+
+ *error_code = MPI_SUCCESS; /* initialize to success */
+
+ MPI_Status status;
+ int nprocs,myrank;
+ MPI_Comm_size(ca_data->comm, &nprocs);
+ MPI_Comm_rank(ca_data->comm, &myrank);
+
+#ifdef onesidedtrace
+ printf("Rank %d - H5FD_mpio_ccio_osagg_write started\n",myrank);
+#endif
+
+ if (ca_data->io_buf_window == MPI_WIN_NULL || ca_data->io_buf_put_amounts_window == MPI_WIN_NULL)
+ {
+ HDF5_ccio_win_setup(ca_data, nprocs);
+ }
+
+ /*
+ * This flag denotes whether the source datatype is contiguous, which is referenced throughout the algorithm
+ * and defines how the source buffer offsets and data chunks are determined. If the value is 1 (true - contiguous data)
+ * things are profoundly simpler in that the source buffer offset for a given target offset simply linearly increases
+ * by the chunk sizes being written. If the value is 0 (non-contiguous) then these values are based on calculations
+ * from the flattened source datatype.
+ */
+ int bufTypeIsContig;
+ if (memFlatBuf->count == 1)
+ bufTypeIsContig = 1;
+ else
+ bufTypeIsContig = 0;
+
+ if (!bufTypeIsContig) {
+ /* For a non-contiguous source buffer set the extent. */
+ if ((stripeSize == 0) || stripe_parms->firstStripedIOCall) {
+ bufTypeExtent = memFlatBuf->extent;
+ }
+
+#ifdef onesidedtrace
+ printf("Rank %d - memFlatBuf->count is %d bufTypeExtent is %ld\n",myrank,memFlatBuf->count, bufTypeExtent);
+ for (i=0;i<memFlatBuf->count;i++)
+ printf("Rank %d - memFlatBuf->blocklens[%d] is %d memFlatBuf->indices[%d] is %ld\n",myrank,i,memFlatBuf->blocklens[i],i,memFlatBuf->indices[i]);
+#endif
+ }
+
+ int naggs = ca_data->cb_nodes;
+
+ /* Track the state of the source buffer for feeding the target data blocks.
+ * For GPFS the number of file domains per agg is always 1 so we just need 1 agg
+ * dimension to track the data, in the case of lustre we will need 2 dimensions
+ * agg and file domain since aggs write to multiple file domains in the case of lustre.
+ * This structure will be modified as the data is written to reflect the current state
+ * of the offset.
+ */
+
+#ifdef onesidedtrace
+ printf("Rank %d - sizeof(FDSourceBufferState_CA) is %d - make sure is 32 for 32-byte memalign optimal\n",myrank,sizeof(FDSourceBufferState_CA));
+#endif
+
+ FDSourceBufferState_CA *currentFDSourceBufferState = (FDSourceBufferState_CA *) H5MM_malloc(naggs * sizeof(FDSourceBufferState_CA));
+
+ for (i=0;i<naggs;i++) {
+ /* Initialize based on the bufType to indicate that it is unset.
+ */
+ if (bufTypeIsContig) {
+ currentFDSourceBufferState[i].sourceBufferOffset = -1;
+ }
+ else {
+ currentFDSourceBufferState[i].indiceOffset = -1;
+ }
+ }
+
+#ifdef onesidedtrace
+ printf("Rank %d - H5FD_mpio_ccio_osagg_write bufTypeIsContig is %d contig_access_count is %d\n",myrank,bufTypeIsContig,contig_access_count);
+#endif
+
+ /* MaxNumContigOperations keeps track of how many different chunks we will need to send
+ * for the purpose of pre-allocating the data structures to hold them.
+ */
+ int maxNumContigOperations = contig_access_count;
+ int myAggRank = -1; /* if I am an aggregor this is my index into ranklist */
+ int iAmUsedAgg = 0; /* whether or not this rank is used as an aggregator. */
+
+ /* Make coll_bufsize an ADIO_Offset_CA since it is used in calculations with offsets.
+ */
+ ADIO_Offset_CA coll_bufsize = (ADIO_Offset_CA)(ca_data->cb_buffer_size);
+
+ /* This logic defines values that are used later to determine what offsets define the portion
+ * of the file domain the agg is writing this round.
+ */
+ int greatestFileDomainAggRank = -1,smallestFileDomainAggRank = -1;
+ ADIO_Offset_CA greatestFileDomainOffset = 0;
+ ADIO_Offset_CA smallestFileDomainOffset = lastFileOffset;
+ for (j=0;j<naggs;j++) {
+ if (fd_end[j] > greatestFileDomainOffset) {
+ greatestFileDomainOffset = fd_end[j];
+ greatestFileDomainAggRank = j;
+ }
+ if (fd_start[j] < smallestFileDomainOffset) {
+ smallestFileDomainOffset = fd_start[j];
+ smallestFileDomainAggRank = j;
+ }
+ if (ca_data->ranklist[j] == myrank) {
+ myAggRank = j;
+ if (fd_end[j] > fd_start[j]) {
+ iAmUsedAgg = 1;
+ stripe_parms->iWasUsedStripingAgg = 1;
+ }
+ }
+ }
+
+#ifdef onesidedtrace
+ printf("Rank %d - contig_access_count is %d lastFileOffset is %ld firstFileOffset is %ld\n",myrank,contig_access_count,lastFileOffset,firstFileOffset);
+ for (j=0;j<contig_access_count;j++) {
+ printf("Rank %d - offset_list[%d]: %ld , len_list[%d]: %ld\n",myrank,j,offset_list[j],j,len_list[j]);
+ }
+#endif
+
+ /* Compute number of rounds.
+ */
+ int numberOfRounds = 0;
+ for (j=0;j<naggs;j++) {
+ int currentNumberOfRounds = (int)(((fd_end[j] - fd_start[j])+(ADIO_Offset_CA)1)/coll_bufsize);
+ if (((ADIO_Offset_CA)currentNumberOfRounds*coll_bufsize) < ((fd_end[j] - fd_start[j])+(ADIO_Offset_CA)1))
+ currentNumberOfRounds++;
+ if (currentNumberOfRounds > numberOfRounds)
+ numberOfRounds = currentNumberOfRounds;
+ }
+
+ /* Data structures to track what data this compute needs to send to whom.
+ * For lustre they will all need another dimension for the file domain.
+ */
+ int *targetAggsForMyData = (int *)H5MM_malloc(naggs * sizeof(int));
+ ADIO_Offset_CA *targetAggsForMyDataFDStart = (ADIO_Offset_CA *)H5MM_malloc(naggs * sizeof(ADIO_Offset_CA));
+ ADIO_Offset_CA *targetAggsForMyDataFDEnd = (ADIO_Offset_CA *)H5MM_malloc(naggs * sizeof(ADIO_Offset_CA));
+ int numTargetAggs = 0;
+
+ /* This data structure holds the beginning offset and len list index for the range to be written
+ * coresponding to the round and target agg. Initialize to -1 to denote being unset.
+ */
+ int **targetAggsForMyDataFirstOffLenIndex = (int **)H5MM_malloc(numberOfRounds * sizeof(int *));
+ for (i=0;i<numberOfRounds;i++) {
+ targetAggsForMyDataFirstOffLenIndex[i] = (int *)H5MM_malloc(naggs * sizeof(int));
+ for (j=0;j<naggs;j++)
+ targetAggsForMyDataFirstOffLenIndex[i][j] = -1;
+ }
+
+ /* This data structure holds the ending offset and len list index for the range to be written
+ * coresponding to the round and target agg.
+ */
+ int **targetAggsForMyDataLastOffLenIndex = (int **)H5MM_malloc(numberOfRounds * sizeof(int *));
+ for (i=0;i<numberOfRounds;i++)
+ targetAggsForMyDataLastOffLenIndex[i] = (int *)H5MM_malloc(naggs * sizeof(int));
+
+#ifdef onesidedtrace
+ printf("Rank %d - NumberOfRounds is %d\n",myrank,numberOfRounds);
+ for (i=0;i<naggs;i++)
+ printf("Rank %d - ca_data->ranklist[%d] is %d fd_start is %ld fd_end is %ld\n",myrank,i,ca_data->ranklist[i],fd_start[i],fd_end[i]);
+ for (j=0;j<contig_access_count;j++)
+ printf("Rank %d - offset_list[%d] is %ld len_list is %ld\n",myrank,j,offset_list[j],len_list[j]);
+#endif
+
+ int currentAggRankListIndex = 0;
+ int maxNumNonContigSourceChunks = 0;
+
+ ADIO_Offset_CA currentSourceBufferOffset = 0;
+ ADIO_Offset_CA currentDataTypeExtent = 0;
+ int currentFlatBufIndice=0;
+ ADIO_Offset_CA currentIndiceOffset = 0;
+
+ /* Remember where we left off in the source buffer when packing stripes. */
+ if ((stripeSize > 0) && !stripe_parms->firstStripedIOCall) {
+ currentDataTypeExtent = stripe_parms->lastDataTypeExtent;
+ currentFlatBufIndice = stripe_parms->lastFlatBufIndice;
+ currentIndiceOffset = stripe_parms->lastIndiceOffset;
+#ifdef onesidedtrace
+ printf("Rank %d - using stripe_parms->lastDataTypeExtent %ld stripe_parms->lastFlatBufIndice %d stripe_parms->lastIndiceOffset %ld\n",
+ myrank,stripe_parms->lastDataTypeExtent,stripe_parms->lastFlatBufIndice,stripe_parms->lastIndiceOffset);
+#endif
+ }
+
+ /* This denotes the coll_bufsize boundaries within the source buffer for writing for the same round.
+ */
+ ADIO_Offset_CA intraRoundCollBufsizeOffset = 0;
+
+ /* This data structure tracks what target aggs need to be written to on what rounds.
+ */
+ int *targetAggsForMyDataCurrentRoundIter = (int *)H5MM_malloc(naggs * sizeof(int));
+ for (i=0;i<naggs;i++)
+ targetAggsForMyDataCurrentRoundIter[i] = 0;
+
+ /* This is the first of the two main loops in this algorithm. The purpose of this loop is essentially to populate
+ * the data structures defined above for what source data blocks needs to go where (target agg and file domain) and when
+ * (round iter). For lustre essentially an additional layer of nesting will be required for the multiple file domains
+ * within the target agg.
+ */
+ if ((contig_access_count > 0) && (buf != NULL) && lenListOverZero) {
+ int blockIter;
+ for (blockIter=0;blockIter<contig_access_count;blockIter++) {
+
+ /* Determine the starting source buffer offset for this block - for iter 0 skip it since that value is 0.
+ */
+ if (blockIter>0) {
+ if (bufTypeIsContig) {
+ currentSourceBufferOffset += len_list[blockIter-1];
+ }
+ else {
+
+ /* Non-contiguous source datatype, count up the extents and indices to this point
+ * in the blocks for use in computing the source starting buffer offset for target aggs
+ * and file domains.
+ */
+ ADIO_Offset_CA sourceBlockTotal = 0;
+ int lastIndiceUsed = currentFlatBufIndice;
+ int numNonContigSourceChunks = 0;
+
+ while (sourceBlockTotal < len_list[blockIter-1]) {
+ numNonContigSourceChunks++;
+ sourceBlockTotal += (memFlatBuf->blocklens[currentFlatBufIndice] - currentIndiceOffset);
+ lastIndiceUsed = currentFlatBufIndice;
+ currentFlatBufIndice++;
+ if (currentFlatBufIndice == memFlatBuf->count) {
+ currentFlatBufIndice = 0;
+ currentDataTypeExtent++;
+ }
+ currentIndiceOffset = (ADIO_Offset_CA)0;
+ }
+ if (sourceBlockTotal > len_list[blockIter-1]) {
+ currentFlatBufIndice--;
+ if (currentFlatBufIndice < 0 ) {
+ currentDataTypeExtent--;
+ currentFlatBufIndice = memFlatBuf->count-1;
+ }
+ currentIndiceOffset = len_list[blockIter-1] - (sourceBlockTotal - memFlatBuf->blocklens[lastIndiceUsed]);
+ }
+ else
+ currentIndiceOffset = (ADIO_Offset_CA)0;
+ maxNumContigOperations += (numNonContigSourceChunks+2);
+ if (numNonContigSourceChunks > maxNumNonContigSourceChunks)
+ maxNumNonContigSourceChunks = numNonContigSourceChunks;
+
+#ifdef onesidedtrace
+ printf("blockiter %d currentFlatBufIndice is now %d currentDataTypeExtent is now %ld currentIndiceOffset is now %ld maxNumContigOperations is now %d\n",blockIter,currentFlatBufIndice,currentDataTypeExtent,currentIndiceOffset,maxNumContigOperations);
+#endif
+
+ } // !bufTypeIsContig
+ } // blockIter > 0
+
+ /* For the last iteration we need to include these maxNumContigOperations and maxNumNonContigSourceChunks
+ * for non-contig case even though we did not need to compute the next starting offset.
+ */
+ if ((blockIter == (contig_access_count-1)) && (!bufTypeIsContig)) {
+ ADIO_Offset_CA sourceBlockTotal = 0;
+ int tmpCurrentFlatBufIndice = currentFlatBufIndice;
+ int lastNumNonContigSourceChunks = 0;
+ while (sourceBlockTotal < len_list[blockIter]) {
+ lastNumNonContigSourceChunks++;
+ sourceBlockTotal += memFlatBuf->blocklens[tmpCurrentFlatBufIndice];
+ tmpCurrentFlatBufIndice++;
+ if (tmpCurrentFlatBufIndice == memFlatBuf->count) {
+ tmpCurrentFlatBufIndice = 0;
+ }
+ }
+ maxNumContigOperations += (lastNumNonContigSourceChunks+2);
+ if (lastNumNonContigSourceChunks > maxNumNonContigSourceChunks)
+ maxNumNonContigSourceChunks = lastNumNonContigSourceChunks;
+
+ }
+
+ ADIO_Offset_CA blockStart = offset_list[blockIter], blockEnd = offset_list[blockIter]+len_list[blockIter]-(ADIO_Offset_CA)1;
+
+ /* Find the starting target agg for this block - normally it will be the current agg so guard the expensive
+ * while loop with a cheap if-check which for large numbers of small blocks will usually be false.
+ */
+ if (!((blockStart >= fd_start[currentAggRankListIndex]) && (blockStart <= fd_end[currentAggRankListIndex]))) {
+ while (!((blockStart >= fd_start[currentAggRankListIndex]) && (blockStart <= fd_end[currentAggRankListIndex])))
+ currentAggRankListIndex++;
+ };
+
+#ifdef onesidedtrace
+ printf("Rank %d - currentAggRankListIndex is %d blockStart %ld blockEnd %ld fd_start[currentAggRankListIndex] %ld fd_end[currentAggRankListIndex] %ld\n",myrank,currentAggRankListIndex,blockStart,blockEnd,fd_start[currentAggRankListIndex],fd_end[currentAggRankListIndex]);
+#endif
+
+ /* Determine if this is a new target agg.
+ */
+ if (blockIter>0) {
+ if ((offset_list[blockIter-1]+len_list[blockIter-1]-(ADIO_Offset_CA)1) < fd_start[currentAggRankListIndex]) {
+ numTargetAggs++;
+ }
+ }
+
+ /* Determine which round to start writing - data is written coll_bufsize per round from the aggregator
+ * so if our starting offset in the file domain is multiple coll_bufsize that will correspond to the round.
+ */
+ if ((blockStart - fd_start[currentAggRankListIndex]) >= coll_bufsize) {
+ ADIO_Offset_CA currentRoundBlockStart = fd_start[currentAggRankListIndex];
+ int startingRound = 0;
+ while (blockStart > (currentRoundBlockStart + coll_bufsize - (ADIO_Offset_CA)1)) {
+ currentRoundBlockStart+=coll_bufsize;
+ startingRound++;
+ }
+ targetAggsForMyDataCurrentRoundIter[numTargetAggs] = startingRound;
+ }
+
+ /* Initialize the data structures if this is the first offset in the round/target agg.
+ */
+ if (targetAggsForMyDataFirstOffLenIndex[targetAggsForMyDataCurrentRoundIter[numTargetAggs]][numTargetAggs] == -1) {
+ targetAggsForMyData[numTargetAggs] = ca_data->ranklist[currentAggRankListIndex];
+ targetAggsForMyDataFDStart[numTargetAggs] = fd_start[currentAggRankListIndex];
+ /* Round up file domain to the first actual offset used if this is the first file domain.
+ */
+ if (currentAggRankListIndex == smallestFileDomainAggRank) {
+ if (targetAggsForMyDataFDStart[numTargetAggs] < firstFileOffset)
+ targetAggsForMyDataFDStart[numTargetAggs] = firstFileOffset;
+ }
+ targetAggsForMyDataFDEnd[numTargetAggs] = fd_end[currentAggRankListIndex];
+ /* Round down file domain to the last actual offset used if this is the last file domain.
+ */
+ if (currentAggRankListIndex == greatestFileDomainAggRank) {
+ if (targetAggsForMyDataFDEnd[numTargetAggs] > lastFileOffset)
+ targetAggsForMyDataFDEnd[numTargetAggs] = lastFileOffset;
+ }
+ targetAggsForMyDataFirstOffLenIndex[targetAggsForMyDataCurrentRoundIter[numTargetAggs]][numTargetAggs] = blockIter;
+ /* Set the source buffer state starting point for data access for this agg and file domain. */
+
+ if (bufTypeIsContig) {
+ if (currentFDSourceBufferState[numTargetAggs].sourceBufferOffset == -1) {
+ currentFDSourceBufferState[numTargetAggs].sourceBufferOffset = currentSourceBufferOffset;
+#ifdef onesidedtrace
+ printf("Rank %d - For agg %d sourceBufferOffset initialized to %ld\n",myrank,currentAggRankListIndex,currentSourceBufferOffset);
+#endif
+ }
+ }
+ else {
+ if (currentFDSourceBufferState[numTargetAggs].indiceOffset == -1) {
+ currentFDSourceBufferState[numTargetAggs].indiceOffset = currentIndiceOffset;
+ currentFDSourceBufferState[numTargetAggs].bufTypeExtent = bufTypeExtent;
+ currentFDSourceBufferState[numTargetAggs].dataTypeExtent = currentDataTypeExtent;
+ currentFDSourceBufferState[numTargetAggs].flatBufIndice = currentFlatBufIndice;
+#ifdef onesidedtrace
+ printf("Rank %d - For agg %d dataTypeExtent initialized to %ld flatBufIndice to %d indiceOffset to %ld\n",myrank,numTargetAggs,currentDataTypeExtent,currentFlatBufIndice,currentIndiceOffset);
+#endif
+ }
+ }
+
+ intraRoundCollBufsizeOffset = fd_start[currentAggRankListIndex] + ((ADIO_Offset_CA)(targetAggsForMyDataCurrentRoundIter[numTargetAggs]+1) * coll_bufsize);
+
+#ifdef onesidedtrace
+ printf("Rank %d - Initial settings numTargetAggs %d offset_list[%d] with value %ld past fd border %ld with len %ld currentSourceBufferOffset set to %ld intraRoundCollBufsizeOffset set to %ld\n",myrank,numTargetAggs,blockIter,offset_list[blockIter],fd_start[currentAggRankListIndex],len_list[blockIter],currentSourceBufferOffset,intraRoundCollBufsizeOffset);
+#endif
+ }
+
+ /* Replace the last offset block iter with this one.
+ */
+ targetAggsForMyDataLastOffLenIndex[targetAggsForMyDataCurrentRoundIter[numTargetAggs]][numTargetAggs] = blockIter;
+
+ /* If this blocks extends into the next file domain advance to the next target aggs and source buffer states.
+ */
+ if (blockEnd > fd_end[currentAggRankListIndex]) {
+
+ ADIO_Offset_CA amountToAdvanceSBOffsetForFD = 0;
+ int additionalFDCounter = 0;
+
+ while (blockEnd > fd_end[currentAggRankListIndex]) {
+#ifdef onesidedtrace
+ printf("Rank %d - block extends past current fd, blockEnd %ld >= fd_end[currentAggRankListIndex] %ld total block size is %ld blockStart was %ld\n",myrank,blockEnd,fd_end[currentAggRankListIndex], len_list[blockIter],blockStart);
+#endif
+ ADIO_Offset_CA thisAggBlockEnd = fd_end[currentAggRankListIndex];
+ if (thisAggBlockEnd >= intraRoundCollBufsizeOffset) {
+ while (thisAggBlockEnd >= intraRoundCollBufsizeOffset) {
+ targetAggsForMyDataCurrentRoundIter[numTargetAggs]++;
+ intraRoundCollBufsizeOffset += coll_bufsize;
+ targetAggsForMyDataFirstOffLenIndex[targetAggsForMyDataCurrentRoundIter[numTargetAggs]][numTargetAggs] = blockIter;
+ targetAggsForMyDataLastOffLenIndex[targetAggsForMyDataCurrentRoundIter[numTargetAggs]][numTargetAggs] = blockIter;
+#ifdef onesidedtrace
+ printf("Rank %d - targetAggsForMyDataCurrentRoundI%d] is now %d intraRoundCollBufsizeOffset is now %ld\n",myrank,numTargetAggs,targetAggsForMyDataCurrentRoundIter[numTargetAggs],intraRoundCollBufsizeOffset);
+#endif
+ } // while (thisAggBlockEnd >= intraRoundCollBufsizeOffset)
+ } // if (thisAggBlockEnd >= intraRoundCollBufsizeOffset)
+
+ int prevAggRankListIndex = currentAggRankListIndex;
+ currentAggRankListIndex++;
+
+ /* Skip over unused aggs.
+ */
+ if (fd_start[currentAggRankListIndex] > fd_end[currentAggRankListIndex]) {
+ while (fd_start[currentAggRankListIndex] > fd_end[currentAggRankListIndex])
+ currentAggRankListIndex++;
+ } // (fd_start[currentAggRankListIndex] > fd_end[currentAggRankListIndex])
+
+ /* Start new target agg.
+ */
+ if (blockEnd >= fd_start[currentAggRankListIndex]) {
+ numTargetAggs++;
+ targetAggsForMyData[numTargetAggs] = ca_data->ranklist[currentAggRankListIndex];
+ targetAggsForMyDataFDStart[numTargetAggs] = fd_start[currentAggRankListIndex];
+ /* Round up file domain to the first actual offset used if this is the first file domain.
+ */
+ if (currentAggRankListIndex == smallestFileDomainAggRank) {
+ if (targetAggsForMyDataFDStart[numTargetAggs] < firstFileOffset)
+ targetAggsForMyDataFDStart[numTargetAggs] = firstFileOffset;
+ }
+ targetAggsForMyDataFDEnd[numTargetAggs] = fd_end[currentAggRankListIndex];
+ /* Round down file domain to the last actual offset used if this is the last file domain.
+ */
+ if (currentAggRankListIndex == greatestFileDomainAggRank) {
+ if (targetAggsForMyDataFDEnd[numTargetAggs] > lastFileOffset)
+ targetAggsForMyDataFDEnd[numTargetAggs] = lastFileOffset;
+ }
+ targetAggsForMyDataFirstOffLenIndex[targetAggsForMyDataCurrentRoundIter[numTargetAggs]][numTargetAggs] = blockIter;
+ /* For the first additonal file domain the source buffer offset
+ * will be incremented relative to the state of this first main
+ * loop but for subsequent full file domains the offset will be
+ * incremented by the size
+ * of the file domain.
+ */
+ if (additionalFDCounter == 0)
+ amountToAdvanceSBOffsetForFD = (fd_end[prevAggRankListIndex] - blockStart) + (ADIO_Offset_CA)1;
+ else
+ amountToAdvanceSBOffsetForFD = (fd_end[prevAggRankListIndex] - fd_start[prevAggRankListIndex]) +(ADIO_Offset_CA)1;
+
+ if (bufTypeIsContig) {
+ HDassert(numTargetAggs > 0);
+ if (currentFDSourceBufferState[numTargetAggs].sourceBufferOffset == -1) {
+ if (additionalFDCounter == 0) { // first file domain, still use the current data counter
+ currentFDSourceBufferState[numTargetAggs].sourceBufferOffset =
+ currentSourceBufferOffset+amountToAdvanceSBOffsetForFD;
+ }
+ else { // 2nd file domain, advance full file domain from last source buffer state
+ currentFDSourceBufferState[numTargetAggs].sourceBufferOffset =
+ currentFDSourceBufferState[numTargetAggs-1].sourceBufferOffset+amountToAdvanceSBOffsetForFD;
+ }
+#ifdef onesidedtrace
+ printf("Rank %d - Crossed into new FD - for agg %d sourceBufferOffset initialized to %ld amountToAdvanceSBOffsetForFD is %ld\n",myrank,numTargetAggs,currentFDSourceBufferState[numTargetAggs].sourceBufferOffset,amountToAdvanceSBOffsetForFD);
+#endif
+ }
+ }
+ else if (currentFDSourceBufferState[numTargetAggs].indiceOffset == -1) {
+ /* non-contiguos source buffer */
+ HDassert(numTargetAggs > 0);
+
+ /* Initialize the source buffer state appropriately and then
+ * advance it with the
+ * H5FD_mpio_nc_buffer_advance function.
+ */
+ if (additionalFDCounter == 0) {
+ // first file domain, still use the current data counter
+ currentFDSourceBufferState[numTargetAggs].indiceOffset =
+ currentIndiceOffset;
+ currentFDSourceBufferState[numTargetAggs].bufTypeExtent = bufTypeExtent;
+ currentFDSourceBufferState[numTargetAggs].dataTypeExtent =
+ currentDataTypeExtent;
+ currentFDSourceBufferState[numTargetAggs].flatBufIndice =
+ currentFlatBufIndice;
+ }
+ else {
+ // 2nd file domain, advance full file domain from last source buffer state
+ currentFDSourceBufferState[numTargetAggs].indiceOffset =
+ currentFDSourceBufferState[numTargetAggs-1].indiceOffset;
+ currentFDSourceBufferState[numTargetAggs].bufTypeExtent =
+ currentFDSourceBufferState[numTargetAggs-1].bufTypeExtent;
+ currentFDSourceBufferState[numTargetAggs].dataTypeExtent =
+ currentFDSourceBufferState[numTargetAggs-1].dataTypeExtent;
+ currentFDSourceBufferState[numTargetAggs].flatBufIndice =
+ currentFDSourceBufferState[numTargetAggs-1].flatBufIndice;
+ }
+ H5FD_mpio_nc_buffer_advance(((char*)buf), memFlatBuf,
+ (int)amountToAdvanceSBOffsetForFD, 1,
+ &currentFDSourceBufferState[numTargetAggs], NULL);
+#ifdef onesidedtrace
+ printf("Rank %d - Crossed into new FD - for agg %d dataTypeExtent initialized to %ld flatBufIndice to %d indiceOffset to %ld amountToAdvanceSBOffsetForFD is %d\n",myrank,numTargetAggs,currentFDSourceBufferState[numTargetAggs].dataTypeExtent,currentFDSourceBufferState[numTargetAggs].flatBufIndice,currentFDSourceBufferState[numTargetAggs].indiceOffset,amountToAdvanceSBOffsetForFD);
+#endif
+ }
+ additionalFDCounter++;
+
+#ifdef onesidedtrace
+ printf("Rank %d - block extended beyond fd init settings numTargetAggs %d offset_list[%d] with value %ld past fd border %ld with len %ld\n",myrank,numTargetAggs,blockIter,offset_list[blockIter],fd_start[currentAggRankListIndex],len_list[blockIter]);
+#endif
+ intraRoundCollBufsizeOffset = fd_start[currentAggRankListIndex] + coll_bufsize;
+ targetAggsForMyDataLastOffLenIndex[targetAggsForMyDataCurrentRoundIter[numTargetAggs]][numTargetAggs] = blockIter;
+
+ } // if (blockEnd >= fd_start[currentAggRankListIndex])
+ } // while (blockEnd > fd_end[currentAggRankListIndex])
+ } // if (blockEnd > fd_end[currentAggRankListIndex])
+
+ /* If we are still in the same file domain / target agg but have gone
+ * past the coll_bufsize and need to advance to the next round -
+ * initialize tracking data appropriately.
+ */
+ if (blockEnd >= intraRoundCollBufsizeOffset) {
+ ADIO_Offset_CA currentBlockEnd = blockEnd;
+ while (currentBlockEnd >= intraRoundCollBufsizeOffset) {
+ targetAggsForMyDataCurrentRoundIter[numTargetAggs]++;
+ intraRoundCollBufsizeOffset += coll_bufsize;
+ targetAggsForMyDataFirstOffLenIndex[targetAggsForMyDataCurrentRoundIter[numTargetAggs]][numTargetAggs] = blockIter;
+ targetAggsForMyDataLastOffLenIndex[targetAggsForMyDataCurrentRoundIter[numTargetAggs]][numTargetAggs] = blockIter;
+#ifdef onesidedtrace
+ printf("Rank %d - smaller than fd currentBlockEnd is now %ld intraRoundCollBufsizeOffset is now %ld targetAggsForMyDataCurrentRoundIter[%d] is now %d\n",myrank,currentBlockEnd, intraRoundCollBufsizeOffset, numTargetAggs,targetAggsForMyDataCurrentRoundIter[numTargetAggs]);
+#endif
+ } // while (currentBlockEnd >= intraRoundCollBufsizeOffset)
+ } // if (blockEnd >= intraRoundCollBufsizeOffset)
+
+ /* Need to advance numTargetAggs if this is the last target offset to
+ * include this one.
+ */
+ if (blockIter == (contig_access_count-1))
+ numTargetAggs++;
+ }
+
+#ifdef onesidedtrace
+ printf("Rank %d - numTargetAggs is %d\n",myrank,numTargetAggs);
+ for (i=0;i<numTargetAggs;i++) {
+ printf("Rank %d - targetAggsForMyDataCurrentRoundIter[%d] = %d\n",myrank,i,targetAggsForMyDataCurrentRoundIter[i]);
+ for (j=0;j<=targetAggsForMyDataCurrentRoundIter[i];j++) {
+ printf("Rank %d - targetAggsForMyDataFirstOffLenIndex[round %d][target agg %d] is %ld\n",myrank,j,i,targetAggsForMyDataFirstOffLenIndex[j][i]);
+ fflush(stdout);
+ printf("Rank %d - targetAggsForMyDataLastOffLenIndex[round %d][target agg %d] is %ld\n",myrank,j,i,targetAggsForMyDataLastOffLenIndex[j][i]);
+ fflush(stdout);
+ //printf("Rank %d - targetAggsForMyData[%d] is %d targetAggsForMyDataFDStart[%d] is %ld targetAggsForMyDataFDEnd is %ld targetAggsForMyDataFirstOffLenIndex is %d with value %ld targetAggsForMyDataLastOffLenIndex is %d with value %ld\n",myrank,i,targetAggsForMyData[i],i,targetAggsForMyDataFDStart[i],targetAggsForMyDataFDEnd[i],targetAggsForMyDataFirstOffLenIndex[j][i],offset_list[targetAggsForMyDataFirstOffLenIndex[j][i]],targetAggsForMyDataLastOffLenIndex[j][i],offset_list[targetAggsForMyDataLastOffLenIndex[j][i]]);
+ }
+ }
+ printf("Rank %d - About to leave (contig_access_count > 0) loop...\n",myrank);
+ fflush(stdout);
+#endif
+
+ } // if ((contig_access_count > 0) && (buf != NULL) && lenListOverZero)
+
+#ifdef onesidedtrace
+ printf("Rank %d - Done with (contig_access_count > 0) loop.\n",myrank);
+ fflush(stdout);
+#endif
+
+ H5MM_free(targetAggsForMyDataCurrentRoundIter);
+
+
+#ifdef onesidedtrace
+ printf("Rank %d - targetAggsForMyDataCurrentRoundIter freed.\n",myrank);
+ fflush(stdout);
+#endif
+
+ /* use the write buffer allocated in the file_open */
+ char *write_buf;
+ MPI_Win write_buf_window;
+ if(!ca_data->onesided_no_rmw) {
+ hole_found = 0;
+ }
+
+ /* Async I/O - Adjust if this is the "duplicate" buffer */
+ if (ca_data->use_dup) {
+ write_buf = ca_data->io_buf_d;
+ write_buf_window = ca_data->io_buf_window_d;
+ } else {
+ write_buf = ca_data->io_buf;
+ write_buf_window = ca_data->io_buf_window;
+ }
+
+#ifdef onesidedtrace
+ printf("Rank %d - write_buf and write_buf_window set.\n",myrank);
+ fflush(stdout);
+#endif
+
+ /* Counters to track the offset range being written by the used aggs.
+ */
+ ADIO_Offset_CA currentRoundFDStart = 0;
+ ADIO_Offset_CA currentRoundFDEnd = 0;
+
+ if (iAmUsedAgg) {
+ currentRoundFDStart = fd_start[myAggRank];
+ currentRoundFDEnd = fd_end[myAggRank];
+ if (myAggRank == smallestFileDomainAggRank) {
+ if (currentRoundFDStart < firstFileOffset)
+ currentRoundFDStart = firstFileOffset;
+ }
+ else if (myAggRank == greatestFileDomainAggRank) {
+ if (currentRoundFDEnd > lastFileOffset)
+ currentRoundFDEnd = lastFileOffset;
+ }
+#ifdef onesidedtrace
+ printf("Rank %d - iAmUsedAgg - currentRoundFDStart initialized to %ld currentRoundFDEnd to %ld\n",myrank,currentRoundFDStart,currentRoundFDEnd);
+#endif
+
+ if ((stripeSize > 0) && (segmentIter == 0)) {
+ stripe_parms->numStripesUsed = 0;
+ stripe_parms->stripeIOoffsets = (MPI_Offset *) H5MM_malloc(stripe_parms->stripesPerAgg*sizeof(MPI_Offset));
+ stripe_parms->stripeIOLens = (int *) H5MM_malloc(stripe_parms->stripesPerAgg*sizeof(int));
+ stripe_parms->amountOfStripedDataExpected = 0;
+ int stripeIter = 0;
+ for (stripeIter=0;stripeIter<stripe_parms->stripesPerAgg;stripeIter++) {
+ if (stripeIter == 0) {
+ stripe_parms->stripeIOoffsets[stripeIter] = currentRoundFDStart;
+ stripe_parms->stripeIOLens[stripeIter] = (int)(currentRoundFDEnd - currentRoundFDStart)+1;
+ stripe_parms->amountOfStripedDataExpected += (int)(currentRoundFDEnd - currentRoundFDStart)+1;
+ stripe_parms->numStripesUsed++;
+ }
+ else {
+ if (((currentRoundFDEnd + (ADIO_Offset_CA)1 + ((ADIO_Offset_CA)stripeIter * stripe_parms->segmentLen))) > stripe_parms->stripedLastFileOffset) {
+ if (((currentRoundFDEnd + (ADIO_Offset_CA)1 - (ADIO_Offset_CA)(stripe_parms->stripeSize) + ((ADIO_Offset_CA)stripeIter * stripe_parms->segmentLen))) <= stripe_parms->stripedLastFileOffset) {
+ stripe_parms->stripeIOoffsets[stripeIter] = (currentRoundFDEnd + (ADIO_Offset_CA)1) - (ADIO_Offset_CA)(stripe_parms->stripeSize) + ((ADIO_Offset_CA)stripeIter * stripe_parms->segmentLen);
+ stripe_parms->stripeIOLens[stripeIter] = (int)(stripe_parms->stripedLastFileOffset - (currentRoundFDEnd + (ADIO_Offset_CA)1 - (ADIO_Offset_CA)(stripe_parms->stripeSize) + ((ADIO_Offset_CA)stripeIter * stripe_parms->segmentLen)) + (ADIO_Offset_CA)1);
+ stripe_parms->amountOfStripedDataExpected += (int)(stripe_parms->stripedLastFileOffset - (currentRoundFDEnd + (ADIO_Offset_CA)1 - (ADIO_Offset_CA)(stripe_parms->stripeSize) + ((ADIO_Offset_CA)stripeIter * stripe_parms->segmentLen)) + (ADIO_Offset_CA)1);
+ stripe_parms->numStripesUsed++;
+ }
+ }
+ else {
+ stripe_parms->stripeIOoffsets[stripeIter] = (currentRoundFDEnd + (ADIO_Offset_CA)1) - (ADIO_Offset_CA)(stripe_parms->stripeSize) + ((ADIO_Offset_CA)stripeIter * stripe_parms->segmentLen);
+ stripe_parms->stripeIOLens[stripeIter] = stripe_parms->stripeSize;
+ stripe_parms->amountOfStripedDataExpected += stripe_parms->stripeSize;
+ stripe_parms->numStripesUsed++;
+ }
+ }
+ } // for-loop
+#ifdef onesidedtrace
+ printf("Rank %d - stripe_parms->amountOfStripedDataExpected is %d stripe_parms->numStripesUsed is %d offsets and lengths are ",myrank,stripe_parms->amountOfStripedDataExpected,stripe_parms->numStripesUsed);
+ for (i=0;i<stripe_parms->numStripesUsed;i++) {
+ printf("%ld %ld --",stripe_parms->stripeIOoffsets[i],stripe_parms->stripeIOLens[i]);
+ }
+ printf("\n");
+#endif
+ } // if ((stripe_parms->stripeSize>0) && (stripe_parms->segmentIter==0))
+
+ if (ca_data->onesided_always_rmw && ((stripeSize==0) || (stripe_parms->segmentIter==0))) { // read in the first buffer
+ ADIO_Offset_CA tmpCurrentRoundFDEnd = 0;
+ if ((fd_end[myAggRank] - currentRoundFDStart) < coll_bufsize) {
+ if (myAggRank == greatestFileDomainAggRank) {
+ if (fd_end[myAggRank] > lastFileOffset)
+ tmpCurrentRoundFDEnd = lastFileOffset;
+ else
+ tmpCurrentRoundFDEnd = fd_end[myAggRank];
+ }
+ else
+ tmpCurrentRoundFDEnd = fd_end[myAggRank];
+ }
+ else
+ tmpCurrentRoundFDEnd = currentRoundFDStart + coll_bufsize - (ADIO_Offset_CA)1;
+#ifdef onesidedtrace
+ printf("Rank %d - ca_data->onesided_always_rmw - first buffer pre-read for file offsets %ld to %ld total is %d\n",myrank,currentRoundFDStart,tmpCurrentRoundFDEnd,(int)(tmpCurrentRoundFDEnd - currentRoundFDStart)+1);
+#endif
+ if (stripeSize==0) {
+ MPI_File_read_at(ca_data->fh, currentRoundFDStart, write_buf, (int)(tmpCurrentRoundFDEnd - currentRoundFDStart)+1,
+ MPI_BYTE, error_code);
+ }
+ else {
+ /* pre-read the entire batch of stripes we will do before writing */
+ int stripeIter = 0;
+ for (stripeIter=0;stripeIter<stripe_parms->numStripesUsed;stripeIter++)
+ MPI_File_read_at(ca_data->fh, stripe_parms->stripeIOoffsets[stripeIter], (char*)write_buf + ((ADIO_Offset_CA)stripeIter * (ADIO_Offset_CA)stripeSize), stripe_parms->stripeIOLens[stripeIter], MPI_BYTE, error_code);
+ }
+ }
+
+ } // if iAmUsedAgg
+
+ if (ca_data->onesided_always_rmw && ((stripeSize == 0) || (segmentIter == 0))) // wait until the first buffer is read
+ MPI_Barrier(ca_data->comm);
+
+#ifdef onesidedtrace
+ MPI_Barrier(ca_data->comm);
+ if(myrank==0) { printf("\n\n"); fflush(stdout); }
+ MPI_Barrier(ca_data->comm);
+ printf("Rank %d is waiting at barrier between main loops.\n", myrank);
+ printf("Rank %d -- numberOfRounds = %d, contig_access_count = %d, numTargetAggs = %d\n", myrank, numberOfRounds, contig_access_count, numTargetAggs);
+ fflush(stdout);
+ MPI_Barrier(ca_data->comm);
+ if(myrank==0) { printf("\n\n"); fflush(stdout); }
+ MPI_Barrier(ca_data->comm);
+#endif
+
+ /* This is the second main loop of the algorithm, actually nested loop of target aggs within rounds. There are 2 flavors of this.
+ * For onesided_write_aggmethod of 1 each nested iteration for the target
+ * agg does an mpi_put on a contiguous chunk using a primative datatype
+ * determined using the data structures from the first main loop. For
+ * onesided_write_aggmethod of 2 each nested iteration for the target agg
+ * builds up data to use in created a derived data type for 1 mpi_put that is done for the target agg for each round.
+ * To support lustre there will need to be an additional layer of nesting
+ * for the multiple file domains within target aggs.
+ */
+ int roundIter;
+
+ for (roundIter=0;roundIter<numberOfRounds;roundIter++) {
+ if ((contig_access_count > 0) && (buf != NULL) && lenListOverZero) {
+
+ int aggIter;
+ for (aggIter=0;aggIter<numTargetAggs;aggIter++) {
+
+ int numBytesPutThisAggRound = 0;
+ /* If we have data for the round/agg process it.
+ */
+ if (targetAggsForMyDataFirstOffLenIndex[roundIter][aggIter] != -1) {
+ ADIO_Offset_CA currentRoundFDStartForMyTargetAgg = (ADIO_Offset_CA)((ADIO_Offset_CA)targetAggsForMyDataFDStart[aggIter] + (ADIO_Offset_CA)((ADIO_Offset_CA)roundIter*coll_bufsize));
+ ADIO_Offset_CA currentRoundFDEndForMyTargetAgg = (ADIO_Offset_CA)((ADIO_Offset_CA)targetAggsForMyDataFDStart[aggIter] + (ADIO_Offset_CA)((ADIO_Offset_CA)(roundIter+1)*coll_bufsize) - (ADIO_Offset_CA)1);
+
+ int targetAggContigAccessCount = 0;
+
+ /* These data structures are used for the derived datatype mpi_put
+ * in the onesided_write_aggmethod of 2 case.
+ */
+ int *targetAggBlockLengths=NULL;
+ MPI_Aint *targetAggDisplacements=NULL, *sourceBufferDisplacements=NULL;
+ MPI_Datatype *targetAggDataTypes=NULL;
+
+ char *derivedTypePackedSourceBuffer=NULL;
+ int derivedTypePackedSourceBufferOffset = 0;
+ int allocatedDerivedTypeArrays = 0;
+ ADIO_Offset_CA amountOfDataWrittenThisRoundAgg = 0;
+
+#ifdef onesidedtrace
+ printf("Rank %d - roundIter %d processing targetAggsForMyData %d \n",myrank,roundIter,targetAggsForMyData[aggIter]);
+#endif
+
+ /* Process the range of offsets for this target agg.
+ */
+ int offsetIter;
+ int startingOffLenIndex = targetAggsForMyDataFirstOffLenIndex[roundIter][aggIter], endingOffLenIndex = targetAggsForMyDataLastOffLenIndex[roundIter][aggIter];
+ for (offsetIter=startingOffLenIndex;offsetIter<=endingOffLenIndex;offsetIter++) {
+ if (currentRoundFDEndForMyTargetAgg > targetAggsForMyDataFDEnd[aggIter])
+ currentRoundFDEndForMyTargetAgg = targetAggsForMyDataFDEnd[aggIter];
+
+ ADIO_Offset_CA offsetStart = offset_list[offsetIter], offsetEnd = (offset_list[offsetIter]+len_list[offsetIter]-(ADIO_Offset_CA)1);
+
+#ifdef onesidedtrace
+ printf("Rank %d - roundIter %d target iter %d targetAggsForMyData is %d offset_list[%d] is %ld len_list[%d] is %ld targetAggsForMyDataFDStart is %ld targetAggsForMyDataFDEnd is %ld currentRoundFDStartForMyTargetAgg is %ld currentRoundFDEndForMyTargetAgg is %ld targetAggsForMyDataFirstOffLenIndex is %ld\n",
+ myrank,roundIter,aggIter,targetAggsForMyData[aggIter],offsetIter,offset_list[offsetIter],offsetIter,len_list[offsetIter],
+ targetAggsForMyDataFDStart[aggIter],targetAggsForMyDataFDEnd[aggIter],
+ currentRoundFDStartForMyTargetAgg,currentRoundFDEndForMyTargetAgg, targetAggsForMyDataFirstOffLenIndex[roundIter][aggIter]);
+#endif
+
+ /* Determine the amount of data and exact source buffer offsets to use.
+ */
+ int bufferAmountToSend = 0;
+
+ if ((offsetStart >= currentRoundFDStartForMyTargetAgg) && (offsetStart <= currentRoundFDEndForMyTargetAgg)) {
+ if (offsetEnd > currentRoundFDEndForMyTargetAgg)
+ bufferAmountToSend = (currentRoundFDEndForMyTargetAgg - offsetStart) +1;
+ else
+ bufferAmountToSend = (offsetEnd - offsetStart) +1;
+ }
+ else if ((offsetEnd >= currentRoundFDStartForMyTargetAgg) && (offsetEnd <= currentRoundFDEndForMyTargetAgg)) {
+ if (offsetEnd > currentRoundFDEndForMyTargetAgg)
+ bufferAmountToSend = (currentRoundFDEndForMyTargetAgg - currentRoundFDStartForMyTargetAgg) +1;
+ else
+ bufferAmountToSend = (offsetEnd - currentRoundFDStartForMyTargetAgg) +1;
+ if (offsetStart < currentRoundFDStartForMyTargetAgg) {
+ offsetStart = currentRoundFDStartForMyTargetAgg;
+ }
+ }
+ else if ((offsetStart <= currentRoundFDStartForMyTargetAgg) && (offsetEnd >= currentRoundFDEndForMyTargetAgg)) {
+ bufferAmountToSend = (currentRoundFDEndForMyTargetAgg - currentRoundFDStartForMyTargetAgg) +1;
+ offsetStart = currentRoundFDStartForMyTargetAgg;
+ }
+
+ numBytesPutThisAggRound += bufferAmountToSend;
+#ifdef onesidedtrace
+ printf("Rank %d - bufferAmountToSend is %d\n",myrank,bufferAmountToSend);
+#endif
+ if (bufferAmountToSend > 0) { /* we have data to send this round */
+
+ if (ca_data->onesided_write_aggmethod == 2) {
+ /* Only allocate these arrays if we are using method 2 and only do it once for this round/target agg.
+ */
+ if (!allocatedDerivedTypeArrays) {
+ targetAggBlockLengths = (int *)H5MM_malloc(maxNumContigOperations * sizeof(int));
+ targetAggDisplacements = (MPI_Aint *)H5MM_malloc(maxNumContigOperations * sizeof(MPI_Aint));
+ sourceBufferDisplacements = (MPI_Aint *)H5MM_malloc(maxNumContigOperations * sizeof(MPI_Aint));
+ targetAggDataTypes = (MPI_Datatype *)H5MM_malloc(maxNumContigOperations * sizeof(MPI_Datatype));
+ if (!bufTypeIsContig) {
+ int k;
+ for (k=targetAggsForMyDataFirstOffLenIndex[roundIter][aggIter];k<=targetAggsForMyDataLastOffLenIndex[roundIter][aggIter];k++)
+ amountOfDataWrittenThisRoundAgg += len_list[k];
+
+#ifdef onesidedtrace
+ printf("Rank %d - derivedTypePackedSourceBuffer mallocing %ld\n",myrank,amountOfDataWrittenThisRoundAgg);
+#endif
+
+ if (amountOfDataWrittenThisRoundAgg > 0)
+ derivedTypePackedSourceBuffer = (char *)H5MM_malloc(amountOfDataWrittenThisRoundAgg * sizeof(char));
+ else
+ derivedTypePackedSourceBuffer = NULL;
+ }
+ allocatedDerivedTypeArrays = 1;
+ }
+ }
+
+ /* Determine the offset into the target window.
+ */
+ MPI_Aint targetDisplacementToUseThisRound = (MPI_Aint) (offsetStart - currentRoundFDStartForMyTargetAgg) + ((MPI_Aint)(segmentIter)*(MPI_Aint)(stripeSize));
+
+
+ /* For onesided_write_aggmethod of 1 do the mpi_put using the primitive MPI_BYTE type for each contiguous
+ * chunk in the target, of source data is non-contiguous then pack the data first.
+ */
+ if (ca_data->onesided_write_aggmethod == 1) {
+
+ MPI_Win_lock(MPI_LOCK_SHARED, targetAggsForMyData[aggIter], MPI_MODE_NOCHECK, write_buf_window);
+
+ char *putSourceData;
+ if (bufTypeIsContig) {
+#ifdef onesidedtrace
+ printf("Rank %d - ca_data->onesided_write_aggmethod == 1 currentFDSourceBufferState[%d].sourceBufferOffset is %ld bufferAmountToSend is %d targetAggsForMyData[aggIter] is %d targetDisplacementToUseThisRound is %d write_buf_window is %016lx\n",myrank,aggIter,currentFDSourceBufferState[aggIter].sourceBufferOffset,bufferAmountToSend,targetAggsForMyData[aggIter],targetDisplacementToUseThisRound,write_buf_window);
+ fflush(stdout);
+#endif
+ MPI_Put(((char*)buf) + currentFDSourceBufferState[aggIter].sourceBufferOffset,bufferAmountToSend, MPI_BYTE,targetAggsForMyData[aggIter],targetDisplacementToUseThisRound, bufferAmountToSend,MPI_BYTE,write_buf_window);
+ currentFDSourceBufferState[aggIter].sourceBufferOffset += (ADIO_Offset_CA)bufferAmountToSend;
+ }
+ else {
+ putSourceData = (char *) H5MM_malloc(bufferAmountToSend*sizeof(char));
+ H5FD_mpio_nc_buffer_advance(((char*)buf), memFlatBuf, bufferAmountToSend, 1, &currentFDSourceBufferState[aggIter], putSourceData);
+
+ MPI_Put(putSourceData,bufferAmountToSend, MPI_BYTE,targetAggsForMyData[aggIter],targetDisplacementToUseThisRound, bufferAmountToSend,MPI_BYTE,write_buf_window);
+ }
+
+ MPI_Win_unlock(targetAggsForMyData[aggIter], write_buf_window);
+
+ if (!bufTypeIsContig)
+ H5MM_free(putSourceData);
+ }
+
+
+ /* For aggmethod 2, populate the data structures for this round/agg for this offset iter
+ * to be used subsequently when building the derived type for 1 mpi_put for all the data for this
+ * round/agg.
+ */
+ else if (ca_data->onesided_write_aggmethod == 2) {
+
+ if (bufTypeIsContig) {
+ targetAggBlockLengths[targetAggContigAccessCount]= bufferAmountToSend;
+ targetAggDataTypes[targetAggContigAccessCount] = MPI_BYTE;
+ targetAggDisplacements[targetAggContigAccessCount] = targetDisplacementToUseThisRound;
+ sourceBufferDisplacements[targetAggContigAccessCount] = (MPI_Aint)currentFDSourceBufferState[aggIter].sourceBufferOffset;
+ currentFDSourceBufferState[aggIter].sourceBufferOffset += (ADIO_Offset_CA)bufferAmountToSend;
+ targetAggContigAccessCount++;
+ }
+ else {
+ H5FD_mpio_nc_buffer_advance(((char*)buf), memFlatBuf, bufferAmountToSend, 1, &currentFDSourceBufferState[aggIter], &derivedTypePackedSourceBuffer[derivedTypePackedSourceBufferOffset]);
+ targetAggBlockLengths[targetAggContigAccessCount]= bufferAmountToSend;
+ targetAggDataTypes[targetAggContigAccessCount] = MPI_BYTE;
+ targetAggDisplacements[targetAggContigAccessCount] = targetDisplacementToUseThisRound;
+ sourceBufferDisplacements[targetAggContigAccessCount] = (MPI_Aint)derivedTypePackedSourceBufferOffset;
+ targetAggContigAccessCount++;
+ derivedTypePackedSourceBufferOffset += (ADIO_Offset_CA)bufferAmountToSend;
+ }
+ }
+
+#ifdef onesidedtrace
+ printf("Rank %d - roundIter %d bufferAmountToSend is %d offsetStart is %ld currentRoundFDStartForMyTargetAgg is %ld currentRoundFDEndForMyTargetAgg is %ld targetDisplacementToUseThisRound is %ld targetAggsForMyDataFDStart[aggIter] is %ld\n",myrank,roundIter, bufferAmountToSend, offsetStart,currentRoundFDStartForMyTargetAgg,currentRoundFDEndForMyTargetAgg,targetDisplacementToUseThisRound,targetAggsForMyDataFDStart[aggIter]);
+#endif
+
+ } // bufferAmountToSend > 0
+ } // contig list
+
+ /* For aggmethod 2, Now build the derived type using the data from this round/agg and do 1 single mpi_put.
+ */
+ if (ca_data->onesided_write_aggmethod == 2) {
+
+ MPI_Datatype sourceBufferDerivedDataType, targetBufferDerivedDataType;
+ MPI_Type_create_struct(targetAggContigAccessCount, targetAggBlockLengths, sourceBufferDisplacements, targetAggDataTypes, &sourceBufferDerivedDataType);
+ MPI_Type_commit(&sourceBufferDerivedDataType);
+ MPI_Type_create_struct(targetAggContigAccessCount, targetAggBlockLengths, targetAggDisplacements, targetAggDataTypes, &targetBufferDerivedDataType);
+ MPI_Type_commit(&targetBufferDerivedDataType);
+
+#ifdef onesidedtrace
+ printf("Rank %d - mpi_put of derived type to agg %d targetAggContigAccessCount is %d\n",myrank,targetAggsForMyData[aggIter],targetAggContigAccessCount);
+#endif
+
+ if (targetAggContigAccessCount > 0) {
+
+#ifdef onesidedtrace
+ printf("Rank %d - Calling 1st MPI_Win_lock\n",myrank);
+ fflush(stdout);
+#endif
+
+ MPI_Win_lock(MPI_LOCK_SHARED, targetAggsForMyData[aggIter], MPI_MODE_NOCHECK, write_buf_window);
+
+ if (bufTypeIsContig) {
+#ifdef onesidedtrace
+ printf("Rank %d - Calling MPI_Put with bufTypeIsContig==TRUE, aggIter %ld, targetAggsForMyData[aggIter] is %ld\n",myrank,aggIter,targetAggsForMyData[aggIter]);
+ fflush(stdout);
+#endif
+ MPI_Put(((char*)buf),1, sourceBufferDerivedDataType,targetAggsForMyData[aggIter],0, 1,targetBufferDerivedDataType,write_buf_window);
+ }
+ else {
+
+#ifdef onesidedtrace
+ printf("Rank %d - Calling MPI_Put with bufTypeIsContig==FALSE, aggIter %ld, targetAggsForMyData[aggIter] is %ld\n",myrank,aggIter,targetAggsForMyData[aggIter]);
+ fflush(stdout);
+#endif
+ MPI_Put(derivedTypePackedSourceBuffer,1, sourceBufferDerivedDataType,targetAggsForMyData[aggIter],0, 1,targetBufferDerivedDataType,write_buf_window);
+ }
+#ifdef onesidedtrace
+ printf("Rank %d - Calling 1st MPI_Win_UNlock\n",myrank);
+ fflush(stdout);
+#endif
+ MPI_Win_unlock(targetAggsForMyData[aggIter], write_buf_window);
+ //MPI_Win_fence(0, write_buf_window);
+ }
+
+ if (allocatedDerivedTypeArrays) {
+ H5MM_free(targetAggBlockLengths);
+ H5MM_free(targetAggDisplacements);
+ H5MM_free(targetAggDataTypes);
+ H5MM_free(sourceBufferDisplacements);
+ if (!bufTypeIsContig)
+ if (derivedTypePackedSourceBuffer != NULL)
+ H5MM_free(derivedTypePackedSourceBuffer);
+ }
+ if (targetAggContigAccessCount > 0) {
+ MPI_Type_free(&sourceBufferDerivedDataType);
+ MPI_Type_free(&targetBufferDerivedDataType);
+ }
+
+ }
+
+ if (!ca_data->onesided_no_rmw) {
+
+ MPI_Win io_buf_put_amounts_window_use = ca_data->io_buf_put_amounts_window;
+ if (ca_data->use_dup) {
+ io_buf_put_amounts_window_use = ca_data->io_buf_put_amounts_window_d;
+ }
+#ifdef onesidedtrace
+ printf("Rank %d - Calling 2nd MPI_Win_lock\n",myrank);
+ fflush(stdout);
+#endif
+ MPI_Win_lock(MPI_LOCK_SHARED, targetAggsForMyData[aggIter], MPI_MODE_NOCHECK, io_buf_put_amounts_window_use);
+#ifdef onesidedtrace
+ printf("Rank %d - Calling MPI_Accumulate\n",myrank);
+ fflush(stdout);
+#endif
+ MPI_Accumulate(&numBytesPutThisAggRound,1, MPI_INT,targetAggsForMyData[aggIter],0, 1, MPI_INT, MPI_SUM, io_buf_put_amounts_window_use);
+#ifdef onesidedtrace
+ printf("Rank %d - Calling 2nd MPI_Win_UNlock\n",myrank);
+ fflush(stdout);
+#endif
+ MPI_Win_unlock(targetAggsForMyData[aggIter], io_buf_put_amounts_window_use);
+ }
+
+ } // baseoffset != -1
+ } // target aggs
+
+ if (stripeSize > 0) {
+ stripe_parms->lastDataTypeExtent = currentFDSourceBufferState[numTargetAggs-1].dataTypeExtent;
+ stripe_parms->lastFlatBufIndice = currentFDSourceBufferState[numTargetAggs-1].flatBufIndice;
+ stripe_parms->lastIndiceOffset = currentFDSourceBufferState[numTargetAggs-1].indiceOffset;
+#ifdef onesidedtrace
+ printf("Rank %d - setting stripe_parms->lastDataTypeExtent %ld stripe_parms->lastFlatBufIndice %d stripe_parms->lastIndiceOffset %ld\n",myrank,stripe_parms->lastDataTypeExtent,stripe_parms->lastFlatBufIndice,stripe_parms->lastIndiceOffset);
+#endif
+ }
+
+ } /// contig_access_count > 0
+
+ /* Synchronize all procs before the file write */
+ if ((stripeSize == 0) || (stripe_parms->flushCB)) {
+#ifdef onesidedtrace
+ printf("Rank %d - first barrier roundIter %d\n",myrank,roundIter);
+#endif
+ MPI_Barrier(ca_data->comm);
+ }
+
+ if ((iAmUsedAgg || stripe_parms->iWasUsedStripingAgg) && ((stripeSize == 0) || (stripe_parms->flushCB))) {
+ stripe_parms->iWasUsedStripingAgg = 0;
+ /* Determine what offsets define the portion of the file domain the agg is writing this round.
+ */
+ if (iAmUsedAgg) {
+ if ((fd_end[myAggRank] - currentRoundFDStart) < coll_bufsize) {
+ if (myAggRank == greatestFileDomainAggRank) {
+ if (fd_end[myAggRank] > lastFileOffset)
+ currentRoundFDEnd = lastFileOffset;
+ else
+ currentRoundFDEnd = fd_end[myAggRank];
+ }
+ else
+ currentRoundFDEnd = fd_end[myAggRank];
+ }
+ else
+ currentRoundFDEnd = currentRoundFDStart + coll_bufsize - (ADIO_Offset_CA)1;
+#ifdef onesidedtrace
+ printf("current used agg about to writecontig - currentRoundFDStart is %ld currentRoundFDEnd is %ld within file domain %ld to %ld\n",currentRoundFDStart,currentRoundFDEnd,fd_start[myAggRank],fd_end[myAggRank]);
+#endif
+ }
+#ifdef onesidedtrace
+ else {
+ printf("former used agg about to writecontig\n");
+ }
+#endif
+ int doWriteContig = 1;
+ int tmp_put_amt = ca_data->io_buf_put_amounts;
+ if (ca_data->use_dup) tmp_put_amt = ca_data->io_buf_put_amounts_d;
+
+ if (!ca_data->onesided_no_rmw) {
+ if (stripeSize == 0) {
+ if (tmp_put_amt != ((int)(currentRoundFDEnd - currentRoundFDStart)+1)) {
+ doWriteContig = 0;
+ hole_found = 1;
+#ifdef onesidedtrace
+ printf("hole found --- ca_data->io_buf_put_amounts is %d currentRoundFDEnd is %ld currentRoundFDStart is %ld on roundIter %d\n",tmp_put_amt,currentRoundFDEnd,currentRoundFDStart,roundIter);
+#endif
+ }
+ }
+ else { // file striping
+ if (tmp_put_amt != stripe_parms->amountOfStripedDataExpected) {
+ doWriteContig = 0;
+ hole_found = 1;
+#ifdef onesidedtrace
+ printf("striping hole found --- ca_data->io_buf_put_amounts is %d stripe_parms->amountOfStripedDataExpected is %d on roundIter %d\n",tmp_put_amt,stripe_parms->amountOfStripedDataExpected,roundIter);
+#endif
+ }
+ }
+ if (ca_data->use_dup)
+ ca_data->io_buf_put_amounts_d = 0;
+ else
+ ca_data->io_buf_put_amounts = 0;
+ }
+
+ if (doWriteContig) {
+ if (stripeSize > 0) {
+#ifdef onesidedtrace
+ printf("about to write out %d stripes\n",stripe_parms->numStripesUsed);
+#endif
+
+ int stripeIter = 0;
+ for (stripeIter=0;stripeIter<stripe_parms->numStripesUsed;stripeIter++) {
+
+#ifdef onesidedtrace
+ printf("writing write_buf offset %ld len %ld file offset %ld\n",((ADIO_Offset_CA)stripeIter * (ADIO_Offset_CA)(stripeSize)),stripe_parms->stripeIOLens[stripeIter],stripe_parms->stripeIOoffsets[stripeIter]);
+#endif
+ if (ca_data->check_req) {
+ MPIO_Wait(&ca_data->io_Request, error_code);
+ ca_data->check_req = 0;
+ }
+
+ MPI_File_iwrite_at(ca_data->fh, stripe_parms->stripeIOoffsets[stripeIter], (char*)(write_buf + ((ADIO_Offset_CA)stripeIter * (ADIO_Offset_CA)(stripeSize))), stripe_parms->stripeIOLens[stripeIter], MPI_BYTE, &ca_data->io_Request);
+
+ if (ca_data->async_io_outer && 0) {
+ ca_data->check_req = 1;
+ } else {
+ MPIO_Wait(&ca_data->io_Request, error_code);
+ ca_data->check_req = 0;
+ }
+
+ }
+ H5MM_free(stripe_parms->stripeIOLens);
+ H5MM_free(stripe_parms->stripeIOoffsets);
+ }
+ else {
+
+ if (ca_data->check_req) {
+ MPIO_Wait(&ca_data->io_Request, error_code);
+ ca_data->check_req = 0;
+ }
+
+ MPI_File_iwrite_at(ca_data->fh, currentRoundFDStart, write_buf, (int)(currentRoundFDEnd - currentRoundFDStart)+1, MPI_BYTE, &ca_data->io_Request);
+
+ if (ca_data->async_io_outer) {
+ ca_data->check_req = 1;
+ } else {
+ MPIO_Wait(&ca_data->io_Request, error_code);
+ ca_data->check_req = 0;
+ }
+
+ }
+ }
+ } // iAmUsedAgg
+
+ if (iAmUsedAgg && stripeSize == 0) {
+ currentRoundFDStart += coll_bufsize;
+
+ if (ca_data->onesided_always_rmw && (roundIter<(numberOfRounds-1))) { // read in the buffer for the next round unless this is the last round
+ ADIO_Offset_CA tmpCurrentRoundFDEnd = 0;
+ if ((fd_end[myAggRank] - currentRoundFDStart) < coll_bufsize) {
+ if (myAggRank == greatestFileDomainAggRank) {
+ if (fd_end[myAggRank] > lastFileOffset)
+ tmpCurrentRoundFDEnd = lastFileOffset;
+ else
+ tmpCurrentRoundFDEnd = fd_end[myAggRank];
+ }
+ else
+ tmpCurrentRoundFDEnd = fd_end[myAggRank];
+ }
+ else
+ tmpCurrentRoundFDEnd = currentRoundFDStart + coll_bufsize - (ADIO_Offset_CA)1;
+#ifdef onesidedtrace
+ printf("Rank %d - ca_data->onesided_always_rmw - round %d buffer pre-read for file offsets %ld to %ld total is %d\n",myrank,roundIter, currentRoundFDStart,tmpCurrentRoundFDEnd,(int)(tmpCurrentRoundFDEnd - currentRoundFDStart)+1);
+#endif
+ MPI_File_read_at(ca_data->fh, currentRoundFDStart, write_buf, (int)(tmpCurrentRoundFDEnd - currentRoundFDStart)+1,
+ MPI_BYTE, error_code);
+ }
+ }
+
+ if (roundIter<(numberOfRounds-1)) {
+#ifdef onesidedtrace
+ printf("second barrier roundIter %d --- waiting in loop this time\n",roundIter);
+#endif
+ MPI_Barrier(ca_data->comm);
+ }
+
+ } /* for-loop roundIter */
+
+#ifdef onesidedtrace
+ printf("freeing datastructures\n");
+#endif
+ H5MM_free(targetAggsForMyData);
+ H5MM_free(targetAggsForMyDataFDStart);
+ H5MM_free(targetAggsForMyDataFDEnd);
+
+ for (i=0;i<numberOfRounds;i++) {
+ H5MM_free(targetAggsForMyDataFirstOffLenIndex[i]);
+ H5MM_free(targetAggsForMyDataLastOffLenIndex[i]);
+ }
+ H5MM_free(targetAggsForMyDataFirstOffLenIndex);
+ H5MM_free(targetAggsForMyDataLastOffLenIndex);
+
+ H5MM_free(currentFDSourceBufferState);
+
+ return;
+} /* H5FD_mpio_ccio_osagg_write */
+
+/*-------------------------------------------------------------------------
+ * Function: H5FD_mpio_ccio_osagg_read
+ *
+ * Purpose: One-sided collective READ implementation.
+ *
+ *-------------------------------------------------------------------------
+ */
+void H5FD_mpio_ccio_osagg_read(CustomAgg_FH_Data ca_data,
+ ADIO_Offset_CA *offset_list,
+ ADIO_Offset_CA *len_list,
+ int contig_access_count,
+ const void *buf,
+ H5S_flatbuf_t *flatBuf,
+ int *error_code,
+ ADIO_Offset_CA firstFileOffset,
+ ADIO_Offset_CA lastFileOffset,
+ int numNonZeroDataOffsets,
+ ADIO_Offset_CA *fd_start,
+ ADIO_Offset_CA* fd_end,
+ FS_Block_Parms *stripe_parms,
+ int do_file_read)
+{
+ int i,j; /* generic iterators */
+
+ /*
+ * Make local copy of certain ADIOI_OneSidedStripeParms elements for
+ * faster access - pay for pointer dereference only once.
+ */
+ int stripeSize = stripe_parms->stripeSize;
+ int segmentIter = stripe_parms->segmentIter;
+ hsize_t bufTypeExtent = stripe_parms->bufTypeExtent;
+
+ if ((stripeSize > 0) && stripe_parms->firstStripedIOCall)
+ stripe_parms->iWasUsedStripingAgg = 0;
+
+#ifdef onesidedtrace
+ if (buf == NULL) {
+ printf("H5FD_mpio_ccio_osagg_read - buf is NULL contig_access_count is %d\n",contig_access_count);
+ for (i=0;i<contig_access_count;i++)
+ printf("offset_list[%d] is %ld len_list[%d] is %ld\n", i,offset_list[i],i,len_list[i]);
+ }
+ if (contig_access_count < 0) {
+ printf("H5FD_mpio_ccio_osagg_read - contig_access_count of %d is less than 0\n",contig_access_count);
+ }
+#endif
+
+ int lenListOverZero = 0;
+ for (i=0;((i<contig_access_count) && (!lenListOverZero));i++) {
+ if (len_list[i] > 0) lenListOverZero = 1;
+ }
+
+ *error_code = MPI_SUCCESS; /* initialize to success */
+
+ MPI_Status status;
+
+ pthread_t io_thread;
+ void *thread_ret;
+ ThreadFuncData io_thread_args;
+
+ int nprocs,myrank;
+ MPI_Comm_size(ca_data->comm, &nprocs);
+ MPI_Comm_rank(ca_data->comm, &myrank);
+
+#ifdef onesidedtrace
+ printf("Rank %d - H5FD_mpio_ccio_osagg_read started\n",myrank);
+#endif
+
+ if (ca_data->io_buf_window == MPI_WIN_NULL || ca_data->io_buf_put_amounts_window == MPI_WIN_NULL)
+ {
+ HDF5_ccio_win_setup(ca_data, nprocs);
+ }
+
+ /* This flag denotes whether the source datatype is contiguous, which is referenced throughout the algorithm
+ * and defines how the source buffer offsets and data chunks are determined. If the value is 1 (true - contiguous data)
+ * things are profoundly simpler in that the source buffer offset for a given target offset simply linearly increases
+ * by the chunk sizes being written. If the value is 0 (non-contiguous) then these values are based on calculations
+ * from the flattened source datatype.
+ */
+ int bufTypeIsContig;
+ if (flatBuf->count == 1)
+ bufTypeIsContig = 1;
+ else
+ bufTypeIsContig = 0;
+
+ if (!bufTypeIsContig) {
+ /* For a non-contiguous source buffer set the extent. */
+ if ((stripeSize == 0) || stripe_parms->firstStripedIOCall) {
+ bufTypeExtent = flatBuf->extent;
+ }
+
+#ifdef onesidedtrace
+ printf("Rank %d - memFlatBuf->count is %d bufTypeExtent is %ld\n",myrank,flatBuf->count, bufTypeExtent);
+ for (i=0;i<flatBuf->count;i++)
+ printf("Rank %d - flatBuf->blocklens[%d] is %d flatBuf->indices[%d] is %ld\n",myrank,i,flatBuf->blocklens[i],i,flatBuf->indices[i]);
+#endif
+ }
+
+ int naggs = ca_data->cb_nodes;
+
+ /* Track the state of the source buffer for feeding the target data blocks.
+ * For GPFS the number of file domains per agg is always 1 so we just need 1 agg
+ * dimension to track the data, in the case of lustre we will need 2 dimensions
+ * agg and file domain since aggs write to multiple file domains in the case of lustre.
+ * This structure will be modified as the data is written to reflect the current state
+ * of the offset.
+ */
+
+#ifdef onesidedtrace
+ printf("Rank %d - sizeof(FDSourceBufferState_CA) is %d - make sure is 32 for 32-byte memalign optimal\n",myrank,sizeof(FDSourceBufferState_CA));
+#endif
+
+ FDSourceBufferState_CA *currentFDSourceBufferState = (FDSourceBufferState_CA *) H5MM_malloc(naggs * sizeof(FDSourceBufferState_CA));
+ for (i=0;i<naggs;i++) {
+ /* initialize based on the bufType to indicate that it is unset.
+ */
+ if (bufTypeIsContig) {
+ currentFDSourceBufferState[i].sourceBufferOffset = -1;
+ }
+ else {
+ currentFDSourceBufferState[i].indiceOffset = -1;
+ }
+ }
+
+#ifdef onesidedtrace
+ printf("Rank %d - H5FD_mpio_ccio_osagg_read bufTypeIsContig is %d contig_access_count is %d\n",myrank,bufTypeIsContig,contig_access_count);
+#endif
+
+ /* maxNumContigOperations keeps track of how many different chunks we will
+ * need to recv for the purpose of pre-allocating the data structures to
+ * hold them.
+ */
+ int maxNumContigOperations = contig_access_count;
+ int myAggRank = -1; /* if I am an aggregor this is my index into ranklist */
+ int iAmUsedAgg = 0; /* whether or not this rank is used as an aggregator. */
+
+ /* Make coll_bufsize an ADIO_Offset_CA since it is used in calculations with offsets.
+ */
+ ADIO_Offset_CA coll_bufsize = (ADIO_Offset_CA)(ca_data->cb_buffer_size);
+
+ /* Check if the I/O is (inner) asynchronous */
+ if (ca_data->async_io_inner == 1) {
+ /* split buffer in half for asynchronous I/O */
+ coll_bufsize = (ADIO_Offset_CA)(ca_data->cb_buffer_size/2);
+ }
+
+ /* This logic defines values that are used later to determine what offsets define the portion
+ * of the file domain the agg is reading this round.
+ */
+ int greatestFileDomainAggRank = -1,smallestFileDomainAggRank = -1;
+ ADIO_Offset_CA greatestFileDomainOffset = 0;
+ ADIO_Offset_CA smallestFileDomainOffset = lastFileOffset;
+ for (j=0;j<naggs;j++) {
+ if (fd_end[j] > greatestFileDomainOffset) {
+ greatestFileDomainOffset = fd_end[j];
+ greatestFileDomainAggRank = j;
+ }
+ if (fd_start[j] < smallestFileDomainOffset) {
+ smallestFileDomainOffset = fd_start[j];
+ smallestFileDomainAggRank = j;
+ }
+ if (ca_data->ranklist[j] == myrank) {
+ myAggRank = j;
+ if (fd_end[j] > fd_start[j]) {
+ iAmUsedAgg = 1;
+ stripe_parms->iWasUsedStripingAgg = 1;
+ }
+ }
+ }
+
+#ifdef onesidedtrace
+ printf("Rank %d - contig_access_count is %d lastFileOffset is %ld firstFileOffset is %ld\n",myrank,contig_access_count,lastFileOffset,firstFileOffset);
+ for (j=0;j<contig_access_count;j++) {
+ printf("Rank %d - offset_list[%d]: %ld , len_list[%d]: %ld\n",myrank,j,offset_list[j],j,len_list[j]);
+ }
+#endif
+
+ /* Compute number of rounds.
+ */
+ int numberOfRounds = 0;
+ for (j=0;j<naggs;j++) {
+ int currentNumberOfRounds = (int)(((fd_end[j] - fd_start[j])+(ADIO_Offset_CA)1)/coll_bufsize);
+ if ( ( (ADIO_Offset_CA)currentNumberOfRounds*coll_bufsize ) < ((fd_end[j] - fd_start[j])+(ADIO_Offset_CA)1))
+ currentNumberOfRounds++;
+ if (currentNumberOfRounds > numberOfRounds)
+ numberOfRounds = currentNumberOfRounds;
+ }
+
+ /* Data structures to track what data this compute needs to receive from whom.
+ * For lustre they will all need another dimension for the file domain.
+ */
+ int *sourceAggsForMyData = (int *) H5MM_malloc(naggs * sizeof(int));
+ ADIO_Offset_CA *sourceAggsForMyDataFDStart = (ADIO_Offset_CA *)H5MM_malloc(naggs * sizeof(ADIO_Offset_CA));
+ ADIO_Offset_CA *sourceAggsForMyDataFDEnd = (ADIO_Offset_CA *)H5MM_malloc(naggs * sizeof(ADIO_Offset_CA));
+ int numSourceAggs = 0;
+
+ /* This data structure holds the beginning offset and len list index for the range to be read
+ * coresponding to the round and source agg. Initialize to -1 to denote being unset.
+ */
+ int **sourceAggsForMyDataFirstOffLenIndex = (int **) H5MM_malloc(numberOfRounds * sizeof(int *));
+ for (i = 0; i < numberOfRounds; i++) {
+ sourceAggsForMyDataFirstOffLenIndex[i] = (int *) H5MM_malloc(naggs * sizeof(int));
+ for (j = 0; j < naggs; j++)
+ sourceAggsForMyDataFirstOffLenIndex[i][j] = -1;
+ }
+
+ /* This data structure holds the ending offset and len list index for the range to be read
+ * coresponding to the round and source agg.
+ */
+ int **sourceAggsForMyDataLastOffLenIndex = (int **) H5MM_malloc(numberOfRounds * sizeof(int *));
+ for (i = 0; i < numberOfRounds; i++)
+ sourceAggsForMyDataLastOffLenIndex[i] = (int *) H5MM_malloc(naggs * sizeof(int));
+
+#ifdef onesidedtrace
+ printf("Rank %d - NumberOfRounds is %d\n",myrank,numberOfRounds);
+ for (i=0;i<naggs;i++)
+ printf("Rank %d - ca_data->ranklist[%d] is %d fd_start is %ld fd_end is %ld\n",myrank,i,ca_data->ranklist[i],fd_start[i],fd_end[i]);
+ for (j=0;j<contig_access_count;j++)
+ printf("Rank %d - offset_list[%d] is %ld len_list is %ld\n",myrank,j,offset_list[j],len_list[j]);
+#endif
+
+ int currentAggRankListIndex = 0;
+ int maxNumNonContigSourceChunks = 0;
+
+ ADIO_Offset_CA currentRecvBufferOffset = 0;
+ ADIO_Offset_CA currentDataTypeExtent = 0;
+ int currentFlatBufIndice=0;
+ ADIO_Offset_CA currentIndiceOffset = 0;
+
+ /* Remember where we left off in the buffer when reading stripes. */
+ if ((stripeSize > 0) && !stripe_parms->firstStripedIOCall) {
+ currentDataTypeExtent = stripe_parms->lastDataTypeExtent;
+ currentFlatBufIndice = stripe_parms->lastFlatBufIndice;
+ currentIndiceOffset = stripe_parms->lastIndiceOffset;
+ }
+
+ /* This denotes the coll_bufsize boundaries within the source buffer for reading for 1 round.
+ */
+ ADIO_Offset_CA intraRoundCollBufsizeOffset = 0;
+
+ /* This data structure tracks what source aggs need to be read to on what rounds.
+ */
+ int *sourceAggsForMyDataCurrentRoundIter = (int *) H5MM_malloc(naggs * sizeof(int));
+ for (i = 0; i < naggs; i++)
+ sourceAggsForMyDataCurrentRoundIter[i] = 0;
+
+
+ /* This is the first of the two main loops in this algorithm.
+ * The purpose of this loop is essentially to populate
+ * the data structures defined above for what read data blocks
+ * needs to go where (source agg and file domain) and when
+ * (round iter). For lustre essentially an additional layer of
+ * nesting will be required for the multiple file domains
+ * within the source agg.
+ */
+ if ((contig_access_count > 0) && (buf != NULL) && lenListOverZero) {
+ int blockIter;
+ for (blockIter = 0; blockIter < contig_access_count; blockIter++) {
+
+ /* Determine the starting source buffer offset for this block - for iter 0 skip it since that value is 0.
+ */
+ if (blockIter > 0) {
+ if (bufTypeIsContig) {
+ currentRecvBufferOffset += len_list[blockIter - 1];
+ } else {
+ /* Non-contiguous source datatype, count up the extents and indices to this point
+ * in the blocks.
+ */
+ ADIO_Offset_CA sourceBlockTotal = 0;
+ int lastIndiceUsed = currentFlatBufIndice;
+ int numNonContigSourceChunks = 0;
+
+ while (sourceBlockTotal < len_list[blockIter - 1]) {
+ numNonContigSourceChunks++;
+ sourceBlockTotal += (flatBuf->blocklens[currentFlatBufIndice] - currentIndiceOffset);
+ lastIndiceUsed = currentFlatBufIndice;
+ currentFlatBufIndice++;
+ if (currentFlatBufIndice == flatBuf->count) {
+ currentFlatBufIndice = 0;
+ currentDataTypeExtent++;
+ }
+ currentIndiceOffset = (ADIO_Offset_CA) 0;
+ }
+ if (sourceBlockTotal > len_list[blockIter - 1]) {
+ currentFlatBufIndice--;
+ if (currentFlatBufIndice < 0) {
+ currentDataTypeExtent--;
+ currentFlatBufIndice = flatBuf->count - 1;
+ }
+ currentIndiceOffset = len_list[blockIter - 1] - (sourceBlockTotal - flatBuf->blocklens[lastIndiceUsed]);
+ } else
+ currentIndiceOffset = (ADIO_Offset_CA) 0;
+ maxNumContigOperations += (numNonContigSourceChunks + 2);
+ if (numNonContigSourceChunks > maxNumNonContigSourceChunks)
+ maxNumNonContigSourceChunks = numNonContigSourceChunks;
+
+#ifdef onesidedtrace
+ printf("blockiter %d currentFlatBufIndice is now %d currentDataTypeExtent is now %ld currentIndiceOffset is now %ld maxNumContigOperations is now %d\n",blockIter,currentFlatBufIndice,currentDataTypeExtent,currentIndiceOffset,maxNumContigOperations);
+#endif
+ } // !bufTypeIsContig
+ } // blockIter > 0
+
+ /* For the last iteration we need to include these maxNumContigOperations and maxNumNonContigSourceChunks
+ * for non-contig case even though we did not need to compute the next starting offset.
+ */
+ if ((blockIter == (contig_access_count - 1)) && (!bufTypeIsContig)) {
+ ADIO_Offset_CA sourceBlockTotal = 0;
+ int tmpCurrentFlatBufIndice = currentFlatBufIndice;
+ int lastNumNonContigSourceChunks = 0;
+ while (sourceBlockTotal < len_list[blockIter]) {
+ lastNumNonContigSourceChunks++;
+ sourceBlockTotal += flatBuf->blocklens[tmpCurrentFlatBufIndice];
+ tmpCurrentFlatBufIndice++;
+ if (tmpCurrentFlatBufIndice == flatBuf->count) {
+ tmpCurrentFlatBufIndice = 0;
+ }
+ }
+ maxNumContigOperations += (lastNumNonContigSourceChunks + 2);
+ if (lastNumNonContigSourceChunks > maxNumNonContigSourceChunks)
+ maxNumNonContigSourceChunks = lastNumNonContigSourceChunks;
+ }
+
+ ADIO_Offset_CA blockStart = offset_list[blockIter];
+ ADIO_Offset_CA blockEnd = offset_list[blockIter] + len_list[blockIter] - (ADIO_Offset_CA)1;
+
+ /* Find the starting source agg for this block - normally it will be the current agg so guard the expensive
+ * while loop with a cheap if-check which for large numbers of small blocks will usually be false.
+ */
+ if (!((blockStart >= fd_start[currentAggRankListIndex]) && (blockStart <= fd_end[currentAggRankListIndex]))) {
+ while (!((blockStart >= fd_start[currentAggRankListIndex]) && (blockStart <= fd_end[currentAggRankListIndex])))
+ currentAggRankListIndex++;
+ };
+
+#ifdef onesidedtrace
+ printf("Rank %d - currentAggRankListIndex is %d blockStart %ld blockEnd %ld fd_start[currentAggRankListIndex] %ld fd_end[currentAggRankListIndex] %ld\n",myrank,currentAggRankListIndex,blockStart,blockEnd,fd_start[currentAggRankListIndex],fd_end[currentAggRankListIndex]);
+#endif
+
+ /* Determine if this is a new source agg.
+ */
+ if (blockIter > 0) {
+ if ((offset_list[blockIter - 1] + len_list[blockIter - 1] - (ADIO_Offset_CA) 1) < fd_start[currentAggRankListIndex]) {
+ numSourceAggs++;
+ }
+ }
+
+ /* Determine which round to start reading.
+ */
+ if ((blockStart - fd_start[currentAggRankListIndex]) >= coll_bufsize) {
+ ADIO_Offset_CA currentRoundBlockStart = fd_start[currentAggRankListIndex];
+ int startingRound = 0;
+ while (blockStart > (currentRoundBlockStart + coll_bufsize - (ADIO_Offset_CA) 1)) {
+ currentRoundBlockStart += coll_bufsize;
+ startingRound++;
+ }
+ sourceAggsForMyDataCurrentRoundIter[numSourceAggs] = startingRound;
+ }
+
+ /* Initialize the data structures if this is the first offset in the round/source agg.
+ */
+ if (sourceAggsForMyDataFirstOffLenIndex[sourceAggsForMyDataCurrentRoundIter[numSourceAggs]][numSourceAggs] == -1) {
+ sourceAggsForMyData[numSourceAggs] = ca_data->ranklist[currentAggRankListIndex];
+ sourceAggsForMyDataFDStart[numSourceAggs] = fd_start[currentAggRankListIndex];
+ /* Round up file domain to the first actual offset used if this is the first file domain.
+ */
+ if (currentAggRankListIndex == smallestFileDomainAggRank) {
+ if (sourceAggsForMyDataFDStart[numSourceAggs] < firstFileOffset)
+ sourceAggsForMyDataFDStart[numSourceAggs] = firstFileOffset;
+ }
+ sourceAggsForMyDataFDEnd[numSourceAggs] = fd_end[currentAggRankListIndex];
+ /* Round down file domain to the last actual offset used if this is the last file domain.
+ */
+ if (currentAggRankListIndex == greatestFileDomainAggRank) {
+ if (sourceAggsForMyDataFDEnd[numSourceAggs] > lastFileOffset)
+ sourceAggsForMyDataFDEnd[numSourceAggs] = lastFileOffset;
+ }
+ sourceAggsForMyDataFirstOffLenIndex[sourceAggsForMyDataCurrentRoundIter[numSourceAggs]][numSourceAggs] = blockIter;
+
+ /* Set the source buffer state starting point for data access for this agg and file domain.
+ */
+ if (bufTypeIsContig) {
+ if (currentFDSourceBufferState[numSourceAggs].sourceBufferOffset == -1) {
+ currentFDSourceBufferState[numSourceAggs].sourceBufferOffset = currentRecvBufferOffset;
+#ifdef onesidedtrace
+ printf("Rank %d - For agg %d sourceBufferOffset initialized to %ld\n",myrank,currentAggRankListIndex,currentRecvBufferOffset);
+#endif
+ }
+ } else {
+ if (currentFDSourceBufferState[numSourceAggs].indiceOffset == -1) {
+ currentFDSourceBufferState[numSourceAggs].indiceOffset = currentIndiceOffset;
+ currentFDSourceBufferState[numSourceAggs].bufTypeExtent = bufTypeExtent;
+ currentFDSourceBufferState[numSourceAggs].dataTypeExtent = currentDataTypeExtent;
+ currentFDSourceBufferState[numSourceAggs].flatBufIndice = currentFlatBufIndice;
+#ifdef onesidedtrace
+ printf("Rank %d - For agg %d dataTypeExtent initialized to %d flatBufIndice to %d indiceOffset to %ld\n", myrank, numSourceAggs, currentDataTypeExtent, currentFlatBufIndice, currentIndiceOffset);
+#endif
+ }
+ }
+
+ intraRoundCollBufsizeOffset = fd_start[currentAggRankListIndex] + ((ADIO_Offset_CA) (sourceAggsForMyDataCurrentRoundIter[numSourceAggs] + 1) * coll_bufsize);
+
+#ifdef onesidedtrace
+ printf("Rank %d - init settings numSourceAggs %d offset_list[%d] with value %ld past fd border %ld with len %ld currentRecvBufferOffset set to %ld intraRoundCollBufsizeOffset set to %ld\n", myrank, numSourceAggs, blockIter, offset_list[blockIter], fd_start[currentAggRankListIndex], len_list[blockIter], currentRecvBufferOffset, intraRoundCollBufsizeOffset);
+#endif
+
+ }
+
+ /* Replace the last offset block iter with this one.
+ */
+ sourceAggsForMyDataLastOffLenIndex[sourceAggsForMyDataCurrentRoundIter[numSourceAggs]][numSourceAggs] = blockIter;
+
+ /* If this blocks extends into the next file domain advance to the next source aggs and source buffer states.
+ */
+ if (blockEnd > fd_end[currentAggRankListIndex]) {
+
+ ADIO_Offset_CA amountToAdvanceSBOffsetForFD = 0;
+ int additionalFDCounter = 0;
+
+ while (blockEnd > fd_end[currentAggRankListIndex]) {
+#ifdef onesidedtrace
+ printf("Rank %d - block extends past current fd, blockEnd %ld >= fd_end[currentAggRankListIndex] %ld total block size is %ld blockStart was %ld\n", myrank, blockEnd, fd_end[currentAggRankListIndex], len_list[blockIter], blockStart);
+ printf("Rank %d - currentAggRankListIndex is now %d blockEnd %ld > fd_end[%d] %ld\n", myrank, currentAggRankListIndex, blockEnd, currentAggRankListIndex, fd_end[currentAggRankListIndex]);
+#endif
+ ADIO_Offset_CA thisAggBlockEnd = fd_end[currentAggRankListIndex];
+ if (thisAggBlockEnd >= intraRoundCollBufsizeOffset) {
+ while (thisAggBlockEnd >= intraRoundCollBufsizeOffset) {
+ sourceAggsForMyDataCurrentRoundIter[numSourceAggs]++;
+ intraRoundCollBufsizeOffset += coll_bufsize;
+ sourceAggsForMyDataFirstOffLenIndex[sourceAggsForMyDataCurrentRoundIter[numSourceAggs]][numSourceAggs] = blockIter;
+ sourceAggsForMyDataLastOffLenIndex[sourceAggsForMyDataCurrentRoundIter[numSourceAggs]][numSourceAggs] = blockIter;
+#ifdef onesidedtrace
+ printf("Rank %d - sourceAggsForMyDataCurrentRoundI%d] is now %d intraRoundCollBufsizeOffset is now %ld\n", myrank, numSourceAggs, sourceAggsForMyDataCurrentRoundIter[numSourceAggs], intraRoundCollBufsizeOffset);
+#endif
+ } // while (thisAggBlockEnd >= intraRoundCollBufsizeOffset)
+ } // if (thisAggBlockEnd >= intraRoundCollBufsizeOffset)
+
+ int prevAggRankListIndex = currentAggRankListIndex;
+ currentAggRankListIndex++;
+
+ /* Skip over unused aggs.
+ */
+ if (fd_start[currentAggRankListIndex] > fd_end[currentAggRankListIndex]) {
+ while (fd_start[currentAggRankListIndex] > fd_end[currentAggRankListIndex])
+ currentAggRankListIndex++;
+ } // (fd_start[currentAggRankListIndex] > fd_end[currentAggRankListIndex])
+
+ /* Start new source agg.
+ */
+ if (blockEnd >= fd_start[currentAggRankListIndex]) {
+ numSourceAggs++;
+ sourceAggsForMyData[numSourceAggs] = ca_data->ranklist[currentAggRankListIndex];
+ sourceAggsForMyDataFDStart[numSourceAggs] = fd_start[currentAggRankListIndex];
+ /* Round up file domain to the first actual offset used if this is the first file domain.
+ */
+ if (currentAggRankListIndex == smallestFileDomainAggRank) {
+ if (sourceAggsForMyDataFDStart[numSourceAggs] < firstFileOffset)
+ sourceAggsForMyDataFDStart[numSourceAggs] = firstFileOffset;
+ }
+ sourceAggsForMyDataFDEnd[numSourceAggs] = fd_end[currentAggRankListIndex];
+ /* Round down file domain to the last actual offset used if this is the last file domain.
+ */
+ if (currentAggRankListIndex == greatestFileDomainAggRank) {
+ if (sourceAggsForMyDataFDEnd[numSourceAggs] > lastFileOffset)
+ sourceAggsForMyDataFDEnd[numSourceAggs] = lastFileOffset;
+ }
+ sourceAggsForMyDataFirstOffLenIndex[sourceAggsForMyDataCurrentRoundIter[numSourceAggs]][numSourceAggs] = blockIter;
+
+ /* For the first additonal file domain the source buffer offset
+ * will be incremented relative to the state of this first main
+ * loop but for subsequent full file domains the offset will be
+ * incremented by the size of the file domain.
+ */
+ if (additionalFDCounter == 0)
+ amountToAdvanceSBOffsetForFD = (fd_end[prevAggRankListIndex] - blockStart) + (ADIO_Offset_CA) 1;
+ else
+ amountToAdvanceSBOffsetForFD = (fd_end[prevAggRankListIndex] - fd_start[prevAggRankListIndex]) + (ADIO_Offset_CA) 1;
+
+ if (bufTypeIsContig) {
+ HDassert(numSourceAggs > 0);
+ if (currentFDSourceBufferState[numSourceAggs].sourceBufferOffset == -1) {
+ if (additionalFDCounter == 0) { // first file domain, still use the current data counter
+ currentFDSourceBufferState[numSourceAggs].sourceBufferOffset = currentRecvBufferOffset + amountToAdvanceSBOffsetForFD;
+ } else { // 2nd file domain, advance full file domain from last source buffer state
+ currentFDSourceBufferState[numSourceAggs].sourceBufferOffset = currentFDSourceBufferState[numSourceAggs - 1].sourceBufferOffset + amountToAdvanceSBOffsetForFD;
+ }
+#ifdef onesidedtrace
+ printf("Rank %d - Crossed into new FD - for agg %d sourceBufferOffset initialized to %ld amountToAdvanceSBOffsetForFD is %ld\n", myrank, numSourceAggs, currentFDSourceBufferState[numSourceAggs].sourceBufferOffset, amountToAdvanceSBOffsetForFD);
+#endif
+ }
+ } else if (currentFDSourceBufferState[numSourceAggs].indiceOffset == -1) {
+
+ /* non-contiguos source buffer */
+ HDassert(numSourceAggs > 0);
+
+ /* Initialize the source buffer state appropriately and then
+ * advance it with the nonContigSourceDataBufferAdvance function.
+ */
+ if (additionalFDCounter == 0) {
+ // first file domain, still use the current data counter
+ currentFDSourceBufferState[numSourceAggs].indiceOffset = currentIndiceOffset;
+ currentFDSourceBufferState[numSourceAggs].bufTypeExtent = bufTypeExtent;
+ currentFDSourceBufferState[numSourceAggs].dataTypeExtent = currentDataTypeExtent;
+ currentFDSourceBufferState[numSourceAggs].flatBufIndice = currentFlatBufIndice;
+ } else {
+ // 2nd file domain, advance full file domain from last source buffer state
+ currentFDSourceBufferState[numSourceAggs].indiceOffset =
+ currentFDSourceBufferState[numSourceAggs - 1].indiceOffset;
+ currentFDSourceBufferState[numSourceAggs].bufTypeExtent =
+ currentFDSourceBufferState[numSourceAggs - 1].bufTypeExtent;
+ currentFDSourceBufferState[numSourceAggs].dataTypeExtent = currentFDSourceBufferState[numSourceAggs - 1].dataTypeExtent;
+ currentFDSourceBufferState[numSourceAggs].flatBufIndice =
+ currentFDSourceBufferState[numSourceAggs - 1].flatBufIndice;
+ }
+ H5FD_mpio_nc_buffer_advance(((char *) buf), flatBuf, (int) amountToAdvanceSBOffsetForFD, 0, &currentFDSourceBufferState[numSourceAggs], NULL);
+
+#ifdef onesidedtrace
+ printf("Rank %d - Crossed into new FD - for agg %d dataTypeExtent initialized to %d flatBufIndice to %d indiceOffset to %ld amountToAdvanceSBOffsetForFD is %d\n", myrank, numSourceAggs, currentFDSourceBufferState[numSourceAggs].dataTypeExtent, currentFDSourceBufferState[numSourceAggs].flatBufIndice, currentFDSourceBufferState[numSourceAggs].indiceOffset, amountToAdvanceSBOffsetForFD);
+#endif
+ }
+ additionalFDCounter++;
+
+#ifdef onesidedtrace
+ printf("Rank %d - block extended beyond fd init settings numSourceAggs %d offset_list[%d] with value %ld past fd border %ld with len %ld\n", myrank, numSourceAggs, blockIter, offset_list[blockIter], fd_start[currentAggRankListIndex], len_list[blockIter]);
+#endif
+ intraRoundCollBufsizeOffset = fd_start[currentAggRankListIndex] + coll_bufsize;
+ sourceAggsForMyDataLastOffLenIndex[sourceAggsForMyDataCurrentRoundIter[numSourceAggs]][numSourceAggs] = blockIter;
+
+ } // if (blockEnd >= fd_start[currentAggRankListIndex])
+ } // while (blockEnd > fd_end[currentAggRankListIndex])
+ } // if (blockEnd > fd_end[currentAggRankListIndex])
+
+ /* If we are still in the same file domain / source agg but have gone past the coll_bufsize and need
+ * to advance to the next round handle this situation.
+ */
+ if (blockEnd >= intraRoundCollBufsizeOffset) {
+ ADIO_Offset_CA currentBlockEnd = blockEnd;
+ while (currentBlockEnd >= intraRoundCollBufsizeOffset) {
+ sourceAggsForMyDataCurrentRoundIter[numSourceAggs]++;
+ intraRoundCollBufsizeOffset += coll_bufsize;
+ sourceAggsForMyDataFirstOffLenIndex[sourceAggsForMyDataCurrentRoundIter[numSourceAggs]][numSourceAggs] = blockIter;
+ sourceAggsForMyDataLastOffLenIndex[sourceAggsForMyDataCurrentRoundIter[numSourceAggs]][numSourceAggs] = blockIter;
+#ifdef onesidedtrace
+ printf("Rank %d - block less than fd currentBlockEnd is now %ld intraRoundCollBufsizeOffset is now %ld sourceAggsForMyDataCurrentRoundIter[%d] is now %d\n", myrank, currentBlockEnd, intraRoundCollBufsizeOffset, numSourceAggs, sourceAggsForMyDataCurrentRoundIter[numSourceAggs]);
+#endif
+ } // while (currentBlockEnd >= intraRoundCollBufsizeOffset)
+ } // if (blockEnd >= intraRoundCollBufsizeOffset)
+
+ /* Need to advance numSourceAggs if this is the last source offset to
+ * include this one.
+ */
+ if (blockIter == (contig_access_count - 1))
+ numSourceAggs++;
+ }
+
+#ifdef onesidedtrace
+ printf("Rank %d - numSourceAggs is %d\n", myrank, numSourceAggs);
+ /*for (i = 0; i < numSourceAggs; i++) {
+ for (j = 0; j <= sourceAggsForMyDataCurrentRoundIter[i]; j++)
+ printf("Rank %d - sourceAggsForMyData[%d] is %d sourceAggsForMyDataFDStart[%d] is %ld sourceAggsForMyDataFDEnd is %ld sourceAggsForMyDataFirstOffLenIndex is %d with value %ld sourceAggsForMyDataLastOffLenIndex is %d with value %ld\n", myrank, i, sourceAggsForMyData[i], i, sourceAggsForMyDataFDStart[i], sourceAggsForMyDataFDEnd[i], sourceAggsForMyDataFirstOffLenIndex[j][i], offset_list[sourceAggsForMyDataFirstOffLenIndex[j][i]], sourceAggsForMyDataLastOffLenIndex[j][i], offset_list[sourceAggsForMyDataLastOffLenIndex[j][i]]);
+ }*/
+#endif
+
+ } // if ((contig_access_count > 0) && (buf != NULL) && lenListOverZero)
+
+ H5MM_free(sourceAggsForMyDataCurrentRoundIter);
+
+ int currentReadBuf = 0;
+ int useIOBuffer = 0;
+
+ /* Check if the I/O is asynchronous */
+ if ((ca_data->async_io_inner == 1) && (numberOfRounds > 1)) {
+ if (ca_data->pthread_io == 1) {
+ useIOBuffer = 1;
+ io_thread = pthread_self();
+ } else {
+ ca_data->async_io_inner = 0;
+ }
+ }
+
+ /* use the two-phase buffer allocated in the file_open - no app should ever
+ * be both reading and reading at the same time */
+ char *read_buf0 = ca_data->io_buf;
+ char *read_buf1 = ca_data->io_buf + coll_bufsize;
+
+ /* Async I/O - Adjust if this is the "duplicate" buffer */
+ if (ca_data->use_dup) {
+ read_buf0 = ca_data->io_buf_d;
+ read_buf1 = ca_data->io_buf_d + coll_bufsize;
+ }
+
+ /* use the two-phase buffer allocated in the file_open - no app should ever
+ * be both reading and reading at the same time */
+ char *read_buf = read_buf0;
+ MPI_Win read_buf_window = ca_data->io_buf_window;
+
+ /* Async I/O - Adjust if this is the "duplicate" buffer */
+ if (ca_data->use_dup) {
+ read_buf_window = ca_data->io_buf_window_d;
+ }
+
+ ADIO_Offset_CA currentRoundFDStart = 0, nextRoundFDStart = 0;
+ ADIO_Offset_CA currentRoundFDEnd = 0, nextRoundFDEnd = 0;
+
+ if (iAmUsedAgg) {
+ currentRoundFDStart = fd_start[myAggRank];
+ nextRoundFDStart = fd_start[myAggRank];
+ if (myAggRank == smallestFileDomainAggRank) {
+ if (currentRoundFDStart < firstFileOffset)
+ currentRoundFDStart = firstFileOffset;
+ if (nextRoundFDStart < firstFileOffset)
+ nextRoundFDStart = firstFileOffset;
+ } else if (myAggRank == greatestFileDomainAggRank) {
+ if (currentRoundFDEnd > lastFileOffset)
+ currentRoundFDEnd = lastFileOffset;
+ if (nextRoundFDEnd > lastFileOffset)
+ nextRoundFDEnd = lastFileOffset;
+ }
+#ifdef onesidedtrace
+ printf("Rank %d - iAmUsedAgg - currentRoundFDStart initialized to %ld currentRoundFDEnd to %ld\n", myrank, currentRoundFDStart, currentRoundFDEnd);
+#endif
+ } // if iAmUsedAgg
+
+#ifdef onesidedtrace
+ MPI_Barrier(ca_data->comm);
+ if(myrank==0) { printf("\n\n"); fflush(stdout); }
+ MPI_Barrier(ca_data->comm);
+ printf("Rank %d is waiting at barrier between main loops.\n", myrank);
+ printf("Rank %d -- numberOfRounds = %d, contig_access_count = %d, numSourceAggs = %d\n", myrank, numberOfRounds, contig_access_count, numSourceAggs);
+ fflush(stdout);
+ MPI_Barrier(ca_data->comm);
+ if(myrank==0) { printf("\n\n"); fflush(stdout); }
+ MPI_Barrier(ca_data->comm);
+#endif
+
+ /* This is the second main loop of the algorithm, actually nested loop of
+ * aggs within rounds. There are 2 flavors of this.
+ * For onesided_read_aggmethod of 1 each nested iteration for the source agg
+ * does an mpi_get on a contiguous chunk using a primative datatype
+ * determined using the data structures from the first main loop.
+ * For onesided_read_aggmethod of 2 each nested iteration for the source agg
+ * builds up data to use in created a derived data type for 1 mpi_get that
+ * is done for the target agg for each round.
+ * To support lustre there will need to be an additional layer of nesting
+ * for the multiple file domains within target aggs.
+ */
+ int roundIter;
+ for (roundIter = 0; roundIter < numberOfRounds; roundIter++) {
+
+ if (iAmUsedAgg || stripe_parms->iWasUsedStripingAgg) {
+ stripe_parms->iWasUsedStripingAgg = 0;
+
+#ifdef onesidedtrace
+ printf("Rank %d - roundIter %ld of %ld - currentRoundFDEnd = %ld \n", myrank, roundIter, numberOfRounds, currentRoundFDEnd);
+#endif
+
+ /* determine what offsets define the portion of the file domain the agg is reading this round */
+ if (iAmUsedAgg) {
+
+ currentRoundFDStart = nextRoundFDStart;
+
+ if (!useIOBuffer || (roundIter == 0)) {
+
+ ADIO_Offset_CA amountDataToReadThisRound;
+ if ((fd_end[myAggRank] - currentRoundFDStart) < coll_bufsize) {
+ currentRoundFDEnd = fd_end[myAggRank];
+ amountDataToReadThisRound = ((currentRoundFDEnd - currentRoundFDStart) + 1);
+ } else {
+ currentRoundFDEnd = currentRoundFDStart + coll_bufsize - (ADIO_Offset_CA) 1;
+ amountDataToReadThisRound = coll_bufsize;
+ }
+
+#ifdef onesidedtrace
+ printf("Rank %d - amountDataToReadThisRound=%ld - myAggRank=%ld - fd_end[myAggRank]=%ld - currentRoundFDStart=%ld - currentRoundFDEnd=%ld - coll_bufsize=%ld\n", myrank, amountDataToReadThisRound, myAggRank, fd_end[myAggRank], currentRoundFDStart, currentRoundFDEnd, coll_bufsize);
+#endif
+
+ /*
+ * Don't actually do the read if it was already done
+ * (asynchronously) outside this function call...
+ */
+ if (do_file_read && amountDataToReadThisRound>0) {
+#ifdef onesidedtrace
+ printf("Rank %d - calling MPI_File_read_at\n", myrank);
+#endif
+
+ if (ca_data->check_req) {
+ MPIO_Wait(&ca_data->io_Request, error_code);
+ ca_data->check_req = 0;
+ }
+
+ /* read currentRoundFDEnd bytes */
+ MPI_File_read_at(ca_data->fh, currentRoundFDStart, read_buf, amountDataToReadThisRound, MPI_BYTE, &status);
+
+#ifdef onesidedtrace
+ printf("Rank %d - Finishing MPI_File_read_at (offset=%d,size=%d)\n", myrank, currentRoundFDStart, amountDataToReadThisRound);
+ fflush(stdout);
+#endif
+ } /* if (do_file_read) */
+
+ currentReadBuf = 1;
+
+ } /* (!useIOBuffer || (roundIter == 0)) */
+ if (useIOBuffer) {
+
+ /* use the thread reader for the next round */
+ /* switch back and forth between the read buffers so that the data aggregation code is diseminating 1 buffer while the thread is reading into the other */
+ if (roundIter > 0) currentRoundFDEnd = nextRoundFDEnd; // Does this do anything?
+
+ if (roundIter < (numberOfRounds - 1)) {
+
+#ifdef onesidedtrace
+ printf("Rank %d - Calc amountDataToReadNextRound...\n", myrank);
+ fflush(stdout);
+#endif
+ nextRoundFDStart += coll_bufsize;
+ ADIO_Offset_CA amountDataToReadNextRound;
+ if ((fd_end[myAggRank] - nextRoundFDStart) < coll_bufsize) {
+ nextRoundFDEnd = fd_end[myAggRank];
+ amountDataToReadNextRound = ((nextRoundFDEnd - nextRoundFDStart) + 1);
+ } else {
+ nextRoundFDEnd = nextRoundFDStart + coll_bufsize - (ADIO_Offset_CA) 1;
+ amountDataToReadNextRound = coll_bufsize;
+ }
+#ifdef onesidedtrace
+ printf("Rank %d - nextRoundFDEnd = %ld, amountDataToReadNextRound = %ld.\n", myrank, nextRoundFDEnd, amountDataToReadNextRound);
+ fflush(stdout);
+ printf("Rank %d - myAggRank=%ld - fd_end[myAggRank]=%ld - nextRoundFDStart=%ld - nextRoundFDEnd=%ld - coll_bufsize=%ld\n", myrank, myAggRank, fd_end[myAggRank], nextRoundFDStart, nextRoundFDEnd, coll_bufsize);
+#endif
+ if ( !pthread_equal(io_thread, pthread_self()) ) {
+
+#ifdef onesidedtrace
+ printf("Rank %d - Need pthread join.\n", myrank);
+ fflush(stdout);
+#endif
+ pthread_join(io_thread, &thread_ret);
+
+ int error_code_thread = *(int *) thread_ret;
+ if (error_code_thread != MPI_SUCCESS) {
+ printf("Rank %d - pthread_join FAILED!, error_code_thread = %d\n", myrank, error_code_thread);
+ fflush(stdout);
+ return;
+ }
+ io_thread = pthread_self();
+ }
+
+ /* do a little pointer shuffling: background I/O works from one
+ * buffer while two-phase machinery fills up another */
+ if (currentReadBuf == 0) {
+ read_buf = read_buf1;
+ currentReadBuf = 1;
+ io_thread_args.buf = read_buf0;
+ } else {
+ read_buf = read_buf0;
+ currentReadBuf = 0;
+ io_thread_args.buf = read_buf1;
+ }
+ io_thread_args.fh = ca_data->fh;
+ io_thread_args.myrank = myrank;
+ io_thread_args.io_kind = READ_CA;
+ io_thread_args.size = amountDataToReadNextRound;
+ io_thread_args.offset = nextRoundFDStart;
+ io_thread_args.error_code = *error_code;
+
+ if (amountDataToReadNextRound > 0) {
+#ifdef onesidedtrace
+ printf("Rank %d - calling pthread_create (size=%ld,offset=%ld)\n", myrank, io_thread_args.size, io_thread_args.offset);
+ printf("Rank %d - (size=%ld,amountDataToReadNextRound=%ld)\n", myrank, io_thread_args.size, amountDataToReadNextRound);
+ fflush(stdout);
+#endif
+ if ((pthread_create(&io_thread, NULL, IO_Thread_Func, &(io_thread_args))) != 0)
+ io_thread = pthread_self();
+#ifdef onesidedtrace
+ printf("Rank %d - pthread_create DONE.\n", myrank);
+#endif
+ }
+ } else { /* last round */
+
+ if (!pthread_equal(io_thread, pthread_self())) {
+
+ pthread_join(io_thread, &thread_ret);
+ int error_code_thread = *(int *) thread_ret;
+ if (error_code_thread != MPI_SUCCESS) {
+ printf("Rank %d - Last pthread_join FAILED!, error_code_thread = %d\n", myrank, error_code_thread);
+ fflush(stdout);
+ return;
+ }
+ io_thread = pthread_self();
+
+ }
+ if (currentReadBuf == 0) {
+ read_buf = read_buf1;
+ } else {
+ read_buf = read_buf0;
+ }
+
+ }
+ } /* useIOBuffer */
+
+ } /* IAmUsedAgg */
+ else if (useIOBuffer) {
+ if (roundIter < (numberOfRounds - 1)) {
+ if (currentReadBuf == 0) {
+ currentReadBuf = 1;
+ read_buf = read_buf1;
+ } else {
+ currentReadBuf = 0;
+ read_buf = read_buf0;
+ }
+ } else {
+ if (currentReadBuf == 0) {
+ read_buf = read_buf1;
+ } else {
+ read_buf = read_buf0;
+ }
+ }
+ }
+
+ } // (iAmUsedAgg || stripe_parms->iWasUsedStripingAgg)
+
+#ifdef onesidedtrace
+ printf("Rank %d - Hitting MPI_Barrier.\n", myrank);
+#endif
+
+ /* wait until the read buffers are full before we start pulling from the source procs */
+ MPI_Barrier(ca_data->comm);
+
+ if ((contig_access_count > 0) && (buf != NULL) && lenListOverZero) {
+
+ int aggIter;
+ for (aggIter = 0; aggIter < numSourceAggs; aggIter++) {
+
+ /* If we have data for the round/agg process it.
+ */
+ if (sourceAggsForMyDataFirstOffLenIndex[roundIter][aggIter] != -1) {
+
+ ADIO_Offset_CA currentRoundFDStartForMySourceAgg = (ADIO_Offset_CA) ((ADIO_Offset_CA) sourceAggsForMyDataFDStart[aggIter] + (ADIO_Offset_CA) ((ADIO_Offset_CA) roundIter * coll_bufsize));
+ ADIO_Offset_CA currentRoundFDEndForMySourceAgg = (ADIO_Offset_CA) ((ADIO_Offset_CA) sourceAggsForMyDataFDStart[aggIter] + (ADIO_Offset_CA) ((ADIO_Offset_CA) (roundIter + 1) * coll_bufsize) - (ADIO_Offset_CA) 1);
+
+ int sourceAggContigAccessCount = 0;
+
+ /* These data structures are used for the derived datatype mpi_get
+ * in the onesided_read_aggmethod of 2 case.
+ */
+ int *sourceAggBlockLengths = NULL;
+ MPI_Aint *sourceAggDisplacements = NULL, *recvBufferDisplacements = NULL;
+ MPI_Datatype *sourceAggDataTypes = NULL;
+ char *derivedTypePackedSourceBuffer = NULL;
+ int derivedTypePackedSourceBufferOffset = 0;
+ int allocatedDerivedTypeArrays = 0;
+ ADIO_Offset_CA amountOfDataReadThisRoundAgg = 0;
+
+ /* Process the range of offsets for this source agg.
+ */
+ int offsetIter;
+ int startingOffLenIndex = sourceAggsForMyDataFirstOffLenIndex[roundIter][aggIter];
+ int endingOffLenIndex = sourceAggsForMyDataLastOffLenIndex[roundIter][aggIter];
+ for (offsetIter = startingOffLenIndex; offsetIter <= endingOffLenIndex; offsetIter++) {
+
+ if (currentRoundFDEndForMySourceAgg > sourceAggsForMyDataFDEnd[aggIter])
+ currentRoundFDEndForMySourceAgg = sourceAggsForMyDataFDEnd[aggIter];
+
+ ADIO_Offset_CA offsetStart = offset_list[offsetIter], offsetEnd = (offset_list[offsetIter] + len_list[offsetIter] - (ADIO_Offset_CA) 1);
+
+ /* Determine the amount of data and exact source buffer offsets to use.
+ */
+ int bufferAmountToRecv = 0;
+
+ if ((offsetStart >= currentRoundFDStartForMySourceAgg) && (offsetStart <= currentRoundFDEndForMySourceAgg)) {
+ if (offsetEnd > currentRoundFDEndForMySourceAgg)
+ bufferAmountToRecv = (currentRoundFDEndForMySourceAgg - offsetStart) + 1;
+ else
+ bufferAmountToRecv = (offsetEnd - offsetStart) + 1;
+ } else if ((offsetEnd >= currentRoundFDStartForMySourceAgg) && (offsetEnd <= currentRoundFDEndForMySourceAgg)) {
+ if (offsetEnd > currentRoundFDEndForMySourceAgg)
+ bufferAmountToRecv = (currentRoundFDEndForMySourceAgg - currentRoundFDStartForMySourceAgg) + 1;
+ else
+ bufferAmountToRecv = (offsetEnd - currentRoundFDStartForMySourceAgg) + 1;
+ if (offsetStart < currentRoundFDStartForMySourceAgg) {
+ offsetStart = currentRoundFDStartForMySourceAgg;
+ }
+ } else if ((offsetStart <= currentRoundFDStartForMySourceAgg) && (offsetEnd >= currentRoundFDEndForMySourceAgg)) {
+ bufferAmountToRecv = (currentRoundFDEndForMySourceAgg - currentRoundFDStartForMySourceAgg) + 1;
+ offsetStart = currentRoundFDStartForMySourceAgg;
+ }
+
+ if (bufferAmountToRecv > 0) { /* we have data to recv this round */
+ if (ca_data->onesided_read_aggmethod == 2) {
+ /* Only allocate these arrays if we are using method 2 and only do it once for this round/source agg.
+ */
+ if (!allocatedDerivedTypeArrays) {
+ sourceAggBlockLengths = (int *) H5MM_malloc(maxNumContigOperations * sizeof(int));
+ sourceAggDisplacements = (MPI_Aint *) H5MM_malloc(maxNumContigOperations * sizeof(MPI_Aint));
+ recvBufferDisplacements = (MPI_Aint *) H5MM_malloc(maxNumContigOperations * sizeof(MPI_Aint));
+ sourceAggDataTypes = (MPI_Datatype *) H5MM_malloc(maxNumContigOperations * sizeof(MPI_Datatype));
+ if (!bufTypeIsContig) {
+ int k;
+ for (k = sourceAggsForMyDataFirstOffLenIndex[roundIter][aggIter]; k <= sourceAggsForMyDataLastOffLenIndex[roundIter][aggIter]; k++)
+ amountOfDataReadThisRoundAgg += len_list[k];
+
+#ifdef onesidedtrace
+ printf("Rank %d - derivedTypePackedSourceBuffer mallocing %ld\n", myrank,amountOfDataReadThisRoundAgg);
+#endif
+ if (amountOfDataReadThisRoundAgg > 0)
+ derivedTypePackedSourceBuffer = (char *) H5MM_malloc(amountOfDataReadThisRoundAgg * sizeof(char));
+ else
+ derivedTypePackedSourceBuffer = NULL;
+ }
+ allocatedDerivedTypeArrays = 1;
+ }
+ }
+
+ /* Determine the offset into the source window.
+ */
+ ADIO_Offset_CA sourceDisplacementToUseThisRound = (ADIO_Offset_CA) (offsetStart - currentRoundFDStartForMySourceAgg);
+
+ /* If using the thread reader select the appropriate side of the split window.
+ */
+ if (useIOBuffer && (read_buf == read_buf1)) {
+ sourceDisplacementToUseThisRound += (ADIO_Offset_CA) coll_bufsize;
+ }
+
+
+ /* For onesided_read_aggmethod of 1 do the mpi_get using the primitive MPI_BYTE type from each
+ * contiguous chunk from the target, if the source is non-contiguous then unpack the data after
+ * the MPI_Win_unlock is done to make sure the data has arrived first.
+ */
+ if (ca_data->onesided_read_aggmethod == 1) {
+
+ MPI_Win_lock(MPI_LOCK_SHARED, sourceAggsForMyData[aggIter], 0, read_buf_window);
+
+ char *getSourceData = NULL;
+ if (bufTypeIsContig) {
+
+ MPI_Get(((char *) buf) + currentFDSourceBufferState[aggIter].sourceBufferOffset, bufferAmountToRecv, MPI_BYTE, sourceAggsForMyData[aggIter], sourceDisplacementToUseThisRound, bufferAmountToRecv, MPI_BYTE, read_buf_window);
+ currentFDSourceBufferState[aggIter].sourceBufferOffset += (ADIO_Offset_CA) bufferAmountToRecv;
+
+ }
+ else {
+
+ getSourceData = (char *) H5MM_malloc(bufferAmountToRecv * sizeof(char));
+ MPI_Get(getSourceData, bufferAmountToRecv, MPI_BYTE, sourceAggsForMyData[aggIter], sourceDisplacementToUseThisRound, bufferAmountToRecv, MPI_BYTE, read_buf_window);
+
+ }
+
+ MPI_Win_unlock(sourceAggsForMyData[aggIter], read_buf_window);
+
+ if (!bufTypeIsContig) {
+ H5FD_mpio_nc_buffer_advance(((char *) buf), flatBuf, bufferAmountToRecv, 0, &currentFDSourceBufferState[aggIter], getSourceData);
+ H5MM_free(getSourceData);
+ }
+ }
+
+ /* For onesided_read_aggmethod of 2 populate the data structures for this round/agg for this offset iter
+ * to be used subsequently when building the derived type for 1 mpi_get for all the data for this
+ * round/agg.
+ */
+ else if (ca_data->onesided_read_aggmethod == 2) {
+
+ if (bufTypeIsContig) {
+ sourceAggBlockLengths[sourceAggContigAccessCount] = bufferAmountToRecv;
+ sourceAggDataTypes[sourceAggContigAccessCount] = MPI_BYTE;
+ sourceAggDisplacements[sourceAggContigAccessCount] = sourceDisplacementToUseThisRound;
+ recvBufferDisplacements[sourceAggContigAccessCount] = (MPI_Aint) currentFDSourceBufferState[aggIter].sourceBufferOffset;
+ currentFDSourceBufferState[aggIter].sourceBufferOffset += (ADIO_Offset_CA) bufferAmountToRecv;
+ sourceAggContigAccessCount++;
+ }
+ else {
+ sourceAggBlockLengths[sourceAggContigAccessCount] = bufferAmountToRecv;
+ sourceAggDataTypes[sourceAggContigAccessCount] = MPI_BYTE;
+ sourceAggDisplacements[sourceAggContigAccessCount] = sourceDisplacementToUseThisRound;
+ recvBufferDisplacements[sourceAggContigAccessCount] = (MPI_Aint) derivedTypePackedSourceBufferOffset;
+ derivedTypePackedSourceBufferOffset += (ADIO_Offset_CA) bufferAmountToRecv;
+ sourceAggContigAccessCount++;
+ }
+ }
+ } // bufferAmountToRecv > 0
+ } // contig list
+
+ /* For onesided_read_aggmethod of 2 now build the derived type using
+ * the data from this round/agg and do 1 single mpi_get.
+ */
+ if (ca_data->onesided_read_aggmethod == 2) {
+ MPI_Datatype recvBufferDerivedDataType, sourceBufferDerivedDataType;
+
+ MPI_Type_create_struct(sourceAggContigAccessCount, sourceAggBlockLengths, recvBufferDisplacements, sourceAggDataTypes, &recvBufferDerivedDataType);
+ MPI_Type_commit(&recvBufferDerivedDataType);
+ MPI_Type_create_struct(sourceAggContigAccessCount, sourceAggBlockLengths, sourceAggDisplacements, sourceAggDataTypes, &sourceBufferDerivedDataType);
+ MPI_Type_commit(&sourceBufferDerivedDataType);
+
+ if (sourceAggContigAccessCount > 0) {
+
+ MPI_Win_lock(MPI_LOCK_SHARED, sourceAggsForMyData[aggIter], 0, read_buf_window);
+
+ if (bufTypeIsContig) {
+
+ MPI_Get(((char *) buf), 1, recvBufferDerivedDataType, sourceAggsForMyData[aggIter], 0, 1, sourceBufferDerivedDataType, read_buf_window);
+
+ } else {
+
+ MPI_Get(derivedTypePackedSourceBuffer, 1, recvBufferDerivedDataType, sourceAggsForMyData[aggIter], 0, 1, sourceBufferDerivedDataType, read_buf_window);
+
+ }
+
+ MPI_Win_unlock(sourceAggsForMyData[aggIter], read_buf_window);
+
+ if (!bufTypeIsContig) {
+
+ H5FD_mpio_nc_buffer_advance(((char *) buf), flatBuf, derivedTypePackedSourceBufferOffset, 0, &currentFDSourceBufferState[aggIter], derivedTypePackedSourceBuffer);
+
+ }
+ }
+
+ if (allocatedDerivedTypeArrays) {
+ H5MM_free(sourceAggBlockLengths);
+ H5MM_free(sourceAggDisplacements);
+ H5MM_free(sourceAggDataTypes);
+ H5MM_free(recvBufferDisplacements);
+ if (!bufTypeIsContig) {
+ if (derivedTypePackedSourceBuffer != NULL)
+ H5MM_free(derivedTypePackedSourceBuffer);
+ }
+ }
+ if (sourceAggContigAccessCount > 0) {
+ MPI_Type_free(&recvBufferDerivedDataType);
+ MPI_Type_free(&sourceBufferDerivedDataType);
+ }
+ }
+ } // baseoffset != -1
+ } // source aggs
+
+ if (stripeSize > 0) {
+ stripe_parms->lastDataTypeExtent = currentFDSourceBufferState[numSourceAggs-1].dataTypeExtent;
+ stripe_parms->lastFlatBufIndice = currentFDSourceBufferState[numSourceAggs-1].flatBufIndice;
+ stripe_parms->lastIndiceOffset = currentFDSourceBufferState[numSourceAggs-1].indiceOffset;
+ }
+
+ } /* contig_access_count > 0 */
+
+ /* the source procs recv the requested data to the aggs */
+
+ /* Synchronize all procs */
+ MPI_Barrier(ca_data->comm);
+
+ nextRoundFDStart = currentRoundFDStart + coll_bufsize;
+
+ } /* for-loop roundIter */
+
+ if (useIOBuffer) { /* thread readr cleanup */
+ if (!pthread_equal(io_thread, pthread_self())) {
+ pthread_join(io_thread, &thread_ret);
+ *error_code = *(int *) thread_ret;
+ }
+ }
+
+ H5MM_free(sourceAggsForMyData);
+ H5MM_free(sourceAggsForMyDataFDStart);
+ H5MM_free(sourceAggsForMyDataFDEnd);
+
+ for (i = 0; i < numberOfRounds; i++) {
+ H5MM_free(sourceAggsForMyDataFirstOffLenIndex[i]);
+ H5MM_free(sourceAggsForMyDataLastOffLenIndex[i]);
+ }
+ H5MM_free(sourceAggsForMyDataFirstOffLenIndex);
+ H5MM_free(sourceAggsForMyDataLastOffLenIndex);
+ H5MM_free(currentFDSourceBufferState);
+
+ return;
+ } /* H5FD_mpio_ccio_osagg_read */
+
+
+/*-------------------------------------------------------------------------
+ * Function: H5FD_mpio_ccio_file_read
+ *
+ * Purpose: Read data from file into aggregators
+ *
+ *-------------------------------------------------------------------------
+ */
+void H5FD_mpio_ccio_file_read(CustomAgg_FH_Data ca_data, int *error_code,
+ ADIO_Offset_CA firstFileOffset, ADIO_Offset_CA lastFileOffset,
+ ADIO_Offset_CA *fd_start, ADIO_Offset_CA* fd_end)
+ {
+ int i,j; /* generic iterators */
+ int naggs, iAmUsedAgg, myAggRank;
+ MPI_Status status;
+ int nprocs, myrank;
+ int greatestFileDomainAggRank, smallestFileDomainAggRank;
+ ADIO_Offset_CA greatestFileDomainOffset, smallestFileDomainOffset;
+ ADIO_Offset_CA coll_bufsize;
+ ADIO_Offset_CA readFDStart;
+ ADIO_Offset_CA readFDEnd;
+
+ *error_code = MPI_SUCCESS; /* initialize to success */
+ MPI_Comm_size(ca_data->comm, &nprocs);
+ MPI_Comm_rank(ca_data->comm, &myrank);
+
+ naggs = ca_data->cb_nodes;
+ iAmUsedAgg = 0; /* whether or not this rank is used as an aggregator. */
+ myAggRank = -1; /* if I am an aggregor this is my index into ranklist */
+ coll_bufsize = (ADIO_Offset_CA)(ca_data->cb_buffer_size);
+
+ /*
+ * Confirm that we are only dealing with ONE round here...
+ */
+ int numberOfRounds = 0;
+ for (j=0;j<naggs;j++) {
+ int currentNumberOfRounds = (int)(((fd_end[j] - fd_start[j])+(ADIO_Offset_CA)1)/coll_bufsize);
+ if ( ( (ADIO_Offset_CA)currentNumberOfRounds*coll_bufsize ) < ((fd_end[j] - fd_start[j])+(ADIO_Offset_CA)1))
+ currentNumberOfRounds++;
+ if (currentNumberOfRounds > numberOfRounds)
+ numberOfRounds = currentNumberOfRounds;
+ }
+ if (numberOfRounds > 1) {
+ printf("ERROR -- Use of H5FD_mpio_ccio_file_read assumes there are is only ONE round for the current aggregation segment!\n");
+ }
+
+#ifdef onesidedtrace
+ printf("Rank %d (use_dup == %d) called H5FD_mpio_ccio_file_read with segmentFirstFileOffset %d, segmentLastFileOffset %d, segment_stripe_start %d, segment_stripe_end %d. \n",myrank, ca_data->use_dup, (int)firstFileOffset, (int)lastFileOffset, (int)fd_start[0], (int)fd_end[0]);
+#endif
+
+ /* This logic defines values that are used later to determine what offsets define the portion
+ * of the file domain the agg is reading this round.
+ */
+ greatestFileDomainAggRank = -1;
+ smallestFileDomainAggRank = -1;
+ greatestFileDomainOffset = 0;
+ smallestFileDomainOffset = lastFileOffset;
+ for (j=0;j<naggs;j++) {
+ if (fd_end[j] > greatestFileDomainOffset) {
+ greatestFileDomainOffset = fd_end[j];
+ greatestFileDomainAggRank = j;
+ }
+ if (fd_start[j] < smallestFileDomainOffset) {
+ smallestFileDomainOffset = fd_start[j];
+ smallestFileDomainAggRank = j;
+ }
+ if (ca_data->ranklist[j] == myrank) {
+ myAggRank = j;
+ if (fd_end[j] > fd_start[j]) {
+ iAmUsedAgg = 1;
+ }
+ }
+ }
+
+ readFDStart = 0;
+ readFDEnd = 0;
+ if (iAmUsedAgg) {
+
+ /* What offset to read from */
+ readFDStart = fd_start[myAggRank];
+ if (myAggRank == smallestFileDomainAggRank) {
+ if (readFDStart < firstFileOffset)
+ readFDStart = firstFileOffset;
+ } else if (myAggRank == greatestFileDomainAggRank) {
+ if (readFDEnd > lastFileOffset)
+ readFDEnd = lastFileOffset;
+ }
+
+ /* How much data to read */
+ int read_size;
+ if ((fd_end[myAggRank] - readFDStart) < coll_bufsize) {
+ readFDEnd = fd_end[myAggRank];
+ read_size = ((readFDEnd - readFDStart) + 1);
+ } else {
+ readFDEnd = readFDStart + coll_bufsize - (ADIO_Offset_CA) 1;
+ read_size = coll_bufsize;
+ }
+
+ /* Read 'read_size' bytes */
+ if (ca_data->use_dup) {
+ MPI_File_iread_at(ca_data->fh, readFDStart, ca_data->io_buf_d, read_size, MPI_BYTE, &ca_data->io_Request_d);
+ ca_data->check_req_d = 1;
+ }else {
+ MPI_File_iread_at(ca_data->fh, readFDStart, ca_data->io_buf, read_size, MPI_BYTE, &ca_data->io_Request);
+ ca_data->check_req = 1;
+ }
+
+ } // (iAmUsedAgg)
+
+ /* Synchronize all procs */
+ MPI_Barrier(ca_data->comm);
+
+ return;
+ } /* H5FD_mpio_ccio_file_read */
+
+/*-------------------------------------------------------------------------
+ * Function: calc_file_domains
+ *
+ * Purpose: Compute a dynamic access range based file domain partition
+ * among I/O aggregators, which align to the GPFS block size
+ * Divide the I/O workload among aggregation processes. This is
+ * done by (logically) dividing the file into file domains (FDs); each
+ * process may directly access only its own file domain.
+ * Additional effort is to make sure that each I/O aggregator gets
+ * a file domain that aligns to the GPFS block size. So, there will
+ * not be any false sharing of GPFS file blocks among multiple I/O nodes.
+ *
+ * Return: Void.
+ *
+ *-------------------------------------------------------------------------
+ */
+void calc_file_domains(ADIO_Offset_CA *st_offsets, ADIO_Offset_CA *end_offsets,
+ int nprocs, int nprocs_for_coll, ADIO_Offset_CA *min_st_offset_ptr,
+ ADIO_Offset_CA **fd_start_ptr, ADIO_Offset_CA **fd_end_ptr,
+ ADIO_Offset_CA *fd_size_ptr, ADIO_Offset_CA blksize)
+{
+ ADIO_Offset_CA min_st_offset, max_end_offset, *fd_start, *fd_end, *fd_size;
+ int i, aggr;
+
+#ifdef onesidedtrace
+ printf("calc_file_domains: Blocksize=%ld, nprocs=%ld, nprocs_for_coll=%ld\n",blksize,nprocs,nprocs_for_coll);
+#endif
+ /* find min of start offsets and max of end offsets of all processes */
+ min_st_offset = st_offsets [0];
+ max_end_offset = end_offsets[0];
+ for (i=1; i<nprocs; i++) {
+ min_st_offset = MIN(min_st_offset, st_offsets[i]);
+ max_end_offset = MAX(max_end_offset, end_offsets[i]);
+ }
+
+#ifdef onesidedtrace
+ printf("calc_file_domains, min_st_offset, max_end_offset = %qd, %qd\n", min_st_offset, max_end_offset );
+#endif
+
+ /* determine the "file domain (FD)" of each process, i.e., the portion of
+ the file that will be "owned" by each process */
+
+ ADIO_Offset_CA gpfs_ub = (max_end_offset +blksize-1) / blksize * blksize - 1;
+ ADIO_Offset_CA gpfs_lb = min_st_offset / blksize * blksize;
+ ADIO_Offset_CA gpfs_ub_rdoff = (max_end_offset +blksize-1) / blksize * blksize - 1 - max_end_offset;
+ ADIO_Offset_CA gpfs_lb_rdoff = min_st_offset - min_st_offset / blksize * blksize;
+ ADIO_Offset_CA fd_gpfs_range = gpfs_ub - gpfs_lb + 1;
+
+ int naggs = nprocs_for_coll;
+
+ /* Tweak the file domains so that no fd is smaller than a threshold. We
+ * have to strike a balance between efficency and parallelism: somewhere
+ * between 10k processes sending 32-byte requests and one process sending a
+ * 320k request is a (system-dependent) sweet spot
+
+ This is from the common code - the new min_fd_size parm that we didn't implement.
+ (And common code uses a different declaration of fd_size so beware)
+
+ if (fd_size < min_fd_size)
+ fd_size = min_fd_size;
+ */
+ fd_size = (ADIO_Offset_CA *) H5MM_malloc(nprocs_for_coll * sizeof(ADIO_Offset_CA));
+ *fd_start_ptr = (ADIO_Offset_CA *) H5MM_malloc(nprocs_for_coll * sizeof(ADIO_Offset_CA));
+ *fd_end_ptr = (ADIO_Offset_CA *) H5MM_malloc(nprocs_for_coll * sizeof(ADIO_Offset_CA));
+ fd_start = *fd_start_ptr;
+ fd_end = *fd_end_ptr;
+
+ /* each process will have a file domain of some number of gpfs blocks, but
+ * the division of blocks is not likely to be even. Some file domains will
+ * be "large" and others "small"
+ *
+ * Example: consider 17 blocks distributed over 3 aggregators.
+ * nb_cn_small = 17/3 = 5
+ * naggs_large = 17 - 3*(17/3) = 17 - 15 = 2
+ * naggs_small = 3 - 2 = 1
+ *
+ * and you end up with file domains of {5-blocks, 6-blocks, 6-blocks}
+ *
+ * what about (relatively) small files? say, a file of 1000 blocks
+ * distributed over 2064 aggregators:
+ * nb_cn_small = 1000/2064 = 0
+ * naggs_large = 1000 - 2064*(1000/2064) = 1000
+ * naggs_small = 2064 - 1000 = 1064
+ * and you end up with domains of {0, 0, 0, ... 1, 1, 1 ...}
+ *
+ * it might be a good idea instead of having all the zeros up front, to
+ * "mix" those zeros into the fd_size array. that way, no pset/bridge-set
+ * is left with zero work. In fact, even if the small file domains aren't
+ * zero, it's probably still a good idea to mix the "small" file domains
+ * across the fd_size array to keep the io nodes in balance
+ */
+
+ ADIO_Offset_CA n_gpfs_blk = fd_gpfs_range / blksize;
+ ADIO_Offset_CA nb_cn_small = n_gpfs_blk/naggs;
+ ADIO_Offset_CA naggs_large = n_gpfs_blk - naggs * (n_gpfs_blk/naggs);
+ ADIO_Offset_CA naggs_small = naggs - naggs_large;
+
+ /* simple allocation of file domins to each aggregator */
+ for (i=0; i<naggs; i++) {
+ if (i < naggs_large) {
+ fd_size[i] = (nb_cn_small+1) * blksize;
+ } else {
+ fd_size[i] = nb_cn_small * blksize;
+ }
+ }
+
+#ifdef onesidedtrace
+ printf("gpfs_ub %llu, gpfs_lb %llu, gpfs_ub_rdoff %llu, gpfs_lb_rdoff %llu, fd_gpfs_range %llu, n_gpfs_blk %llu, nb_cn_small %llu, naggs_large %llu, naggs_small %llu\n",
+ gpfs_ub ,
+ gpfs_lb ,
+ gpfs_ub_rdoff,
+ gpfs_lb_rdoff,
+ fd_gpfs_range,
+ n_gpfs_blk ,
+ nb_cn_small ,
+ naggs_large ,
+ naggs_small
+ );
+ printf("File domains:\n");
+ fflush(stdout);
+#endif
+
+ fd_size[0] -= gpfs_lb_rdoff;
+ fd_size[naggs-1] -= gpfs_ub_rdoff;
+
+ /* compute the file domain for each aggr */
+ ADIO_Offset_CA offset = min_st_offset;
+ for (aggr=0; aggr<naggs; aggr++) {
+ fd_start[aggr] = offset;
+ fd_end [aggr] = offset + fd_size[aggr] - 1;
+ offset += fd_size[aggr];
+#ifdef onesidedtrace
+ printf("fd[%d]: start %ld end %ld\n",aggr,fd_start[aggr],fd_end[aggr]);
+ fflush(stdout);
+#endif
+ }
+
+ *fd_size_ptr = fd_size[0];
+ *min_st_offset_ptr = min_st_offset;
+
+ H5MM_free (fd_size);
+}
+
+/*-------------------------------------------------------------------------
+ * Function: IO_Thread_Func
+ *
+ * Purpose: Function for running in another thread for doing the file
+ * reading while the main thread is doing data aggregation -
+ * useful only when multiple rounds are needed due to file size
+ * relative to the read buffer size and number of aggregators
+ *
+ * Return: Void.
+ *
+ *-------------------------------------------------------------------------
+ */
+void *IO_Thread_Func(void *vptr_args) {
+ ThreadFuncData *args = (ThreadFuncData*)vptr_args;
+#ifdef onesidedtrace
+ printf("Rank %d - In IO_Thread_Func.\n", args->myrank);
+ fflush(stdout);
+#endif
+ if (args->size > 0) {
+ if (args->io_kind == READ_CA) {
+ args->error_code = MPI_File_read_at(args->fh, args->offset, args->buf, args->size, MPI_BYTE, &(args->error_code));
+ } else {
+ args->error_code = MPI_File_write_at(args->fh, args->offset, args->buf, args->size, MPI_BYTE, &(args->error_code));
+ }
+#ifdef onesidedtrace
+ int eclass, len;
+ char estring[MPI_MAX_ERROR_STRING];
+ MPI_Error_string(args->error_code, estring, &len);
+ printf("Rank %d - Leaving IO_Thread_Func with CODE %d: %s (int: %d) (offset=%d,size=%d)\n", args->myrank, eclass, estring, args->error_code, args->offset, args->size);
+ fflush(stdout);
+#endif
+ } else {
+ args->error_code = 0;
+#ifdef onesidedtrace
+ printf("Rank %d - WARNING: Leaving IO_Thread_Func WITHOUT doing IO OP (size = %d)\n", args->myrank, args->size);
+ fflush(stdout);
+#endif
+ }
+ pthread_exit(&(args->error_code));
+ return NULL;
+}
+
+#endif /* H5_HAVE_PARALLEL */
diff --git a/src/H5FDmpio_topology.h b/src/H5FDmpio_topology.h
new file mode 100644
index 0000000..5011d9b
--- /dev/null
+++ b/src/H5FDmpio_topology.h
@@ -0,0 +1,922 @@
+
+/*
+ * Programmers: Richard Zamora <rzamora@anl.gov>
+ * August 2018, (last modified: December 11th, 2018)
+ *
+ * Francois Tessier <ftessier@cscs.ch>
+ * August 2018
+ *
+ * Purpose: This is the topology API, which can be used to select optimal
+ * aggregator ranks for collective IO opperations.
+ *
+ */
+
+/*********************/
+/* Define Statements */
+/*********************/
+
+/*
+ * HOWTO Information:
+ *
+ * To add a new system, add a new #ifdef block below. At a minimum, you must
+ * define the NETWORK_BANDWIDTH and NETWORK_LATENCY. You should also add
+ * '#include' statements for any libraries you need to implement
+ * machine-specific code in `distance_between_ranks()` and in
+ * `distance_to_io_node()`. You will probably also need to use/modify the
+ * `rank_to_coordinates()` function to actually calculate rank-rank hop
+ * distances.
+ *
+ * For systems with PMI support (e.g. #include <pmi.h> is available), follow
+ * the THETA example for calculating rank-rank distances.
+ *
+ */
+
+/* Machine-specific Defs/Includes for Theta Cray XC40 (@ALCF) */
+#ifdef THETA
+#include <pmi.h>
+#include <lustre/lustreapi.h> /* Not used in this version of the API */
+#include <lustre/lustre_user.h> /* Not used in this version of the API */
+#define LNETS_PER_OST 7
+#define MAX_IONODES 392 /* 56 OSTs * 7 LNET */
+#define NETWORK_BANDWIDTH 1800000
+#define NETWORK_LATENCY 30
+
+/* Machine-specific Defs/Includes for IBM BG/Q Mira/Vesta (@ALCF) */
+#elif defined (BGQ)
+#include <spi/include/kernel/location.h>
+#include <spi/include/kernel/process.h>
+#include <spi/include/kernel/memory.h>
+#include <firmware/include/personality.h>
+#include <hwi/include/bqc/nd_500_dcr.h>
+#include <mpix.h>
+#define NETWORK_BANDWIDTH 1800000
+#define NETWORK_LATENCY 30
+
+/* Default Machine Defs */
+#else
+#define NETWORK_BANDWIDTH 1800000
+#define NETWORK_LATENCY 30
+
+#endif
+
+#define TMIN(a,b) (((a)<(b))?(a):(b))
+#define TMAX(a,b) (((a)>(b))?(a):(b))
+#define LARGE_PENALTY 10000000.0
+#define SMALL_PENALTY 10000.0 /* Note: This penalty is currently arbitrary */
+#define MAX_STR 1024
+#define DBGRANKS 1 // Only shows ranklist on rank==0 if DBGRANKS==0
+//#define topo_debug
+
+/*
+ * MPI_CHECK_H5 will display a custom error message as well as an error string
+ * from the MPI_STATUS and then exit the program. This macro is borrowed
+ * directly from the HPC-IOR code
+ */
+#define MPI_CHECK_H5(MPI_STATUS, MSG) do { \
+ char resultString[MPI_MAX_ERROR_STRING]; \
+ int resultLength; \
+ \
+ if (MPI_STATUS != MPI_SUCCESS) { \
+ MPI_Error_string(MPI_STATUS, resultString, &resultLength); \
+ fprintf(stdout, "ior ERROR: %s, MPI %s, (%s:%d)\n", \
+ MSG, resultString, __FILE__, __LINE__); \
+ fflush(stdout); \
+ MPI_Abort(MPI_COMM_WORLD, -1); \
+ } \
+} while(0)
+
+
+/*********************/
+/* Special Type Defs */
+/*********************/
+
+
+typedef struct cost cost;
+
+struct cost {
+ double cost;
+ int rank;
+};
+
+/*
+ * AGGSelect is used to select the desired aggregation selection routine
+ * DATA -> Try to maximize data-movement bandwidth
+ * SPREAD -> Spread out aggregators using topology information
+ * STRIDED -> Spread out aggregators according a given stride (using the rank IDs)
+ * RANDOM -> Use random rank selection for aggregator placement
+ */
+enum AGGSelect{DEFAULT, DATA, SPREAD, STRIDED, RANDOM};
+
+
+/************************/
+/* Function Definitions */
+/************************/
+
+
+/*-------------------------------------------------------------------------
+ * Function: CountProcsPerNode
+ *
+ * Purpose: Count the number of mpi procs that share a host.
+ *
+ * Return: The count (int)
+ *
+ * NOTE: This also assumes that the task count on all nodes is equal
+ * to the task count on the host running MPI task 0.
+ *
+ *-------------------------------------------------------------------------
+ */
+static int CountProcsPerNode(int numTasks, int rank, MPI_Comm comm)
+{
+ char localhost[MAX_STR];
+ char hostname0[MAX_STR];
+ static int firstPass = true;
+ unsigned count;
+ unsigned flag;
+ int rc;
+
+ rc = gethostname(localhost, MAX_STR);
+ if (rc == -1) {
+ /* This node won't match task 0's hostname...except in the
+ * case where ALL gethostname() calls fail, in which
+ * case ALL nodes will appear to be on the same node.
+ * We'll handle that later. */
+ localhost[0] = '\0';
+ if (rank == 0) perror("gethostname() failed");
+ }
+
+ /* send task 0's hostname to all tasks */
+ if (rank == 0) strcpy(hostname0, localhost);
+ MPI_CHECK_H5(MPI_Bcast(hostname0, MAX_STR, MPI_CHAR, 0, comm), "broadcast of task 0's hostname failed");
+ if (strcmp(hostname0, localhost) == 0) flag = 1;
+ else flag = 0;
+
+ /* count the tasks share the same host as task 0 */
+ MPI_Allreduce(&flag, &count, 1, MPI_UNSIGNED, MPI_SUM, comm);
+
+ if (hostname0[0] == '\0') count = 1;
+ return (int)count;
+}
+
+/*-------------------------------------------------------------------------
+ * Function: network_bandwidth
+ *
+ * Purpose: system-dependent (hard-coded) network bandwidth (bytes/ms)
+ *
+ * Return: Bandwidth (int64_t) in bytes/ms
+ *
+ *-------------------------------------------------------------------------
+ */
+int64_t network_bandwidth () {
+ return NETWORK_BANDWIDTH;
+}
+
+/*-------------------------------------------------------------------------
+ * Function: network_latency
+ *
+ * Purpose: system-dependent (hard-coded) network latency (ms)
+ *
+ * Return: Latency (int64_t) in milliseconds
+ *
+ *-------------------------------------------------------------------------
+ */
+int64_t network_latency () {
+ return NETWORK_LATENCY;
+}
+
+/*-------------------------------------------------------------------------
+ * Function: rank_to_coordinates
+ *
+ * Purpose: Given the rank, return the topology coordinates
+ *
+ * Return: int* coord array pointer
+ *
+ *-------------------------------------------------------------------------
+ */
+void rank_to_coordinates ( int rank, int* coord ) {
+#ifdef THETA
+ pmi_mesh_coord_t xyz;
+ int nid;
+
+ /* Hypothesis : PMI_rank == MPI_rank */
+ PMI_Get_nid(rank, &nid);
+ PMI_Get_meshcoord((pmi_nid_t) nid, &xyz);
+
+ coord[0] = xyz.mesh_x;
+ coord[1] = xyz.mesh_y;
+ coord[2] = xyz.mesh_z;
+ coord[3] = nid;
+ coord[4] = sched_getcpu();
+#elif defined (BGQ)
+ MPIX_Rank2torus( rank, coord );
+#endif
+}
+
+/*-------------------------------------------------------------------------
+ * Function: distance_between_ranks
+ *
+ * Purpose: Given two ranks, return the number of hops a message needs
+ * to take to travel between them.
+ *
+ * Return: int distance (number of hops between ranks)
+ *
+ *-------------------------------------------------------------------------
+ */
+int distance_between_ranks ( int src_rank, int dest_rank, int ppn, int pps ) {
+ int distance = 0;
+
+#ifdef THETA
+ int dim = 4, d;
+ int src_coord[dim], dest_coord[dim];
+
+ rank_to_coordinates ( src_rank, src_coord );
+ rank_to_coordinates ( dest_rank, dest_coord );
+
+ for ( d = 0; d < dim; d++ ) {
+ if ( src_coord[d] != dest_coord[d] )
+ distance++;
+ }
+#elif defined (BGQ)
+ int dim=6, d, hops;
+ int src_coord[6], dest_coord[6];
+ MPIX_Hardware_t hw;
+
+ rank_to_coordinates ( src_rank, src_coord );
+ rank_to_coordinates ( dest_rank, dest_coord );
+
+ MPIX_Hardware( &hw );
+ //dim = hw.torus_dimension; // Should return "6"
+
+ /* Note: dont count last dimension.. it refers to cores on same node */
+ for ( d = 0; d < dim-1; d++ ) {
+ hops = abs ( dest_coord[d] - src_coord[d] );
+ if ( hw.isTorus[d] == 1 )
+ hops = TMIN ( hops, (int)hw.Size[d] - hops );
+ distance += hops;
+ }
+#else
+ /*
+ * If we don't have topology information, but do know ppn & pps (per socket),
+ * just assume simple rank ordering.
+ * Assume 2 hops between nodes & 1 between sockets.
+ */
+ int same_node = 1;
+ int same_soc = 0;
+ if (ppn > 0) {
+ int ind_i = (src_rank / ppn) * ppn;
+ int ind_f = (src_rank / ppn) * ppn + ppn;
+ if ( (dest_rank < ind_i) || (dest_rank >= ind_f) ) {
+ // NOT Inside same 'node'
+ same_node = 0;
+ }
+ }
+ if (same_node && (pps > 0)) {
+ int ind_i = (src_rank / pps) * pps;
+ int ind_m = (src_rank / pps) * pps + pps;
+ if ( (dest_rank >= ind_i) && (dest_rank < ind_m) ) same_soc = 1;
+ }
+ if (!same_soc) distance++;
+ if (!same_node) distance++;
+#endif
+
+ return distance;
+}
+
+/***********************/
+/* THETA-Specific Defs */
+/***********************/
+
+#ifdef THETA
+
+ /*-------------------------------------------------------------------------
+ * Function: fgr_to_lnets
+ *
+ * Purpose: Given a formatted OST label, return the possible lnet nodes.
+ *
+ * Return: int *lnet is populated.
+ *
+ *-------------------------------------------------------------------------
+ */
+ void fgr_to_lnets ( char *fgr_id, int *lnet ) {
+ int count = 0;
+ FILE *fp;
+ char fline[100];
+ char *lnet_list, *item;
+
+ fp = fopen("/etc/lnet/routes.conf", "r");
+
+ if ( fp == NULL ) {
+ fprintf ( stdout, "[ERROR] Error while opening routes.conf file!\n" );
+ return;
+ }
+
+ while ( fgets ( fline, 100, fp ) != NULL ) {
+
+ const char *c = strstr ( fline, fgr_id );
+
+ if ( c != NULL ) {
+ const char *b1 = strstr ( fline, "[" ) + 1;
+ const char *b2 = strstr ( fline, "]" );
+ lnet_list = ( char * ) malloc ( sizeof ( char ) * ( b2 - b1 + 1 ) );
+ strncpy ( lnet_list, b1, b2 - b1 );
+ item = strtok ( lnet_list, "," );
+
+ while ( item ) {
+ lnet [ count ] = atoi ( item );
+ item = strtok ( 0, "," );
+ count++;
+ }
+ }
+ count = 0;
+ }
+
+ fclose ( fp );
+ return;
+ }
+
+ /*-------------------------------------------------------------------------
+ * Function: io_nodes_per_file
+ *
+ * Purpose: Given a file name, determine the LNET nodes.
+ *
+ * Return: Number of LNET nodes responsible for file.
+ * Populate (int *) nodesList with the LNET nodes.
+ *
+ * Note: Assume 7 LNET nodes per OST
+ *
+ *-------------------------------------------------------------------------
+ */
+ int io_nodes_per_file ( char* filename, int *nodesList ) {
+ int err, stripeCount, nLnets, i, idx, oid, l;
+ char fgrId [20];
+ int *ssuId, *ostId, *lnets;
+ struct find_param param = { 0 };
+ int ssu2fgr [] = { 0, 0, 0, 0,
+ 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3,
+ 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5,
+ 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7,
+ 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9};
+
+ err = llapi_getstripe ( filename, &param );
+ if ( err )
+ fprintf ( stdout, "[ERROR] llapi_getstripe\n");
+
+ stripeCount = (&param)->fp_lmd->lmd_lmm.lmm_stripe_count;
+ nLnets = stripeCount * LNETS_PER_OST;
+
+ ssuId = (int *) malloc ( stripeCount * sizeof ( int ) );
+ ostId = (int *) malloc ( stripeCount * sizeof ( int ) );
+
+ /* Hypothesis : OSS id == SNX - 4 */
+ for ( i = 0; i < stripeCount; i++ ) {
+ idx = (&param)->fp_lmd->lmd_lmm.lmm_objects[i].l_ost_idx;
+ ssuId[i] = idx + 4;
+ lnets = (int *) malloc ( LNETS_PER_OST * sizeof ( int ) );
+
+ snprintf ( fgrId, 20, "o2ib100%d", ssu2fgr[ ssuId[i] ] );
+
+ fgr_to_lnets ( fgrId, lnets );
+
+ for ( l = 0; l < LNETS_PER_OST; l++ )
+ nodesList [ i * LNETS_PER_OST + l ] = lnets [ l ];
+
+ free ( lnets );
+ }
+
+ return nLnets;
+ }
+
+
+#endif
+
+
+/*-------------------------------------------------------------------------
+ * Function: distance_to_io_node
+ *
+ * Purpose: Given a rank, determine distance to the nearest IO node.
+ *
+ * Return: Number of hops needed to reach the nearest IO node.
+ *
+ *-------------------------------------------------------------------------
+ */
+int distance_to_io_node ( int src_rank ) {
+#ifdef THETAIO
+
+ /*
+ * On Theta/LUSTRE, each OST will be served by 7 different IO nodes. This
+ * means the writing/reading of a specific stripe (on a specific OST) will
+ * require the aggregator to interact with up to 7 different IO nodes
+ * durring the course of a collective I/O operation.
+ *
+ * The following code cannot be used to actually determine the exact/avg
+ * IO-node distace, but it is included in case we get ideas later. :)
+ */
+ int nodesList[MAX_IONODES];
+ int n_lnets, i;
+ n_lnets = io_nodes_per_file ( "/lus/theta-fs0/projects/datascience/rzamora/topology/1D-ARRAY-00000000.dat", nodesList );
+ for ( i = 0; i < n_lnets; i++ ) {
+ fprintf (stdout, "%d ", nodesList[i]);
+ }
+
+ /*
+ * Note: This function needs to be improved to actually calculate the distance to IO nodes...
+ * Fore now, just setting distance to 1:
+ */
+ return 1;
+
+#elif defined (BGQ)
+ return MPIX_IO_distance ();
+
+#endif
+
+ /* Default Value */
+ return 1;
+}
+
+/*-------------------------------------------------------------------------
+ * Function: topology_aware_list_serial
+ *
+ * Purpose: Given a `tally` array (of bytes needed to/from each of nb_aggr
+ * aggregators) on each rank, determine optimal list of ranks
+ * to act as the aggregators (agg_list).
+ *
+ * Return: 0 == Success. Populates int* agg_list
+ *
+ * Note: "Serial" label refers to the fact that agg_list must be
+ * populated one at a time (for now).
+ *
+ *-------------------------------------------------------------------------
+ */
+int topology_aware_list_serial ( int64_t* tally, int64_t nb_aggr, int* agg_list, int ppn, int pps, MPI_Comm comm )
+{
+ int i, r, agg_ind, aggr_nprocs, nprocs, latency, bandwidth, distance_to_io, distance, rank;
+ int agg_to_calc, aggr_comm_rank, aggr_comm_size, ranks_per_agg, ind_i, ind_m, ind_f;
+ MPI_Comm_rank ( comm, &rank );
+ MPI_Comm_size ( comm, &nprocs );
+ int64_t *data_distribution;
+ double base_cost_penalty = 1;
+ int trim_thresh = 1;
+ int min_stride;
+ int *world_ranks;
+ cost aggr_cost, min_cost;
+
+ latency = network_latency (); /* */
+ bandwidth = network_bandwidth ();
+ data_distribution = (int64_t *) malloc (nprocs * sizeof(int64_t));
+ world_ranks = (int *) malloc (nprocs * sizeof(int));
+ min_stride = nprocs / nb_aggr;
+
+ /* Loop through the aggregators (this is the `serial` part) */
+ for (agg_ind=0; agg_ind<nb_aggr; agg_ind++ ) {
+
+ aggr_cost.cost = base_cost_penalty;
+ aggr_cost.rank = rank;
+ min_cost.cost = 0.0;
+ min_cost.rank = 0;
+
+ /* All-reduce the structures needed to calculate cost for sending data */
+ MPI_Allgather ( &tally[ agg_ind ], 1, MPI_LONG_LONG, data_distribution, 1, MPI_LONG_LONG, comm );
+ MPI_Allgather ( &rank, 1, MPI_INT, world_ranks, 1, MPI_INT, comm );
+
+ /*
+ * Now we can trim the data a bit.
+ * Note: This is currently turned off.. (see if(0))
+ */
+ aggr_nprocs = nprocs;
+ if (0 && trim_thresh > 0) {
+ for (r = nprocs-1; r >= 0; r-- ) {
+ if (data_distribution[r] < trim_thresh) {
+ aggr_nprocs--;
+ for (i = r; i < aggr_nprocs; i++) {
+ data_distribution[i] = data_distribution[i+1];
+ world_ranks[i] = world_ranks[i+1];
+ }
+ }
+ }
+ }
+
+ /* Compute the cost of aggregating data from the other ranks */
+ for (r = 0; r < aggr_nprocs; r++ ) {
+ if ( (rank != world_ranks[r]) && (data_distribution[r] > 0)) {
+ distance = distance_between_ranks ( rank, world_ranks[r], ppn, pps );
+ //printf("Rank %d - agg_ind = %d r = %d distance = %d \n", rank, agg_ind, r , distance);
+ aggr_cost.cost += ( distance * latency + data_distribution[r] / bandwidth );
+ }
+ }
+ distance_to_io = distance_to_io_node ( rank );
+ aggr_cost.cost += distance_to_io * latency;
+
+ /* If "this" rank was selected as the "strided" initial list, give it slight preference */
+ if ( aggr_cost.rank == agg_list[ agg_ind ]) aggr_cost.cost-=latency;
+
+ /* Determine the aggr with minimum cost */
+ //printf("agg_ind = %d aggr_cost.rank = %d aggr_cost.cost = %f\n",agg_ind,aggr_cost.rank,aggr_cost.cost);
+ MPI_Allreduce ( &aggr_cost, &min_cost, 1, MPI_DOUBLE_INT, MPI_MINLOC, comm );
+ agg_list[ agg_ind ] = min_cost.rank;
+ //if (rank == 0) printf("agg_ind = %d min_cost.rank = %d min_cost.cost = %f\n",agg_ind,min_cost.rank,min_cost.cost);
+
+ //printf("agg_ind = %d rank = %d base_cost_penalty = %f aggr_cost.cost = %f\n",agg_ind,rank,base_cost_penalty,aggr_cost.cost);
+
+ /*
+ * Increase `base_cost_penalty` for rank that was just chosen.
+ * Add smaller penalty if rank is on same node as the last selection.
+ */
+ if (min_cost.rank == rank)
+ base_cost_penalty += LARGE_PENALTY;
+ else {
+ distance = distance_between_ranks ( rank, min_cost.rank, ppn, pps );
+ if (distance < 1)
+ base_cost_penalty += SMALL_PENALTY;
+ }
+
+ }
+
+ free(data_distribution);
+ free(world_ranks);
+ return 0;
+}
+
+/*-------------------------------------------------------------------------
+ * Function: add_chunk
+ *
+ * Purpose: Helper function to recursively populate an array of byte
+ * quantities, with each index corresponding to the aggregator
+ * index where that quatitiy of data will be read from or written to.
+ *
+ * Return: 0 == Success. Populates int64_t* tally
+ *
+ *-------------------------------------------------------------------------
+ */
+int add_chunk ( int64_t datalen, int64_t offset, int64_t buffer_size, int64_t nb_aggr, int64_t* tally )
+{
+
+ int64_t agg_offset = 0;
+ int64_t amount_add = 0;
+ int64_t amount_left = 0;
+
+ agg_offset = offset % (nb_aggr * buffer_size) ; /* Position of the start of the chunk wrt the round */
+ agg_offset = agg_offset / buffer_size ; /* Which aggregator owns the start of this chunk */
+ amount_add = buffer_size - (agg_offset % buffer_size) ; /* How many bytes belong in the starting agg */
+ if (amount_add > datalen) { /* chunk ends within the same agg */
+ amount_add = datalen;
+ } else {
+ amount_left = datalen - amount_add;
+ if (amount_left > 0) {
+ offset += amount_add;
+ datalen-= amount_add;
+ add_chunk ( datalen, offset, buffer_size, nb_aggr, tally );
+ }
+ }
+ tally[ agg_offset ] += amount_add;
+ return 0;
+}
+
+/*-------------------------------------------------------------------------
+ * Function: get_cb_config_list
+ *
+ * Purpose: Generate a ROMIO cb_config_list hint string to select an
+ * optimal list of aggregator nodes.
+ *
+ * Return: 0 == Success. Populates char* hint_str
+ *
+ * Note: This will only work if there is one rank per node, and fewer
+ * aggregators than nodes.
+ * (and if the cb_config_list hint is not ignored)
+ *
+ * Assumption:
+ * ---------------------------------------------------------------
+ * | agg 1 round 1 | agg 2 round 1 | agg 1 round 2 | agg 2 round 2 |
+ * ---------------------------------------------------------------
+ *
+ *-------------------------------------------------------------------------
+ */
+int get_cb_config_list ( int64_t* data_lens, int64_t* offsets, int data_len, char* hint_str, int64_t buffer_size, int64_t nb_aggr, MPI_Comm comm )
+{
+ int rank, nprocs, i, r, resultlen;
+ int* agg_list;
+ int64_t *data_to_send_per_aggr;
+ char name[MPI_MAX_PROCESSOR_NAME];
+ char name_buf[MPI_MAX_PROCESSOR_NAME];
+ char* cb_reverse = getenv("HDF5_CB_REV");
+ MPI_Comm_rank ( comm, &rank );
+ MPI_Comm_size ( comm, &nprocs );
+ MPI_Get_processor_name( name, &resultlen );
+ int ppn = CountProcsPerNode(nprocs, rank, comm);
+ int pps = ppn;
+
+ /* Tally data quantities associated with each aggregator */
+ data_to_send_per_aggr = (int64_t *) calloc (nb_aggr, sizeof (int64_t));
+ for ( r = 0; r < data_len; r++ ) {
+ add_chunk ( data_lens[r], offsets[r], buffer_size, nb_aggr, data_to_send_per_aggr );
+ }
+
+ /* Generate topology-aware list of aggregators */
+ agg_list = (int *) calloc (nprocs, sizeof (int));
+ topology_aware_list_serial( data_to_send_per_aggr, nb_aggr, agg_list, ppn, pps, comm );
+
+ /* Reverse the order of the agg list..? */
+ if ( cb_reverse && (strcmp(cb_reverse,"yes") == 0) ) {
+ for (i=0, r=nb_aggr-1; i<r; i++,r--) {
+ int tmp0 = agg_list[i];
+ agg_list[i] = agg_list[r];
+ agg_list[r] = tmp0;
+ }
+ }
+
+ for ( r = 0; r < nb_aggr; r++ ) {
+ if (rank==0) printf("agg_list[%d] = %d\n",r,agg_list[r]);
+ strcpy(name_buf, name);
+ MPI_Bcast ( name_buf, MPI_MAX_PROCESSOR_NAME, MPI_CHAR, agg_list[r], comm );
+ if (r==0)
+ sprintf(hint_str,"%s",name_buf);
+ else
+ sprintf(hint_str,"%s,%s",hint_str,name_buf);
+ }
+ free(agg_list);
+
+ return 0;
+}
+
+
+/*-------------------------------------------------------------------------
+ * Function: get_ranklist_spread
+ *
+ * Purpose: Just start with a simple rank-based spacing between aggs, then
+ * try to increase actual minimum distance between any two aggs.
+ *
+ * Return: 0 == Success. Populates int* agg_list
+ *
+ *-------------------------------------------------------------------------
+ */
+int get_ranklist_spread ( int64_t nb_aggr, int* agg_list, int ppn, int pps, MPI_Comm comm )
+{
+ int i, r, agg_ind, distance, nprocs, rank, stride;
+ cost aggr_cost, max_cost;
+
+ MPI_Comm_rank ( comm, &rank );
+ MPI_Comm_size ( comm, &nprocs );
+
+ /* Start with a simple aggregator placement */
+ stride = nprocs / nb_aggr;
+ for (agg_ind=0; agg_ind<nb_aggr; agg_ind++ ) {
+ agg_list[ agg_ind ] = agg_ind * stride;
+ }
+
+ /*
+ * Loop through the aggs and adjust each one to MAXIMUIZE the
+ * minimum distance to another aggregator...
+ */
+ for (agg_ind=0; agg_ind<nb_aggr; agg_ind++ ) {
+
+ aggr_cost.cost = -1;
+ aggr_cost.rank = rank;
+ max_cost.cost = 0.0;
+ max_cost.rank = 0;
+ //if ((rank == 0)) printf("starting with agg_rank = %d for agg_ind = %d.\n", agg_list[ agg_ind ], agg_ind);
+
+ /* Compute the "cost" (min distance) to other aggs */
+ for (r = 0; r < nb_aggr; r++ ) {
+ if (r != agg_ind) {
+ if (aggr_cost.rank == agg_list[ r ]) {
+ aggr_cost.cost = 0;
+ //printf("agg_ind = %d, rank %d cannot be chosen because its agg_ind %d already.\n", agg_ind, rank , r);
+ break;
+ }
+ distance = distance_between_ranks ( aggr_cost.rank, agg_list[ r ], ppn, pps );
+ //printf("agg_ind = %d, rank %d is distance=%d from agg %d.\n", agg_ind, aggr_cost.rank, distance, r);
+ if (aggr_cost.cost < 0)
+ aggr_cost.cost = distance;
+ else
+ aggr_cost.cost = TMIN( distance, aggr_cost.cost );
+ }
+ }
+ if (aggr_cost.rank == agg_list[ agg_ind ]) aggr_cost.cost++; // Don't change agg if we are already good...
+
+ /* Determine the aggr with MAXIMUM cost (we want to maximize min distance) */
+ MPI_Allreduce ( &aggr_cost, &max_cost, 1, MPI_DOUBLE_INT, MPI_MAXLOC, comm );
+ agg_list[ agg_ind ] = max_cost.rank;
+ //printf("agg_ind = %d aggr_cost.rank = %d aggr_cost.cost = %f, max_cost.rank = %d, max_cost.cost = %f \n", agg_ind, aggr_cost.rank, aggr_cost.cost, max_cost.rank, max_cost.cost);
+
+ }
+
+ return 0;
+}
+
+/*-------------------------------------------------------------------------
+ * Function: get_ranklist_random
+ *
+ * Purpose: Just return a random selection of ranks
+ *
+ * Return: 0 == Success. Populates int* agg_list
+ *
+ *-------------------------------------------------------------------------
+ */
+int get_ranklist_random ( int64_t nb_aggr, int* agg_list, MPI_Comm comm )
+{
+ int i, r, good, agg_ind, rank, nprocs;
+ MPI_Comm_rank ( comm, &rank );
+ MPI_Comm_size ( comm, &nprocs );
+
+ /* Use rank-0 to crate a random agg placement */
+ if (rank == 0) {
+ srand(time(NULL)); /* Initialization, should only be called once. */
+ for (agg_ind=0; agg_ind<nb_aggr; agg_ind++ ) {
+ while (1) {
+ good = 1;
+ r = rand() % nprocs;
+ for (i=0; i<agg_ind; i++) {
+ if ((r == agg_list[i]) || (r < 0) || (r > (nprocs-1))) {
+ good = 0;
+ break;
+ }
+ }
+ if (good == 1) break;
+ }
+ agg_list[ agg_ind ] = r;
+ }
+ }
+ /* Bcast random list to other ranks */
+ MPI_Bcast(&agg_list[0], nb_aggr, MPI_INT, 0, comm);
+ return 0;
+}
+
+/*-------------------------------------------------------------------------
+ * Function: get_ranklist_strided
+ *
+ * Purpose: Just return a strided selection of ranks
+ *
+ * Return: 0 == Success. Populates int* agg_list
+ *
+ *-------------------------------------------------------------------------
+ */
+int get_ranklist_strided ( int64_t nb_aggr, int* agg_list, int stride, MPI_Comm comm )
+{
+ int agg_ind, nprocs;
+ MPI_Comm_size ( comm, &nprocs );
+ if (stride < 1) stride = nprocs / nb_aggr;
+ /* Use rank-0 to crate a random agg placement */
+ for (agg_ind=0; agg_ind<nb_aggr; agg_ind++ ) {
+ agg_list[ agg_ind ] = agg_ind * stride;
+ }
+ return 0;
+}
+
+/*-------------------------------------------------------------------------
+ * Function: topology_aware_ranklist
+ *
+ * Purpose: Given a data patern to read or write, generate an optimal
+ * list of aggregator ranks. Actual routine depends on the
+ * 'AGGSelect select_type' argument...
+ *
+ * DATA -> Try to maximize data-movement bandwidth
+ * SPREAD -> Spread out aggregators using topology information
+ * STRIDED -> Spread out aggregators according a given stride (using the rank IDs)
+ * RANDOM -> Use random rank selection for aggregator placement
+ *
+ *
+ * fd_mapping == 0 Assumption:
+ * ---------------------------------------------------------------
+ * | agg 1 round 1 | agg 2 round 1 | agg 1 round 2 | agg 2 round 2 |
+ * ---------------------------------------------------------------
+ *
+ * fd_mapping == 1 Assumption (Use contiguous file domains for each agg):
+ * ---------------------------------------------------------------
+ * | agg 1 round 1 | agg 1 round 2 | agg 2 round 1 | agg 2 round 2 |
+ * ---------------------------------------------------------------
+ *
+ * Return: 0 == Success. Populates int *ranklist
+ *
+ *-------------------------------------------------------------------------
+ */
+int topology_aware_ranklist ( int64_t* data_lens, int64_t* offsets, int data_len,
+ int *ranklist, int64_t buffer_size, int64_t nb_aggr, int ppn, int pps,
+ int stride, MPI_Comm comm, enum AGGSelect select_type, int fd_mapping )
+{
+ int r;
+ int64_t *data_to_send_per_aggr;
+ double min_off_g, max_off_g, min_off_l, max_off_l;
+ int64_t min_off, max_off, off;
+ int64_t st_agg, in_0, in_1;
+
+#ifdef topo_debug
+ int rank, myrank, nprocs;
+ MPI_Comm_rank ( comm, &myrank );
+ MPI_Comm_size ( comm, &nprocs );
+#endif
+
+ switch(select_type) {
+
+ case DATA :
+ {
+ /* Tally data quantities associated with each aggregator */
+ data_to_send_per_aggr = (int64_t *) calloc (nb_aggr, sizeof (int64_t));
+
+ if (fd_mapping==1) { /* GPFS-style mapping */
+
+ /* get local min and max offsets */
+
+ min_off = offsets[0];
+ max_off = offsets[0] + data_lens[0];
+ for ( r = 1; r < data_len; r++ ) {
+ if (offsets[r] < min_off)
+ min_off = offsets[r];
+ off = offsets[r] + data_lens[r];
+ if (off > max_off)
+ max_off = off;
+ }
+ min_off_l = (double) min_off;
+ max_off_l = (double) max_off;
+
+ /* Use allreduce to get global min and max offsets */
+ MPI_Allreduce ( &min_off_l, &min_off_g, 1, MPI_DOUBLE, MPI_MIN, comm );
+ MPI_Allreduce ( &max_off_l, &max_off_g, 1, MPI_DOUBLE, MPI_MAX, comm );
+
+ min_off = (int64_t) min_off_g;
+ max_off = (int64_t) max_off_g;
+
+#ifdef topo_debug
+ if (DBGRANKS > 0) {
+ for (rank=0;rank<TMIN(nprocs,DBGRANKS);rank++) {
+ if (rank == myrank) {
+ printf("Rank %d - (min_off_l=%ld, max_off_l=%ld) (min_off=%ld, max_off=%ld):", myrank, (int64_t) min_off_l, (int64_t) max_off_l, min_off, max_off);
+ for (r=0;r<data_len;r++)
+ printf(" [%ld -> %ld]", offsets[r], offsets[r]+data_lens[r] );
+ printf("\n");
+ fflush(stdout);
+ }
+ MPI_Barrier(comm);
+ }
+ }
+#endif
+
+ /* Loop through data to add counts to known file domains */
+ int64_t fd_size = (max_off - min_off) / nb_aggr;
+ for ( r = 0; r < data_len; r++ ) {
+ st_agg = (offsets[r] - min_off) / fd_size;
+ //in_0 = (offsets[r]-min_off) % fd_size;
+ in_0 = ((st_agg + 1) * fd_size) - (offsets[r] - min_off);
+ in_0 = TMIN ( in_0, data_lens[r] );
+ in_1 = TMAX(0, data_lens[r] - in_0);
+ data_to_send_per_aggr[ (int) st_agg ] += in_0;
+ data_to_send_per_aggr[ (int) ((st_agg+1)%nb_aggr) ] += in_1;
+ }
+
+ } else { /* LUSTRE-style mapping */
+ for ( r = 0; r < data_len; r++ ) {
+ add_chunk ( data_lens[r], offsets[r], buffer_size, nb_aggr, data_to_send_per_aggr );
+ }
+ }
+
+ /* Generate topology-aware list of aggregators */
+ topology_aware_list_serial( data_to_send_per_aggr, nb_aggr, ranklist, ppn, pps, comm );
+ break;
+ }
+ case SPREAD :
+ {
+ /* Generate spread-out list of aggregators */
+ get_ranklist_spread ( nb_aggr, ranklist, ppn, pps, comm );
+ break;
+ }
+ case STRIDED :
+ {
+ /* Generate constant-strided list of aggregators */
+ get_ranklist_strided ( nb_aggr, ranklist, stride, comm );
+ break;
+ }
+ case RANDOM :
+ {
+ /* Generate random list of aggregators */
+ get_ranklist_random ( nb_aggr, ranklist, comm );
+ break;
+ }
+ default :
+ {
+ /* Generate random list of aggregators */
+ get_ranklist_strided ( nb_aggr, ranklist, 0, comm );
+ }
+
+ }
+
+#ifdef topo_debug
+ if (myrank == 0) {
+ printf("Topology-aware CB Selection (type %d): nb_aggr is %d, and ranklist is:", select_type, nb_aggr);
+ for (r=0;r<nb_aggr;r++)
+ printf(" %d", ranklist[r]);
+ printf("\n");
+ }
+ MPI_Barrier(comm);
+ if ((select_type == DATA) && (DBGRANKS > 0)) {
+ for (rank=0;rank<TMIN(nprocs,DBGRANKS);rank++) {
+ if (rank == myrank) {
+ printf("Rank %d - Data distribution: ", myrank);
+ for (r=0;r<nb_aggr;r++)
+ printf(" %d", data_to_send_per_aggr[r]);
+ printf("\n");
+ fflush(stdout);
+ }
+ MPI_Barrier(comm);
+ }
+ }
+#endif
+
+ return 0;
+}
diff --git a/src/H5FDmulti.c b/src/H5FDmulti.c
index aa1b118..03cad1f 100644
--- a/src/H5FDmulti.c
+++ b/src/H5FDmulti.c
@@ -171,6 +171,8 @@ static const H5FD_class_t H5FD_multi_g = {
H5FD_multi_get_handle, /*get_handle */
H5FD_multi_read, /*read */
H5FD_multi_write, /*write */
+ NULL, /*select_read */
+ NULL, /*select_write */
H5FD_multi_flush, /*flush */
H5FD_multi_truncate, /*truncate */
H5FD_multi_lock, /*lock */
diff --git a/src/H5FDprivate.h b/src/H5FDprivate.h
index c79d676..0534deb 100644
--- a/src/H5FDprivate.h
+++ b/src/H5FDprivate.h
@@ -145,6 +145,8 @@ H5_DLL herr_t H5FD_set_feature_flags(H5FD_t *file, unsigned long feature_flags);
H5_DLL herr_t H5FD_get_fs_type_map(const H5FD_t *file, H5FD_mem_t *type_map);
H5_DLL herr_t H5FD_read(H5FD_t *file, H5FD_mem_t type, haddr_t addr, size_t size, void *buf/*out*/);
H5_DLL herr_t H5FD_write(H5FD_t *file, H5FD_mem_t type, haddr_t addr, size_t size, const void *buf);
+H5_DLL herr_t H5FD_select_read(H5FD_t *file, H5FD_mem_t type, hid_t file_space, hid_t mem_space, size_t elmt_size, haddr_t addr, void *buf/*out*/);
+H5_DLL herr_t H5FD_select_write(H5FD_t *file, H5FD_mem_t type, hid_t file_space, hid_t mem_space, size_t elmt_size, haddr_t addr, const void *buf);
H5_DLL herr_t H5FD_flush(H5FD_t *file, hbool_t closing);
H5_DLL herr_t H5FD_truncate(H5FD_t *file, hbool_t closing);
H5_DLL herr_t H5FD_lock(H5FD_t *file, hbool_t rw);
@@ -178,4 +180,3 @@ H5_DLL herr_t H5FD_get_mpi_info(H5FD_t *file, void** file_info);
#endif /* H5_HAVE_PARALLEL */
#endif /* !_H5FDprivate_H */
-
diff --git a/src/H5FDpublic.h b/src/H5FDpublic.h
index 514d1bf..76f167d 100644
--- a/src/H5FDpublic.h
+++ b/src/H5FDpublic.h
@@ -207,7 +207,7 @@ typedef enum H5F_mem_t H5FD_mem_t;
* the handle for the VFD (returned with the 'get_handle' callback) is
* of type 'int' and is compatible with POSIX I/O calls.
*/
-#define H5FD_FEAT_POSIX_COMPAT_HANDLE 0x00000080
+#define H5FD_FEAT_POSIX_COMPAT_HANDLE 0x00000080
/*
* Defining H5FD_FEAT_HAS_MPI for a VFL driver means that
* the driver makes use of MPI communication and code may retrieve
@@ -220,7 +220,7 @@ typedef enum H5F_mem_t H5FD_mem_t;
* instead of the default H5D_ALLOC_TIME_LATE
*/
#define H5FD_FEAT_ALLOCATE_EARLY 0x00000200
- /*
+ /*
* Defining H5FD_FEAT_ALLOW_FILE_IMAGE for a VFL driver means that
* the driver is able to use a file image in the fapl as the initial
* contents of a file.
@@ -269,8 +269,7 @@ typedef struct H5FD_class_t {
H5F_close_degree_t fc_degree;
herr_t (*terminate)(void);
hsize_t (*sb_size)(H5FD_t *file);
- herr_t (*sb_encode)(H5FD_t *file, char *name/*out*/,
- unsigned char *p/*out*/);
+ herr_t (*sb_encode)(H5FD_t *file, char *name/*out*/, unsigned char *p/*out*/);
herr_t (*sb_decode)(H5FD_t *f, const char *name, const unsigned char *p);
size_t fapl_size;
void * (*fapl_get)(H5FD_t *file);
@@ -279,23 +278,21 @@ typedef struct H5FD_class_t {
size_t dxpl_size;
void * (*dxpl_copy)(const void *dxpl);
herr_t (*dxpl_free)(void *dxpl);
- H5FD_t *(*open)(const char *name, unsigned flags, hid_t fapl,
- haddr_t maxaddr);
+ H5FD_t *(*open)(const char *name, unsigned flags, hid_t fapl, haddr_t maxaddr);
herr_t (*close)(H5FD_t *file);
int (*cmp)(const H5FD_t *f1, const H5FD_t *f2);
herr_t (*query)(const H5FD_t *f1, unsigned long *flags);
herr_t (*get_type_map)(const H5FD_t *file, H5FD_mem_t *type_map);
haddr_t (*alloc)(H5FD_t *file, H5FD_mem_t type, hid_t dxpl_id, hsize_t size);
- herr_t (*free)(H5FD_t *file, H5FD_mem_t type, hid_t dxpl_id,
- haddr_t addr, hsize_t size);
+ herr_t (*free)(H5FD_t *file, H5FD_mem_t type, hid_t dxpl_id, haddr_t addr, hsize_t size);
haddr_t (*get_eoa)(const H5FD_t *file, H5FD_mem_t type);
herr_t (*set_eoa)(H5FD_t *file, H5FD_mem_t type, haddr_t addr);
haddr_t (*get_eof)(const H5FD_t *file, H5FD_mem_t type);
herr_t (*get_handle)(H5FD_t *file, hid_t fapl, void**file_handle);
- herr_t (*read)(H5FD_t *file, H5FD_mem_t type, hid_t dxpl,
- haddr_t addr, size_t size, void *buffer);
- herr_t (*write)(H5FD_t *file, H5FD_mem_t type, hid_t dxpl,
- haddr_t addr, size_t size, const void *buffer);
+ herr_t (*read)(H5FD_t *file, H5FD_mem_t type, hid_t dxpl, haddr_t addr, size_t size, void *buffer);
+ herr_t (*write)(H5FD_t *file, H5FD_mem_t type, hid_t dxpl, haddr_t addr, size_t size, const void *buffer);
+ herr_t (*select_read)(H5FD_t *file, H5FD_mem_t type, hid_t dxpl, hid_t file_space, hid_t mem_space, size_t elmt_size, haddr_t addr, void *buffer);
+ herr_t (*select_write)(H5FD_t *file, H5FD_mem_t type, hid_t dxpl, hid_t file_space, hid_t mem_space, size_t elmt_size, haddr_t addr, const void *buffer);
herr_t (*flush)(H5FD_t *file, hid_t dxpl_id, hbool_t closing);
herr_t (*truncate)(H5FD_t *file, hid_t dxpl_id, hbool_t closing);
herr_t (*lock)(H5FD_t *file, hbool_t rw);
@@ -332,7 +329,7 @@ struct H5FD_t {
/* Define enum for the source of file image callbacks */
typedef enum {
H5FD_FILE_IMAGE_OP_NO_OP,
- H5FD_FILE_IMAGE_OP_PROPERTY_LIST_SET,
+ H5FD_FILE_IMAGE_OP_PROPERTY_LIST_SET,
H5FD_FILE_IMAGE_OP_PROPERTY_LIST_COPY,
H5FD_FILE_IMAGE_OP_PROPERTY_LIST_GET,
H5FD_FILE_IMAGE_OP_PROPERTY_LIST_CLOSE,
@@ -343,13 +340,13 @@ typedef enum {
/* Define structure to hold file image callbacks */
typedef struct {
- void *(*image_malloc)(size_t size, H5FD_file_image_op_t file_image_op,
+ void *(*image_malloc)(size_t size, H5FD_file_image_op_t file_image_op,
void *udata);
void *(*image_memcpy)(void *dest, const void *src, size_t size,
H5FD_file_image_op_t file_image_op, void *udata);
- void *(*image_realloc)(void *ptr, size_t size,
+ void *(*image_realloc)(void *ptr, size_t size,
H5FD_file_image_op_t file_image_op, void *udata);
- herr_t (*image_free)(void *ptr, H5FD_file_image_op_t file_image_op,
+ herr_t (*image_free)(void *ptr, H5FD_file_image_op_t file_image_op,
void *udata);
void *(*udata_copy)(void *udata);
herr_t (*udata_free)(void *udata);
@@ -391,4 +388,3 @@ H5_DLL herr_t H5FDdriver_query(hid_t driver_id, unsigned long *flags/*out*/);
}
#endif
#endif
-
diff --git a/src/H5FDsec2.c b/src/H5FDsec2.c
index 06c008d..c380d11 100644
--- a/src/H5FDsec2.c
+++ b/src/H5FDsec2.c
@@ -167,6 +167,8 @@ static const H5FD_class_t H5FD_sec2_g = {
H5FD_sec2_get_handle, /* get_handle */
H5FD_sec2_read, /* read */
H5FD_sec2_write, /* write */
+ NULL, /* select_read */
+ NULL, /* select_write */
NULL, /* flush */
H5FD_sec2_truncate, /* truncate */
H5FD_sec2_lock, /* lock */
diff --git a/src/H5FDstdio.c b/src/H5FDstdio.c
index 861c6a6..0adf470 100644
--- a/src/H5FDstdio.c
+++ b/src/H5FDstdio.c
@@ -205,6 +205,8 @@ static const H5FD_class_t H5FD_stdio_g = {
H5FD_stdio_get_handle, /* get_handle */
H5FD_stdio_read, /* read */
H5FD_stdio_write, /* write */
+ NULL, /* select_read */
+ NULL, /* select_write */
H5FD_stdio_flush, /* flush */
H5FD_stdio_truncate, /* truncate */
H5FD_stdio_lock, /* lock */
diff --git a/src/H5Fio.c b/src/H5Fio.c
index 69e6bb2..297c9d5 100644
--- a/src/H5Fio.c
+++ b/src/H5Fio.c
@@ -38,6 +38,7 @@
#include "H5FDprivate.h" /* File drivers */
#include "H5Iprivate.h" /* IDs */
#include "H5PBprivate.h" /* Page Buffer */
+#include "H5Sprivate.h" /* Dataspaces */
/****************/
@@ -166,11 +167,133 @@ done:
FUNC_LEAVE_NOAPI(ret_value)
} /* end H5F_block_write() */
-
+/*-------------------------------------------------------------------------
+ * Function: H5F_select_write
+ *
+ * Purpose: Writes selected data from memory to a file/server/etc.
+ * The address is relative to the base address for the file.
+ * The file and memory selection determine the elements to read
+ * from the file, and the elmt_size is the size of each element
+ * in bytes. The address in the file and the buffer in memory
+ * are assumed to point at the multidimensional array in the
+ * file and memory spaces, respectively.
+ *
+ * Return: Non-negative on success/Negative on failure
+ *
+ * Programmer: Quincey Koziol
+ * koziol@lbl.gov
+ * Nov 8 2017
+ *
+ *-------------------------------------------------------------------------
+ */
+herr_t
+H5F_select_write(const H5F_t *f, H5FD_mem_t type, hid_t file_space,
+ hid_t mem_space, size_t elmt_size, haddr_t addr, const void *buf)
+{
+ H5FD_mem_t map_type; /* Mapped memory type */
+ H5S_t *fspace; /* File dataspace */
+ hsize_t npoints; /* # of elements in dataspace */
+ hssize_t snpoints; /* # of elements in dataspace (signed) */
+ H5FD_t *file_ptr; /* H5FD file pointer */
+ herr_t ret_value = SUCCEED; /* Return value */
+
+ FUNC_ENTER_NOAPI(FAIL)
+
+ /* Sanity checks */
+ HDassert(f);
+ HDassert(f->shared);
+ HDassert(H5F_INTENT(f) & H5F_ACC_RDWR);
+ HDassert(H5F_addr_defined(addr));
+ HDassert(buf);
+
+ /* Check for attempting I/O on 'temporary' file address */
+ if(NULL == (fspace = (H5S_t *)H5I_object_verify(file_space, H5I_DATASPACE)))
+ HGOTO_ERROR(H5E_IO, H5E_BADTYPE, FAIL, "file dataspace ID not valid")
+ if (!(fspace==NULL)){
+ if((snpoints = H5S_GET_EXTENT_NPOINTS(fspace)) < 0)
+ HGOTO_ERROR(H5E_IO, H5E_CANTGET, FAIL, "can't retrieve # of elements for file dataspace")
+ npoints = (hsize_t)snpoints;
+ if(H5F_addr_le(f->shared->tmp_addr, (addr + (npoints * elmt_size))))
+ HGOTO_ERROR(H5E_IO, H5E_BADRANGE, FAIL, "attempting I/O in temporary file space")
+ }
+
+ /* Treat global heap as raw data */
+ map_type = (type == H5FD_MEM_GHEAP) ? H5FD_MEM_DRAW : type;
+ file_ptr = (f->shared->lf);
+
+ /* Pass I/O down to next layer */
+ if(H5FD_select_write(file_ptr, map_type, file_space, mem_space, elmt_size, addr, buf) < 0)
+ HGOTO_ERROR(H5E_IO, H5E_WRITEERROR, FAIL, "selection write failed")
+
+done:
+ FUNC_LEAVE_NOAPI(ret_value)
+} /* end H5F_select_write() */
+
+/*-------------------------------------------------------------------------
+ * Function: H5F_select_read
+ *
+ * Purpose: Reads selected data from a file/server/etc into a buffer.
+ * The address is relative to the base address for the file.
+ * The file and memory selection determine the elements to read
+ * from the file, and the elmt_size is the size of each element
+ * in bytes. The address in the file and the buffer in memory
+ * are assumed to point at the multidimensional array in the
+ * file and memory spaces, respectively.
+ *
+ * Return: Non-negative on success/Negative on failure
+ *
+ * Programmer: Quincey Koziol
+ * koziol@lbl.gov
+ * Nov 8 2017
+ *
+ *-------------------------------------------------------------------------
+ */
+herr_t
+H5F_select_read(const H5F_t *f, H5FD_mem_t type, hid_t file_space,
+ hid_t mem_space, size_t elmt_size, haddr_t addr, void *buf /*out*/)
+{
+ H5FD_mem_t map_type; /* Mapped memory type */
+ H5S_t *fspace; /* File dataspace */
+ hsize_t npoints; /* # of elements in dataspace */
+ hssize_t snpoints; /* # of elements in dataspace (signed) */
+ H5FD_t *file_ptr; /* H5FD file pointer */
+ herr_t ret_value = SUCCEED; /* Return value */
+
+ FUNC_ENTER_NOAPI(FAIL)
+
+ /* Sanity checks */
+ HDassert(f);
+ HDassert(f->shared);
+ HDassert(H5F_addr_defined(addr));
+ HDassert(buf);
+
+ /* Check for attempting I/O on 'temporary' file address */
+ if(NULL == (fspace = (H5S_t *)H5I_object_verify(file_space, H5I_DATASPACE)))
+ HGOTO_ERROR(H5E_IO, H5E_BADTYPE, FAIL, "file dataspace ID not valid")
+ if (!(fspace==NULL)){
+ if((snpoints = H5S_GET_EXTENT_NPOINTS(fspace)) < 0)
+ HGOTO_ERROR(H5E_IO, H5E_CANTGET, FAIL, "can't retrieve # of elements for file dataspace")
+ npoints = (hsize_t)snpoints;
+ if(H5F_addr_le(f->shared->tmp_addr, (addr + (npoints * elmt_size))))
+ HGOTO_ERROR(H5E_IO, H5E_BADRANGE, FAIL, "attempting I/O in temporary file space")
+ }
+
+ /* Treat global heap as raw data */
+ map_type = (type == H5FD_MEM_GHEAP) ? H5FD_MEM_DRAW : type;
+ file_ptr = (f->shared->lf);
+
+ /* Pass I/O down to next layer */
+ if(H5FD_select_read(file_ptr, map_type, file_space, mem_space, elmt_size, addr, buf) < 0)
+ HGOTO_ERROR(H5E_IO, H5E_READERROR, FAIL, "selection read failed")
+
+done:
+ FUNC_LEAVE_NOAPI(ret_value)
+} /* end H5F_select_read() */
+
/*-------------------------------------------------------------------------
* Function: H5F_flush_tagged_metadata
*
- * Purpose: Flushes metadata with specified tag in the metadata cache
+ * Purpose: Flushes metadata with specified tag in the metadata cache
* to disk.
*
* Return: Non-negative on success/Negative on failure
@@ -327,4 +450,3 @@ H5F_get_checksums(const uint8_t *buf, size_t buf_size, uint32_t *s_chksum/*out*/
FUNC_LEAVE_NOAPI(SUCCEED)
} /* end H5F_get_chksums() */
-
diff --git a/src/H5Fprivate.h b/src/H5Fprivate.h
index 83513a5..b6878b2 100644
--- a/src/H5Fprivate.h
+++ b/src/H5Fprivate.h
@@ -479,11 +479,11 @@ typedef struct H5F_t H5F_t;
#define H5F_ACS_SDATA_BLOCK_SIZE_NAME "sdata_block_size" /* Minimum "small data" allocation block size (when aggregating "small" raw data allocations) */
#define H5F_ACS_GARBG_COLCT_REF_NAME "gc_ref" /* Garbage-collect references */
#define H5F_ACS_FILE_DRV_NAME "vfd_info" /* File driver ID & info */
-#define H5F_ACS_VOL_CONN_NAME "vol_connector_info" /* VOL connector ID & info */
+#define H5F_ACS_VOL_DRV_NAME "vol_driver_info" /* VOL driver ID & info */
#define H5F_ACS_CLOSE_DEGREE_NAME "close_degree" /* File close degree */
#define H5F_ACS_FAMILY_OFFSET_NAME "family_offset" /* Offset position in file for family file driver */
#define H5F_ACS_FAMILY_NEWSIZE_NAME "family_newsize" /* New member size of family driver. (private property only used by h5repart) */
-#define H5F_ACS_FAMILY_TO_SINGLE_NAME "family_to_single" /* Whether to convert family to a single-file driver. (private property only used by h5repart) */
+#define H5F_ACS_FAMILY_TO_SEC2_NAME "family_to_sec2" /* Whether to convert family to sec2 driver. (private property only used by h5repart) */
#define H5F_ACS_MULTI_TYPE_NAME "multi_type" /* Data type in multi file driver */
#define H5F_ACS_LIBVER_LOW_BOUND_NAME "libver_low_bound" /* 'low' bound of library format versions */
#define H5F_ACS_LIBVER_HIGH_BOUND_NAME "libver_high_bound" /* 'high' bound of library format versions */
@@ -716,7 +716,6 @@ typedef enum H5F_prefix_open_t {
/* Private functions */
H5_DLL H5F_t *H5F_open(const char *name, unsigned flags, hid_t fcpl_id, hid_t fapl_id);
H5_DLL herr_t H5F_try_close(H5F_t *f, hbool_t *was_closed/*out*/);
-H5_DLL hid_t H5F_get_file_id(hid_t obj_id, H5I_type_t id_type);
/* Functions that retrieve values from the file struct */
H5_DLL H5F_libver_t H5F_get_low_bound(const H5F_t *f);
@@ -735,7 +734,7 @@ H5_DLL H5F_t *H5F_get_parent(const H5F_t *f);
H5_DLL unsigned H5F_get_nmounts(const H5F_t *f);
H5_DLL unsigned H5F_get_read_attempts(const H5F_t *f);
H5_DLL hid_t H5F_get_access_plist(H5F_t *f, hbool_t app_ref);
-H5_DLL hid_t H5F_get_id(H5F_t *file);
+H5_DLL hid_t H5F_get_id(H5F_t *file, hbool_t app_ref);
H5_DLL herr_t H5F_get_obj_count(const H5F_t *f, unsigned types, hbool_t app_ref, size_t *obj_id_count_ptr);
H5_DLL herr_t H5F_get_obj_ids(const H5F_t *f, unsigned types, size_t max_objs, hid_t *oid_list, hbool_t app_ref, size_t *obj_id_count_ptr);
H5_DLL hsize_t H5F_get_pgend_meta_thres(const H5F_t *f);
@@ -797,6 +796,14 @@ H5_DLL herr_t H5F_flush_mounts(H5F_t *f);
H5_DLL herr_t H5F_block_read(H5F_t *f, H5FD_mem_t type, haddr_t addr, size_t size, void *buf/*out*/);
H5_DLL herr_t H5F_block_write(H5F_t *f, H5FD_mem_t type, haddr_t addr, size_t size, const void *buf);
+/* Functions that operate on selections of elements wrt super block */
+H5_DLL herr_t H5F_select_read(const H5F_t *f, H5FD_mem_t type,
+ hid_t file_space, hid_t mem_space, size_t elmt_size,
+ haddr_t addr, void *buf/*out*/);
+H5_DLL herr_t H5F_select_write(const H5F_t *f, H5FD_mem_t type,
+ hid_t file_space, hid_t mem_space, size_t elmt_size,
+ haddr_t addr, const void *buf);
+
/* Functions that flush or evict */
H5_DLL herr_t H5F_flush_tagged_metadata(H5F_t *f, haddr_t tag);
H5_DLL herr_t H5F_evict_tagged_metadata(H5F_t *f, haddr_t tag);
@@ -856,4 +863,3 @@ H5_DLL herr_t H5F_cwfs_remove_heap(H5F_file_t *shared, struct H5HG_heap_t *heap)
H5_DLL herr_t H5F_debug(H5F_t *f, FILE * stream, int indent, int fwidth);
#endif /* _H5Fprivate_H */
-
diff --git a/src/H5Sall.c b/src/H5Sall.c
index 3b77b98..45a8846 100644
--- a/src/H5Sall.c
+++ b/src/H5Sall.c
@@ -31,9 +31,9 @@
/* Selection callbacks */
static herr_t H5S__all_copy(H5S_t *dst, const H5S_t *src, hbool_t share_selection);
-static herr_t H5S__all_get_seq_list(const H5S_t *space, unsigned flags,
- H5S_sel_iter_t *iter, size_t maxseq, size_t maxbytes,
- size_t *nseq, size_t *nbytes, hsize_t *off, size_t *len);
+//static herr_t H5S__all_get_seq_list(const H5S_t *space, unsigned flags,
+// H5S_sel_iter_t *iter, size_t maxseq, size_t maxbytes,
+// size_t *nseq, size_t *nbytes, hsize_t *off, size_t *len);
static herr_t H5S__all_release(H5S_t *space);
static htri_t H5S__all_is_valid(const H5S_t *space);
static hssize_t H5S__all_serial_size(const H5S_t *space);
@@ -987,7 +987,7 @@ done:
EXAMPLES
REVISION LOG
--------------------------------------------------------------------------*/
-static herr_t
+herr_t
H5S__all_get_seq_list(const H5S_t H5_ATTR_UNUSED *space, unsigned H5_ATTR_UNUSED flags, H5S_sel_iter_t *iter,
size_t H5_ATTR_UNUSED maxseq, size_t maxelem, size_t *nseq, size_t *nelem,
hsize_t *off, size_t *len)
@@ -1028,4 +1028,3 @@ H5S__all_get_seq_list(const H5S_t H5_ATTR_UNUSED *space, unsigned H5_ATTR_UNUSED
FUNC_LEAVE_NOAPI(SUCCEED)
} /* end H5S__all_get_seq_list() */
-
diff --git a/src/H5Shyper.c b/src/H5Shyper.c
index ed3fa45..c989083 100644
--- a/src/H5Shyper.c
+++ b/src/H5Shyper.c
@@ -75,9 +75,9 @@ static hsize_t H5S__hyper_get_clip_extent_real(const H5S_t *clip_space,
/* Selection callbacks */
static herr_t H5S__hyper_copy(H5S_t *dst, const H5S_t *src, hbool_t share_selection);
-static herr_t H5S__hyper_get_seq_list(const H5S_t *space, unsigned flags,
- H5S_sel_iter_t *iter, size_t maxseq, size_t maxbytes,
- size_t *nseq, size_t *nbytes, hsize_t *off, size_t *len);
+//static herr_t H5S__hyper_get_seq_list(const H5S_t *space, unsigned flags,
+// H5S_sel_iter_t *iter, size_t maxseq, size_t maxbytes,
+// size_t *nseq, size_t *nbytes, hsize_t *off, size_t *len);
static herr_t H5S__hyper_release(H5S_t *space);
static htri_t H5S__hyper_is_valid(const H5S_t *space);
static hssize_t H5S__hyper_serial_size(const H5S_t *space);
@@ -4522,22 +4522,22 @@ H5S__hyper_project_simple(const H5S_t *base_space, H5S_t *new_space, hsize_t *of
/* Copy the diminfo */
while(base_space_dim < base_space->extent.rank) {
- new_space->select.sel_info.hslab->app_diminfo[new_space_dim].start =
+ new_space->select.sel_info.hslab->app_diminfo[new_space_dim].start =
base_space->select.sel_info.hslab->app_diminfo[base_space_dim].start;
- new_space->select.sel_info.hslab->app_diminfo[new_space_dim].stride =
+ new_space->select.sel_info.hslab->app_diminfo[new_space_dim].stride =
base_space->select.sel_info.hslab->app_diminfo[base_space_dim].stride;
- new_space->select.sel_info.hslab->app_diminfo[new_space_dim].count =
+ new_space->select.sel_info.hslab->app_diminfo[new_space_dim].count =
base_space->select.sel_info.hslab->app_diminfo[base_space_dim].count;
- new_space->select.sel_info.hslab->app_diminfo[new_space_dim].block =
+ new_space->select.sel_info.hslab->app_diminfo[new_space_dim].block =
base_space->select.sel_info.hslab->app_diminfo[base_space_dim].block;
- new_space->select.sel_info.hslab->opt_diminfo[new_space_dim].start =
+ new_space->select.sel_info.hslab->opt_diminfo[new_space_dim].start =
base_space->select.sel_info.hslab->opt_diminfo[base_space_dim].start;
new_space->select.sel_info.hslab->opt_diminfo[new_space_dim].stride =
base_space->select.sel_info.hslab->opt_diminfo[base_space_dim].stride;
- new_space->select.sel_info.hslab->opt_diminfo[new_space_dim].count =
+ new_space->select.sel_info.hslab->opt_diminfo[new_space_dim].count =
base_space->select.sel_info.hslab->opt_diminfo[base_space_dim].count;
- new_space->select.sel_info.hslab->opt_diminfo[new_space_dim].block =
+ new_space->select.sel_info.hslab->opt_diminfo[new_space_dim].block =
base_space->select.sel_info.hslab->opt_diminfo[base_space_dim].block;
/* Advance to next dimensions */
@@ -7224,7 +7224,7 @@ H5S_select_hyperslab (H5S_t *space, H5S_seloper_t op,
/* Check for unlimited dimension */
for(u = 0; u<space->extent.rank; u++)
if((count[u] == H5S_UNLIMITED) || (block[u] == H5S_UNLIMITED)) {
- if(unlim_dim >= 0)
+ if(unlim_dim >= 0)
HGOTO_ERROR(H5E_DATASPACE, H5E_UNSUPPORTED, FAIL, "cannot have more than one unlimited dimension in selection")
else {
if(count[u] == block[u] /* == H5S_UNLIMITED */)
@@ -9086,7 +9086,7 @@ H5S__hyper_get_seq_list_single(const H5S_t *space, H5S_sel_iter_t *iter,
EXAMPLES
REVISION LOG
--------------------------------------------------------------------------*/
-static herr_t
+herr_t
H5S__hyper_get_seq_list(const H5S_t *space, unsigned H5_ATTR_UNUSED flags, H5S_sel_iter_t *iter,
size_t maxseq, size_t maxelem, size_t *nseq, size_t *nelem,
hsize_t *off, size_t *len)
@@ -9314,7 +9314,7 @@ H5S__hyper_project_intersection(const H5S_t *src_space, const H5S_t *dst_space,
HDassert(dst_space);
HDassert(src_intersect_space);
HDassert(proj_space);
-
+
/* Assert that src_space and src_intersect_space have same extent and there
* are no point selections */
HDassert(H5S_GET_EXTENT_NDIMS(src_space)
@@ -9483,7 +9483,7 @@ H5S__hyper_project_intersection(const H5S_t *src_space, const H5S_t *dst_space,
* selection and advance any sequences we complete */
if(ss_off[ss_i] >= sis_off[sis_i])
int_sel_off = ss_sel_off;
- else
+ else
int_sel_off = sis_off[sis_i] - ss_off[ss_i] + ss_sel_off;
if((ss_off[ss_i] + (hsize_t)ss_len[ss_i]) <= (sis_off[sis_i]
+ (hsize_t)sis_len[sis_i])) {
@@ -10429,4 +10429,3 @@ H5Sget_regular_hyperslab(hid_t spaceid, hsize_t start[], hsize_t stride[],
done:
FUNC_LEAVE_API(ret_value)
} /* H5Sget_regular_hyperslab() */
-
diff --git a/src/H5Smpio.c b/src/H5Smpio.c
index 935d279..78d1b4f 100644
--- a/src/H5Smpio.c
+++ b/src/H5Smpio.c
@@ -41,12 +41,12 @@ static herr_t H5S_mpio_all_type(const H5S_t *space, size_t elmt_size,
MPI_Datatype *new_type, int *count, hbool_t *is_derived_type);
static herr_t H5S_mpio_none_type(MPI_Datatype *new_type, int *count,
hbool_t *is_derived_type);
-static herr_t H5S_mpio_create_point_datatype(size_t elmt_size, hsize_t num_points,
+static herr_t H5S_mpio_create_point_datatype(size_t elmt_size, hsize_t num_points,
MPI_Aint *disp, MPI_Datatype *new_type);
static herr_t H5S_mpio_point_type(const H5S_t *space, size_t elmt_size,
MPI_Datatype *new_type, int *count, hbool_t *is_derived_type,
hbool_t do_permute, hsize_t **permute_map, hbool_t *is_permuted);
-static herr_t H5S_mpio_permute_type(const H5S_t *space, size_t elmt_size,
+static herr_t H5S_mpio_permute_type(const H5S_t *space, size_t elmt_size,
hsize_t **permute_map, MPI_Datatype *new_type, int *count,
hbool_t *is_derived_type);
static herr_t H5S_mpio_hyper_type(const H5S_t *space, size_t elmt_size,
@@ -198,9 +198,9 @@ H5S_mpio_none_type(MPI_Datatype *new_type, int *count, hbool_t *is_derived_type)
*
*-------------------------------------------------------------------------
*/
-static herr_t
+static herr_t
H5S_mpio_create_point_datatype (size_t elmt_size, hsize_t num_points,
- MPI_Aint *disp, MPI_Datatype *new_type)
+ MPI_Aint *disp, MPI_Datatype *new_type)
{
MPI_Datatype elmt_type; /* MPI datatype for individual element */
hbool_t elmt_type_created = FALSE; /* Whether the element MPI datatype was created */
@@ -239,7 +239,7 @@ H5S_mpio_create_point_datatype (size_t elmt_size, hsize_t num_points,
if(MPI_SUCCESS != (mpi_code = MPI_Type_commit(new_type)))
HMPI_GOTO_ERROR(FAIL, "MPI_Type_commit failed", mpi_code)
}
- else {
+ else {
/* use LARGE_DATATYPE::
* We'll create an hindexed_block type for every 2G point count and then combine
* those and any remaining points into a single large datatype.
@@ -373,7 +373,7 @@ done:
*-------------------------------------------------------------------------
*/
static herr_t
-H5S_mpio_point_type(const H5S_t *space, size_t elmt_size, MPI_Datatype *new_type,
+H5S_mpio_point_type(const H5S_t *space, size_t elmt_size, MPI_Datatype *new_type,
int *count, hbool_t *is_derived_type, hbool_t do_permute, hsize_t **permute,
hbool_t *is_permuted)
{
@@ -410,19 +410,19 @@ H5S_mpio_point_type(const H5S_t *space, size_t elmt_size, MPI_Datatype *new_type
disp[u] = H5VM_array_offset(space->extent.rank, space->extent.size, curr->pnt);
disp[u] *= elmt_size;
- /* This is a File Space used to set the file view, so adjust the displacements
+ /* This is a File Space used to set the file view, so adjust the displacements
* to have them monotonically non-decreasing.
- * Generate the permutation array by indicating at each point being selected,
- * the position it will shifted in the new displacement. Example:
- * Suppose 4 points with corresponding are selected
- * Pt 1: disp=6 ; Pt 2: disp=3 ; Pt 3: disp=0 ; Pt 4: disp=4
+ * Generate the permutation array by indicating at each point being selected,
+ * the position it will shifted in the new displacement. Example:
+ * Suppose 4 points with corresponding are selected
+ * Pt 1: disp=6 ; Pt 2: disp=3 ; Pt 3: disp=0 ; Pt 4: disp=4
* The permute map to sort the displacements in order will be:
* point 1: map[0] = L, indicating that this point is not moved (1st point selected)
- * point 2: map[1] = 0, indicating that this point is moved to the first position,
+ * point 2: map[1] = 0, indicating that this point is moved to the first position,
* since disp_pt1(6) > disp_pt2(3)
- * point 3: map[2] = 0, move to position 0, bec it has the lowest disp between
+ * point 3: map[2] = 0, move to position 0, bec it has the lowest disp between
* the points selected so far.
- * point 4: map[3] = 2, move the 2nd position since point 1 has a higher disp,
+ * point 4: map[3] = 2, move the 2nd position since point 1 has a higher disp,
* but points 2 and 3 have lower displacements.
*/
if(do_permute) {
@@ -447,7 +447,7 @@ H5S_mpio_point_type(const H5S_t *space, size_t elmt_size, MPI_Datatype *new_type
HDmemmove(disp + m + 1, disp + m, (u - m) * sizeof(MPI_Aint));
disp[m] = temp;
} /* end if */
- (*permute)[u] = m;
+ (*permute)[u] = m;
} /* end if */
else
(*permute)[u] = num_points;
@@ -508,7 +508,7 @@ done:
*-------------------------------------------------------------------------
*/
static herr_t
-H5S_mpio_permute_type(const H5S_t *space, size_t elmt_size, hsize_t **permute,
+H5S_mpio_permute_type(const H5S_t *space, size_t elmt_size, hsize_t **permute,
MPI_Datatype *new_type, int *count, hbool_t *is_derived_type)
{
MPI_Aint *disp = NULL; /* Datatype displacement for each point*/
@@ -571,12 +571,12 @@ H5S_mpio_permute_type(const H5S_t *space, size_t elmt_size, hsize_t **permute,
/* Set the displacement of the current point */
disp[u] = curr_off;
- /* This is a memory displacement, so for each point selected,
+ /* This is a memory displacement, so for each point selected,
* apply the map that was generated by the file selection */
if((*permute)[u] != num_points) {
MPI_Aint temp = disp[u];
- HDmemmove(disp + (*permute)[u] + 1, disp + (*permute)[u],
+ HDmemmove(disp + (*permute)[u] + 1, disp + (*permute)[u],
(u - (*permute)[u]) * sizeof(MPI_Aint));
disp[(*permute)[u]] = temp;
} /* end if */
@@ -795,7 +795,7 @@ H5S_mpio_hyper_type(const H5S_t *space, size_t elmt_size,
#endif
/* LARGE_DATATYPE::
- * Check if the number of elements to form the inner type fits into a 32 bit integer.
+ * Check if the number of elements to form the inner type fits into a 32 bit integer.
* If yes then just create the innertype with MPI_Type_contiguous.
* Otherwise create a compound datatype by iterating as many times as needed
* for the innertype to be created.
@@ -848,8 +848,8 @@ H5S_mpio_hyper_type(const H5S_t *space, size_t elmt_size,
HMPI_GOTO_ERROR(FAIL, "couldn't create MPI vector type", mpi_code)
}
else {
- /* Things get a bit more complicated and require LARGE_DATATYPE processing
- * There are two MPI datatypes that need to be created:
+ /* Things get a bit more complicated and require LARGE_DATATYPE processing
+ * There are two MPI datatypes that need to be created:
* 1) an internal contiguous block; and
* 2) a collection of elements where an element is a contiguous block(1).
* Remember that the input arguments to the MPI-IO functions use integer
@@ -863,30 +863,23 @@ H5S_mpio_hyper_type(const H5S_t *space, size_t elmt_size,
MPI_Datatype block_type;
/* create a contiguous datatype inner_type x number of BLOCKS.
- * Again we need to check that the number of BLOCKS can fit into
+ * Again we need to check that the number of BLOCKS can fit into
* a 32 bit integer */
if (bigio_count < d[i].block) {
- if (H5S_mpio_create_large_type(d[i].block, 0, inner_type,
+ if (H5S_mpio_create_large_type(d[i].block, 0, inner_type,
&block_type) < 0) {
HGOTO_ERROR(H5E_DATASPACE, H5E_BADTYPE, FAIL,
"couldn't ccreate a large block datatype in hyper selection")
}
}
else {
- if(MPI_SUCCESS != (mpi_code = MPI_Type_contiguous((int)d[i].block,
- inner_type,
+ if(MPI_SUCCESS != (mpi_code = MPI_Type_contiguous((int)d[i].block,
+ inner_type,
&block_type)))
HMPI_GOTO_ERROR(FAIL, "MPI_Type_contiguous failed", mpi_code)
}
- /* As of version 4.0, OpenMPI now turns off MPI-1 API calls by default,
- * so we're using the MPI-2 version even though we don't need the lb
- * value.
- */
- {
- MPI_Aint unused_lb_arg;
- MPI_Type_get_extent(inner_type, &unused_lb_arg, &inner_extent);
- }
+ MPI_Type_extent (inner_type, &inner_extent);
stride_in_bytes = inner_extent * (MPI_Aint)d[i].strid;
/* If the element count is larger than what a 32 bit integer can hold,
@@ -979,7 +972,42 @@ done:
FUNC_LEAVE_NOAPI(ret_value)
} /* end H5S_mpio_hyper_type() */
-
+
+/* This function allows the rank and extent of the space to accessed from the H5S_t structure
+ * from modules (like FD) outside of the space module.
+ */
+herr_t
+H5S_mpio_return_space_rank_and_extent(const H5S_t *space, unsigned *rank, hsize_t *extent) {
+
+ *rank = space->extent.rank;
+ *extent = 1;
+ for (int i=0;i<(*rank);i++) {
+ *extent *= space->extent.size[i];
+ }
+
+ herr_t ret_value = SUCCEED;
+ return ret_value;
+}
+
+/* This function allows the extent and select type of the space to be gotten from the H5S_t structure
+ * from modules like FD outside of the space module.
+ */
+herr_t
+H5S_mpio_return_space_extent_and_select_type(const H5S_t *space, hbool_t *is_permuted, hbool_t *is_regular, H5S_class_t *space_extent_type, H5S_sel_type *space_sel_type) {
+
+ herr_t ret_value = SUCCEED; /* Return value */
+
+ FUNC_ENTER_NOAPI_NOINIT
+
+ *space_extent_type = H5S_GET_EXTENT_TYPE(space);
+ *space_sel_type = H5S_GET_SELECT_TYPE(space);
+ *is_regular = H5S_SELECT_IS_REGULAR(space);
+
+done:
+ FUNC_LEAVE_NOAPI(ret_value)
+}
+
+
/*-------------------------------------------------------------------------
* Function: H5S_mpio_span_hyper_type
*
@@ -997,7 +1025,7 @@ done:
*
* Modifications:
* Mohamad Chaarawi
- * Adding support for large datatypes (beyond the limit of a
+ * Adding support for large datatypes (beyond the limit of a
* 32 bit integer.
*-------------------------------------------------------------------------
*/
@@ -1156,8 +1184,8 @@ H5S_obtain_datatype(const hsize_t *down, H5S_hyper_span_t *span,
}
}
else {
- if(MPI_SUCCESS != (mpi_code = MPI_Type_contiguous((int)blocklen[i],
- *elmt_type,
+ if(MPI_SUCCESS != (mpi_code = MPI_Type_contiguous((int)blocklen[i],
+ *elmt_type,
&temp_type)))
HMPI_GOTO_ERROR(FAIL, "MPI_Type_contiguous failed", mpi_code)
}
@@ -1181,11 +1209,11 @@ H5S_obtain_datatype(const hsize_t *down, H5S_hyper_span_t *span,
*span_type = outer_type;
}
- if (outer_type != MPI_DATATYPE_NULL)
+ if (outer_type != MPI_DATATYPE_NULL)
MPI_Type_free(&outer_type);
/* temp_type shouldn't be freed here...
* Note that we have simply copied it above (not MPI_Type_dup)
- * into the 'span_type' argument of the caller.
+ * into the 'span_type' argument of the caller.
* The caller needs to deal with it there!
*/
}
@@ -1312,7 +1340,7 @@ done:
*-------------------------------------------------------------------------
*/
herr_t
-H5S_mpio_space_type(const H5S_t *space, size_t elmt_size, MPI_Datatype *new_type,
+H5S_mpio_space_type(const H5S_t *space, size_t elmt_size, MPI_Datatype *new_type,
int *count, hbool_t *is_derived_type, hbool_t do_permute, hsize_t **permute_map,
hbool_t *is_permuted)
{
@@ -1333,7 +1361,7 @@ H5S_mpio_space_type(const H5S_t *space, size_t elmt_size, MPI_Datatype *new_type
* out-of-order point selection, then permute this selection which
* should be a memory selection to match the file space permutation.
*/
- if(TRUE == *is_permuted) {
+ if(TRUE == *is_permuted) {
switch(H5S_GET_SELECT_TYPE(space)) {
case H5S_SEL_NONE:
if(H5S_mpio_none_type(new_type, count, is_derived_type) < 0)
@@ -1409,7 +1437,7 @@ done:
/*-------------------------------------------------------------------------
* Function: H5S_mpio_create_large_type
*
- * Purpose: Create a large datatype of size larger than what a 32 bit integer
+ * Purpose: Create a large datatype of size larger than what a 32 bit integer
* can hold.
*
* Return: non-negative on success, negative on failure.
@@ -1507,14 +1535,7 @@ static herr_t H5S_mpio_create_large_type (hsize_t num_elements,
}
}
- /* As of version 4.0, OpenMPI now turns off MPI-1 API calls by default,
- * so we're using the MPI-2 version even though we don't need the lb
- * value.
- */
- {
- MPI_Aint unused_lb_arg;
- MPI_Type_get_extent(old_type, &unused_lb_arg, &old_extent);
- }
+ MPI_Type_extent (old_type, &old_extent);
/* Set up the arguments for MPI_Type_struct constructor */
type[0] = outer_type;
@@ -1546,4 +1567,3 @@ done:
} /* end H5S_mpio_create_large_type */
#endif /* H5_HAVE_PARALLEL */
-
diff --git a/src/H5Spoint.c b/src/H5Spoint.c
index 9924920..bd03d68 100644
--- a/src/H5Spoint.c
+++ b/src/H5Spoint.c
@@ -33,9 +33,9 @@
/* Selection callbacks */
static herr_t H5S_point_copy(H5S_t *dst, const H5S_t *src, hbool_t share_selection);
-static herr_t H5S_point_get_seq_list(const H5S_t *space, unsigned flags,
- H5S_sel_iter_t *iter, size_t maxseq, size_t maxbytes,
- size_t *nseq, size_t *nbytes, hsize_t *off, size_t *len);
+//static herr_t H5S_point_get_seq_list(const H5S_t *space, unsigned flags,
+// H5S_sel_iter_t *iter, size_t maxseq, size_t maxbytes,
+// size_t *nseq, size_t *nbytes, hsize_t *off, size_t *len);
static herr_t H5S_point_release(H5S_t *space);
static htri_t H5S_point_is_valid(const H5S_t *space);
static hssize_t H5S_point_serial_size(const H5S_t *space);
@@ -461,7 +461,7 @@ done:
/* Release possible linked list of nodes */
while(top) {
- curr = top->next;
+ curr = top->next;
H5MM_xfree(top->pnt);
top = H5FL_FREE(H5S_pnt_node_t, top);
top = curr;
@@ -1433,7 +1433,7 @@ H5S_point_project_scalar(const H5S_t *space, hsize_t *offset)
HGOTO_ERROR(H5E_DATASPACE, H5E_BADRANGE, FAIL, "point selection of one element has more than one node!")
/* Calculate offset of selection in projected buffer */
- *offset = H5VM_array_offset(space->extent.rank, space->extent.size, node->pnt);
+ *offset = H5VM_array_offset(space->extent.rank, space->extent.size, node->pnt);
done:
FUNC_LEAVE_NOAPI(ret_value)
@@ -1487,7 +1487,7 @@ H5S_point_project_simple(const H5S_t *base_space, H5S_t *new_space, hsize_t *off
/* Calculate offset of selection in projected buffer */
HDmemset(block, 0, sizeof(block));
HDmemcpy(block, base_space->select.sel_info.pnt_lst->head->pnt, sizeof(hsize_t) * rank_diff);
- *offset = H5VM_array_offset(base_space->extent.rank, base_space->extent.size, block);
+ *offset = H5VM_array_offset(base_space->extent.rank, base_space->extent.size, block);
/* Iterate through base space's point nodes, copying the point information */
base_node = base_space->select.sel_info.pnt_lst->head;
@@ -1659,7 +1659,7 @@ done:
EXAMPLES
REVISION LOG
--------------------------------------------------------------------------*/
-static herr_t
+herr_t
H5S_point_get_seq_list(const H5S_t *space, unsigned flags, H5S_sel_iter_t *iter,
size_t maxseq, size_t maxelem, size_t *nseq, size_t *nelem,
hsize_t *off, size_t *len)
@@ -1764,4 +1764,3 @@ H5S_point_get_seq_list(const H5S_t *space, unsigned flags, H5S_sel_iter_t *iter,
done:
FUNC_LEAVE_NOAPI(ret_value)
} /* end H5S_point_get_seq_list() */
-
diff --git a/src/H5Sprivate.h b/src/H5Sprivate.h
index 32ac51a..899a2c9 100644
--- a/src/H5Sprivate.h
+++ b/src/H5Sprivate.h
@@ -314,10 +314,35 @@ H5_DLL herr_t H5S_mpio_space_type(const H5S_t *space, size_t elmt_size,
/* out: */ MPI_Datatype *new_type,
int *count,
hbool_t *is_derived_type,
- hbool_t do_permute,
+ hbool_t do_permute,
hsize_t **permute_map,
hbool_t * is_permuted);
+
+/*
+ * Buffer-flattening struct for derived MPI_Types.
+ * All values are in bytes.
+ */
+typedef struct H5S_flatbuf_t {
+ hsize_t count; /* number of contiguous blocks */
+ size_t *blocklens; /* array of contiguous block lengths (bytes)*/
+ hsize_t *indices; /*array of byte offsets of each block */
+ hsize_t extent; /* offset range for one instance of this flatbuf */
+ hsize_t size; /* number of bytes of block data */
+} H5S_flatbuf_t;
+H5_DLL herr_t H5S__hyper_get_seq_list(const H5S_t *space, unsigned H5_ATTR_UNUSED flags, H5S_sel_iter_t *iter,
+ size_t maxseq, size_t maxelem, size_t *nseq, size_t *nelem,
+ hsize_t *off, size_t *len);
+H5_DLL herr_t H5S__all_get_seq_list(const H5S_t *space, unsigned flags,
+ H5S_sel_iter_t *iter, size_t maxseq, size_t maxbytes,
+ size_t *nseq, size_t *nbytes, hsize_t *off, size_t *len);
+H5_DLL herr_t H5S_point_get_seq_list(const H5S_t *space, unsigned flags, H5S_sel_iter_t *iter,
+ size_t maxseq, size_t maxelem, size_t *nseq, size_t *nelem,
+ hsize_t *off, size_t *len);
+
+H5_DLL herr_t H5S_mpio_return_space_rank_and_extent(const H5S_t *space, unsigned *rank, hsize_t *extent);
+
+H5_DLL herr_t H5S_mpio_return_space_extent_and_select_type(const H5S_t *space, hbool_t *is_permuted, hbool_t *is_regular, H5S_class_t *space_extent_type, H5S_sel_type *space_sel_type);
+
#endif /* H5_HAVE_PARALLEL */
#endif /* _H5Sprivate_H */
-