summaryrefslogtreecommitdiffstats
path: root/src/H5FDsubfiling/H5FDsubfile_int.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/H5FDsubfiling/H5FDsubfile_int.c')
-rw-r--r--src/H5FDsubfiling/H5FDsubfile_int.c186
1 files changed, 96 insertions, 90 deletions
diff --git a/src/H5FDsubfiling/H5FDsubfile_int.c b/src/H5FDsubfiling/H5FDsubfile_int.c
index 22a5bd0..c089509 100644
--- a/src/H5FDsubfiling/H5FDsubfile_int.c
+++ b/src/H5FDsubfiling/H5FDsubfile_int.c
@@ -30,11 +30,11 @@
* Note: This code should be moved -- most likely to the IOC
* code files.
*
- * Purpose: Apply a truncate operation to the sub-files.
+ * Purpose: Apply a truncate operation to the subfiles.
*
* In the context of the I/O concentrators, the eof must be
* translated into the appropriate value for each of the
- * sub-files, and then applied to same.
+ * subfiles, and then applied to same.
*
* Further, we must ensure that all prior I/O requests complete
* before the truncate is applied.
@@ -44,7 +44,7 @@
* 1) Run a barrier on entry.
*
* 2) Determine if this rank is a IOC. If it is, compute
- * the correct EOF for this sub-file, and send a truncate
+ * the correct EOF for this subfile, and send a truncate
* request to the IOC.
*
* 3) On the IOC thread, allow all pending I/O requests
@@ -72,50 +72,61 @@
herr_t
H5FD__subfiling__truncate_sub_files(hid_t context_id, int64_t logical_file_eof, MPI_Comm comm)
{
- int mpi_code; /* MPI return code */
subfiling_context_t *sf_context = NULL;
- int64_t msg[3] = {
- 0,
- };
- herr_t ret_value = SUCCEED; /* Return value */
+ int64_t msg[3] = {0};
+ int mpi_size;
+ int mpi_code;
+ herr_t ret_value = SUCCEED;
+
+ if (MPI_SUCCESS != (mpi_code = MPI_Comm_size(comm, &mpi_size)))
+ H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Comm_size failed", mpi_code);
/* Barrier on entry */
- if (MPI_SUCCESS != (mpi_code = MPI_Barrier(comm)))
- H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Barrier failed", mpi_code);
+ if (mpi_size > 1)
+ if (MPI_SUCCESS != (mpi_code = MPI_Barrier(comm)))
+ H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Barrier failed", mpi_code);
if (NULL == (sf_context = (subfiling_context_t *)H5_get_subfiling_object(context_id)))
H5_SUBFILING_GOTO_ERROR(H5E_FILE, H5E_BADVALUE, FAIL, "can't get subfile context");
- /* Test to see if this rank is running an I/O concentrator. */
-
if (sf_context->topology->rank_is_ioc) {
-
- int i;
- int64_t subfile_eof;
int64_t num_full_stripes;
+ int64_t num_leftover_stripes;
int64_t partial_stripe_len;
-#ifndef NDEBUG
- int64_t test_file_eof;
-#endif /* NDEBUG */
- /* if it is, first compute the sub-file EOF */
+ num_full_stripes = logical_file_eof / sf_context->sf_blocksize_per_stripe;
+ partial_stripe_len = logical_file_eof % sf_context->sf_blocksize_per_stripe;
+ num_leftover_stripes = partial_stripe_len / sf_context->sf_stripe_size;
- num_full_stripes = logical_file_eof / sf_context->sf_blocksize_per_stripe;
- partial_stripe_len = logical_file_eof % sf_context->sf_blocksize_per_stripe;
+ /* Compute the EOF for each subfile this IOC owns */
+ for (int i = 0; i < sf_context->sf_num_fids; i++) {
+ int64_t subfile_eof = num_full_stripes * sf_context->sf_stripe_size;
+ int64_t global_subfile_idx;
- subfile_eof = num_full_stripes * sf_context->sf_stripe_size;
+ global_subfile_idx =
+ (i * sf_context->topology->n_io_concentrators) + sf_context->topology->ioc_idx;
- if (sf_context->topology->subfile_rank < (partial_stripe_len / sf_context->sf_stripe_size)) {
+ if (global_subfile_idx < num_leftover_stripes) {
+ subfile_eof += sf_context->sf_stripe_size;
+ }
+ else if (global_subfile_idx == num_leftover_stripes) {
+ subfile_eof += partial_stripe_len % sf_context->sf_stripe_size;
+ }
- subfile_eof += sf_context->sf_stripe_size;
- }
- else if (sf_context->topology->subfile_rank == (partial_stripe_len / sf_context->sf_stripe_size)) {
+ /* Direct the IOC to truncate this subfile to the correct EOF */
+ msg[0] = subfile_eof;
+ msg[1] = i;
+ msg[2] = -1; /* padding -- not used in this message */
- subfile_eof += partial_stripe_len % sf_context->sf_stripe_size;
+ if (MPI_SUCCESS !=
+ (mpi_code = MPI_Send(msg, 1, H5_subfiling_rpc_msg_type,
+ sf_context->topology->io_concentrators[sf_context->topology->ioc_idx],
+ TRUNC_OP, sf_context->sf_msg_comm)))
+ H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Send failed", mpi_code);
}
/* sanity check -- compute the file eof using the same mechanism used to
- * compute the sub-file eof. Assert that the computed value and the
+ * compute the subfile eof. Assert that the computed value and the
* actual value match.
*
* Do this only for debug builds -- probably delete this before release.
@@ -124,40 +135,29 @@ H5FD__subfiling__truncate_sub_files(hid_t context_id, int64_t logical_file_eof,
*/
#ifndef NDEBUG
- test_file_eof = 0;
-
- for (i = 0; i < sf_context->topology->n_io_concentrators; i++) {
-
- test_file_eof += num_full_stripes * sf_context->sf_stripe_size;
-
- if (i < (partial_stripe_len / sf_context->sf_stripe_size)) {
-
- test_file_eof += sf_context->sf_stripe_size;
+ {
+ int64_t test_file_eof = 0;
+
+ for (int i = 0; i < sf_context->sf_num_subfiles; i++) {
+ test_file_eof += num_full_stripes * sf_context->sf_stripe_size;
+
+ if (i < num_leftover_stripes) {
+ test_file_eof += sf_context->sf_stripe_size;
+ }
+ else if (i == num_leftover_stripes) {
+ test_file_eof += partial_stripe_len % sf_context->sf_stripe_size;
+ }
}
- else if (i == (partial_stripe_len / sf_context->sf_stripe_size)) {
- test_file_eof += partial_stripe_len % sf_context->sf_stripe_size;
- }
+ HDassert(test_file_eof == logical_file_eof);
}
- HDassert(test_file_eof == logical_file_eof);
#endif /* NDEBUG */
-
- /* then direct the IOC to truncate the sub-file to the correct EOF */
-
- msg[0] = subfile_eof;
- msg[1] = 0; /* padding -- not used in this message */
- msg[2] = context_id;
-
- if (MPI_SUCCESS !=
- (mpi_code = MPI_Send(msg, 3, MPI_INT64_T,
- sf_context->topology->io_concentrators[sf_context->topology->subfile_rank],
- TRUNC_OP, sf_context->sf_msg_comm)))
- H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Send failed", mpi_code);
}
/* Barrier on exit */
- if (MPI_SUCCESS != (mpi_code = MPI_Barrier(comm)))
- H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Barrier failed", mpi_code);
+ if (mpi_size > 1)
+ if (MPI_SUCCESS != (mpi_code = MPI_Barrier(comm)))
+ H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Barrier failed", mpi_code);
done:
@@ -176,9 +176,10 @@ done:
* Do this as follows:
*
* 1) allocate an array of int64_t of length equal to the
- * the number of IOCs, and initialize all fields to -1.
+ * the number of subfiles, and initialize all fields to -1.
*
- * 2) Send each IOC a message requesting that sub-file's EOF.
+ * 2) Send each subfile's IOC a message requesting that
+ * subfile's EOF.
*
* 3) Await reply from each IOC, storing the reply in
* the appropriate entry in the array allocated in 1.
@@ -197,13 +198,13 @@ done:
* than for the more traditional HDF5 file implementations.
* This statement derives from the fact that unlike "normal"
* HDF5 files, subfiling introduces a multi-file representation
- * of a single HDF5 file. The plurality of sub-files represents
- * a software RAID-0 based HDF5 file. As such, each sub-file
+ * of a single HDF5 file. The plurality of subfiles represents
+ * a software RAID-0 based HDF5 file. As such, each subfile
* contains a designated portion of the address space of the
* virtual HDF5 storage. We have no notion of HDF5 datatypes,
* datasets, metadata, or other HDF5 structures; only BYTES.
*
- * The organization of the bytes within sub-files is consistent
+ * The organization of the bytes within subfiles is consistent
* with the RAID-0 striping, i.e. there are IO Concentrators
* (IOCs) which correspond to a stripe-count (in Lustre) as
* well as a stripe_size. The combination of these two
@@ -220,7 +221,7 @@ done:
* follows.
* 1. At file creation, each IOC is assigned a rank value
* (0 to N-1, where N is the total number of IOCs) and
- * a 'sf_base_addr' = 'subfile_rank' * 'sf_stripe_size')
+ * a 'sf_base_addr' = 'ioc_idx' * 'sf_stripe_size')
* we also determine the 'sf_blocksize_per_stripe' which
* is simply the 'sf_stripe_size' * 'n_ioc_concentrators'
*
@@ -263,9 +264,10 @@ H5FD__subfiling__get_real_eof(hid_t context_id, int64_t *logical_eof_ptr)
int64_t msg[3] = {0, 0, 0};
int64_t logical_eof = 0;
int64_t sf_logical_eof;
- int n_io_concentrators = 0; /* copy of value in topology */
- int mpi_code; /* MPI return code */
- herr_t ret_value = SUCCEED; /* Return value */
+ int n_io_concentrators = 0;
+ int num_subfiles = 0;
+ int mpi_code; /* MPI return code */
+ herr_t ret_value = SUCCEED; /* Return value */
HDassert(logical_eof_ptr);
@@ -275,56 +277,60 @@ H5FD__subfiling__get_real_eof(hid_t context_id, int64_t *logical_eof_ptr)
HDassert(sf_context->topology);
n_io_concentrators = sf_context->topology->n_io_concentrators;
+ num_subfiles = sf_context->sf_num_subfiles;
HDassert(n_io_concentrators > 0);
+ HDassert(num_subfiles >= n_io_concentrators);
- if (NULL == (sf_eofs = HDmalloc((size_t)n_io_concentrators * sizeof(int64_t))))
- H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "can't allocate sub-file EOFs array");
- if (NULL == (recv_reqs = HDmalloc((size_t)n_io_concentrators * sizeof(*recv_reqs))))
+ if (NULL == (sf_eofs = HDmalloc((size_t)num_subfiles * sizeof(int64_t))))
+ H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "can't allocate subfile EOFs array");
+ if (NULL == (recv_reqs = HDmalloc((size_t)num_subfiles * sizeof(*recv_reqs))))
H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "can't allocate receive requests array");
- if (NULL == (recv_msg = HDmalloc((size_t)n_io_concentrators * 3 * sizeof(*recv_msg))))
+ if (NULL == (recv_msg = HDmalloc((size_t)num_subfiles * sizeof(msg))))
H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "can't allocate message array");
- for (int i = 0; i < n_io_concentrators; i++) {
+ for (int i = 0; i < num_subfiles; i++) {
sf_eofs[i] = -1;
recv_reqs[i] = MPI_REQUEST_NULL;
}
- /* Post early non-blocking receives for replies from each IOC */
- for (int i = 0; i < n_io_concentrators; i++) {
- int ioc_rank = sf_context->topology->io_concentrators[i];
+ /* Post early non-blocking receives for the EOF of each subfile */
+ for (int i = 0; i < num_subfiles; i++) {
+ int ioc_rank = sf_context->topology->io_concentrators[i % n_io_concentrators];
- if (MPI_SUCCESS != (mpi_code = MPI_Irecv(&recv_msg[3 * i], 3, MPI_INT64_T, ioc_rank,
+ if (MPI_SUCCESS != (mpi_code = MPI_Irecv(&recv_msg[3 * i], 1, H5_subfiling_rpc_msg_type, ioc_rank,
GET_EOF_COMPLETED, sf_context->sf_eof_comm, &recv_reqs[i])))
H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Irecv", mpi_code);
}
- /* Send each IOC a message requesting that subfile's EOF */
+ /* Send each subfile's IOC a message requesting that subfile's EOF */
+
+ msg[1] = -1; /* padding -- not used in this message */
+ msg[2] = -1; /* padding -- not used in this message */
- msg[0] = 0; /* padding -- not used in this message */
- msg[1] = 0; /* padding -- not used in this message */
- msg[2] = context_id;
+ for (int i = 0; i < num_subfiles; i++) {
+ int ioc_rank = sf_context->topology->io_concentrators[i % n_io_concentrators];
- for (int i = 0; i < n_io_concentrators; i++) {
- int ioc_rank = sf_context->topology->io_concentrators[i];
+ /* Set subfile index for receiving IOC */
+ msg[0] = i / n_io_concentrators;
- if (MPI_SUCCESS !=
- (mpi_code = MPI_Send(msg, 3, MPI_INT64_T, ioc_rank, GET_EOF_OP, sf_context->sf_msg_comm)))
+ if (MPI_SUCCESS != (mpi_code = MPI_Send(msg, 1, H5_subfiling_rpc_msg_type, ioc_rank, GET_EOF_OP,
+ sf_context->sf_msg_comm)))
H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Send", mpi_code);
}
/* Wait for EOF communication to complete */
- if (MPI_SUCCESS != (mpi_code = MPI_Waitall(n_io_concentrators, recv_reqs, MPI_STATUSES_IGNORE)))
+ if (MPI_SUCCESS != (mpi_code = MPI_Waitall(num_subfiles, recv_reqs, MPI_STATUSES_IGNORE)))
H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Waitall", mpi_code);
- for (int i = 0; i < n_io_concentrators; i++) {
+ for (int i = 0; i < num_subfiles; i++) {
int ioc_rank = (int)recv_msg[3 * i];
HDassert(ioc_rank >= 0);
HDassert(ioc_rank < n_io_concentrators);
- HDassert(sf_eofs[ioc_rank] == -1);
+ HDassert(sf_eofs[i] == -1);
- sf_eofs[ioc_rank] = recv_msg[(3 * i) + 1];
+ sf_eofs[i] = recv_msg[(3 * i) + 1];
}
/* 4) After all IOCs have replied, compute the offset of
@@ -333,21 +339,21 @@ H5FD__subfiling__get_real_eof(hid_t context_id, int64_t *logical_eof_ptr)
* EOF.
*/
- for (int i = 0; i < n_io_concentrators; i++) {
+ for (int i = 0; i < num_subfiles; i++) {
/* compute number of complete stripes */
sf_logical_eof = sf_eofs[i] / sf_context->sf_stripe_size;
/* multiply by stripe size */
- sf_logical_eof *= sf_context->sf_stripe_size * n_io_concentrators;
+ sf_logical_eof *= sf_context->sf_stripe_size * num_subfiles;
- /* if the sub-file doesn't end on a stripe size boundary, must add in a partial stripe */
+ /* if the subfile doesn't end on a stripe size boundary, must add in a partial stripe */
if (sf_eofs[i] % sf_context->sf_stripe_size > 0) {
/* add in the size of the partial stripe up to but not including this subfile */
sf_logical_eof += i * sf_context->sf_stripe_size;
- /* finally, add in the number of bytes in the last partial stripe depth in the sub-file */
+ /* finally, add in the number of bytes in the last partial stripe depth in the subfile */
sf_logical_eof += sf_eofs[i] % sf_context->sf_stripe_size;
}
@@ -365,7 +371,7 @@ H5FD__subfiling__get_real_eof(hid_t context_id, int64_t *logical_eof_ptr)
done:
if (ret_value < 0) {
- for (int i = 0; i < n_io_concentrators; i++) {
+ for (int i = 0; i < num_subfiles; i++) {
if (recv_reqs && (recv_reqs[i] != MPI_REQUEST_NULL)) {
if (MPI_SUCCESS != (mpi_code = MPI_Cancel(&recv_reqs[i])))
H5_SUBFILING_MPI_DONE_ERROR(FAIL, "MPI_Cancel", mpi_code);