diff options
Diffstat (limited to 'src/H5FDsubfiling/H5FDsubfile_int.c')
-rw-r--r-- | src/H5FDsubfiling/H5FDsubfile_int.c | 186 |
1 files changed, 96 insertions, 90 deletions
diff --git a/src/H5FDsubfiling/H5FDsubfile_int.c b/src/H5FDsubfiling/H5FDsubfile_int.c index 22a5bd0..c089509 100644 --- a/src/H5FDsubfiling/H5FDsubfile_int.c +++ b/src/H5FDsubfiling/H5FDsubfile_int.c @@ -30,11 +30,11 @@ * Note: This code should be moved -- most likely to the IOC * code files. * - * Purpose: Apply a truncate operation to the sub-files. + * Purpose: Apply a truncate operation to the subfiles. * * In the context of the I/O concentrators, the eof must be * translated into the appropriate value for each of the - * sub-files, and then applied to same. + * subfiles, and then applied to same. * * Further, we must ensure that all prior I/O requests complete * before the truncate is applied. @@ -44,7 +44,7 @@ * 1) Run a barrier on entry. * * 2) Determine if this rank is a IOC. If it is, compute - * the correct EOF for this sub-file, and send a truncate + * the correct EOF for this subfile, and send a truncate * request to the IOC. * * 3) On the IOC thread, allow all pending I/O requests @@ -72,50 +72,61 @@ herr_t H5FD__subfiling__truncate_sub_files(hid_t context_id, int64_t logical_file_eof, MPI_Comm comm) { - int mpi_code; /* MPI return code */ subfiling_context_t *sf_context = NULL; - int64_t msg[3] = { - 0, - }; - herr_t ret_value = SUCCEED; /* Return value */ + int64_t msg[3] = {0}; + int mpi_size; + int mpi_code; + herr_t ret_value = SUCCEED; + + if (MPI_SUCCESS != (mpi_code = MPI_Comm_size(comm, &mpi_size))) + H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Comm_size failed", mpi_code); /* Barrier on entry */ - if (MPI_SUCCESS != (mpi_code = MPI_Barrier(comm))) - H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Barrier failed", mpi_code); + if (mpi_size > 1) + if (MPI_SUCCESS != (mpi_code = MPI_Barrier(comm))) + H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Barrier failed", mpi_code); if (NULL == (sf_context = (subfiling_context_t *)H5_get_subfiling_object(context_id))) H5_SUBFILING_GOTO_ERROR(H5E_FILE, H5E_BADVALUE, FAIL, "can't get subfile context"); - /* Test to see if this rank is running an I/O concentrator. */ - if (sf_context->topology->rank_is_ioc) { - - int i; - int64_t subfile_eof; int64_t num_full_stripes; + int64_t num_leftover_stripes; int64_t partial_stripe_len; -#ifndef NDEBUG - int64_t test_file_eof; -#endif /* NDEBUG */ - /* if it is, first compute the sub-file EOF */ + num_full_stripes = logical_file_eof / sf_context->sf_blocksize_per_stripe; + partial_stripe_len = logical_file_eof % sf_context->sf_blocksize_per_stripe; + num_leftover_stripes = partial_stripe_len / sf_context->sf_stripe_size; - num_full_stripes = logical_file_eof / sf_context->sf_blocksize_per_stripe; - partial_stripe_len = logical_file_eof % sf_context->sf_blocksize_per_stripe; + /* Compute the EOF for each subfile this IOC owns */ + for (int i = 0; i < sf_context->sf_num_fids; i++) { + int64_t subfile_eof = num_full_stripes * sf_context->sf_stripe_size; + int64_t global_subfile_idx; - subfile_eof = num_full_stripes * sf_context->sf_stripe_size; + global_subfile_idx = + (i * sf_context->topology->n_io_concentrators) + sf_context->topology->ioc_idx; - if (sf_context->topology->subfile_rank < (partial_stripe_len / sf_context->sf_stripe_size)) { + if (global_subfile_idx < num_leftover_stripes) { + subfile_eof += sf_context->sf_stripe_size; + } + else if (global_subfile_idx == num_leftover_stripes) { + subfile_eof += partial_stripe_len % sf_context->sf_stripe_size; + } - subfile_eof += sf_context->sf_stripe_size; - } - else if (sf_context->topology->subfile_rank == (partial_stripe_len / sf_context->sf_stripe_size)) { + /* Direct the IOC to truncate this subfile to the correct EOF */ + msg[0] = subfile_eof; + msg[1] = i; + msg[2] = -1; /* padding -- not used in this message */ - subfile_eof += partial_stripe_len % sf_context->sf_stripe_size; + if (MPI_SUCCESS != + (mpi_code = MPI_Send(msg, 1, H5_subfiling_rpc_msg_type, + sf_context->topology->io_concentrators[sf_context->topology->ioc_idx], + TRUNC_OP, sf_context->sf_msg_comm))) + H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Send failed", mpi_code); } /* sanity check -- compute the file eof using the same mechanism used to - * compute the sub-file eof. Assert that the computed value and the + * compute the subfile eof. Assert that the computed value and the * actual value match. * * Do this only for debug builds -- probably delete this before release. @@ -124,40 +135,29 @@ H5FD__subfiling__truncate_sub_files(hid_t context_id, int64_t logical_file_eof, */ #ifndef NDEBUG - test_file_eof = 0; - - for (i = 0; i < sf_context->topology->n_io_concentrators; i++) { - - test_file_eof += num_full_stripes * sf_context->sf_stripe_size; - - if (i < (partial_stripe_len / sf_context->sf_stripe_size)) { - - test_file_eof += sf_context->sf_stripe_size; + { + int64_t test_file_eof = 0; + + for (int i = 0; i < sf_context->sf_num_subfiles; i++) { + test_file_eof += num_full_stripes * sf_context->sf_stripe_size; + + if (i < num_leftover_stripes) { + test_file_eof += sf_context->sf_stripe_size; + } + else if (i == num_leftover_stripes) { + test_file_eof += partial_stripe_len % sf_context->sf_stripe_size; + } } - else if (i == (partial_stripe_len / sf_context->sf_stripe_size)) { - test_file_eof += partial_stripe_len % sf_context->sf_stripe_size; - } + HDassert(test_file_eof == logical_file_eof); } - HDassert(test_file_eof == logical_file_eof); #endif /* NDEBUG */ - - /* then direct the IOC to truncate the sub-file to the correct EOF */ - - msg[0] = subfile_eof; - msg[1] = 0; /* padding -- not used in this message */ - msg[2] = context_id; - - if (MPI_SUCCESS != - (mpi_code = MPI_Send(msg, 3, MPI_INT64_T, - sf_context->topology->io_concentrators[sf_context->topology->subfile_rank], - TRUNC_OP, sf_context->sf_msg_comm))) - H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Send failed", mpi_code); } /* Barrier on exit */ - if (MPI_SUCCESS != (mpi_code = MPI_Barrier(comm))) - H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Barrier failed", mpi_code); + if (mpi_size > 1) + if (MPI_SUCCESS != (mpi_code = MPI_Barrier(comm))) + H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Barrier failed", mpi_code); done: @@ -176,9 +176,10 @@ done: * Do this as follows: * * 1) allocate an array of int64_t of length equal to the - * the number of IOCs, and initialize all fields to -1. + * the number of subfiles, and initialize all fields to -1. * - * 2) Send each IOC a message requesting that sub-file's EOF. + * 2) Send each subfile's IOC a message requesting that + * subfile's EOF. * * 3) Await reply from each IOC, storing the reply in * the appropriate entry in the array allocated in 1. @@ -197,13 +198,13 @@ done: * than for the more traditional HDF5 file implementations. * This statement derives from the fact that unlike "normal" * HDF5 files, subfiling introduces a multi-file representation - * of a single HDF5 file. The plurality of sub-files represents - * a software RAID-0 based HDF5 file. As such, each sub-file + * of a single HDF5 file. The plurality of subfiles represents + * a software RAID-0 based HDF5 file. As such, each subfile * contains a designated portion of the address space of the * virtual HDF5 storage. We have no notion of HDF5 datatypes, * datasets, metadata, or other HDF5 structures; only BYTES. * - * The organization of the bytes within sub-files is consistent + * The organization of the bytes within subfiles is consistent * with the RAID-0 striping, i.e. there are IO Concentrators * (IOCs) which correspond to a stripe-count (in Lustre) as * well as a stripe_size. The combination of these two @@ -220,7 +221,7 @@ done: * follows. * 1. At file creation, each IOC is assigned a rank value * (0 to N-1, where N is the total number of IOCs) and - * a 'sf_base_addr' = 'subfile_rank' * 'sf_stripe_size') + * a 'sf_base_addr' = 'ioc_idx' * 'sf_stripe_size') * we also determine the 'sf_blocksize_per_stripe' which * is simply the 'sf_stripe_size' * 'n_ioc_concentrators' * @@ -263,9 +264,10 @@ H5FD__subfiling__get_real_eof(hid_t context_id, int64_t *logical_eof_ptr) int64_t msg[3] = {0, 0, 0}; int64_t logical_eof = 0; int64_t sf_logical_eof; - int n_io_concentrators = 0; /* copy of value in topology */ - int mpi_code; /* MPI return code */ - herr_t ret_value = SUCCEED; /* Return value */ + int n_io_concentrators = 0; + int num_subfiles = 0; + int mpi_code; /* MPI return code */ + herr_t ret_value = SUCCEED; /* Return value */ HDassert(logical_eof_ptr); @@ -275,56 +277,60 @@ H5FD__subfiling__get_real_eof(hid_t context_id, int64_t *logical_eof_ptr) HDassert(sf_context->topology); n_io_concentrators = sf_context->topology->n_io_concentrators; + num_subfiles = sf_context->sf_num_subfiles; HDassert(n_io_concentrators > 0); + HDassert(num_subfiles >= n_io_concentrators); - if (NULL == (sf_eofs = HDmalloc((size_t)n_io_concentrators * sizeof(int64_t)))) - H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "can't allocate sub-file EOFs array"); - if (NULL == (recv_reqs = HDmalloc((size_t)n_io_concentrators * sizeof(*recv_reqs)))) + if (NULL == (sf_eofs = HDmalloc((size_t)num_subfiles * sizeof(int64_t)))) + H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "can't allocate subfile EOFs array"); + if (NULL == (recv_reqs = HDmalloc((size_t)num_subfiles * sizeof(*recv_reqs)))) H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "can't allocate receive requests array"); - if (NULL == (recv_msg = HDmalloc((size_t)n_io_concentrators * 3 * sizeof(*recv_msg)))) + if (NULL == (recv_msg = HDmalloc((size_t)num_subfiles * sizeof(msg)))) H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "can't allocate message array"); - for (int i = 0; i < n_io_concentrators; i++) { + for (int i = 0; i < num_subfiles; i++) { sf_eofs[i] = -1; recv_reqs[i] = MPI_REQUEST_NULL; } - /* Post early non-blocking receives for replies from each IOC */ - for (int i = 0; i < n_io_concentrators; i++) { - int ioc_rank = sf_context->topology->io_concentrators[i]; + /* Post early non-blocking receives for the EOF of each subfile */ + for (int i = 0; i < num_subfiles; i++) { + int ioc_rank = sf_context->topology->io_concentrators[i % n_io_concentrators]; - if (MPI_SUCCESS != (mpi_code = MPI_Irecv(&recv_msg[3 * i], 3, MPI_INT64_T, ioc_rank, + if (MPI_SUCCESS != (mpi_code = MPI_Irecv(&recv_msg[3 * i], 1, H5_subfiling_rpc_msg_type, ioc_rank, GET_EOF_COMPLETED, sf_context->sf_eof_comm, &recv_reqs[i]))) H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Irecv", mpi_code); } - /* Send each IOC a message requesting that subfile's EOF */ + /* Send each subfile's IOC a message requesting that subfile's EOF */ + + msg[1] = -1; /* padding -- not used in this message */ + msg[2] = -1; /* padding -- not used in this message */ - msg[0] = 0; /* padding -- not used in this message */ - msg[1] = 0; /* padding -- not used in this message */ - msg[2] = context_id; + for (int i = 0; i < num_subfiles; i++) { + int ioc_rank = sf_context->topology->io_concentrators[i % n_io_concentrators]; - for (int i = 0; i < n_io_concentrators; i++) { - int ioc_rank = sf_context->topology->io_concentrators[i]; + /* Set subfile index for receiving IOC */ + msg[0] = i / n_io_concentrators; - if (MPI_SUCCESS != - (mpi_code = MPI_Send(msg, 3, MPI_INT64_T, ioc_rank, GET_EOF_OP, sf_context->sf_msg_comm))) + if (MPI_SUCCESS != (mpi_code = MPI_Send(msg, 1, H5_subfiling_rpc_msg_type, ioc_rank, GET_EOF_OP, + sf_context->sf_msg_comm))) H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Send", mpi_code); } /* Wait for EOF communication to complete */ - if (MPI_SUCCESS != (mpi_code = MPI_Waitall(n_io_concentrators, recv_reqs, MPI_STATUSES_IGNORE))) + if (MPI_SUCCESS != (mpi_code = MPI_Waitall(num_subfiles, recv_reqs, MPI_STATUSES_IGNORE))) H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Waitall", mpi_code); - for (int i = 0; i < n_io_concentrators; i++) { + for (int i = 0; i < num_subfiles; i++) { int ioc_rank = (int)recv_msg[3 * i]; HDassert(ioc_rank >= 0); HDassert(ioc_rank < n_io_concentrators); - HDassert(sf_eofs[ioc_rank] == -1); + HDassert(sf_eofs[i] == -1); - sf_eofs[ioc_rank] = recv_msg[(3 * i) + 1]; + sf_eofs[i] = recv_msg[(3 * i) + 1]; } /* 4) After all IOCs have replied, compute the offset of @@ -333,21 +339,21 @@ H5FD__subfiling__get_real_eof(hid_t context_id, int64_t *logical_eof_ptr) * EOF. */ - for (int i = 0; i < n_io_concentrators; i++) { + for (int i = 0; i < num_subfiles; i++) { /* compute number of complete stripes */ sf_logical_eof = sf_eofs[i] / sf_context->sf_stripe_size; /* multiply by stripe size */ - sf_logical_eof *= sf_context->sf_stripe_size * n_io_concentrators; + sf_logical_eof *= sf_context->sf_stripe_size * num_subfiles; - /* if the sub-file doesn't end on a stripe size boundary, must add in a partial stripe */ + /* if the subfile doesn't end on a stripe size boundary, must add in a partial stripe */ if (sf_eofs[i] % sf_context->sf_stripe_size > 0) { /* add in the size of the partial stripe up to but not including this subfile */ sf_logical_eof += i * sf_context->sf_stripe_size; - /* finally, add in the number of bytes in the last partial stripe depth in the sub-file */ + /* finally, add in the number of bytes in the last partial stripe depth in the subfile */ sf_logical_eof += sf_eofs[i] % sf_context->sf_stripe_size; } @@ -365,7 +371,7 @@ H5FD__subfiling__get_real_eof(hid_t context_id, int64_t *logical_eof_ptr) done: if (ret_value < 0) { - for (int i = 0; i < n_io_concentrators; i++) { + for (int i = 0; i < num_subfiles; i++) { if (recv_reqs && (recv_reqs[i] != MPI_REQUEST_NULL)) { if (MPI_SUCCESS != (mpi_code = MPI_Cancel(&recv_reqs[i]))) H5_SUBFILING_MPI_DONE_ERROR(FAIL, "MPI_Cancel", mpi_code); |