From 0b6016a3fc6f7cf5b60160e638e3ae7280952106 Mon Sep 17 00:00:00 2001 From: Jordan Henderson Date: Wed, 25 Jan 2017 19:34:10 -0600 Subject: Fix bug with chunk data updating --- src/H5Dmpio.c | 135 +++++++++++++++++++++++++++++----------------------------- 1 file changed, 68 insertions(+), 67 deletions(-) diff --git a/src/H5Dmpio.c b/src/H5Dmpio.c index 70a859e..39a5e15 100644 --- a/src/H5Dmpio.c +++ b/src/H5Dmpio.c @@ -2661,12 +2661,12 @@ H5D__construct_filtered_io_info_list(const H5D_io_info_t *io_info, const H5D_typ H5D_filtered_collective_io_info_t *local_info_array = NULL; H5D_filtered_collective_io_info_t *overlap_info_array = NULL; H5S_sel_iter_t *mem_iter = NULL; + unsigned char *mod_data = NULL; H5SL_node_t *chunk_node; MPI_Request *send_requests = NULL; MPI_Status *send_statuses = NULL; hbool_t no_overlap = FALSE; hbool_t mem_iter_init = FALSE; - uint8_t *mod_data = NULL; size_t num_send_requests; size_t num_chunks_selected; size_t overlap_info_array_num_entries; @@ -2680,6 +2680,8 @@ H5D__construct_filtered_io_info_list(const H5D_io_info_t *io_info, const H5D_typ HDassert(type_info); HDassert(fm); HDassert(chunk_list); + HDassert(num_entries); + HDassert(_num_chunks_selected_array); if ((mpi_rank = H5F_mpi_get_rank(io_info->dset->oloc.file)) < 0) HGOTO_ERROR(H5E_IO, H5E_MPI, FAIL, "unable to obtain mpi rank") @@ -2723,10 +2725,7 @@ H5D__construct_filtered_io_info_list(const H5D_io_info_t *io_info, const H5D_typ HDfprintf(debug_file, "| - Chunk Num Elmts Sel.: %zd\n", H5S_GET_SELECT_NPOINTS(local_info_array[j].chunk_info.mspace)); } HDfprintf(debug_file, "------------------------------\n\n"); -#endif - -#ifdef PARALLEL_COMPRESS_DEBUG HDfprintf(debug_file, "Testing mem/file space addresses:\n"); HDfprintf(debug_file, "-----------------------------------\n"); @@ -2805,19 +2804,23 @@ H5D__construct_filtered_io_info_list(const H5D_io_info_t *io_info, const H5D_typ overlap_info_array[num_chunks_selected++] = chunk_entry; } /* end if */ else { + unsigned char *mod_data_p = NULL; hssize_t iter_nelmts; - hssize_t mod_data_size; - uint8_t *mod_data_p = NULL; + size_t mod_data_size; /* XXX: Need some way of checking chunk entry to validate that this process * is actually contributing some data for this chunk update */ - /* Determine size of serialized chunk selection plus the size + /* Determine size of serialized chunk memory dataspace plus the size * of the data being written */ - if ((mod_data_size = H5S_SELECT_SERIAL_SIZE(chunk_entry.chunk_info.mspace)) < 0) - HGOTO_ERROR(H5E_DATASET, H5E_CANTENCODE, FAIL, "unable to check dataspace selection size") + /* XXX: Using the file dataspace seems to work here, presumably because the file + * space reflects how the data is actually stored in the file, not in memory. e.g. + * it reflects the chunking setup + */ + if (H5S_encode(chunk_entry.chunk_info.fspace, &mod_data, &mod_data_size) < 0) + HGOTO_ERROR(H5E_DATASET, H5E_CANTENCODE, FAIL, "unable to get encoded dataspace size") if ((iter_nelmts = H5S_GET_SELECT_NPOINTS(chunk_entry.chunk_info.mspace)) < 0) HGOTO_ERROR(H5E_DATASET, H5E_CANTCOUNT, FAIL, "dataspace is invalid") @@ -2828,16 +2831,16 @@ H5D__construct_filtered_io_info_list(const H5D_io_info_t *io_info, const H5D_typ mod_data_size += iter_nelmts * type_info->src_type_size; #ifdef PARALLEL_COMPRESS_DEBUG - HDfprintf(debug_file, "Allocing %zd bytes for mod. data buffer.\n", (size_t) mod_data_size); + HDfprintf(debug_file, "| Allocing %zd bytes for mod. data buffer.\n", (size_t) mod_data_size); #endif - if (NULL == (mod_data = (uint8_t *) H5MM_malloc(mod_data_size))) + if (NULL == (mod_data = (unsigned char *) H5MM_malloc(mod_data_size))) HGOTO_ERROR(H5E_DATASET, H5E_CANTALLOC, FAIL, "couldn't allocate chunk modification send buffer") - /* Serialize the current selection into the buffer */ + /* Serialize the chunk's file dataspace into the buffer */ mod_data_p = mod_data; - if (H5S_SELECT_SERIALIZE(chunk_entry.chunk_info.mspace, &mod_data_p) < 0) - HGOTO_ERROR(H5E_DATASET, H5E_CANTENCODE, FAIL, "unable to serialize dataspace selection") + if (H5S_encode(chunk_entry.chunk_info.fspace, &mod_data_p, &mod_data_size) < 0) + HGOTO_ERROR(H5E_DATASET, H5E_CANTENCODE, FAIL, "unable to encode dataspace") /* Initialize iterator for memory selection */ if (H5S_select_iter_init(mem_iter, chunk_entry.chunk_info.mspace, type_info->src_type_size) < 0) @@ -2845,7 +2848,7 @@ H5D__construct_filtered_io_info_list(const H5D_io_info_t *io_info, const H5D_typ mem_iter_init = TRUE; #ifdef PARALLEL_COMPRESS_DEBUG - HDfprintf(debug_file, "Iterating over %lld elements.\n", iter_nelmts); + HDfprintf(debug_file, "| Iterating over %lld elements.\n", iter_nelmts); #endif /* Collect the modification data into the buffer */ @@ -2862,27 +2865,28 @@ H5D__construct_filtered_io_info_list(const H5D_io_info_t *io_info, const H5D_typ } HDfprintf(debug_file, "]\n|\n"); - HDfprintf(debug_file, "Sending modification data for chunk at address %a to process %d.\n", chunk_entry.old_chunk.offset, new_owner); + HDfprintf(debug_file, "| Sending modification data for chunk at address %a to process %d.\n", chunk_entry.old_chunk.offset, new_owner); #endif /* Send modification data to new owner */ - H5_CHECK_OVERFLOW(mod_data_size, hssize_t, int) + H5_CHECK_OVERFLOW(mod_data_size, size_t, int) if (MPI_SUCCESS != (mpi_code = MPI_Isend(mod_data, (int) mod_data_size, MPI_BYTE, new_owner, chunk_entry.chunk_info.index, io_info->comm, &send_requests[num_send_requests++]))) HMPI_GOTO_ERROR(FAIL, "MPI_Isend failed", mpi_code) #ifdef PARALLEL_COMPRESS_DEBUG - HDfprintf(debug_file, "Mod. data sent.\n"); + HDfprintf(debug_file, "| Mod. data sent.\n|\n"); #endif if (mod_data) - mod_data = (uint8_t *) H5MM_free(mod_data); + mod_data = (unsigned char *) H5MM_free(mod_data); if (mem_iter_init && H5S_SELECT_ITER_RELEASE(mem_iter) < 0) HGOTO_ERROR(H5E_DATASET, H5E_CANTFREE, FAIL, "couldn't release selection iterator") + mem_iter_init = FALSE; } /* end else */ #ifdef PARALLEL_COMPRESS_DEBUG - HDfprintf(debug_file, "Chunk at address %a re-assigned to process %d.\n", chunk_addr, new_owner); + HDfprintf(debug_file, "| Chunk at address %a re-assigned to process %d.\n|\n", chunk_addr, new_owner); #endif } /* end for */ @@ -2902,9 +2906,9 @@ H5D__construct_filtered_io_info_list(const H5D_io_info_t *io_info, const H5D_typ HDfprintf(debug_file, "| Chunk Entry %zd:\n", j); HDfprintf(debug_file, "| - Chunk Address: %a\n", local_info_array[j].old_chunk.offset); HDfprintf(debug_file, "| - Chunk Length: %zd\n", local_info_array[j].old_chunk.length); - HDfprintf(debug_file, "| - Address of mspace: %x\n", local_info_array[j].chunk_info.mspace); - HDfprintf(debug_file, "| - Chunk Selection Type: %d\n", H5S_GET_SELECT_TYPE(local_info_array[j].chunk_info.mspace)); - HDfprintf(debug_file, "| - Chunk Num Elmts Sel.: %zd\n", H5S_GET_SELECT_NPOINTS(local_info_array[j].chunk_info.mspace)); + HDfprintf(debug_file, "| - Address of mspace: %x\n", local_info_array[j].chunk_info.fspace); + HDfprintf(debug_file, "| - Chunk Selection Type: %d\n", H5S_GET_SELECT_TYPE(local_info_array[j].chunk_info.fspace)); + HDfprintf(debug_file, "| - Chunk Num Elmts Sel.: %zd\n", H5S_GET_SELECT_NPOINTS(local_info_array[j].chunk_info.fspace)); } HDfprintf(debug_file, "------------------------------\n\n"); #endif @@ -2947,6 +2951,8 @@ done: H5MM_free(send_statuses); if (mod_data) H5MM_free(mod_data); + if (mem_iter_init && H5S_SELECT_ITER_RELEASE(mem_iter) < 0) + HGOTO_ERROR(H5E_DATASET, H5E_CANTFREE, FAIL, "couldn't release selection iterator") if (mem_iter) H5MM_free(mem_iter); @@ -3076,11 +3082,13 @@ H5D__filtered_collective_chunk_entry_io(H5D_filtered_collective_io_info_t *chunk const H5D_io_info_t *io_info, const H5D_type_info_t *type_info) { H5S_sel_iter_t *mem_iter = NULL; + unsigned char *mod_data = NULL; unsigned filter_mask = 0; hssize_t iter_nelmts; hbool_t full_overwrite = FALSE; hbool_t mem_iter_init = FALSE; size_t buf_size; + H5S_t *dataspace = NULL; herr_t ret_value = SUCCEED; FUNC_ENTER_STATIC @@ -3179,12 +3187,10 @@ H5D__filtered_collective_chunk_entry_io(H5D_filtered_collective_io_info_t *chunk /* Update the chunk data with any modifications from other processes */ while (chunk_entry->num_writers > 1) { - MPI_Status status; - uint8_t *buf = NULL; - uint8_t *buf_p = NULL; - H5S_t *selection = NULL; - int count; - int mpi_code; + unsigned char *mod_data_p = NULL; + MPI_Status status; + int count; + int mpi_code; /* XXX: Since the receive tag needs to be an int, it is possible that a chunk's index * may fall outside the range of an int and cause an overflow problem when casting down @@ -3196,7 +3202,7 @@ H5D__filtered_collective_chunk_entry_io(H5D_filtered_collective_io_info_t *chunk HMPI_GOTO_ERROR(FAIL, "MPI_Probe failed", mpi_code) #ifdef PARALLEL_COMPRESS_DEBUG - HDfprintf(debug_file, "Found message from source %d with tag %d.\n", status.MPI_SOURCE, status.MPI_TAG); + HDfprintf(debug_file, "| - Found message from source %d with tag %d.\n", status.MPI_SOURCE, status.MPI_TAG); #endif /* Retrieve the message size */ @@ -3204,81 +3210,73 @@ H5D__filtered_collective_chunk_entry_io(H5D_filtered_collective_io_info_t *chunk HMPI_GOTO_ERROR(FAIL, "MPI_Get_count failed", mpi_code) #ifdef PARALLEL_COMPRESS_DEBUG - HDfprintf(debug_file, "Message size is %d bytes.\n", count); + HDfprintf(debug_file, "| - Message size is %d bytes.\n", count); #endif - if (NULL == (buf = (uint8_t *) H5MM_malloc(count))) + if (NULL == (mod_data = (unsigned char *) H5MM_malloc(count))) HGOTO_ERROR(H5E_DATASET, H5E_CANTALLOC, FAIL, "couldn't allocate modification data receive buffer") - buf_p = buf; + mod_data_p = mod_data; - if (MPI_SUCCESS != (mpi_code = MPI_Recv(buf, count, MPI_BYTE, MPI_ANY_SOURCE, + if (MPI_SUCCESS != (mpi_code = MPI_Recv(mod_data, count, MPI_BYTE, MPI_ANY_SOURCE, chunk_entry->chunk_info.index, io_info->comm, &status))) HMPI_GOTO_ERROR(FAIL, "MPI_Recv failed", mpi_code) #ifdef PARALLEL_COMPRESS_DEBUG - HDfprintf(debug_file, "Received the message.\n"); + HDfprintf(debug_file, "| - Received the message.\n"); #endif - /* Deserialize the selection in the chunk's dataspace */ - if (H5S_SELECT_DESERIALIZE(&selection, &buf_p) < 0) - HGOTO_ERROR(H5E_DATASET, H5E_CANTDECODE, FAIL, "unable to deserialize dataspace selection") - - /* H5S_extent_copy(selection, chunk_entry->chunk_info.mspace); */ + /* Decode the chunk's memory dataspace */ + if (NULL == (dataspace = H5S_decode(&mod_data_p))) + HGOTO_ERROR(H5E_DATASET, H5E_CANTDECODE, FAIL, "unable to decode dataspace") #ifdef PARALLEL_COMPRESS_DEBUG - HDfprintf(debug_file, "Deserialized selection info:\n"); + HDfprintf(debug_file, "| Deserialized selection info:\n"); HDfprintf(debug_file, "| Mem Space:\n"); - HDfprintf(debug_file, "| - Extent Num elements: %zd\n", H5S_GET_EXTENT_NPOINTS(selection)); - HDfprintf(debug_file, "| - Extent type: %d\n", H5S_GET_EXTENT_TYPE(selection)); - HDfprintf(debug_file, "| - Selection Num elements: %zd\n", H5S_GET_SELECT_NPOINTS(selection)); - HDfprintf(debug_file, "| - Selection type: %d\n", H5S_GET_SELECT_TYPE(selection)); + HDfprintf(debug_file, "| - Extent Num elements: %zd\n", H5S_GET_EXTENT_NPOINTS(dataspace)); + HDfprintf(debug_file, "| - Extent type: %d\n", H5S_GET_EXTENT_TYPE(dataspace)); + HDfprintf(debug_file, "| - Selection Num elements: %zd\n", H5S_GET_SELECT_NPOINTS(dataspace)); + HDfprintf(debug_file, "| - Selection type: %d\n", H5S_GET_SELECT_TYPE(dataspace)); #endif - /* XXX: After receiving the selection information, the extent is not - * set/valid. Possibly copy the selection information directly into - * chunk entry mem space, or copy extent into received selection H5S type - */ - - - if (H5S_select_iter_init(mem_iter, selection, type_info->dst_type_size) < 0) + if (H5S_select_iter_init(mem_iter, dataspace, type_info->dst_type_size) < 0) HGOTO_ERROR(H5E_DATASET, H5E_CANTINIT, FAIL, "unable to initialize memory selection information") mem_iter_init = TRUE; - if ((iter_nelmts = H5S_GET_SELECT_NPOINTS(selection)) < 0) + if ((iter_nelmts = H5S_GET_SELECT_NPOINTS(dataspace)) < 0) HGOTO_ERROR(H5E_DATASET, H5E_CANTCOUNT, FAIL, "dataspace is invalid") #ifdef PARALLEL_COMPRESS_DEBUG - HDfprintf(debug_file, "Contents of message:\n["); + HDfprintf(debug_file, "| Contents of message:\n| ["); for (size_t j = 0; j < iter_nelmts; j++) { if (j > 0) HDfprintf(debug_file, ", "); - HDfprintf(debug_file, "%lld", ((long *) buf_p)[j]); + HDfprintf(debug_file, "%lld", ((long *) mod_data_p)[j]); } HDfprintf(debug_file, "]\n"); #endif #ifdef PARALLEL_COMPRESS_DEBUG - HDfprintf(debug_file, "Iter nelmts=%lld.\n", iter_nelmts); - HDfprintf(debug_file, "Mem space selected points: %zd.\n", H5S_GET_SELECT_NPOINTS(selection)); + HDfprintf(debug_file, "| Iter nelmts=%lld.\n", iter_nelmts); + HDfprintf(debug_file, "| Mem space selected points: %zd.\n| \n", H5S_GET_SELECT_NPOINTS(dataspace)); #endif /* Update the chunk data with the received modification data */ - /* if (!H5D__gather_mem(buf_p, selection, mem_iter, (size_t) iter_nelmts, - io_info->dxpl_cache, chunk_entry->buf)) - HGOTO_ERROR(H5E_DATASET, H5E_WRITEERROR, FAIL, "couldn't gather to write buffer") */ - if (H5D__scatter_mem(buf_p, selection, mem_iter, (size_t) iter_nelmts, + if (H5D__scatter_mem(mod_data_p, dataspace, mem_iter, (size_t) iter_nelmts, io_info->dxpl_cache, chunk_entry->buf) < 0) HGOTO_ERROR(H5E_DATASET, H5E_WRITEERROR, FAIL, "couldn't scatter to write buffer") chunk_entry->num_writers--; - if (buf) - H5MM_free(buf); - if (H5S_SELECT_ITER_RELEASE(mem_iter) < 0) + if (mod_data) + H5MM_free(mod_data); + if (mem_iter_init && H5S_SELECT_ITER_RELEASE(mem_iter) < 0) HGOTO_ERROR(H5E_DATASET, H5E_CANTFREE, FAIL, "couldn't release selection iterator") - if (selection) - if (H5S_close(selection) < 0) + if (dataspace) { + if (H5S_close(dataspace) < 0) HGOTO_ERROR(H5E_DATASPACE, H5E_CANTFREE, FAIL, "can't close dataspace") - } + dataspace = NULL; + } + mem_iter_init = FALSE; + } /* end while */ #ifdef PARALLEL_COMPRESS_DEBUG @@ -3314,6 +3312,9 @@ H5D__filtered_collective_chunk_entry_io(H5D_filtered_collective_io_info_t *chunk done: if (mem_iter_init && H5S_SELECT_ITER_RELEASE(mem_iter) < 0) HGOTO_ERROR(H5E_DATASET, H5E_CANTFREE, FAIL, "couldn't release selection iterator") + if (dataspace) + if (H5S_close(dataspace) < 0) + HGOTO_ERROR(H5E_DATASPACE, H5E_CANTFREE, FAIL, "can't close dataspace") if (mem_iter) H5MM_free(mem_iter) -- cgit v0.12