From 4a85877fdc3b6b4a4e87dad27149148c7ac6ebf5 Mon Sep 17 00:00:00 2001 From: John Mainzer Date: Mon, 19 Apr 2004 12:42:34 -0500 Subject: [svn-r8391] Purpose: Checkpoint checkin of FP bug fixes. FP is still quite buggy, but I must go deal with other matters. Description: Fixed two major bugs: 1) H5FPserver.c was clobbering meta data in its care. 2) H5FPserver.c was allocating the same space multiple times, causing both data and meta data corruption. Also made minor fixes, added debugging code, and familiarized myself with the FP code. All development work with FP enabled was done on Eirene. On this platform, FP now passes its test reliably with up to 9 processes. At 10 processes it seg faults every time. I haven't looked into this issue. There are also several known locking bugs which have to be fixed. However, they are of sufficiently low probability that I didn't bother with them on this pass. FP has not been tested with deletions -- this should be done. Also, need to test FP chunked I/O. Solution: 1) Modified cache in H5FPserver.c to merge changes correctly. Found and fixed a bug in H5TB.c in passing. 2) Multiple space allocation was caused by a race condition with set eoa requests. Most of these eoa requests appeared to be superfluous, so I deleted them. Those issued during the superblock read seemed necessary, so I inserted a barrier at the end of the superblock read, to prevent races with allocations. Platforms tested: h5committested --- src/H5AC.c | 12 +- src/H5F.c | 30 ++- src/H5FD.c | 55 ++--- src/H5FPclient.c | 173 +++++++++++++++- src/H5FPprivate.h | 6 + src/H5FPserver.c | 598 ++++++++++++++++++++++++++++++++++++++++++++++++++++-- 6 files changed, 814 insertions(+), 60 deletions(-) diff --git a/src/H5AC.c b/src/H5AC.c index fb0bafb..494bf98 100644 --- a/src/H5AC.c +++ b/src/H5AC.c @@ -1283,17 +1283,27 @@ H5AC_protect(H5F_t *f, hid_t dxpl_id, const H5AC_class_t *type, haddr_t addr, if (H5FP_request_lock(H5FD_fphdf5_file_id(lf), addr, rw == H5AC_WRITE ? H5FP_LOCK_WRITE : H5FP_LOCK_READ, TRUE, &req_id, &status) < 0) { +#if 0 + HDfprintf(stdout, "H5AC_protect: Lock failed.\n"); /* * FIXME: Check the status variable. If the lock is got * by some other process, we can loop and wait or bail * out of this function */ -HDfprintf(stderr, "Couldn't get lock for metadata at address %a\n", addr); + HDfprintf(stderr, "Couldn't get lock for metadata at address %a\n", addr); +#endif /* 0 */ HGOTO_ERROR(H5E_FPHDF5, H5E_CANTLOCK, NULL, "can't lock data on SAP!") } /* Load a thing from the SAP. */ if (NULL == (thing = type->load(f, dxpl_id, addr, udata1, udata2))) { +#if 0 + HDfprintf(stdout, + "%s: Load failed. addr = %a, type->id = %d.\n", + "H5AC_protect", + addr, + (int)(type->id)); +#endif /* 0 */ HCOMMON_ERROR(H5E_CACHE, H5E_CANTLOAD, "unable to load object") if (H5FP_request_release_lock(H5FD_fphdf5_file_id(lf), addr, diff --git a/src/H5F.c b/src/H5F.c index 007b2bf..63ebdde 100644 --- a/src/H5F.c +++ b/src/H5F.c @@ -1957,16 +1957,32 @@ H5F_open(const char *name, unsigned flags, hid_t fcpl_id, hid_t fapl_id, hid_t d if (H5F_read_superblock(file, dxpl_id, &root_ent, super_info.addr, buf, (size_t)super_info.size) < 0) HGOTO_ERROR(H5E_FILE, H5E_READERROR, NULL, "unable to read superblock") + } + + /* The following barrier ensures that all set eoa operations + * associated with creating the superblock are complete before + * we attempt any allocations. + * JRM - 4/13/04 + */ + if ( (mrc = MPI_Barrier(H5FP_SAP_BARRIER_COMM)) != MPI_SUCCESS ) + { + HMPI_GOTO_ERROR(NULL, "MPI_Barrier failed", mrc) + } + if (!H5FD_fphdf5_is_captain(lf)) { if (H5G_mkroot(file, dxpl_id, &root_ent) < 0) HGOTO_ERROR(H5E_FILE, H5E_CANTINIT, NULL, "unable to create/open root group") - } /* end if */ + } /* All clients free the buffer used for broadcasting the superblock */ buf = H5MM_xfree (buf); } /* end if */ #endif /* H5_HAVE_FPHDF5 */ } else if (1 == shared->nrefs) { +#ifdef H5_HAVE_FPHDF5 + int mrc; /*MPI return code */ +#endif /* H5_HAVE_FPHDF5 */ + /* Read the superblock if it hasn't been read before. */ if (HADDR_UNDEF == (shared->super_addr = H5F_locate_signature(lf,dxpl_id))) HGOTO_ERROR(H5E_FILE, H5E_NOTHDF5, NULL, "unable to find file signature") @@ -1974,6 +1990,18 @@ H5F_open(const char *name, unsigned flags, hid_t fcpl_id, hid_t fapl_id, hid_t d if (H5F_read_superblock(file, dxpl_id, &root_ent, shared->super_addr, NULL, 0) < 0) HGOTO_ERROR(H5E_FILE, H5E_READERROR, NULL, "unable to read superblock") +#ifdef H5_HAVE_FPHDF5 + if (H5FD_is_fphdf5_driver(lf)) { + /* reading the superblock generates lots of set_eoa calls. To avoid + * race conditions with allocations, make sure that everyone is done + * reading the superblock before we proceed. + */ + if ( (mrc = MPI_Barrier(H5FP_SAP_BARRIER_COMM)) != MPI_SUCCESS ) { + HMPI_GOTO_ERROR(NULL, "MPI_Barrier failed", mrc) + } + } +#endif /* H5_HAVE_FPHDF5 */ + /* Make sure we can open the root group */ if (H5G_mkroot(file, dxpl_id, &root_ent) < 0) HGOTO_ERROR(H5E_FILE, H5E_CANTOPENFILE, NULL, "unable to read root group") diff --git a/src/H5FD.c b/src/H5FD.c index 2ab1666..d7ac465 100644 --- a/src/H5FD.c +++ b/src/H5FD.c @@ -1518,6 +1518,10 @@ done: * Bill Wendling, 2003/02/19 * Added support for FPHDF5. * + * John Mainzer, 2004/04/13 + * Moved much of the FPHDF5 specific code into H5FP_client_alloc(), + * and re-worked it to get rid of a race condition on the eoa. + * *------------------------------------------------------------------------- */ haddr_t @@ -1540,51 +1544,16 @@ H5FD_alloc(H5FD_t *file, H5FD_mem_t type, hid_t dxpl_id, hsize_t size) * is the SAP executing this code, then skip the send to the SAP and * try to do the actual allocations. */ - if (H5FD_is_fphdf5_driver(file) && !H5FD_fphdf5_is_sap(file)) { - unsigned req_id = 0; - unsigned capt_only = 0; - H5FP_status_t status = H5FP_STATUS_OK; - H5P_genplist_t *plist; - H5FP_alloc_t fp_alloc; - - /* Get the data xfer property list */ - if ((plist = H5I_object(dxpl_id)) == NULL) - HGOTO_ERROR(H5E_PLIST, H5E_BADTYPE, HADDR_UNDEF, "not a dataset transfer list") - - if (H5P_exist_plist(plist, H5FD_FPHDF5_CAPTN_ALLOC_ONLY) > 0) - if (H5P_get(plist, H5FD_FPHDF5_CAPTN_ALLOC_ONLY, &capt_only) < 0) - HGOTO_ERROR(H5E_PLIST, H5E_CANTGET, HADDR_UNDEF, "can't retrieve FPHDF5 property") + if ( H5FD_is_fphdf5_driver(file) && !H5FD_fphdf5_is_sap(file) ) { + haddr_t addr; - HDmemset(&fp_alloc, 0, sizeof(fp_alloc)); - - /* - * If the captain is the only one who should allocate resources, - * then do just that... - */ - if (!capt_only || H5FD_fphdf5_is_captain(file)) { - /* Send the request to the SAP */ - if (H5FP_request_allocate(file, type, size, &fp_alloc.addr, - &fp_alloc.eoa, &req_id, &status) != SUCCEED) - /* FIXME: Should we check the "status" variable here? */ - HGOTO_ERROR(H5E_FPHDF5, H5E_CANTALLOC, HADDR_UNDEF, - "server couldn't allocate from file") - } - - if (capt_only) { - int mrc; - - if ((mrc = MPI_Bcast(&fp_alloc, 1, H5FP_alloc, - (int)H5FP_capt_barrier_rank, - H5FP_SAP_BARRIER_COMM)) != MPI_SUCCESS) - HMPI_GOTO_ERROR(HADDR_UNDEF, "MPI_Bcast failed", mrc); + if ( (addr = H5FP_client_alloc(file, type, dxpl_id, size)) + == HADDR_UNDEF) { + HGOTO_ERROR(H5E_FPHDF5, H5E_CANTALLOC, HADDR_UNDEF, + "allocation failed.") + } else { + HGOTO_DONE(addr) } - - - /* Set the EOA for all processes. This doesn't fail. */ - file->cls->set_eoa(file, fp_alloc.eoa); - - /* We've succeeded -- return the value */ - HGOTO_DONE(fp_alloc.addr) } #endif /* H5_HAVE_FPHDF5 */ diff --git a/src/H5FPclient.c b/src/H5FPclient.c index 434fa06..8e52056 100644 --- a/src/H5FPclient.c +++ b/src/H5FPclient.c @@ -45,6 +45,7 @@ static int interface_initialize_g = 0; static unsigned H5FP_gen_request_id(void); static herr_t H5FP_dump_to_file(H5FD_t *file, hid_t dxpl_id); + /* *===----------------------------------------------------------------------=== * Public Library (non-API) Functions @@ -332,7 +333,27 @@ H5FP_request_read_metadata(H5FD_t *file, unsigned file_id, hid_t dxpl_id, HDmemset(*buf, '\0', size); HDmemset(&mpi_status, 0, sizeof(mpi_status)); - if (size < sap_read.md_size) { + /* the following code is a bit odd and doubtless needs a bit + * of explanation. I certainly stumbled over it the first + * time I read it. + * + * For reasons unknown, read requests sent to the SAP only + * include a base address, not a length. Thus the SAP sends + * along the largest contiguous chunk it has starting at the + * specified address. + * + * If the chunk is bigger than we want, we just copy over what + * we want, and discard the rest. + * + * If it is just the right size, we receive it in the provided + * buffer. + * + * if it is too small to fulfil our request, we scream and die. + * + * JRM - 4/13/04 + */ + if (size < sap_read.md_size) + { char *mdata; if (H5FP_read_metadata(&mdata, (int)sap_read.md_size, (int)H5FP_sap_rank) == FAIL) { @@ -347,8 +368,11 @@ HDfprintf(stderr, "Metadata Read Failed!!!!\n"); H5FP_SAP_COMM, &mpi_status)) != MPI_SUCCESS) HMPI_GOTO_ERROR(FAIL, "MPI_Recv failed", mrc); } else { -HDfprintf(stderr, "Buffer not big enough to hold metadata!!!!\n"); -assert(0); + HDfprintf(stdout, + "H5FP_request_read_metadata: size = %d > md_size = %d.\n", + (int)size, (int)(sap_read.md_size)); + HDfprintf(stdout, "Mssg received from SAP is too small!!!!\n"); + assert(0); } break; @@ -628,6 +652,110 @@ done: FUNC_LEAVE_NOAPI(ret_value); } + +/* + * Function: H5FP_client_alloc + * Purpose: Handle the client side of an allocation in the FP case. + * In essence, this is simply a matter of referring the + * request to the SAP, and then returning the reply. + * + * A modified version of this code used to live in H5FD_alloc(), + * but I move it here to encapsulate it and generally tidy up. + * + * One can argue that we should all be done in an alloc + * routine in H5FDfdhdf5.c, but this invlves a smaller + * change to the code, and thus a smaller loss if I missed + * a major gotcha. If things go well, and we don't heave + * the current implementation of FP, I'll probably go that + * route eventually. + * Return: Success: The format address of the new file memory. + * Failure: The undefined address HADDR_UNDEF + * Programmer: JRM - 4/7/04 + * Modifications: + */ +haddr_t +H5FP_client_alloc(H5FD_t *file, H5FD_mem_t type, hid_t dxpl_id, hsize_t size) +{ + haddr_t ret_value = HADDR_UNDEF; + unsigned req_id = 0; + unsigned capt_only = 0; + H5FP_status_t status = H5FP_STATUS_OK; + H5P_genplist_t *plist; + H5FP_alloc_t fp_alloc; + + FUNC_ENTER_NOAPI(H5FP_client_alloc, HADDR_UNDEF) + + /* check args */ + HDassert(file); + HDassert(file->cls); + HDassert(type >= 0 && type < H5FD_MEM_NTYPES); + HDassert(size > 0); + + /* verify that we are running FP and we are not the SAP. */ + HDassert(H5FD_is_fphdf5_driver(file) && !H5FD_fphdf5_is_sap(file)); + + /* Get the data xfer property list */ + if ( (plist = H5I_object(dxpl_id)) == NULL ) { + HGOTO_ERROR(H5E_PLIST, H5E_BADTYPE, HADDR_UNDEF, "not a dataset transfer list") + } + + if ( H5P_exist_plist(plist, H5FD_FPHDF5_CAPTN_ALLOC_ONLY) > 0 ) { + if ( H5P_get(plist, H5FD_FPHDF5_CAPTN_ALLOC_ONLY, &capt_only) < 0 ) { + HGOTO_ERROR(H5E_PLIST, H5E_CANTGET, HADDR_UNDEF, "can't retrieve FPHDF5 property") + } + } + + HDmemset(&fp_alloc, 0, sizeof(fp_alloc)); + + /* + * If the captain is the only one who should allocate resources, + * then do just that... + */ + if ( !capt_only || H5FD_fphdf5_is_captain(file) ) { + /* Send the request to the SAP */ + if ( H5FP_request_allocate(file, type, size, &fp_alloc.addr, + &fp_alloc.eoa, &req_id, &status) + != SUCCEED ) { + HGOTO_ERROR(H5E_FPHDF5, H5E_CANTALLOC, HADDR_UNDEF, + "server couldn't allocate from file") + } + } + + /* It should be impossible for this assertion to fail, but then + * that is what assertions are for. + */ + HDassert(status == H5FP_STATUS_OK); + + if ( capt_only ) { + int mrc; + + if ( (mrc = MPI_Bcast(&fp_alloc, 1, H5FP_alloc, + (int)H5FP_capt_barrier_rank, + H5FP_SAP_BARRIER_COMM)) != MPI_SUCCESS ) { + HMPI_GOTO_ERROR(HADDR_UNDEF, "MPI_Bcast failed", mrc); + } + } + + /* we used to send the eoa to the sap here, but that is silly, + * as the sap already knows, and it is possible that another + * interleaving allocation will result in a corrupted eoa. + * + * JRM - 4/7/04 + */ + + /* We've succeeded -- return the value */ + HGOTO_DONE(fp_alloc.addr) + +done: + FUNC_LEAVE_NOAPI(ret_value) + +} /* H5FP_client_alloc() */ + + +/* This function is now called only by H5FP_client_alloc() above. + * Should we make it a private function only accessible from this + * file? JRM - 4/8/04 + */ /* * Function: H5FP_request_allocate * Purpose: Request an allocation of space from the SAP. @@ -739,8 +867,17 @@ H5FP_request_free(H5FD_t *file, H5FD_mem_t mem_type, haddr_t addr, hsize_t size, if (sap_alloc.status != H5FP_STATUS_OK) HGOTO_ERROR(H5E_RESOURCE, H5E_CANTCHANGE, FAIL, "can't free space on server"); +#if 0 /* JRM */ + /* the set_eoa call just sends the eoa we received from the SAP back + * -- with obvious race condition problems if there are interleaving + * calls. Thus I am commenting this call out for now, and will delete + * it in time if I can't find a reason for it. + * + * JRM -- 4/7/04 + */ /* Set the EOA for all processes. This call doesn't fail. */ file->cls->set_eoa(file, sap_alloc.eoa); +#endif /* JRM */ *status = H5FP_STATUS_OK; done: @@ -834,6 +971,24 @@ H5FP_request_set_eoa(H5FD_t *file, haddr_t eoa, unsigned *req_id, H5FP_status_t req.proc_rank = H5FD_mpi_get_rank(file); req.addr = eoa; +#if 0 + /* This is useful debugging code -- lets keep for a while. + * JRM -- 4/13/04 + */ + /* dump stack each time we set the eoa */ + { + int mpi_rank; + + MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); + + HDfprintf(stdout, + "%d: %s: setting eoa: last eoa = %a, new eoa = %a.\n", + mpi_rank, "H5FP_request_set_eoa", last_eoa_received, eoa); + H5FS_print(stdout); + + } +#endif + if ((mrc = MPI_Send(&req, 1, H5FP_request, (int)H5FP_sap_rank, H5FP_TAG_REQUEST, H5FP_SAP_COMM)) != MPI_SUCCESS) HMPI_GOTO_ERROR(FAIL, "MPI_Send failed", mrc); @@ -903,8 +1058,20 @@ H5FP_request_update_eoma_eosda(H5FD_t *file, unsigned *req_id, H5FP_status_t *st H5FP_SAP_BARRIER_COMM)) != MPI_SUCCESS) HMPI_GOTO_ERROR(FAIL, "MPI_Bcast failed!", mrc); +#if 0 + /* The following set_eoa just parrots back to the SAP the eoa + * we just received from it. While I don't think it is a problem + * in this case, there are obvious potentials for race conditions, + * and I don't see that it does anything useful. + * + * Thus I am commenting it out for now. I'll delete it completely + * as soon as I am sure that it serves no purpose whatsoever. + * + * JRM - 4/8/04 + */ /* Set the EOA for all processes. This doesn't fail. */ file->cls->set_eoa(file, sap_eoa.eoa); +#endif *status = H5FP_STATUS_OK; done: diff --git a/src/H5FPprivate.h b/src/H5FPprivate.h index aa85969..2c2f8b1 100644 --- a/src/H5FPprivate.h +++ b/src/H5FPprivate.h @@ -316,10 +316,16 @@ extern herr_t H5FP_request_flush_metadata(H5FD_t *file, unsigned file_id, H5FP_status_t *status); extern herr_t H5FP_request_close(H5FD_t *file, unsigned sap_file_id, unsigned *req_id, H5FP_status_t *status); + +/* the following function should probably become a private function + * in H5FPclient.c if H5FP_client_alloc() does the job. -- JRM + */ extern herr_t H5FP_request_allocate(H5FD_t *file, H5FD_mem_t mem_type, hsize_t size, haddr_t *addr, haddr_t *eoa, unsigned *req_id, H5FP_status_t *status); +extern haddr_t H5FP_client_alloc(H5FD_t *file, H5FD_mem_t type, + hid_t dxpl_id, hsize_t size); extern herr_t H5FP_request_free(H5FD_t *file, H5FD_mem_t mem_type, haddr_t addr, hsize_t size, unsigned *req_id, H5FP_status_t *status); diff --git a/src/H5FPserver.c b/src/H5FPserver.c index 1e2ca62..64535fd 100644 --- a/src/H5FPserver.c +++ b/src/H5FPserver.c @@ -454,6 +454,12 @@ H5FP_remove_object_lock_from_list(H5FP_file_info *info, * Return: <0, 0, or >0 * Programmer: Bill Wendling, 27. August, 2002 * Modifications: + * Altered the function to use the H5F_addr_cmp() macro + * from H5Fprivate. This has the effect of reversing + * the direction of the comparison. This in turn + * should make the next and less tree primitives + * behave as expected. + * JRM - 3/22/04 */ static int H5FP_file_mod_cmp(H5FP_mdata_mod *k1, @@ -463,7 +469,7 @@ H5FP_file_mod_cmp(H5FP_mdata_mod *k1, FUNC_ENTER_NOAPI_NOINIT_NOFUNC(H5FP_file_mod_cmp); assert(k1); assert(k2); - FUNC_LEAVE_NOAPI(k2->addr - k1->addr); + FUNC_LEAVE_NOAPI(H5F_addr_cmp((k1->addr), (k2->addr))); } /* @@ -517,6 +523,410 @@ done: } /* + * Function: H5FP_merge_mod_node_with_next + * + * Purpose: Given a node in a mod tree which overlaps with the next + * node in the tree, merge the two. Where the two nodes + * overlap, use the data from the supplied node. + * + * WARNING!!! + * + * This function calls H5TB_rem(), which may not delete the + * node specified in its parameter list -- if the target node + * is internal, it may swap data with a leaf node and delete + * the leaf instead. + * + * This implies that any pointer into the supplied tree may + * be invalid after this functions returns. Thus the calling + * function must re-aquire the address of *node_ptr (and any + * other nodes in *tree_ptr) after this function returns if + * it needs to do anything further with the node. + * + * Return: Success: SUCCEED + * Failure: FAIL + * + * Programmer: JRM - 3/18/04 + * + * Modifications: + * + * None. + */ +static herr_t +H5FP_merge_mod_node_with_next(H5TB_TREE *tree_ptr, H5TB_NODE *node_ptr) +{ + int i; + int j; + int offset; + herr_t ret_value; + H5TB_NODE *next_node_ptr; + H5FP_mdata_mod *mod_ptr; + H5FP_mdata_mod *next_mod_ptr; + H5FP_mdata_mod *key_ptr; + unsigned combined_md_size; + char *combined_metadata_ptr; + + FUNC_ENTER_NOAPI_NOINIT(H5FP_merge_mod_node_with_next); + + ret_value = SUCCEED; + + /* check parameters & do some initializations in passing */ + if ( ( tree_ptr == NULL ) || + ( node_ptr == NULL ) || + ( (mod_ptr = (H5FP_mdata_mod *)(node_ptr->data)) == NULL ) || + ( (next_node_ptr = H5TB_next(node_ptr)) == NULL ) || + ( (next_mod_ptr = next_node_ptr->data) == NULL ) || + ( mod_ptr->addr >= next_mod_ptr->addr ) || + ( (mod_ptr->addr + mod_ptr->md_size) <= next_mod_ptr->addr ) ) { + HGOTO_ERROR(H5E_FPHDF5, H5E_BADVALUE, FAIL, + "One or more bad params detected on entry."); + } + + if ( (mod_ptr->addr + mod_ptr->md_size) < + (next_mod_ptr->addr + next_mod_ptr->md_size) ) { + /* The next node address range is not completely subsumed in + * that of the current node. Must allocate a new buffer, and + * copy over the contents of the two buffers. Where the buffers + * overlap, give precidence to the data from *node_ptr + */ + combined_md_size = (next_mod_ptr->addr + next_mod_ptr->md_size) - + (mod_ptr->addr); + + combined_metadata_ptr = + (char *)H5MM_malloc((size_t)(combined_md_size + 1)); + + if ( combined_metadata_ptr == NULL ) { + HGOTO_ERROR(H5E_RESOURCE, H5E_NOSPACE, FAIL, + "can't allocate buffer for combined node."); + } + + i = 0; /* this is the index into the combined buffer */ + + for ( j = 0; j < mod_ptr->md_size; j++ ) { + combined_metadata_ptr[i++] = (mod_ptr->metadata)[j]; + } + + offset = (int)((mod_ptr->addr + mod_ptr->md_size) - next_mod_ptr->addr); + + for ( j = offset; j < next_mod_ptr->md_size; j++ ) { + combined_metadata_ptr[i++] = (next_mod_ptr->metadata)[j]; + } + + HDassert(i == combined_md_size); + + combined_metadata_ptr[i] = (char)0; + + HDfree(mod_ptr->metadata); + mod_ptr->metadata = combined_metadata_ptr; + mod_ptr->md_size = combined_md_size; + } + + /* We have copied metadata from the next node into the current node + * if this was necessary. All that remains is to delete the next + * node from the tree and free it. + */ + + H5TB_rem(&(tree_ptr->root), next_node_ptr, (void **)(&key_ptr)); + + /* WARNING!!! + * + * node_ptr or any other pointer to a node in *tree_ptr may be invalid + * at this point. Find the associated data in the tree again if you + * have any further need of it. + */ + + HDassert(key_ptr == next_mod_ptr); + H5FP_free_mod_node(next_mod_ptr); + +done: + + FUNC_LEAVE_NOAPI(ret_value); + +} /* H5FP_merge_mod_node_with_next() */ + +/* + * Function: H5FP_merge_mod_node_with_prev + * + * Purpose: Given a node in a mod tree which overlaps with the previous + * node in the tree, merge the two. Where the two nodes + * overlap, use the data from the supplied node. + * + * WARNING!!! + * + * This function calls H5TB_rem() to delete node_ptr from + * the tree pointed to by tree_ptr. H5TB_rem() may not delete + * the node specified in its parameter list -- if the target + * node is internal, it may swap data with a leaf node and + * delete the leaf instead. + * + * This implies that any pointer into the supplied tree may + * be invalid after this functions returns. Thus the calling + * function must re-aquire the address of any node in *tree_ptr + * after this function returns if it needs to do anything + * further with the node in question. + * + * Return: Success: SUCCEED + * Failure: FAIL + * + * Programmer: JRM - 3/19/04 + * + * Modifications: + * + * None. + */ +static herr_t +H5FP_merge_mod_node_with_prev(H5TB_TREE *tree_ptr, H5TB_NODE *node_ptr) +{ + int i; + int j; + int limit; + herr_t ret_value; + H5TB_NODE *prev_node_ptr; + H5FP_mdata_mod *mod_ptr; + H5FP_mdata_mod *prev_mod_ptr; + H5FP_mdata_mod *key_ptr; + unsigned combined_md_size; + char *combined_metadata_ptr; + + FUNC_ENTER_NOAPI_NOINIT(H5FP_merge_mod_node_with_prev); + + ret_value = SUCCEED; + + /* check parameters & do some initializations in passing */ + if ( ( tree_ptr == NULL ) || + ( node_ptr == NULL ) || + ( (mod_ptr = (H5FP_mdata_mod *)(node_ptr->data)) == NULL ) || + ( (prev_node_ptr = H5TB_prev(node_ptr)) == NULL ) || + ( (prev_mod_ptr = (H5FP_mdata_mod *)(prev_node_ptr->data)) == NULL ) || + ( mod_ptr->addr <= prev_mod_ptr->addr ) || + ( (prev_mod_ptr->addr + prev_mod_ptr->md_size) <= mod_ptr->addr ) ) { + HGOTO_ERROR(H5E_FPHDF5, H5E_BADVALUE, FAIL, + "One or more bad params detected on entry."); + } + + if ( (prev_mod_ptr->addr + prev_mod_ptr->md_size) < + (mod_ptr->addr + mod_ptr->md_size) ) { + /* The node address range is not completely subsumed in + * that of the previous node. Must allocate a new buffer, and + * copy over the contents of the two buffers. Where the buffers + * overlap, give precidence to the data from *node_ptr + */ + combined_md_size = (mod_ptr->addr + mod_ptr->md_size) - + (prev_mod_ptr->addr); + + combined_metadata_ptr = + (char *)H5MM_malloc((size_t)(combined_md_size + 1)); + + if ( combined_metadata_ptr == NULL ) { + HGOTO_ERROR(H5E_RESOURCE, H5E_NOSPACE, FAIL, + "can't allocate buffer for combined node."); + } + + i = 0; /* this is the index into the combined buffer */ + + limit = (int)(mod_ptr->addr - prev_mod_ptr->addr); + + HDassert(limit > 0 ); + + for ( j = 0; j < limit; j++ ) { + combined_metadata_ptr[i++] = (prev_mod_ptr->metadata)[j]; + } + + for ( j = 0; j < (int)(mod_ptr->md_size); j++ ) { + combined_metadata_ptr[i++] = (mod_ptr->metadata)[j]; + } + + HDassert(i == combined_md_size); + + combined_metadata_ptr[i] = (char)0; + + HDfree(prev_mod_ptr->metadata); + prev_mod_ptr->metadata = combined_metadata_ptr; + prev_mod_ptr->md_size = combined_md_size; + } else { /* supplied node is completely inside previous node */ + /* no need to allocate a new buffer. Just copy data from + * mod_ptr->metadata to the appropriate locations in + * prev_mod_ptr->metadata. + */ + + i = (int)(mod_ptr->addr - prev_mod_ptr->addr); + + for ( j = 0; j < (int)(mod_ptr->md_size); j++ ) { + (prev_mod_ptr->metadata)[i++] = (mod_ptr->metadata)[j]; + } + + HDassert(i <= prev_mod_ptr->md_size); + } + + /* We have copied metadata from the current node into the previous + * node. All that remains is to delete the current node from the + * tree and free it. + */ + + H5TB_rem(&(tree_ptr->root), node_ptr, (void **)(&key_ptr)); + + /* WARNING!!! + * + * Any pointer to a node in *tree_ptr may be invalid now as a result + * of the above call to H5TB_rem(). Find the associated data in the + * tree again if you have any further need of it. + */ + + HDassert(key_ptr == mod_ptr); + H5FP_free_mod_node(mod_ptr); + +done: + + FUNC_LEAVE_NOAPI(ret_value); + +} /* H5FP_merge_mod_node_with_prev() */ + +/* + * Function: H5FP_mod_node_overlaps_with_next + * + * Purpose: Given a node in a mod tree, see if there is an overlap + * between the address range of the supplied node, and that + * of the next node in the tree (if any). + * + * Return: TRUE if there is an overlap, and FALSE if there + * isn't. + * + * Programmer: JRM - 3/18/04 + * + * Modifications: + * + * None. + */ +static hbool_t +H5FP_mod_node_overlaps_with_next(H5TB_NODE *node_ptr) +{ + hbool_t ret_value; + H5TB_NODE *next_node_ptr; + + FUNC_ENTER_NOAPI_NOINIT(H5FP_mod_node_overlaps_with_next); + + ret_value = FALSE; + + HDassert(node_ptr != NULL); + + next_node_ptr = H5TB_next(node_ptr); + + if ( next_node_ptr != NULL ) { + if ( ( ((H5FP_mdata_mod *)(node_ptr->data))->addr > 100000 ) || + ( (int)(((H5FP_mdata_mod *)(node_ptr->data))->md_size) > 1024 ) ) { + HDfprintf(stdout, "%s: addr = %a, size = %d, mem_type = %d.\n", + "H5FP_mod_node_overlaps_with_next(2)", + (haddr_t)(((H5FP_mdata_mod *)(node_ptr->data))->addr), + (int)(((H5FP_mdata_mod *)(node_ptr->data))->md_size), + (int)(((H5FP_mdata_mod *)(node_ptr->data))->mem_type)); + } + + if ( (((H5FP_mdata_mod *)(node_ptr->data))->addr) + >= + (((H5FP_mdata_mod *)(next_node_ptr->data))->addr) + ) { + HDfprintf(stdout, "%s: addr,len = %a,%d, next_addr,len = %a,%d.\n", + "H5FP_mod_node_overlaps_with_next", + (((H5FP_mdata_mod *)(node_ptr->data))->addr), + (int)(((H5FP_mdata_mod *)(node_ptr->data))->md_size), + (((H5FP_mdata_mod *)(next_node_ptr->data))->addr), + (int)(((H5FP_mdata_mod *)(next_node_ptr->data))->md_size)); + + HDassert((((H5FP_mdata_mod *)(node_ptr->data))->addr) + < + (((H5FP_mdata_mod *)(next_node_ptr->data))->addr) + ); + } + if ( ( (((H5FP_mdata_mod *)(node_ptr->data))->addr) + + + (((H5FP_mdata_mod *)(node_ptr->data))->md_size) + ) + > + (((H5FP_mdata_mod *)(next_node_ptr->data))->addr) + ) { +#if 0 + /* This is useful debugging code -- keep it around for + * a while. JRM -- 4/13/03 + */ + HDfprintf(stdout, + "H5FP_mod_node_overlaps_with_next: addr = %a, next_addr = %a.\n", + (((H5FP_mdata_mod *)(node_ptr->data))->addr), + (((H5FP_mdata_mod *)(next_node_ptr->data))->addr)); +#endif + ret_value = TRUE; + } + } + +done: + + FUNC_LEAVE_NOAPI(ret_value); + +} /* H5FP_mod_node_overlaps_with_next() */ + +/* + * Function: H5FP_mod_node_overlaps_with_prev + * + * Purpose: Givena node in a mod tree, see if there is an overlap + * between the address range of the supplied node, and that + * of the previous node in the tree (if any). + * + * Return: TRUE if there is an overlap, and FALSE if there + * isn't. + * + * Programmer: JRM - 3/18/04 + * + * Modifications: + * + * None. + */ +static hbool_t +H5FP_mod_node_overlaps_with_prev(H5TB_NODE *node_ptr) +{ + hbool_t ret_value; + H5TB_NODE *prev_node_ptr; + + FUNC_ENTER_NOAPI_NOINIT(H5FP_mod_node_overlaps_with_prev); + + ret_value = FALSE; + + HDassert(node_ptr != NULL); + + prev_node_ptr = H5TB_prev(node_ptr); + + + if ( prev_node_ptr != NULL ) + { + HDassert((((H5FP_mdata_mod *)(node_ptr->data))->addr) + > + (((H5FP_mdata_mod *)(prev_node_ptr->data))->addr) + ); + + if ( ( (((H5FP_mdata_mod *)(prev_node_ptr->data))->addr) + + + (((H5FP_mdata_mod *)(prev_node_ptr->data))->md_size) + ) + > + (((H5FP_mdata_mod *)(node_ptr->data))->addr) + ) { +#if 0 + /* This is useful debugging code -- keep it around for + * a while. JRM - 4/13/04 + */ + HDfprintf(stdout, + "H5FP_mod_node_overlaps_with_prev: addr = %a, prev_addr = %a.\n", + (((H5FP_mdata_mod *)(node_ptr->data))->addr), + (((H5FP_mdata_mod *)(prev_node_ptr->data))->addr)); +#endif + ret_value = TRUE; + } + } + +done: + + FUNC_LEAVE_NOAPI(ret_value); + +} /* H5FP_mod_node_overlaps_with_prev() */ + +/* * Function: H5FP_add_file_mod_to_list * Purpose: Add a metadata write to a file ID. If the metadata is * already in the cache, then we just replace it with the @@ -526,12 +936,17 @@ done: * Failure: FAIL * Programmer: Bill Wendling, 02. August, 2002 * Modifications: + * Re-worked code to merge overlapping metadata changes, + * and to avoid discarding metadata if the supplied metadata + * is smaller than that already in the mod list. + * JRM -- 3/29/04 */ static herr_t H5FP_add_file_mod_to_list(H5FP_file_info *info, H5FD_mem_t mem_type, haddr_t addr, unsigned md_size, char *metadata) { + int i; H5FP_mdata_mod *fm, mod; H5TB_NODE *node; herr_t ret_value = FAIL; @@ -542,7 +957,14 @@ H5FP_add_file_mod_to_list(H5FP_file_info *info, H5FD_mem_t mem_type, assert(info); mod.addr = addr; /* This is the key field for the TBBT */ - +#if 0 + /* This is useful debugging code -- keep it around for a + * while. JRM -- 4/13/04 + */ + HDfprintf(stdout, + "H5FP_add_file_mod_to_list: Adding chunk at %a of length %d.\n", + addr, (int)md_size); +#endif if ((node = H5TB_dfind(info->mod_tree, (void *)&mod, NULL)) != NULL) { /* * The metadata is in the cache already. All we have to do is @@ -550,18 +972,89 @@ H5FP_add_file_mod_to_list(H5FP_file_info *info, H5FD_mem_t mem_type, * The only things to change is the metadata and its size. */ fm = (H5FP_mdata_mod *)node->data; - HDfree(fm->metadata); - fm->metadata = metadata; - fm->md_size = md_size; + + if ( fm->md_size > md_size ) { + for ( i = 0; i < md_size; i++ ) + { + (fm->metadata)[i] = metadata[i]; + } + HDfree(metadata); + } else if ( fm->md_size < md_size ) { + HDfree(fm->metadata); + fm->metadata = metadata; + fm->md_size = md_size; + + while ( H5FP_mod_node_overlaps_with_next(node) ) { + if ( H5FP_merge_mod_node_with_next(info->mod_tree, node) + == FAIL ) { + /* Need to define better errors here. -- JRM */ + HGOTO_ERROR(H5E_FPHDF5, H5E_CANTCHANGE, FAIL, + "Can't merge with next."); + } else { + (info->num_mods)--; /* since we just merged */ + + /* H5FP_merge_mod_node_with_next() contains a call + * to H5TB_rem(), which may clobber node. Hence we + * must look it up again before proceeding. + */ + node = H5TB_dfind(info->mod_tree, (void *)&mod, NULL); + HDassert(node != NULL); + HDassert(node->data == fm); + HDassert(node->key == fm); + } + } + } else { /* fm->md_size == md_size */ + HDfree(fm->metadata); + fm->metadata = metadata; + } + HGOTO_DONE(SUCCEED); } - if ((fm = H5FP_new_file_mod_node(mem_type, addr, md_size, metadata)) != NULL) { - if (!H5TB_dins(info->mod_tree, (void *)fm, NULL)) + if ( (fm = H5FP_new_file_mod_node(mem_type, addr, md_size, metadata)) + != NULL) { + if ( (node = H5TB_dins(info->mod_tree, (void *)fm, NULL)) == NULL ) { HGOTO_ERROR(H5E_FPHDF5, H5E_CANTINSERT, FAIL, "can't insert modification into tree"); + } + + (info->num_mods)++; + + /* merge with next as required */ + while ( H5FP_mod_node_overlaps_with_next(node) ) { + if ( H5FP_merge_mod_node_with_next(info->mod_tree, node) == FAIL ) { + /* Need to define better errors here. -- JRM */ + HGOTO_ERROR(H5E_FPHDF5, H5E_CANTCHANGE, FAIL, + "Can't merge new node with next."); + } else { + (info->num_mods)--; /* since we just merged */ + + /* H5FP_merge_mod_node_with_next() contains a call + * to H5TB_rem(), which may clobber node. Hence we + * must look it up again before proceeding. + */ + node = H5TB_dfind(info->mod_tree, (void *)&mod, NULL); + HDassert(node != NULL); + HDassert(node->data == fm); + HDassert(node->key == fm); + } + } + + /* if the tree was valid to begin with, we must merge with at + * most one previous node. + */ + if ( H5FP_mod_node_overlaps_with_prev(node) ) { + if ( H5FP_merge_mod_node_with_prev(info->mod_tree, node) == FAIL ) { + /* Need to define better errors here. -- JRM */ + HGOTO_ERROR(H5E_FPHDF5, H5E_CANTCHANGE, FAIL, + "Can't merge new node with prev."); + } + /* H5FP_merge_mod_node_with_prev() calls H5TB_rem() to delete + * node after it merges with the previous node. Thus node is + * invalid at this point. + */ + } - ++info->num_mods; HGOTO_DONE(SUCCEED); } @@ -1274,6 +1767,12 @@ H5FP_sap_handle_read_request(H5FP_request_t *req) int mrc; FUNC_ENTER_NOAPI_NOINIT(H5FP_sap_handle_read_request); +#if 0 + /* More useful debugging code to keep for a time. JRM - 4/13/04 */ + HDfprintf(stdout, + "H5FP_sap_handle_read_request: req->addr = %a.\n", + req->addr); +#endif r.req_id = req->req_id; r.file_id = req->file_id; @@ -1356,7 +1855,12 @@ H5FP_sap_handle_write_request(H5FP_request_t *req, char *mdata, unsigned md_size herr_t ret_value = SUCCEED; FUNC_ENTER_NOAPI_NOINIT(H5FP_sap_handle_write_request); - +#if 0 + /* Debugging code -- lets keep it for a time. JRM -- 4/13/04 */ + HDfprintf(stdout, + "H5FP_sap_handle_write_request: addr = %a, md_size = %d.\n", + (haddr_t)(req->addr), (int)md_size); +#endif if ((info = H5FP_find_file_info(req->file_id)) != NULL) { if (info->num_mods >= H5FP_MDATA_CACHE_HIGHWATER_MARK) { /* @@ -1544,14 +2048,26 @@ H5FP_sap_handle_alloc_request(H5FP_request_t *req) sap_alloc.eoa = HADDR_UNDEF; sap_alloc.status = H5FP_STATUS_CANT_ALLOC; } - +#if 0 + /* Debugging code -- lets keep it for a time. JRM -- 4/13/04 */ + HDfprintf(stdout, + "%s: req_size = %d, req_type = %d, addr = %a, eoa = %a, status = %d, rp_rank = %d.\n", + "H5FP_sap_handle_alloc_request", + (int)(req->meta_block_size), + (int)(req->mem_type), + sap_alloc.addr, + sap_alloc.eoa, + (int)(sap_alloc.status), + (int)(req->proc_rank)); +#endif done: if ((mrc = MPI_Send(&sap_alloc, 1, H5FP_alloc, (int)req->proc_rank, H5FP_TAG_ALLOC, H5FP_SAP_COMM)) != MPI_SUCCESS) HMPI_DONE_ERROR(FAIL, "MPI_Send failed", mrc); FUNC_LEAVE_NOAPI(ret_value); -} + +} /* H5FP_sap_handle_alloc_request() */ /* * Function: H5FP_sap_handle_free_request @@ -1577,6 +2093,17 @@ H5FP_sap_handle_free_request(H5FP_request_t *req) sap_alloc.status = H5FP_STATUS_OK; sap_alloc.mem_type = req->mem_type; +#if 0 + /* Debugging code -- lets keep it for a time. JRM -- 4/13/04 */ + HDfprintf(stdout, + "%s: addr = %a, block_size = %a, mem_type = %d, rp_rank = %d.\n", + "H5FP_sap_handle_free_request", + req->addr, + req->meta_block_size, + (int)(req->mem_type), + (int)(req->proc_rank)); +#endif + if ((info = H5FP_find_file_info(req->file_id)) != NULL) { if (H5FD_free((H5FD_t*)&info->file, req->mem_type, H5P_DEFAULT, req->addr, req->meta_block_size) != SUCCEED) { @@ -1599,7 +2126,8 @@ done: HMPI_DONE_ERROR(FAIL, "MPI_Send failed", mrc); FUNC_LEAVE_NOAPI(ret_value); -} + +} /* H5FP_sap_handle_free_request() */ /* * Function: H5FP_sap_handle_get_eoa_request @@ -1623,10 +2151,25 @@ H5FP_sap_handle_get_eoa_request(H5FP_request_t *req) sap_eoa.file_id = req->file_id; if ((info = H5FP_find_file_info(req->file_id)) != NULL) { + +#if 0 + /* Debugging code -- lets keep it for a time. JRM -- 4/13/04 */ + HDfprintf(stdout, "%s: eoa = %a, rp_rank = %d.\n", + "H5FP_sap_handle_get_eoa_request", + ((H5FD_fphdf5_t*)&info->file)->eoa, + (int)(req->proc_rank)); +#endif + /* Get the EOA. */ sap_eoa.eoa = ((H5FD_fphdf5_t*)&info->file)->eoa; sap_eoa.status = H5FP_STATUS_OK; } else { +#if 1 + /* Debugging code -- lets keep it for a time. JRM -- 4/13/04 */ + HDfprintf(stdout, "%s: function failed. rp_rank = %d.\n", + "H5FP_sap_handle_get_eoa_request", + (int)(req->proc_rank)); +#endif sap_eoa.eoa = HADDR_UNDEF; sap_eoa.status = H5FP_STATUS_CANT_ALLOC; ret_value = FAIL; @@ -1660,10 +2203,41 @@ H5FP_sap_handle_set_eoa_request(H5FP_request_t *req) FUNC_ENTER_NOAPI_NOINIT_NOFUNC(H5FP_sap_handle_set_eoa_request); if ((info = H5FP_find_file_info(req->file_id)) != NULL) { + +#if 0 + /* Debugging code -- lets keep it for a time. JRM -- 4/13/04 */ + if ( req->addr < ((H5FD_fphdf5_t*)&info->file)->eoa ) { + HDfprintf(stdout, + "%s: old eoa = %a, new eoa = %a, rp_rank = %d. %s\n", + "H5FP_sap_handle_set_eoa_request", + ((H5FD_fphdf5_t*)&info->file)->eoa, + req->addr, + (int)(req->proc_rank), + "<---- eoa reduced!!! -------"); + } +#if 0 + else { + HDfprintf(stdout, + "%s: old eoa = %a, new eoa = %a, rp_rank = %d.\n", + "H5FP_sap_handle_set_eoa_request", + ((H5FD_fphdf5_t*)&info->file)->eoa, + req->addr, + (int)(req->proc_rank)); + } +#endif +#endif + /* Get the EOA. */ ((H5FD_fphdf5_t*)&info->file)->eoa = req->addr; exit_state = H5FP_STATUS_OK; } else { +#if 1 + /* Debugging code -- lets keep it for a time. JRM -- 4/13/04 */ + HDfprintf(stdout, + "%s: Function failed -- Couldn't get info. new eoa = %a.\n", + "H5FP_sap_handle_set_eoa_request", + req->addr); +#endif exit_state = H5FP_STATUS_CANT_ALLOC; ret_value = FAIL; } -- cgit v0.12