diff options
author | Quincey Koziol <koziol@hdfgroup.org> | 2002-07-19 19:27:09 (GMT) |
---|---|---|
committer | Quincey Koziol <koziol@hdfgroup.org> | 2002-07-19 19:27:09 (GMT) |
commit | 99eee6dff9a38a1bab3d74b4b77dd8169e57409c (patch) | |
tree | 5db3dd6e5ed35a694c655337750dc741e30181fb /src | |
parent | 814ea8b962fec329f6a440d1f52f6b682de47524 (diff) | |
download | hdf5-99eee6dff9a38a1bab3d74b4b77dd8169e57409c.zip hdf5-99eee6dff9a38a1bab3d74b4b77dd8169e57409c.tar.gz hdf5-99eee6dff9a38a1bab3d74b4b77dd8169e57409c.tar.bz2 |
[svn-r5814] Purpose:
Bug Fix
Description:
It was possible to create corrupted metadata information (either in memory
or in the file or both) with a parallel I/O program because of the way
metadata writes were being handled for writes out of the metadata cache.
Solution:
Added a dataset transfer property called "block before metadata write"
which is used by the MPI-I/O and MPI-posix drivers to sync up all the
processes before attempting a metadata write. This property is currently
only for metadata writes from the metadata cache.
Platforms tested:
IRIX64 6.5 (modi4) w/parallel
Diffstat (limited to 'src')
-rw-r--r-- | src/H5.c | 1 | ||||
-rw-r--r-- | src/H5AC.c | 139 | ||||
-rw-r--r-- | src/H5ACprivate.h | 11 | ||||
-rw-r--r-- | src/H5B.c | 24 | ||||
-rw-r--r-- | src/H5FDmpio.c | 69 | ||||
-rw-r--r-- | src/H5FDmpiposix.c | 56 | ||||
-rw-r--r-- | src/H5Gnode.c | 24 | ||||
-rw-r--r-- | src/H5HG.c | 26 | ||||
-rw-r--r-- | src/H5HL.c | 31 | ||||
-rw-r--r-- | src/H5O.c | 30 | ||||
-rw-r--r-- | src/H5private.h | 1 |
11 files changed, 304 insertions, 108 deletions
@@ -201,6 +201,7 @@ H5_term_library(void) pending += DOWN(TN); pending += DOWN(T); pending += DOWN(A); + pending += DOWN(AC); pending += DOWN(P); pending += DOWN(I); } while (pending && ntries++ < 100); @@ -31,7 +31,9 @@ #include "H5Eprivate.h" #include "H5Fpkg.h" #include "H5FLprivate.h" /*Free Lists */ +#include "H5Iprivate.h" /* IDs */ #include "H5MMprivate.h" +#include "H5Pprivate.h" /* Property lists */ /* * Sorting the cache by address before flushing is sometimes faster @@ -43,8 +45,16 @@ * Private file-scope variables. */ #define PABLO_MASK H5AC_mask -#define INTERFACE_INIT NULL + +/* Interface initialization */ static int interface_initialize_g = 0; +#define INTERFACE_INIT H5AC_init_interface +static herr_t H5AC_init_interface(void); + +#ifdef H5_HAVE_PARALLEL +/* Dataset transfer property list for flush calls */ +static hid_t H5AC_dxpl_id=(-1); +#endif /* H5_HAVE_PARALLEL */ #ifdef H5AC_SORT_BY_ADDR static H5AC_t *current_cache_g = NULL; /*for sorting */ @@ -66,6 +76,107 @@ H5FL_ARR_DEFINE_STATIC(H5AC_prot_t,-1); /*------------------------------------------------------------------------- + * Function: H5AC_init_interface + * + * Purpose: Initialize interface-specific information + * + * Return: Non-negative on success/Negative on failure + * + * Programmer: Quincey Koziol + * Thursday, July 18, 2002 + * + * Modifications: + * + *------------------------------------------------------------------------- + */ +static herr_t +H5AC_init_interface(void) +{ +#ifdef H5_HAVE_PARALLEL + H5P_genclass_t *xfer_pclass; /* Dataset transfer property list class object */ + H5P_genplist_t *xfer_plist; /* Dataset transfer property list object */ + unsigned block_before_meta_write=1; /* Custom value for "block before meta write" property */ +#endif /* H5_HAVE_PARALLEL */ + + FUNC_ENTER_NOINIT(H5AC_init_interface); + +#ifdef H5_HAVE_PARALLEL + /* Sanity check */ + assert(H5P_CLS_DATASET_XFER_g!=(-1)); + + /* Get the dataset transfer property list class object */ + if (H5I_GENPROP_CLS != H5I_get_type(H5P_CLS_DATASET_XFER_g) || NULL == (xfer_pclass = H5I_object(H5P_CLS_DATASET_XFER_g))) + HRETURN_ERROR(H5E_CACHE, H5E_BADATOM, FAIL, "can't get property list class"); + + /* Create a new dataset transfer property list */ + if ((H5AC_dxpl_id=H5P_create_id(xfer_pclass)) < 0) + HRETURN_ERROR(H5E_CACHE, H5E_CANTCREATE, FAIL, "unable to register property list"); + + /* Get the property list object */ + if (NULL == (xfer_plist = H5I_object(H5AC_dxpl_id))) + HRETURN_ERROR(H5E_CACHE, H5E_BADATOM, FAIL, "can't get new property list object"); + + /* Insert 'block before metadata write' property */ + if(H5P_insert(xfer_plist,H5AC_BLOCK_BEFORE_META_WRITE_NAME,H5AC_BLOCK_BEFORE_META_WRITE_SIZE,&block_before_meta_write,NULL,NULL,NULL,NULL,NULL)<0) + HRETURN_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "can't insert metadata cache dxpl property"); +#endif /* H5_HAVE_PARALLEL */ + + FUNC_LEAVE(SUCCEED); +} /* end H5AC_init_interface() */ + + +/*------------------------------------------------------------------------- + * Function: H5AC_term_interface + * + * Purpose: Terminate this interface. + * + * Return: Success: Positive if anything was done that might + * affect other interfaces; zero otherwise. + * + * Failure: Negative. + * + * Programmer: Quincey Koziol + * Thursday, July 18, 2002 + * + * Modifications: + * + *------------------------------------------------------------------------- + */ +int +H5AC_term_interface(void) +{ + int n=0; + + FUNC_ENTER_NOINIT(H5AC_term_interface); + + if (interface_initialize_g) { +#ifdef H5_HAVE_PARALLEL + if(H5AC_dxpl_id>0) { + /* Indicate more work to do */ + n = 1; /* H5I */ + + /* Close H5AC dxpl */ + if (H5Pclose(H5AC_dxpl_id) < 0) + H5E_clear(); /*ignore the error*/ + else { + /* Reset static ID */ + H5AC_dxpl_id=(-1); + + /* Reset interface initialization flag */ + interface_initialize_g = 0; + } /* end else */ + } /* end if */ + else +#endif /* H5_HAVE_PARALLEL */ + /* Reset interface initialization flag */ + interface_initialize_g = 0; + } /* end if */ + + FUNC_LEAVE(n); +} /* end H5AC_term_interface() */ + + +/*------------------------------------------------------------------------- * Function: H5AC_create * * Purpose: Initialize the cache just after a file is opened. The @@ -273,7 +384,7 @@ H5AC_find_f(H5F_t *f, const H5AC_class_t *type, haddr_t addr, * Load a new thing. If it can't be loaded, then return an error * without preempting anything. */ - if (NULL == (thing = (type->load)(f, addr, udata1, udata2))) { + if (NULL == (thing = (type->load)(f, H5P_DATASET_XFER_DEFAULT, addr, udata1, udata2))) { HRETURN_ERROR(H5E_CACHE, H5E_CANTLOAD, NULL, "unable to load object"); } /* @@ -283,13 +394,13 @@ H5AC_find_f(H5F_t *f, const H5AC_class_t *type, haddr_t addr, H5AC_subid_t type_id=(*info)->type->id; /* Remember this for later */ flush = (*info)->type->flush; - status = (flush)(f, TRUE, (*info)->addr, (*info)); + status = (flush)(f, H5AC_dxpl_id, TRUE, (*info)->addr, (*info)); if (status < 0) { /* * The old thing could not be removed from the stack. * Release the new thing and fail. */ - if ((type->flush)(f, TRUE, addr, thing) < 0) { + if ((type->flush)(f, H5AC_dxpl_id, TRUE, addr, thing) < 0) { HRETURN_ERROR(H5E_CACHE, H5E_CANTFLUSH, NULL, "unable to flush just-loaded object"); } @@ -413,6 +524,7 @@ H5AC_flush(H5F_t *f, const H5AC_class_t *type, haddr_t addr, hbool_t destroy) cache = f->shared->cache; if (!H5F_addr_defined(addr)) { + unsigned first_flush=1; /* Indicate if this is the first flush */ #ifdef H5AC_SORT_BY_ADDR /* @@ -457,7 +569,14 @@ H5AC_flush(H5F_t *f, const H5AC_class_t *type, haddr_t addr, hbool_t destroy) H5AC_subid_t type_id=(*info)->type->id; /* Remember this for later */ flush = (*info)->type->flush; - status = (flush)(f, destroy, (*info)->addr, (*info)); + + /* Only block for all the processes on the first piece of metadata */ + if(first_flush) { + status = (flush)(f, H5AC_dxpl_id, destroy, (*info)->addr, (*info)); + first_flush=0; + } /* end if */ + else + status = (flush)(f, H5P_DATASET_XFER_DEFAULT, destroy, (*info)->addr, (*info)); if (status < 0) { #ifdef H5AC_SORT_BY_ADDR map = H5FL_ARR_FREE(int,map); @@ -493,7 +612,7 @@ H5AC_flush(H5F_t *f, const H5AC_class_t *type, haddr_t addr, hbool_t destroy) * Flush just this entry. */ flush = cache->slot[i]->type->flush; - status = (flush)(f, destroy, cache->slot[i]->addr, + status = (flush)(f, H5AC_dxpl_id, destroy, cache->slot[i]->addr, cache->slot[i]); if (status < 0) { HRETURN_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, @@ -567,7 +686,7 @@ H5AC_set(H5F_t *f, const H5AC_class_t *type, haddr_t addr, void *thing) H5AC_subid_t type_id=(*info)->type->id; /* Remember this for later */ flush = (*info)->type->flush; - status = (flush)(f, TRUE, (*info)->addr, (*info)); + status = (flush)(f, H5AC_dxpl_id, TRUE, (*info)->addr, (*info)); if (status < 0) { HRETURN_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "unable to flush object"); @@ -656,7 +775,7 @@ H5AC_rename(H5F_t *f, const H5AC_class_t *type, haddr_t old_addr, H5AC_subid_t type_id=cache->slot[new_idx]->type->id; /* Remember this for later */ flush = cache->slot[new_idx]->type->flush; - status = (flush)(f, TRUE, cache->slot[new_idx]->addr, + status = (flush)(f, H5AC_dxpl_id, TRUE, cache->slot[new_idx]->addr, cache->slot[new_idx]); if (status < 0) { HRETURN_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, @@ -774,7 +893,7 @@ H5AC_protect(H5F_t *f, const H5AC_class_t *type, haddr_t addr, * without preempting anything. */ cache->diagnostics[type->id].nmisses++; - if (NULL == (thing = (type->load)(f, addr, udata1, udata2))) { + if (NULL == (thing = (type->load)(f, H5P_DATASET_XFER_DEFAULT, addr, udata1, udata2))) { HRETURN_ERROR(H5E_CACHE, H5E_CANTLOAD, NULL, "unable to load object"); } @@ -860,7 +979,7 @@ H5AC_unprotect(H5F_t *f, const H5AC_class_t *type, haddr_t addr, void *thing) assert(H5F_addr_ne((*info)->addr, addr)); flush = (*info)->type->flush; - status = (flush)(f, TRUE, (*info)->addr, (*info)); + status = (flush)(f, H5AC_dxpl_id, TRUE, (*info)->addr, (*info)); if (status < 0) { HRETURN_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "unable to flush object"); diff --git a/src/H5ACprivate.h b/src/H5ACprivate.h index d96fb5e..734e56a 100644 --- a/src/H5ACprivate.h +++ b/src/H5ACprivate.h @@ -58,8 +58,8 @@ typedef enum H5AC_subid_t { H5AC_NTYPES = 5 /*THIS MUST BE LAST! */ } H5AC_subid_t; -typedef void *(*H5AC_load_func_t)(H5F_t*, haddr_t addr, const void *udata1, void *udata2); -typedef herr_t (*H5AC_flush_func_t)(H5F_t*, hbool_t dest, haddr_t addr, void *thing); +typedef void *(*H5AC_load_func_t)(H5F_t*, hid_t dxpl_id, haddr_t addr, const void *udata1, void *udata2); +typedef herr_t (*H5AC_flush_func_t)(H5F_t*, hid_t dxpl_id, hbool_t dest, haddr_t addr, void *thing); typedef struct H5AC_class_t { H5AC_subid_t id; @@ -107,6 +107,13 @@ typedef struct H5AC_t { } diagnostics[H5AC_NTYPES]; /*diagnostics for each type of object*/ } H5AC_t; +#ifdef H5_HAVE_PARALLEL +/* Definitions for "block before metadata write" property */ +#define H5AC_BLOCK_BEFORE_META_WRITE_NAME "H5AC_block_before_meta_write" +#define H5AC_BLOCK_BEFORE_META_WRITE_SIZE sizeof(unsigned) +#define H5AC_BLOCK_BEFORE_META_WRITE_DEF 0 +#endif /* H5_HAVE_PARALLEL */ + /* * Library prototypes. */ @@ -118,8 +118,8 @@ static H5B_ins_t H5B_insert_helper(H5F_t *f, haddr_t addr, static herr_t H5B_insert_child(H5F_t *f, const H5B_class_t *type, H5B_t *bt, int idx, haddr_t child, H5B_ins_t anchor, void *md_key); -static herr_t H5B_flush(H5F_t *f, hbool_t destroy, haddr_t addr, H5B_t *b); -static H5B_t *H5B_load(H5F_t *f, haddr_t addr, const void *_type, void *udata); +static herr_t H5B_flush(H5F_t *f, hid_t dxpl_id, hbool_t destroy, haddr_t addr, H5B_t *b); +static H5B_t *H5B_load(H5F_t *f, hid_t dxpl_id, haddr_t addr, const void *_type, void *udata); static herr_t H5B_decode_key(H5F_t *f, H5B_t *bt, int idx); static herr_t H5B_decode_keys(H5F_t *f, H5B_t *bt, int idx); static size_t H5B_nodesize(H5F_t *f, const H5B_class_t *type, @@ -137,8 +137,8 @@ static herr_t H5B_assert(H5F_t *f, haddr_t addr, const H5B_class_t *type, /* H5B inherits cache-like properties from H5AC */ static const H5AC_class_t H5AC_BT[1] = {{ H5AC_BT_ID, - (void *(*)(H5F_t*, haddr_t, const void*, void*))H5B_load, - (herr_t (*)(H5F_t*, hbool_t, haddr_t, void*))H5B_flush, + (H5AC_load_func_t)H5B_load, + (H5AC_flush_func_t)H5B_flush, }}; /* Interface initialization? */ @@ -343,10 +343,14 @@ H5B_Kvalue(H5F_t *f, const H5B_class_t *type) * Modifications: * Robb Matzke, 1999-07-28 * The ADDR argument is passed by value. + * + * Quincey Koziol, 2002-7-180 + * Added dxpl parameter to allow more control over I/O from metadata + * cache. *------------------------------------------------------------------------- */ static H5B_t * -H5B_load(H5F_t *f, haddr_t addr, const void *_type, void *udata) +H5B_load(H5F_t *f, hid_t dxpl_id, haddr_t addr, const void *_type, void *udata) { const H5B_class_t *type = (const H5B_class_t *) _type; size_t total_nkey_size; @@ -380,7 +384,7 @@ H5B_load(H5F_t *f, haddr_t addr, const void *_type, void *udata) HGOTO_ERROR (H5E_RESOURCE, H5E_NOSPACE, NULL, "memory allocation failed"); } - if (H5F_block_read(f, H5FD_MEM_BTREE, addr, size, H5P_DATASET_XFER_DEFAULT, bt->page)<0) { + if (H5F_block_read(f, H5FD_MEM_BTREE, addr, size, dxpl_id, bt->page)<0) { HGOTO_ERROR(H5E_BTREE, H5E_READERROR, NULL, "can't read B-tree node"); } @@ -456,10 +460,14 @@ H5B_load(H5F_t *f, haddr_t addr, const void *_type, void *udata) * * Robb Matzke, 1999-07-28 * The ADDR argument is passed by value. + * + * Quincey Koziol, 2002-7-180 + * Added dxpl parameter to allow more control over I/O from metadata + * cache. *------------------------------------------------------------------------- */ static herr_t -H5B_flush(H5F_t *f, hbool_t destroy, haddr_t addr, H5B_t *bt) +H5B_flush(H5F_t *f, hid_t dxpl_id, hbool_t destroy, haddr_t addr, H5B_t *bt) { int i; size_t size = 0; @@ -525,7 +533,7 @@ H5B_flush(H5F_t *f, hbool_t destroy, haddr_t addr, H5B_t *bt) * bother writing data for the child entries that don't exist or * for the final unchanged children. */ - if (H5F_block_write(f, H5FD_MEM_BTREE, addr, size, H5P_DATASET_XFER_DEFAULT, bt->page)<0) { + if (H5F_block_write(f, H5FD_MEM_BTREE, addr, size, dxpl_id, bt->page)<0) { HRETURN_ERROR(H5E_BTREE, H5E_CANTFLUSH, FAIL, "unable to save B-tree node to disk"); } diff --git a/src/H5FDmpio.c b/src/H5FDmpio.c index df4b95c..023057f 100644 --- a/src/H5FDmpio.c +++ b/src/H5FDmpio.c @@ -8,11 +8,6 @@ * Purpose: This is the MPI-2 I/O driver. * * Limitations: - * H5FD_mpio_read & H5FD_mpio_write - * Eventually these should choose collective or independent i/o - * based on a parameter that is passed down to it from H5Dwrite, - * rather than the access_parms (which are fixed at the open). - * * H5FD_mpio_read * One implementation of MPI/MPI-IO causes MPI_Get_count * to return (incorrectly) a negative count. I (who?) added code @@ -21,6 +16,7 @@ * kluge is activated by #ifdef MPI_KLUGE0202. */ #include "H5private.h" /*library functions */ +#include "H5ACprivate.h" /* Metadata cache */ #include "H5Eprivate.h" /*error handling */ #include "H5Fprivate.h" /*files */ #include "H5FDprivate.h" /*file driver */ @@ -71,9 +67,9 @@ static herr_t H5FD_mpio_query(const H5FD_t *_f1, unsigned long *flags); static haddr_t H5FD_mpio_get_eoa(H5FD_t *_file); static herr_t H5FD_mpio_set_eoa(H5FD_t *_file, haddr_t addr); static haddr_t H5FD_mpio_get_eof(H5FD_t *_file); -static herr_t H5FD_mpio_read(H5FD_t *_file, H5FD_mem_t type, hid_t fapl_id, haddr_t addr, +static herr_t H5FD_mpio_read(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t addr, size_t size, void *buf); -static herr_t H5FD_mpio_write(H5FD_t *_file, H5FD_mem_t type, hid_t fapl_id, haddr_t addr, +static herr_t H5FD_mpio_write(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t addr, size_t size, const void *buf); static herr_t H5FD_mpio_flush(H5FD_t *_file, unsigned closing); @@ -1210,6 +1206,7 @@ H5FD_mpio_read(H5FD_t *_file, H5FD_mem_t UNUSED type, hid_t dxpl_id, haddr_t add /* Make certain we have the correct type of property list */ assert(H5I_GENPROP_LST==H5I_get_type(dxpl_id)); assert(TRUE==H5P_isa_class(dxpl_id,H5P_DATASET_XFER)); + assert(buf); /* Portably initialize MPI status variable */ HDmemset(&mpi_stat,0,sizeof(MPI_Status)); @@ -1484,6 +1481,12 @@ done: * if the first I/O was a collective I/O using MPI derived types * and the next I/O was an independent I/O. * + * Quincey Koziol - 2002/07/18 + * Added "block_before_meta_write" dataset transfer flag, which + * is set during writes from a metadata cache flush and indicates + * that all the processes must sync up before (one of them) + * writing metadata. + * *------------------------------------------------------------------------- */ static herr_t @@ -1496,8 +1499,10 @@ H5FD_mpio_write(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t addr, MPI_Offset mpi_off, mpi_disp; MPI_Status mpi_stat; MPI_Datatype buf_type, file_type; + int mpi_code; /* MPI return code */ int size_i, bytes_written; unsigned use_view_this_time=0; + unsigned block_before_meta_write=0; /* Whether to block before a metadata write */ H5P_genplist_t *plist; /* Property list pointer */ herr_t ret_value=SUCCEED; @@ -1512,6 +1517,7 @@ H5FD_mpio_write(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t addr, /* Make certain we have the correct type of property list */ assert(H5I_GENPROP_LST==H5I_get_type(dxpl_id)); assert(TRUE==H5P_isa_class(dxpl_id,H5P_DATASET_XFER)); + assert(buf); /* Portably initialize MPI status variable */ HDmemset(&mpi_stat,0,sizeof(MPI_Status)); @@ -1582,19 +1588,36 @@ H5FD_mpio_write(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t addr, HGOTO_ERROR(H5E_INTERNAL, H5E_MPI, FAIL, "MPI_File_set_view failed"); } /* end if */ - /* Only p<round> will do the actual write if all procs in comm write same data */ - if ((type!=H5FD_MEM_DRAW) && H5_mpi_1_metawrite_g) { - if (file->mpi_rank != file->mpi_round) { + /* Metadata specific actions */ + if(type!=H5FD_MEM_DRAW) { + /* Check if we need to syncronize all processes before attempting metadata write + * (Prevents race condition where the process writing the metadata goes ahead + * and writes the metadata to the file before all the processes have + * read the data, "transmitting" data from the "future" to the reading + * process. -QAK ) + */ + if(H5P_exist_plist(plist,H5AC_BLOCK_BEFORE_META_WRITE_NAME)>0) + if(H5P_get(plist,H5AC_BLOCK_BEFORE_META_WRITE_NAME,&block_before_meta_write)<0) + HGOTO_ERROR(H5E_PLIST, H5E_CANTGET, FAIL, "can't get H5AC property"); + + if(block_before_meta_write) + if (MPI_SUCCESS!= (mpi_code=MPI_Barrier(file->comm))) + HMPI_GOTO_ERROR(FAIL, "MPI_Barrier failed", mpi_code); + + /* Only p<round> will do the actual write if all procs in comm write same metadata */ + if (H5_mpi_1_metawrite_g) { + if (file->mpi_rank != file->mpi_round) { #ifdef H5FDmpio_DEBUG - if (H5FD_mpio_Debug[(int)'w']) { - fprintf(stdout, - " proc %d: in H5FD_mpio_write (write omitted)\n", - file->mpi_rank ); - } + if (H5FD_mpio_Debug[(int)'w']) { + fprintf(stdout, + " proc %d: in H5FD_mpio_write (write omitted)\n", + file->mpi_rank ); + } #endif - HGOTO_DONE(SUCCEED) /* skip the actual write */ + HGOTO_DONE(SUCCEED) /* skip the actual write */ + } } - } + } /* end if */ /* Write the data. */ assert(H5FD_MPIO_INDEPENDENT==dx->xfer_mode || H5FD_MPIO_COLLECTIVE==dx->xfer_mode); @@ -1681,18 +1704,6 @@ done: /* Round-robin rotate to the next process */ file->mpi_round = (++file->mpi_round)%file->mpi_size; -#ifdef QAK - { - int max,min; - - MPI_Allreduce(&file->mpi_round, &max, 1, MPI_INT, MPI_MAX, file->comm); - MPI_Allreduce(&file->mpi_round, &min, 1, MPI_INT, MPI_MIN, file->comm); - if(max!=file->mpi_round) - printf("%s: rank=%d, round=%d, max=%d\n",FUNC,file->mpi_rank,file->mpi_round,max); - if(min!=file->mpi_round) - printf("%s: rank=%d, round=%d, min=%d\n",FUNC,file->mpi_rank,file->mpi_round,min); - } -#endif /* QAK */ } /* end if */ } /* end if */ diff --git a/src/H5FDmpiposix.c b/src/H5FDmpiposix.c index 82533d0..32e5535 100644 --- a/src/H5FDmpiposix.c +++ b/src/H5FDmpiposix.c @@ -32,6 +32,7 @@ * */ #include "H5private.h" /*library functions */ +#include "H5ACprivate.h" /* Metadata cache */ #include "H5Eprivate.h" /*error handling */ #include "H5Fprivate.h" /*files */ #include "H5FDprivate.h" /*file driver */ @@ -965,21 +966,31 @@ done: * * Modifications: * + * Quincey Koziol - 2002/07/18 + * Added "block_before_meta_write" dataset transfer flag, which + * is set during writes from a metadata cache flush and indicates + * that all the processes must sync up before (one of them) + * writing metadata. + * *------------------------------------------------------------------------- */ static herr_t -H5FD_mpiposix_write(H5FD_t *_file, H5FD_mem_t type, hid_t UNUSED dxpl_id, haddr_t addr, +H5FD_mpiposix_write(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t addr, size_t size, const void *buf) { H5FD_mpiposix_t *file = (H5FD_mpiposix_t*)_file; int mpi_code; /* MPI return code */ ssize_t nbytes; /* Number of bytes written each I/O call */ + H5P_genplist_t *plist; /* Property list pointer */ + unsigned block_before_meta_write=0; /* Whether to block before a metadata write */ herr_t ret_value=SUCCEED; /* Return value */ FUNC_ENTER_NOAPI(H5FD_mpiposix_write, FAIL); assert(file); assert(H5FD_MPIPOSIX==file->pub.driver_id); + assert(H5I_GENPROP_LST==H5I_get_type(dxpl_id)); + assert(TRUE==H5P_isa_class(dxpl_id,H5P_DATASET_XFER)); assert(buf); /* Check for overflow conditions */ @@ -990,17 +1001,36 @@ H5FD_mpiposix_write(H5FD_t *_file, H5FD_mem_t type, hid_t UNUSED dxpl_id, haddr_ if (addr+size>file->eoa) HGOTO_ERROR(H5E_ARGS, H5E_OVERFLOW, FAIL, "addr overflow"); - /* Only p<round> will do the actual write if all procs in comm write same data */ - if ((type!=H5FD_MEM_DRAW) && H5_mpiposix_1_metawrite_g) { - if (file->mpi_rank != file->mpi_round) - HGOTO_DONE(SUCCEED) /* skip the actual write */ + /* Obtain the data transfer properties */ + if(NULL == (plist = H5I_object(dxpl_id))) + HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, "not a file access property list"); + + /* Metadata specific actions */ + if(type!=H5FD_MEM_DRAW) { + /* Check if we need to syncronize all processes before attempting metadata write + * (Prevents race condition where the process writing the metadata goes ahead + * and writes the metadata to the file before all the processes have + * read the data, "transmitting" data from the "future" to the reading + * process. -QAK ) + */ + if(H5P_exist_plist(plist,H5AC_BLOCK_BEFORE_META_WRITE_NAME)>0) + if(H5P_get(plist,H5AC_BLOCK_BEFORE_META_WRITE_NAME,&block_before_meta_write)<0) + HGOTO_ERROR(H5E_PLIST, H5E_CANTGET, FAIL, "can't get H5AC property"); + + if(block_before_meta_write) + if (MPI_SUCCESS!= (mpi_code=MPI_Barrier(file->comm))) + HMPI_GOTO_ERROR(FAIL, "MPI_Barrier failed", mpi_code); + + /* Only p<round> will do the actual write if all procs in comm write same metadata */ + if (H5_mpiposix_1_metawrite_g) + if (file->mpi_rank != file->mpi_round) + HGOTO_DONE(SUCCEED) /* skip the actual write */ } /* end if */ /* Seek to the correct location */ if ((addr!=file->pos || OP_WRITE!=file->op) && file_seek(file->fd, (file_offset_t)addr, SEEK_SET)<0) - HGOTO_ERROR(H5E_IO, H5E_SEEKERROR, FAIL, - "unable to seek to proper position"); + HGOTO_ERROR(H5E_IO, H5E_SEEKERROR, FAIL, "unable to seek to proper position"); /* * Write the data, being careful of interrupted system calls and partial @@ -1039,18 +1069,6 @@ done: /* Round-robin rotate to the next process */ file->mpi_round = (++file->mpi_round)%file->mpi_size; -#ifdef QAK - { - int max,min; - - MPI_Allreduce(&file->mpi_round, &max, 1, MPI_INT, MPI_MAX, file->comm); - MPI_Allreduce(&file->mpi_round, &min, 1, MPI_INT, MPI_MIN, file->comm); - if(max!=file->mpi_round) - printf("%s: rank=%d, round=%d, max=%d\n",FUNC,file->mpi_rank,file->mpi_round,max); - if(min!=file->mpi_round) - printf("%s: rank=%d, round=%d, min=%d\n",FUNC,file->mpi_rank,file->mpi_round,min); - } -#endif /* QAK */ } /* end if */ } /* end else */ diff --git a/src/H5Gnode.c b/src/H5Gnode.c index 2acd0a4..efb38d0 100644 --- a/src/H5Gnode.c +++ b/src/H5Gnode.c @@ -46,9 +46,9 @@ static size_t H5G_node_size(H5F_t *f); static herr_t H5G_node_create(H5F_t *f, H5B_ins_t op, void *_lt_key, void *_udata, void *_rt_key, haddr_t *addr_p/*out*/); -static herr_t H5G_node_flush(H5F_t *f, hbool_t destroy, haddr_t addr, +static herr_t H5G_node_flush(H5F_t *f, hid_t dxpl_id, hbool_t destroy, haddr_t addr, H5G_node_t *sym); -static H5G_node_t *H5G_node_load(H5F_t *f, haddr_t addr, const void *_udata1, +static H5G_node_t *H5G_node_load(H5F_t *f, hid_t dxpl_id, haddr_t addr, const void *_udata1, void *_udata2); static int H5G_node_cmp2(H5F_t *f, void *_lt_key, void *_udata, void *_rt_key); @@ -70,7 +70,7 @@ static size_t H5G_node_sizeof_rkey(H5F_t *f, const void *_udata); const H5AC_class_t H5AC_SNODE[1] = {{ H5AC_SNODE_ID, (H5AC_load_func_t)H5G_node_load, - (herr_t (*)(H5F_t*, hbool_t, haddr_t, void*))H5G_node_flush, + (H5AC_flush_func_t)H5G_node_flush, }}; /* H5G inherits B-tree like properties from H5B */ @@ -315,10 +315,14 @@ H5G_node_create(H5F_t *f, H5B_ins_t UNUSED op, void *_lt_key, * * Robb Matzke, 1999-07-28 * The ADDR argument is passed by value. + * + * Quincey Koziol, 2002-7-180 + * Added dxpl parameter to allow more control over I/O from metadata + * cache. *------------------------------------------------------------------------- */ static herr_t -H5G_node_flush(H5F_t *f, hbool_t destroy, haddr_t addr, H5G_node_t *sym) +H5G_node_flush(H5F_t *f, hid_t dxpl_id, hbool_t destroy, haddr_t addr, H5G_node_t *sym) { uint8_t *buf = NULL, *p = NULL; size_t size; @@ -376,7 +380,7 @@ H5G_node_flush(H5F_t *f, hbool_t destroy, haddr_t addr, H5G_node_t *sym) H5G_ent_encode_vec(f, &p, sym->entry, sym->nsyms); HDmemset(p, 0, size - (p - buf)); - status = H5F_block_write(f, H5FD_MEM_BTREE, addr, size, H5P_DATASET_XFER_DEFAULT, buf); + status = H5F_block_write(f, H5FD_MEM_BTREE, addr, size, dxpl_id, buf); if (status < 0) HRETURN_ERROR(H5E_SYM, H5E_WRITEERROR, FAIL, "unable to write symbol table node to the file"); @@ -415,10 +419,14 @@ H5G_node_flush(H5F_t *f, hbool_t destroy, haddr_t addr, H5G_node_t *sym) * Modifications: * Robb Matzke, 1999-07-28 * The ADDR argument is passed by value. + * + * Quincey Koziol, 2002-7-180 + * Added dxpl parameter to allow more control over I/O from metadata + * cache. *------------------------------------------------------------------------- */ static H5G_node_t * -H5G_node_load(H5F_t *f, haddr_t addr, const void * UNUSED _udata1, +H5G_node_load(H5F_t *f, hid_t dxpl_id, haddr_t addr, const void * UNUSED _udata1, void * UNUSED _udata2) { H5G_node_t *sym = NULL; @@ -450,9 +458,9 @@ H5G_node_load(H5F_t *f, haddr_t addr, const void * UNUSED _udata1, HGOTO_ERROR (H5E_RESOURCE, H5E_NOSPACE, NULL, "memory allocation failed"); } - if (H5F_block_read(f, H5FD_MEM_BTREE, addr, size, H5P_DATASET_XFER_DEFAULT, buf) < 0) { + if (H5F_block_read(f, H5FD_MEM_BTREE, addr, size, dxpl_id, buf) < 0) { HGOTO_ERROR(H5E_SYM, H5E_READERROR, NULL, - "unabel to read symbol table node"); + "unable to read symbol table node"); } /* magic */ if (HDmemcmp(p, H5G_NODE_MAGIC, H5G_NODE_SIZEOF_MAGIC)) { @@ -56,9 +56,9 @@ struct H5HG_heap_t { }; /* PRIVATE PROTOTYPES */ -static H5HG_heap_t *H5HG_load(H5F_t *f, haddr_t addr, const void *udata1, +static H5HG_heap_t *H5HG_load(H5F_t *f, hid_t dxpl_id, haddr_t addr, const void *udata1, void *udata2); -static herr_t H5HG_flush(H5F_t *f, hbool_t dest, haddr_t addr, +static herr_t H5HG_flush(H5F_t *f, hid_t dxpl_id, hbool_t dest, haddr_t addr, H5HG_heap_t *heap); /* @@ -66,8 +66,8 @@ static herr_t H5HG_flush(H5F_t *f, hbool_t dest, haddr_t addr, */ static const H5AC_class_t H5AC_GHEAP[1] = {{ H5AC_GHEAP_ID, - (void *(*)(H5F_t*, haddr_t, const void*, void*))H5HG_load, - (herr_t (*)(H5F_t*, hbool_t, haddr_t, void*))H5HG_flush, + (H5AC_load_func_t)H5HG_load, + (H5AC_flush_func_t)H5HG_flush, }}; /* Interface initialization */ @@ -221,10 +221,14 @@ H5HG_create (H5F_t *f, size_t size) * Modifications: * Robb Matzke, 1999-07-28 * The ADDR argument is passed by value. + * + * Quincey Koziol, 2002-7-180 + * Added dxpl parameter to allow more control over I/O from metadata + * cache. *------------------------------------------------------------------------- */ static H5HG_heap_t * -H5HG_load (H5F_t *f, haddr_t addr, const void * UNUSED udata1, +H5HG_load (H5F_t *f, hid_t dxpl_id, haddr_t addr, const void * UNUSED udata1, void * UNUSED udata2) { H5HG_heap_t *heap = NULL; @@ -251,7 +255,7 @@ H5HG_load (H5F_t *f, haddr_t addr, const void * UNUSED udata1, HGOTO_ERROR (H5E_RESOURCE, H5E_NOSPACE, NULL, "memory allocation failed"); } - if (H5F_block_read(f, H5FD_MEM_GHEAP, addr, H5HG_MINSIZE, H5P_DATASET_XFER_DEFAULT, + if (H5F_block_read(f, H5FD_MEM_GHEAP, addr, H5HG_MINSIZE, dxpl_id, heap->chunk)<0) { HGOTO_ERROR (H5E_HEAP, H5E_READERROR, NULL, "unable to read global heap collection"); @@ -288,7 +292,7 @@ H5HG_load (H5F_t *f, haddr_t addr, const void * UNUSED udata1, "memory allocation failed"); } if (H5F_block_read (f, H5FD_MEM_GHEAP, next_addr, (heap->size-H5HG_MINSIZE), - H5P_DATASET_XFER_DEFAULT, heap->chunk+H5HG_MINSIZE)<0) { + dxpl_id, heap->chunk+H5HG_MINSIZE)<0) { HGOTO_ERROR (H5E_HEAP, H5E_READERROR, NULL, "unable to read global heap collection"); } @@ -397,10 +401,14 @@ H5HG_load (H5F_t *f, haddr_t addr, const void * UNUSED udata1, * Modifications: * Robb Matzke, 1999-07-28 * The ADDR argument is passed by value. + * + * Quincey Koziol, 2002-7-180 + * Added dxpl parameter to allow more control over I/O from metadata + * cache. *------------------------------------------------------------------------- */ static herr_t -H5HG_flush (H5F_t *f, hbool_t destroy, haddr_t addr, H5HG_heap_t *heap) +H5HG_flush (H5F_t *f, hid_t dxpl_id, hbool_t destroy, haddr_t addr, H5HG_heap_t *heap) { int i; @@ -414,7 +422,7 @@ H5HG_flush (H5F_t *f, hbool_t destroy, haddr_t addr, H5HG_heap_t *heap) if (heap->dirty) { if (H5F_block_write (f, H5FD_MEM_GHEAP, addr, heap->size, - H5P_DATASET_XFER_DEFAULT, heap->chunk)<0) { + dxpl_id, heap->chunk)<0) { HRETURN_ERROR (H5E_HEAP, H5E_WRITEERROR, FAIL, "unable to write global heap collection to file"); } @@ -52,17 +52,17 @@ typedef struct H5HL_t { } H5HL_t; /* PRIVATE PROTOTYPES */ -static H5HL_t *H5HL_load(H5F_t *f, haddr_t addr, const void *udata1, +static H5HL_t *H5HL_load(H5F_t *f, hid_t dxpl_id, haddr_t addr, const void *udata1, void *udata2); -static herr_t H5HL_flush(H5F_t *f, hbool_t dest, haddr_t addr, H5HL_t *heap); +static herr_t H5HL_flush(H5F_t *f, hid_t dxpl_id, hbool_t dest, haddr_t addr, H5HL_t *heap); /* * H5HL inherits cache-like properties from H5AC */ static const H5AC_class_t H5AC_LHEAP[1] = {{ H5AC_LHEAP_ID, - (void *(*)(H5F_t*, haddr_t, const void*, void*))H5HL_load, - (herr_t (*)(H5F_t*, hbool_t, haddr_t, void*))H5HL_flush, + (H5AC_load_func_t)H5HL_load, + (H5AC_flush_func_t)H5HL_flush, }}; /* Interface initialization */ @@ -195,10 +195,14 @@ H5HL_create(H5F_t *f, size_t size_hint, haddr_t *addr_p/*out*/) * Modifications: * Robb Matzke, 1999-07-28 * The ADDR argument is passed by value. + * + * Quincey Koziol, 2002-7-180 + * Added dxpl parameter to allow more control over I/O from metadata + * cache. *------------------------------------------------------------------------- */ static H5HL_t * -H5HL_load(H5F_t *f, haddr_t addr, const void * UNUSED udata1, +H5HL_load(H5F_t *f, hid_t dxpl_id, haddr_t addr, const void * UNUSED udata1, void * UNUSED udata2) { uint8_t hdr[52]; @@ -217,7 +221,7 @@ H5HL_load(H5F_t *f, haddr_t addr, const void * UNUSED udata1, assert(!udata1); assert(!udata2); - if (H5F_block_read(f, H5FD_MEM_LHEAP, addr, H5HL_SIZEOF_HDR(f), H5P_DATASET_XFER_DEFAULT, + if (H5F_block_read(f, H5FD_MEM_LHEAP, addr, H5HL_SIZEOF_HDR(f), dxpl_id, hdr) < 0) { HRETURN_ERROR(H5E_HEAP, H5E_READERROR, NULL, "unable to read heap header"); @@ -258,7 +262,7 @@ H5HL_load(H5F_t *f, haddr_t addr, const void * UNUSED udata1, } if (heap->disk_alloc && H5F_block_read(f, H5FD_MEM_LHEAP, heap->addr, heap->disk_alloc, - H5P_DATASET_XFER_DEFAULT, heap->chunk + H5HL_SIZEOF_HDR(f)) < 0) { + dxpl_id, heap->chunk + H5HL_SIZEOF_HDR(f)) < 0) { HGOTO_ERROR(H5E_HEAP, H5E_CANTLOAD, NULL, "unable to read heap data"); } @@ -323,10 +327,14 @@ H5HL_load(H5F_t *f, haddr_t addr, const void * UNUSED udata1, * * Robb Matzke, 1999-07-28 * The ADDR argument is passed by value. + * + * Quincey Koziol, 2002-7-180 + * Added dxpl parameter to allow more control over I/O from metadata + * cache. *------------------------------------------------------------------------- */ static herr_t -H5HL_flush(H5F_t *f, hbool_t destroy, haddr_t addr, H5HL_t *heap) +H5HL_flush(H5F_t *f, hid_t dxpl_id, hbool_t destroy, haddr_t addr, H5HL_t *heap) { uint8_t *p = heap->chunk; H5HL_free_t *fl = heap->freelist; @@ -394,19 +402,18 @@ H5HL_flush(H5F_t *f, hbool_t destroy, haddr_t addr, H5HL_t *heap) /* The header and data are contiguous */ if (H5F_block_write(f, H5FD_MEM_LHEAP, addr, (H5HL_SIZEOF_HDR(f)+heap->disk_alloc), - H5P_DATASET_XFER_DEFAULT, heap->chunk) < 0) { + dxpl_id, heap->chunk) < 0) { HRETURN_ERROR(H5E_HEAP, H5E_WRITEERROR, FAIL, "unable to write heap header and data to file"); } } else { if (H5F_block_write(f, H5FD_MEM_LHEAP, addr, H5HL_SIZEOF_HDR(f), - H5P_DATASET_XFER_DEFAULT, heap->chunk)<0) { + dxpl_id, heap->chunk)<0) { HRETURN_ERROR(H5E_HEAP, H5E_WRITEERROR, FAIL, "unable to write heap header to file"); } if (H5F_block_write(f, H5FD_MEM_LHEAP, heap->addr, heap->disk_alloc, - H5P_DATASET_XFER_DEFAULT, - heap->chunk + H5HL_SIZEOF_HDR(f)) < 0) { + dxpl_id, heap->chunk + H5HL_SIZEOF_HDR(f)) < 0) { HRETURN_ERROR(H5E_HEAP, H5E_WRITEERROR, FAIL, "unable to write heap data to file"); } @@ -30,8 +30,8 @@ #define PABLO_MASK H5O_mask /* PRIVATE PROTOTYPES */ -static herr_t H5O_flush(H5F_t *f, hbool_t destroy, haddr_t addr, H5O_t *oh); -static H5O_t *H5O_load(H5F_t *f, haddr_t addr, const void *_udata1, +static herr_t H5O_flush(H5F_t *f, hid_t dxpl_id, hbool_t destroy, haddr_t addr, H5O_t *oh); +static H5O_t *H5O_load(H5F_t *f, hid_t dxpl_id, haddr_t addr, const void *_udata1, void *_udata2); static unsigned H5O_find_in_ohdr(H5F_t *f, haddr_t addr, const H5O_class_t **type_p, int sequence); @@ -44,8 +44,8 @@ static herr_t H5O_touch_oh(H5F_t *f, H5O_t *oh, hbool_t force); /* H5O inherits cache-like properties from H5AC */ static const H5AC_class_t H5AC_OHDR[1] = {{ H5AC_OHDR_ID, - (void *(*)(H5F_t *, haddr_t, const void *, void *)) H5O_load, - (herr_t (*)(H5F_t *, hbool_t, haddr_t, void *)) H5O_flush, + (H5AC_load_func_t)H5O_load, + (H5AC_flush_func_t)H5O_flush, }}; /* Interface initialization */ @@ -339,10 +339,14 @@ H5O_close(H5G_entry_t *obj_ent) * * Robb Matzke, 1999-07-28 * The ADDR argument is passed by value. + * + * Quincey Koziol, 2002-7-180 + * Added dxpl parameter to allow more control over I/O from metadata + * cache. *------------------------------------------------------------------------- */ static H5O_t * -H5O_load(H5F_t *f, haddr_t addr, const void * UNUSED _udata1, +H5O_load(H5F_t *f, hid_t dxpl_id, haddr_t addr, const void * UNUSED _udata1, void * UNUSED _udata2) { H5O_t *oh = NULL; @@ -376,7 +380,7 @@ H5O_load(H5F_t *f, haddr_t addr, const void * UNUSED _udata1, /* read fixed-lenth part of object header */ hdr_size = H5O_SIZEOF_HDR(f); assert(hdr_size<=sizeof(buf)); - if (H5F_block_read(f, H5FD_MEM_OHDR, addr, hdr_size, H5P_DATASET_XFER_DEFAULT, buf) < 0) { + if (H5F_block_read(f, H5FD_MEM_OHDR, addr, hdr_size, dxpl_id, buf) < 0) { HGOTO_ERROR(H5E_OHDR, H5E_READERROR, NULL, "unable to read object header"); } @@ -434,7 +438,7 @@ H5O_load(H5F_t *f, haddr_t addr, const void * UNUSED _udata1, HGOTO_ERROR (H5E_RESOURCE, H5E_NOSPACE, NULL, "memory allocation failed"); } - if (H5F_block_read(f, H5FD_MEM_OHDR, chunk_addr, chunk_size, H5P_DATASET_XFER_DEFAULT, + if (H5F_block_read(f, H5FD_MEM_OHDR, chunk_addr, chunk_size, dxpl_id, oh->chunk[chunkno].image) < 0) { HGOTO_ERROR(H5E_OHDR, H5E_READERROR, NULL, "unable to read object header data"); @@ -537,10 +541,14 @@ done: * * Robb Matzke, 1999-07-28 * The ADDR argument is passed by value. + * + * Quincey Koziol, 2002-7-180 + * Added dxpl parameter to allow more control over I/O from metadata + * cache. *------------------------------------------------------------------------- */ static herr_t -H5O_flush(H5F_t *f, hbool_t destroy, haddr_t addr, H5O_t *oh) +H5O_flush(H5F_t *f, hid_t dxpl_id, hbool_t destroy, haddr_t addr, H5O_t *oh) { uint8_t buf[16], *p; int id; @@ -586,7 +594,7 @@ H5O_flush(H5F_t *f, hbool_t destroy, haddr_t addr, H5O_t *oh) } /* end if */ else { if (H5F_block_write(f, H5FD_MEM_OHDR, addr, H5O_SIZEOF_HDR(f), - H5P_DATASET_XFER_DEFAULT, buf) < 0) { + dxpl_id, buf) < 0) { HRETURN_ERROR(H5E_OHDR, H5E_WRITEERROR, FAIL, "unable to write object header hdr to disk"); } @@ -669,7 +677,7 @@ H5O_flush(H5F_t *f, hbool_t destroy, haddr_t addr, H5O_t *oh) /* Write the combined prefix/chunk out */ if (H5F_block_write(f, H5FD_MEM_OHDR, addr, (H5O_SIZEOF_HDR(f)+oh->chunk[u].size), - H5P_DATASET_XFER_DEFAULT, p) < 0) { + dxpl_id, p) < 0) { HRETURN_ERROR(H5E_OHDR, H5E_WRITEERROR, FAIL, "unable to write object header data to disk"); } /* end if */ @@ -680,7 +688,7 @@ H5O_flush(H5F_t *f, hbool_t destroy, haddr_t addr, H5O_t *oh) else { if (H5F_block_write(f, H5FD_MEM_OHDR, oh->chunk[u].addr, (oh->chunk[u].size), - H5P_DATASET_XFER_DEFAULT, oh->chunk[u].image) < 0) { + dxpl_id, oh->chunk[u].image) < 0) { HRETURN_ERROR(H5E_OHDR, H5E_WRITEERROR, FAIL, "unable to write object header data to disk"); } /* end if */ diff --git a/src/H5private.h b/src/H5private.h index 65b37c3..765eb16 100644 --- a/src/H5private.h +++ b/src/H5private.h @@ -1181,6 +1181,7 @@ __DLL__ void H5_term_library(void); /* Functions to terminate interfaces */ __DLL__ int H5A_term_interface(void); +__DLL__ int H5AC_term_interface(void); __DLL__ int H5D_term_interface(void); __DLL__ int H5F_term_interface(void); __DLL__ int H5G_term_interface(void); |