[svn-r11231] Purpose:

bug fix for collective chunk IO, phase 1 Optimization hasn't been done yet, the collective chunk IO bug should be fixed. Description: In chunking storage, memory space and file space will be remapped, So to check whether file space and memory space are regular in order to use optimized MPI derived datatype for collective call one has to check per-chunk wise instead of per hyperslab wise. Even a regular memory space will be stored in span-tree and will be irregular before chunk IO. Solution: 1. Check file space and memory space per chunk wise instead of per hyperslab wise. 2. For collective IO mode, number of chunks covered by hyperslab may be different. Since we are handing per chunk per IO, for the extra chunk IO for some(not all) processors, collective mode will cause program hanged. So for the extra chunk Io mode independent IO has to be used. 3. On some platforms, Complex MPI derived datatype is not working, so we have to use independent IO for collective IO mode if the selection is irregular. However, when the selection is regular, we do want to use collective IO since that will improve performance. Special cares have to be added for this case. Platforms tested: copper(AIX 5.1) Linux(heping mpich 1.2.6), Teragrid machine, Cobalt(altix), modi4 Misc. update:
author: MuQun Yang <ymuqun@hdfgroup.org> 2005-08-11 18:48:09 (GMT)
committer: MuQun Yang <ymuqun@hdfgroup.org> 2005-08-11 18:48:09 (GMT)
commit: 870c5b2f66c158446b385cf67f507f1641aca1e2 (patch)
tree: 73d52b3101ef2b1a054e035710670bd5a694b48c /src/H5Dio.c
parent: 0e1b41d0fd1521784128e8637b5afa8371d2779d (diff)
download: hdf5-870c5b2f66c158446b385cf67f507f1641aca1e2.zip
hdf5-870c5b2f66c158446b385cf67f507f1641aca1e2.tar.gz
hdf5-870c5b2f66c158446b385cf67f507f1641aca1e2.tar.bz2
1 files changed, 492 insertions, 38 deletions
diff --git a/src/H5Dio.c b/src/H5Dio.c
index f1d12f8..fa3f49e 100644
--- a/src/H5Dio.c
+++ b/src/H5Dio.c
@@ -108,13 +108,24 @@ static htri_t
 H5D_get_collective_io_consensus(const H5F_t *file,
             const htri_t local_opinion,
             const unsigned flags);
+
+static herr_t  H5D_mpio_get_mini_chunk(const H5D_t *dset,
+                                       const H5S_t *mem_space,
+                                       const H5S_t *file_space,
+                                       int *min_chunkf);
+static herr_t  
+H5D_obtain_duplicate_pid(hid_t dxpl_id,
+		    hid_t* dp_id, 
+		    H5D_dxpl_cache_t **cache);
+
 #endif /* H5_HAVE_PARALLEL */
 
 /* I/O info operations */
 static herr_t 
 H5D_ioinfo_init(H5D_t *dset, const H5D_dxpl_cache_t *dxpl_cache, hid_t dxpl_id,
-    const H5S_t *mem_space, const H5S_t *file_space,
-    unsigned flags, hbool_t *use_par_opt_io, H5D_io_info_t *io_info);
+		hid_t dp_dxpl_id, H5D_dxpl_cache_t *dp_dxpl_cache,
+		const H5S_t *mem_space, const H5S_t *file_space,
+		unsigned flags, hbool_t *use_par_opt_io, H5D_io_info_t *io_info);
 
 /* Chunk operations */
 static herr_t H5D_create_chunk_map(const H5D_t *dataset, const H5T_t *mem_type,
@@ -648,8 +659,13 @@ H5D_read(H5D_t *dataset, hid_t mem_type_id, const H5S_t *mem_space,
     const H5T_t	*mem_type = NULL;       /* Memory datatype */
     H5D_io_info_t io_info;              /* Dataset I/O info     */
     hbool_t     use_par_opt_io=FALSE;   /* Whether the 'optimized' I/O routines with be parallel */
+    H5D_dxpl_cache_t _dp_dxpl_cache;                 /* Data transfer property cache buffer */
+    H5D_dxpl_cache_t *dp_dxpl_cache=&_dp_dxpl_cache;   /* Data transfer property cache */
+    hid_t            dp_id;
 #ifdef H5_HAVE_PARALLEL
-    hbool_t     xfer_mode_changed=FALSE;    /* Whether the transfer mode was changed */
+    hbool_t     xfer_mode_changed=FALSE;    /* Whether the transfer mode was changed */  
+    H5FD_mpio_xfer_t xfer_mode; 
+
 #ifdef H5_HAVE_INSTRUMENTED_LIBRARY
     int         prop_value,new_value;
     htri_t      check_prop;
@@ -660,6 +676,8 @@ H5D_read(H5D_t *dataset, hid_t mem_type_id, const H5S_t *mem_space,
     unsigned	sconv_flags=0;	        /* Flags for the space conversion */
     herr_t	ret_value = SUCCEED;	/* Return value	*/
 
+
+
     FUNC_ENTER_NOAPI_NOINIT(H5D_read)
 
     /* check args */
@@ -682,6 +700,7 @@ H5D_read(H5D_t *dataset, hid_t mem_type_id, const H5S_t *mem_space,
         HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't fill dxpl cache")
 
 #ifdef H5_HAVE_PARALLEL
+
     /* Collective access is not permissible without a MPI based VFD */
     if (dxpl_cache->xfer_mode==H5FD_MPIO_COLLECTIVE && !IS_H5FD_MPI(dataset->ent.file))
         HGOTO_ERROR (H5E_DATASET, H5E_UNSUPPORTED, FAIL, "collective access for MPI-based drivers only")
@@ -765,8 +784,15 @@ H5D_read(H5D_t *dataset, hid_t mem_type_id, const H5S_t *mem_space,
             assert(0 && "Unhandled layout type!");
     } /* end switch */
 
+#ifdef H5_HAVE_PARALLEL
+    /* Obtain duplicate property list id. This is used to handle 
+       collective chunk IO. */
+
+   if(H5D_obtain_duplicate_pid(dxpl_id,&dp_id,&dp_dxpl_cache)<0)                            
+     HGOTO_ERROR(H5E_PLIST, H5E_CANTGET, FAIL, "can't obtain duplicated property id")
+#endif 
     /* Set up I/O operation */
-    if(H5D_ioinfo_init(dataset,dxpl_cache,dxpl_id,mem_space,file_space,sconv_flags,&use_par_opt_io,&io_info)<0)
+    if(H5D_ioinfo_init(dataset,dxpl_cache,dxpl_id,dp_id,dp_dxpl_cache,mem_space,file_space,sconv_flags,&use_par_opt_io,&io_info)<0)
         HGOTO_ERROR (H5E_DATASET, H5E_UNSUPPORTED, FAIL, "unable to set up I/O operation")
 
 #ifdef H5_HAVE_PARALLEL
@@ -877,8 +903,13 @@ H5D_write(H5D_t *dataset, hid_t mem_type_id, const H5S_t *mem_space,
     const H5T_t	*mem_type = NULL;       /* Memory datatype */
     H5D_io_info_t io_info;              /* Dataset I/O info     */
     hbool_t     use_par_opt_io=FALSE;   /* Whether the 'optimized' I/O routines with be parallel */
+    hid_t       dp_id;
+    H5D_dxpl_cache_t _dp_dxpl_cache;                 /* Data transfer property cache buffer */
+    H5D_dxpl_cache_t *dp_dxpl_cache=&_dp_dxpl_cache;   /* Data transfer property cache */
 #ifdef H5_HAVE_PARALLEL
-    hbool_t     xfer_mode_changed=FALSE;    /* Whether the transfer mode was changed */
+    hbool_t     xfer_mode_changed=FALSE; /* Whether the transfer mode was changed */    
+    H5FD_mpio_xfer_t xfer_mode; 
+    int mpi_rank;
 #ifdef H5_HAVE_INSTRUMENTED_LIBRARY
     int         prop_value,new_value;
     htri_t      check_prop;
@@ -1013,9 +1044,12 @@ H5D_write(H5D_t *dataset, hid_t mem_type_id, const H5S_t *mem_space,
         default:
             assert(0 && "Unhandled layout type!");
     } /* end switch */
-
+#ifdef H5_HAVE_PARALLEL
+     if(H5D_obtain_duplicate_pid(dxpl_id,&dp_id,&dp_dxpl_cache)<0)
+       HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't obtain duplicated property id")
+#endif
     /* Set up I/O operation */
-    if(H5D_ioinfo_init(dataset,dxpl_cache,dxpl_id,mem_space,file_space,sconv_flags,&use_par_opt_io,&io_info)<0)
+    if(H5D_ioinfo_init(dataset,dxpl_cache,dxpl_id,dp_id,dp_dxpl_cache,mem_space,file_space,sconv_flags,&use_par_opt_io,&io_info)<0)
         HGOTO_ERROR (H5E_DATASET, H5E_UNSUPPORTED, FAIL, "unable to set up I/O operation")
         
 #ifdef H5_HAVE_PARALLEL
@@ -1654,6 +1688,12 @@ H5D_chunk_read(H5D_io_info_t *io_info, hsize_t nelmts,
     H5D_storage_t store;                /*union of EFL and chunk pointer in file space */
     herr_t	ret_value = SUCCEED;	/*return value		*/
 
+#ifdef H5_HAVE_PARALLEL
+    int count_chunk,mpi_rank, mpi_code,min_num_chunk,is_regular,all_regular;
+    hid_t temp_id;   
+    MPI_Comm comm;
+#endif
+
     FUNC_ENTER_NOAPI_NOINIT(H5D_chunk_read)
     
     /* Map elements between file and memory for each chunk*/
@@ -1679,7 +1719,13 @@ H5D_chunk_read(H5D_io_info_t *io_info, hsize_t nelmts,
 
         /* Get first node in chunk skip list */
         chunk_node=H5SL_first(fm.fsel);
-
+#ifdef H5_HAVE_PARALLEL
+        if(io_info->dxpl_cache->xfer_mode == H5FD_MPIO_COLLECTIVE) {
+       	if(H5D_mpio_get_mini_chunk(dataset,mem_space,file_space,&min_num_chunk)<0)
+		HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't get minimum number of chunk")
+		  }
+        count_chunk = 0;
+#endif
         /* Iterate through chunks to be operated on */
         while(chunk_node) {
             H5D_chunk_info_t *chunk_info;   /* chunk information */
@@ -1691,15 +1737,148 @@ H5D_chunk_read(H5D_io_info_t *io_info, hsize_t nelmts,
             store.chunk.offset = chunk_info->coords;
             store.chunk.index = chunk_info->index;
 
-            /* Perform the actual read operation */
-            status = (io_info->ops.read)(io_info,
+#ifdef H5_HAVE_PARALLEL
+
+	     count_chunk++;
+             if(io_info->dxpl_cache->xfer_mode == H5FD_MPIO_COLLECTIVE) {
+	       /* If the number of chunk is greater than minimum number of chunk,
+		  Do independent read */
+
+             if(count_chunk <= min_num_chunk) {
+#ifndef H5_MPI_COMPLEX_DERIVED_DATATYPE_WORKS
+              if(H5S_SELECT_IS_REGULAR(chunk_info->fspace) == TRUE &&
+                 H5S_SELECT_IS_REGULAR(chunk_info->mspace) == TRUE)
+                   is_regular = 1;
+              else is_regular = 0;
+
+        /* Getting MPI communicator and rank */
+        if((comm = H5F_mpi_get_comm(dataset->ent.file))==MPI_COMM_NULL)
+            HGOTO_ERROR(H5E_DATASPACE, H5E_CANTGET, FAIL, "can't retrieve MPI communicator")
+        if((mpi_rank = H5F_mpi_get_rank(dataset->ent.file))<0)
+            HGOTO_ERROR(H5E_DATASPACE, H5E_CANTGET, FAIL, "can't retrieve MPI rank")
+
+               if (MPI_SUCCESS != (mpi_code= MPI_Reduce(&all_regular,&is_regular,1,MPI_INT,MPI_MIN,0,comm)))
+                    HMPI_GOTO_ERROR(FAIL, "MPI_Reduce failed", mpi_code) 
+               if (MPI_SUCCESS != (mpi_code= MPI_Bcast(&all_regular,1,MPI_INT,0,comm)))
+                    HMPI_GOTO_ERROR(FAIL, "MPI_Bcast failed", mpi_code)
+#endif
+             }
+
+	       if(count_chunk > min_num_chunk) {
+		  temp_id = io_info->dxpl_id;
+		  io_info->dxpl_id = io_info->dp_dxpl_id;
+		  status = (io_info->ops_sca.read)(io_info,
+			   chunk_info->chunk_points, H5T_get_size(dataset->shared->type),
+			    chunk_info->fspace, chunk_info->mspace,
+                            buf);
+		 /* Check return value from optimized read */
+		 if (status<0)
+		   HGOTO_ERROR(H5E_DATASET, H5E_READERROR, FAIL, "optimized read failed")
+                    io_info->dxpl_id = temp_id;
+	       }
+
+       
+	       else if((H5S_SELECT_IS_REGULAR(chunk_info->fspace) == FALSE)||
+	              (H5S_SELECT_IS_REGULAR(chunk_info->mspace) == FALSE)){
+
+#ifndef H5_MPI_COMPLEX_DERIVED_DATATYPE_WORKS
+	       /* Perform the independent read operation */
+                temp_id = io_info->dxpl_id;
+                io_info->dxpl_id = io_info->dp_dxpl_id;
+		status = (io_info->ops_sca.read)(io_info, 
                 chunk_info->chunk_points, H5T_get_size(dataset->shared->type),
                 chunk_info->fspace, chunk_info->mspace,
                 buf);
-        
-            /* Check return value from optimized read */
-            if (status<0)
-                HGOTO_ERROR(H5E_DATASET, H5E_READERROR, FAIL, "optimized read failed")
+		/* Check return value from optimized read */
+		if (status<0)
+		  HGOTO_ERROR(H5E_DATASET, H5E_READERROR, FAIL, "optimized read failed")
+                io_info->dxpl_id = temp_id;
+#else
+	   
+		  /* Perform the actual collective read operation */
+		  status = (io_info->ops.read)(io_info, 
+		    chunk_info->chunk_points, H5T_get_size(dataset->shared->type),
+                    chunk_info->fspace, chunk_info->mspace,
+                    buf);
+	       /* Check return value from optimized read */
+               if (status<0)
+		 HGOTO_ERROR(H5E_DATASET, H5E_READERROR, FAIL, "optimized read failed")
+#endif
+		   }
+	     
+	       else { 
+		 /* For regular selection, 
+		    if MPI_COMPLEX_DERIVED_DATATYPE is not defined,
+                    unless spaces for all processors are regular, independent read operation should be performed.*/
+
+#ifndef H5_MPI_COMPLEX_DERIVED_DATATYPE_WORKS
+                if(!all_regular) {
+
+	       /* Perform the independent read operation */
+                temp_id = io_info->dxpl_id;
+                io_info->dxpl_id = io_info->dp_dxpl_id;
+		status = (io_info->ops_sca.read)(io_info, 
+                chunk_info->chunk_points, H5T_get_size(dataset->shared->type),
+                chunk_info->fspace, chunk_info->mspace,
+                buf);
+		/* Check return value from optimized read */
+		if (status<0)
+		  HGOTO_ERROR(H5E_DATASET, H5E_READERROR, FAIL, "optimized read failed")
+                io_info->dxpl_id = temp_id;
+		   }
+	     
+	        else { 
+	         /* For  regular collective read in parallel*/
+	        /* Perform the  read operation */
+		   status = (io_info->ops.read)(io_info, 
+						chunk_info->chunk_points, 
+						H5T_get_size(dataset->shared->type),
+						chunk_info->fspace, chunk_info->mspace,
+						buf);
+	        /* Check return value from optimized read */
+                if (status<0)
+		 HGOTO_ERROR(H5E_DATASET, H5E_READERROR, FAIL, "optimized read failed")
+		   }
+#else 
+              
+	         /* For  regular collective read in parallel*/
+	        /* Perform the  read operation */
+		   status = (io_info->ops.read)(io_info, 
+						chunk_info->chunk_points, 
+						H5T_get_size(dataset->shared->type),
+						chunk_info->fspace, chunk_info->mspace,
+						buf);
+	        /* Check return value from optimized read */
+                if (status<0)
+		 HGOTO_ERROR(H5E_DATASET, H5E_READERROR, FAIL, "optimized read failed")
+#endif
+		   }
+               
+          }
+          else { 
+		 /* For  regular independent read in parallel*/
+	        /* Perform the  read operation */
+		   status = (io_info->ops.read)(io_info, 
+						chunk_info->chunk_points, 
+						H5T_get_size(dataset->shared->type),
+						chunk_info->fspace, chunk_info->mspace,
+						buf);
+	        /* Check return value from optimized read */
+                if (status<0)
+		 HGOTO_ERROR(H5E_DATASET, H5E_READERROR, FAIL, "optimized read failed")
+		   }
+                
+#else	
+		  /* Perform the actual read operation for sequential*/
+		  status = (io_info->ops.read)(io_info, 
+					       chunk_info->chunk_points, 
+					       H5T_get_size(dataset->shared->type),
+					       chunk_info->fspace, chunk_info->mspace,
+					       buf);
+	       /* Check return value from optimized read */
+               if (status<0)
+		 HGOTO_ERROR(H5E_DATASET, H5E_READERROR, FAIL, "optimized read failed")
+#endif        
 
             /* Get the next chunk node in the skip list */
             chunk_node=H5SL_next(chunk_node);
@@ -1937,6 +2116,8 @@ done:
  *      Hacked on it a lot. :-)
  *      Leon Arber: 4/20/04
  *      Added support for data transforms.
+ *      Kent Yang: 8/10/04
+ *      Added support for collective chunk IO.
  *
  *-------------------------------------------------------------------------
  */
@@ -1973,6 +2154,13 @@ H5D_chunk_write(H5D_io_info_t *io_info, hsize_t nelmts,
     H5D_storage_t store;                /*union of EFL and chunk pointer in file space */
     herr_t	ret_value = SUCCEED;	/*return value		*/
 
+#ifdef H5_HAVE_PARALLEL
+    hid_t temp_id;
+    int count_chunk,mpi_rank,mpi_code,min_num_chunk,is_regular,all_regular = 0;
+    MPI_Comm comm;
+    
+#endif
+
     FUNC_ENTER_NOAPI_NOINIT(H5D_chunk_write)
     
     /* Map elements between file and memory for each chunk*/
@@ -1990,6 +2178,14 @@ H5D_chunk_write(H5D_io_info_t *io_info, hsize_t nelmts,
 #ifdef H5S_DEBUG
 	H5_timer_begin(&timer);
 #endif
+
+#ifdef H5_HAVE_PARALLEL
+        if(io_info->dxpl_cache->xfer_mode == H5FD_MPIO_COLLECTIVE) {
+	  if(H5D_mpio_get_mini_chunk(dataset,mem_space,file_space,&min_num_chunk)<0)
+		HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't get minimum number of chunk")
+		  }
+        count_chunk = 0;
+#endif
         /* Get first node in chunk skip list */
         chunk_node=H5SL_first(fm.fsel);
 
@@ -2004,15 +2200,136 @@ H5D_chunk_write(H5D_io_info_t *io_info, hsize_t nelmts,
             store.chunk.offset = chunk_info->coords;
             store.chunk.index = chunk_info->index;
 
-            /* Perform the actual write operation */
-            status = (io_info->ops.write)(io_info, 
+#ifdef H5_HAVE_PARALLEL
+
+	     count_chunk++;
+             if(io_info->dxpl_cache->xfer_mode == H5FD_MPIO_COLLECTIVE) {
+	       /* If the number of chunk is greater than minimum number of chunk,
+		  Do independent write */
+
+              if(count_chunk <= min_num_chunk) {
+#ifndef H5_MPI_COMPLEX_DERIVED_DATATYPE_WORKS
+              if(H5S_SELECT_IS_REGULAR(chunk_info->fspace) == TRUE &&
+                 H5S_SELECT_IS_REGULAR(chunk_info->mspace) == TRUE)
+                   is_regular = 1;
+              else is_regular = 0;
+        /* Getting MPI communicator and rank */
+        if((comm = H5F_mpi_get_comm(dataset->ent.file))==MPI_COMM_NULL)
+            HGOTO_ERROR(H5E_DATASPACE, H5E_CANTGET, FAIL, "can't retrieve MPI communicator")
+        if((mpi_rank = H5F_mpi_get_rank(dataset->ent.file))<0)
+            HGOTO_ERROR(H5E_DATASPACE, H5E_CANTGET, FAIL, "can't retrieve MPI rank")
+               if (MPI_SUCCESS != (mpi_code= MPI_Reduce(&all_regular,&is_regular,1,MPI_INT,MPI_MIN,0,comm)))
+                    HMPI_GOTO_ERROR(FAIL, "MPI_Reduce failed", mpi_code) 
+               if (MPI_SUCCESS != (mpi_code= MPI_Bcast(&all_regular,1,MPI_INT,0,comm)))
+                    HMPI_GOTO_ERROR(FAIL, "MPI_Bcast failed", mpi_code)
+#endif
+             }
+	       if(count_chunk > min_num_chunk) {
+		  temp_id = io_info->dxpl_id;
+		  io_info->dxpl_id = io_info->dp_dxpl_id;
+                  fflush(stdout);
+		  status = (io_info->ops_sca.write)(io_info,
+			   chunk_info->chunk_points, H5T_get_size(dataset->shared->type),
+			    chunk_info->fspace, chunk_info->mspace,
+                            buf);
+		 /* Check return value from optimized write */
+		 if (status<0)
+		   HGOTO_ERROR(H5E_DATASET, H5E_WRITEERROR, FAIL, "optimized write failed")
+                    io_info->dxpl_id = temp_id;
+	       }
+
+       
+	       else if((H5S_SELECT_IS_REGULAR(chunk_info->fspace) == FALSE)||
+	              (H5S_SELECT_IS_REGULAR(chunk_info->mspace) == FALSE)){
+
+#ifndef H5_MPI_COMPLEX_DERIVED_DATATYPE_WORKS
+	       /* Perform the independent write operation */
+                
+	        temp_id = io_info->dxpl_id;
+	        io_info->dxpl_id = io_info->dp_dxpl_id;
+		status = (io_info->ops_sca.write)(io_info, 
                 chunk_info->chunk_points, H5T_get_size(dataset->shared->type),
                 chunk_info->fspace, chunk_info->mspace,
                 buf);
-        
-            /* Check return value from optimized write */
-            if (status<0)
-                HGOTO_ERROR(H5E_DATASET, H5E_WRITEERROR, FAIL, "optimized write failed")
+                 
+		/* Check return value from optimized write */
+		if (status<0)
+		  HGOTO_ERROR(H5E_DATASET, H5E_WRITEERROR, FAIL, "optimized write failed")
+                io_info->dxpl_id = temp_id;
+#else
+	   
+		  /* Perform the actual collective write operation */
+		  status = (io_info->ops.write)(io_info, 
+		    chunk_info->chunk_points, H5T_get_size(dataset->shared->type),
+                    chunk_info->fspace, chunk_info->mspace,
+                    buf);
+	       /* Check return value from optimized write */
+               if (status<0)
+		 HGOTO_ERROR(H5E_DATASET, H5E_WRITEERROR, FAIL, "optimized write failed")
+#endif
+		   }
+	     
+	       else { 
+#ifndef H5_MPI_COMPLEX_DERIVED_DATATYPE_WORKS
+                if(!all_regular) {
+
+	       /* Perform the independent write operation */
+                temp_id = io_info->dxpl_id;
+                io_info->dxpl_id = io_info->dp_dxpl_id;
+		status = (io_info->ops_sca.write)(io_info, 
+                chunk_info->chunk_points, H5T_get_size(dataset->shared->type),
+                chunk_info->fspace, chunk_info->mspace,
+                buf);
+		/* Check return value from optimized read */
+		if (status<0)
+		  HGOTO_ERROR(H5E_DATASET, H5E_READERROR, FAIL, "optimized read failed")
+                io_info->dxpl_id = temp_id;
+		   }
+                  else {
+		 /* For regular selection, perform the collective  write operation */
+		   status = (io_info->ops.write)(io_info, 
+		    chunk_info->chunk_points, H5T_get_size(dataset->shared->type),
+                    chunk_info->fspace, chunk_info->mspace,
+                    buf);
+	        /* Check return value from optimized write */
+                if (status<0)
+		 HGOTO_ERROR(H5E_DATASET, H5E_WRITEERROR, FAIL, "optimized write failed")
+                  }
+#else
+
+		   status = (io_info->ops.write)(io_info, 
+		    chunk_info->chunk_points, H5T_get_size(dataset->shared->type),
+                    chunk_info->fspace, chunk_info->mspace,
+                    buf);
+	        /* Check return value from optimized write */
+                if (status<0)
+		 HGOTO_ERROR(H5E_DATASET, H5E_WRITEERROR, FAIL, "optimized write failed")
+               
+#endif
+              }
+	      } 
+	     else { 
+		 /* For independent  parallel write*/
+	        /* Perform the write operation */
+		   status = (io_info->ops.write)(io_info, 
+		    chunk_info->chunk_points, H5T_get_size(dataset->shared->type),
+                    chunk_info->fspace, chunk_info->mspace,
+                    buf);
+	        /* Check return value from optimized write */
+                if (status<0)
+		 HGOTO_ERROR(H5E_DATASET, H5E_WRITEERROR, FAIL, "optimized write failed")
+		   }
+
+#else	
+		  /* Perform the actual write operation for sequential*/
+		  status = (io_info->ops.write)(io_info, 
+		    chunk_info->chunk_points, H5T_get_size(dataset->shared->type),
+                    chunk_info->fspace, chunk_info->mspace,
+                    buf);
+	       /* Check return value from optimized write */
+               if (status<0)
+		 HGOTO_ERROR(H5E_DATASET, H5E_WRITEERROR, FAIL, "optimized write failed")
+#endif
 
             /* Get the next chunk node in the skip list */
             chunk_node=H5SL_next(chunk_node);
@@ -3270,7 +3587,16 @@ done:
  */
 static herr_t 
 H5D_ioinfo_init(H5D_t *dset, const H5D_dxpl_cache_t *dxpl_cache, hid_t dxpl_id,
-    const H5S_t 
+    hid_t
+#if !(defined H5_HAVE_PARALLEL || defined H5S_DEBUG)
+    UNUSED
+#endif /* H5_HAVE_PARALLEL */
+    dp_dxpl_id, H5D_dxpl_cache_t 
+
+#if !(defined H5_HAVE_PARALLEL || defined H5S_DEBUG)
+    UNUSED
+#endif /* H5_HAVE_PARALLEL */
+*dp_dxpl_cache,const H5S_t 
 #if !(defined H5_HAVE_PARALLEL || defined H5S_DEBUG)
     UNUSED
 #endif /* H5_HAVE_PARALLEL */
@@ -3320,6 +3646,9 @@ H5D_ioinfo_init(H5D_t *dset, const H5D_dxpl_cache_t *dxpl_cache, hid_t dxpl_id,
     /*
      * Check if we can set direct MPI-IO read/write functions
      */
+    io_info->dp_dxpl_id = dp_dxpl_id;
+    io_info->dp_dxpl_cache = dp_dxpl_cache;
+
     opt=H5D_mpio_opt_possible(dset,mem_space,file_space,flags);
     if(opt==FAIL)
         HGOTO_ERROR(H5E_DATASPACE, H5E_BADRANGE, FAIL, "invalid check for direct IO dataspace ");
@@ -3333,28 +3662,20 @@ H5D_ioinfo_init(H5D_t *dset, const H5D_dxpl_cache_t *dxpl_cache, hid_t dxpl_id,
     /* Check if we can use the optimized parallel I/O routines */
     if(opt==TRUE) {
         /* Set the pointers to the MPI-specific routines */
-      if((H5S_SELECT_IS_REGULAR(file_space) == TRUE) &&
-	 (H5S_SELECT_IS_REGULAR(mem_space)  == TRUE)){
-        io_info->ops.read = H5D_mpio_spaces_read;
-        io_info->ops.write = H5D_mpio_spaces_write;
-      }
-
-    #ifdef H5_MPI_COMPLEX_DERIVED_DATATYPE_WORKS
-      else {
-	io_info->ops.read = H5D_mpio_spaces_span_read;
-	io_info->ops.write = H5D_mpio_spaces_span_write;
-      }
-    #endif
-        /* Indicate that the I/O will be parallel */
-        *use_par_opt_io=TRUE;
-    } /* end if */
+ 	io_info->ops.read =  H5D_mpio_select_read;
+	io_info->ops.write = H5D_mpio_select_write;
+	io_info->ops_sca.read  = H5D_select_read;
+	io_info->ops_sca.write = H5D_select_write;
+	*use_par_opt_io=TRUE;
+        /* Indicate that the I/O will use collective */
+    }
+     /* end if */
     else {
-        /* Indicate that the I/O will _NOT_ be parallel */
+        /* Indicate that the I/O will _NOT_ be parallel, use independent IO */
         *use_par_opt_io=FALSE;
         io_info->ops.read = H5D_select_read;
         io_info->ops.write = H5D_select_write;
 
-
     } /* end else */
 #else
         io_info->ops.read = H5D_select_read;
@@ -3372,3 +3693,136 @@ done:
 #endif /* H5_HAVE_PARALLEL || H5S_DEBUG */
     FUNC_LEAVE_NOAPI(ret_value)
 } /* end H5D_ioinfo_init() */
+
+
+#ifdef H5_HAVE_PARALLEL
+
+
+/*-------------------------------------------------------------------------
+ * Function:	H5D_mpio_get_mini_chunk
+ *
+ * Purpose:	Routine for obtaining minimum number of chunks to cover
+                hyperslab selection selected by all processors.
+ *          
+ *
+ * Return:	Non-negative on success/Negative on failure
+ *
+ * Programmer:	
+ *		
+ *
+ * Modifications:
+ *
+ *-------------------------------------------------------------------------
+ */
+
+static herr_t  H5D_mpio_get_mini_chunk(const H5D_t *dset,
+				       const H5S_t *mem_space, 
+				       const H5S_t *file_space,
+				       int *min_chunkf) {
+
+  
+  hsize_t chunk_dim[H5O_LAYOUT_NDIMS];        /* Chunk dimensions */
+  hsize_t startf[H5S_MAX_RANK],      /* Selection start bounds */
+          endf[H5S_MAX_RANK];     /* Selection end bounds */
+        unsigned dim_rankf;         /* Number of dimensions of file dataspace */
+        int pcheck_hyper,check_hyper,   /* Flags for checking if selection is in one chunk */
+            tnum_chunkf,            /* Number of chunks selection overlaps */
+            max_chunkf,             /* Maximum number of chunks selection overlaps */
+            num_chunks_same;        /* Flag indicating whether all processes have the same # of                                    chunks to operate on */
+        unsigned dim_chunks;        /* Temporary number of chunks in a dimension */
+        MPI_Comm comm;              /* MPI communicator for file */
+        int mpi_rank;               /* Rank in MPI communicator */
+        int mpi_code;               /* MPI return code */
+        unsigned u;                 /* Local index variable */
+	herr_t ret_value;
+
+	ret_value = SUCCEED;
+	FUNC_ENTER_NOAPI_NOINIT(H5D_mpio_get_mini_chunk);
+        /* Getting MPI communicator and rank */
+        if((comm = H5F_mpi_get_comm(dset->ent.file))==MPI_COMM_NULL)
+            HGOTO_ERROR(H5E_DATASPACE, H5E_CANTGET, FAIL, "can't retrieve MPI communicator")
+        if((mpi_rank = H5F_mpi_get_rank(dset->ent.file))<0)
+            HGOTO_ERROR(H5E_DATASPACE, H5E_CANTGET, FAIL, "can't retrieve MPI rank")
+
+
+        dim_rankf = H5S_GET_EXTENT_NDIMS(file_space);
+
+        if(H5S_SELECT_BOUNDS(file_space,startf,endf)==FAIL)
+            HGOTO_ERROR(H5E_DATASPACE, H5E_BADRANGE,FAIL, "invalid check for single selection blocks");
+
+        for(u=0; u < dset->shared->layout.u.chunk.ndims; u++) 
+            chunk_dim[u] = dset->shared->layout.u.chunk.dim[u];
+
+
+        /* Compute the number of chunks covered by the selection on this process */
+        tnum_chunkf = 1;
+        for (u=0; u<dim_rankf; u++) {
+            dim_chunks = (endf[u]/chunk_dim[u]-startf[u]/chunk_dim[u])+1;
+            tnum_chunkf = dim_chunks*tnum_chunkf;
+        }
+
+        /* Determine the minimum and maximum # of chunks for all processes */
+
+        if (MPI_SUCCESS != (mpi_code= MPI_Reduce(&tnum_chunkf,min_chunkf,1,MPI_INT,MPI_MIN,0,comm)))
+            HMPI_GOTO_ERROR(FAIL, "MPI_Reduce failed", mpi_code)
+  
+                    
+        /* Broadcast the flag indicating the number of chunks are the same */
+        if (MPI_SUCCESS != (mpi_code= MPI_Bcast(min_chunkf,1,MPI_INT,0,comm)))
+            HMPI_GOTO_ERROR(FAIL, "MPI_Bcast failed", mpi_code)
+
+ done:
+	FUNC_LEAVE_NOAPI(ret_value);
+	
+}
+
+
+/*-------------------------------------------------------------------------
+ * Function:	H5D_obtain_duplicate_pid
+ *
+ * Purpose:	Routine for obtaining a copy property list ID of 
+                data transfer property.
+                
+ *          
+ *
+ * Return:	Non-negative on success/Negative on failure
+ *
+ * Programmer:	
+ *		
+ *
+ * Modifications:
+ *
+ *-------------------------------------------------------------------------
+ */
+
+static herr_t  H5D_obtain_duplicate_pid(hid_t dxpl_id,
+				        hid_t* dp_id, 
+				        H5D_dxpl_cache_t **cache)
+{
+
+  H5FD_mpio_xfer_t xfer_mode; 
+  H5P_genplist_t   *dp_dx_plist;           /* Data transer property list */
+  herr_t           ret_value=SUCCEED;
+  
+  FUNC_ENTER_NOAPI_NOINIT(H5D_obtain_duplicate_pid)
+
+   *dp_id = H5Pcopy(dxpl_id);
+    
+  /*  printf("inside function dp id %d\n",*dp_id);*/
+    /* Get the dataset transfer property list */
+    if (NULL == (dp_dx_plist = H5I_object(*dp_id)))
+        HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, "not a dataset creation property list")
+
+    xfer_mode = H5FD_MPIO_INDEPENDENT;
+    if(H5P_set (dp_dx_plist, H5D_XFER_IO_XFER_MODE_NAME, &xfer_mode) < 0)
+            HGOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "can't set transfer mode")   
+    
+    /* Fill the DXPL cache values for later use */
+    if (H5D_get_dxpl_cache(*dp_id,cache)<0)
+        HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't fill dxpl cache")
+
+  done:
+    FUNC_LEAVE_NOAPI(ret_value)
+  
+}           
+#endif /*H5_HAVE_PARALLEL*/
author	MuQun Yang <ymuqun@hdfgroup.org>	2005-08-11 18:48:09 (GMT)
committer	MuQun Yang <ymuqun@hdfgroup.org>	2005-08-11 18:48:09 (GMT)
commit	870c5b2f66c158446b385cf67f507f1641aca1e2 (patch)
tree	73d52b3101ef2b1a054e035710670bd5a694b48c /src/H5Dio.c
parent	0e1b41d0fd1521784128e8637b5afa8371d2779d (diff)
download	hdf5-870c5b2f66c158446b385cf67f507f1641aca1e2.zip hdf5-870c5b2f66c158446b385cf67f507f1641aca1e2.tar.gz hdf5-870c5b2f66c158446b385cf67f507f1641aca1e2.tar.bz2