[svn-r12553] This check-in includes the following part of parallel optimization codes:

1. Provide another option for users to do independent IO with MPI file setview(collectively) 2. With the request of collective IO from users, using Independent IO with MPI file setview if we find collective IO is not good for the applications for IO per chunk(multi-chunk IO) case. Previously we used pure independent IO and that actually performed small IO(IO each row) for this case. The recent performance study suggested the independent IO with file setview can acheieve significantly better performance than collective IO when not many processes participate in the IO. 3. For applications that explicitly choose to do collective IO per chunk case, the library won't do any optimization(gather/broadcast) operations. The library simply passes the collective IO request to MPI-IO. Tested at copper, kagiso, heping, mir and tungsten(cmpi and mpich) Kagiso is using LAM, t_mpi test was broken even. The cchunk10 test failed at heping and mir. I suspected it was an MPICH problem. Will investigate later. Everything passed at copper. at tungsten: the old cmpi bug(failed at esetw) is still there. Other tests passed. Some sequential fheap tests failed at kagiso.
author: MuQun Yang <ymuqun@hdfgroup.org> 2006-08-09 03:00:11 (GMT)
committer: MuQun Yang <ymuqun@hdfgroup.org> 2006-08-09 03:00:11 (GMT)
commit: 6916816a563532fddc3699a6d5e4adb57212968d (patch)
tree: 70121257e539ec369455ebd43119873fd96c7489 /src
parent: d17d42acd0fbba4b3433937f448c99930553b038 (diff)
download: hdf5-6916816a563532fddc3699a6d5e4adb57212968d.zip
hdf5-6916816a563532fddc3699a6d5e4adb57212968d.tar.gz
hdf5-6916816a563532fddc3699a6d5e4adb57212968d.tar.bz2
8 files changed, 544 insertions, 226 deletions
diff --git a/src/H5D.c b/src/H5D.c
index 073b74d..850c4ea 100644
--- a/src/H5D.c
+++ b/src/H5D.c
@@ -196,8 +196,10 @@ H5D_init_interface(void)
     void            *def_vfl_info            = H5D_XFER_VFL_INFO_DEF;
     size_t          def_hyp_vec_size         = H5D_XFER_HYPER_VECTOR_SIZE_DEF;
 #ifdef H5_HAVE_PARALLEL
-    H5FD_mpio_xfer_t def_io_xfer_mode             = H5D_XFER_IO_XFER_MODE_DEF;
+    H5FD_mpio_xfer_t def_io_xfer_mode          = H5D_XFER_IO_XFER_MODE_DEF;
+    H5FD_mpio_collective_opt_t def_io_xfer_opt_mode  = H5D_XFER_IO_XFER_OPT_MODE_DEF;
     H5FD_mpio_chunk_opt_t def_mpio_chunk_opt_mode = H5D_XFER_MPIO_CHUNK_OPT_HARD_DEF;
+    H5FD_mpio_collective_opt_t def_mpio_collective_opt_mode = H5D_XFER_MPIO_COLLECTIVE_OPT_DEF;
     unsigned def_mpio_chunk_opt_num               = H5D_XFER_MPIO_CHUNK_OPT_NUM_DEF;
     unsigned def_mpio_chunk_opt_ratio             = H5D_XFER_MPIO_CHUNK_OPT_RATIO_DEF;
 #endif /* H5_HAVE_PARALLEL */
@@ -303,6 +305,10 @@ H5D_init_interface(void)
         /* Register the I/O transfer mode property */
         if(H5P_register(xfer_pclass,H5D_XFER_IO_XFER_MODE_NAME,H5D_XFER_IO_XFER_MODE_SIZE,&def_io_xfer_mode,NULL,NULL,NULL,NULL,NULL,NULL,NULL)<0)
             HGOTO_ERROR(H5E_PLIST, H5E_CANTINSERT, FAIL, "can't insert property into class")
+        if(H5P_register(xfer_pclass,H5D_XFER_IO_XFER_OPT_MODE_NAME,H5D_XFER_IO_XFER_OPT_MODE_SIZE,&def_io_xfer_opt_mode,NULL,NULL,NULL,NULL,NULL,NULL,NULL)<0)
+            HGOTO_ERROR(H5E_PLIST, H5E_CANTINSERT, FAIL, "can't insert property into class")
+        if(H5P_register(xfer_pclass,H5D_XFER_MPIO_COLLECTIVE_OPT_NAME,H5D_XFER_MPIO_COLLECTIVE_OPT_SIZE,&def_mpio_collective_opt_mode,NULL,NULL,NULL,NULL,NULL,NULL,NULL)<0)
+            HGOTO_ERROR(H5E_PLIST, H5E_CANTINSERT, FAIL, "can't insert property into class")
 	if(H5P_register(xfer_pclass,H5D_XFER_MPIO_CHUNK_OPT_HARD_NAME,H5D_XFER_MPIO_CHUNK_OPT_HARD_SIZE,&def_mpio_chunk_opt_mode,NULL,NULL,NULL,NULL,NULL,NULL,NULL)<0)
             HGOTO_ERROR(H5E_PLIST, H5E_CANTINSERT, FAIL, "can't insert property into class")
         if(H5P_register(xfer_pclass,H5D_XFER_MPIO_CHUNK_OPT_NUM_NAME,H5D_XFER_MPIO_CHUNK_OPT_NUM_SIZE,&def_mpio_chunk_opt_num,NULL,NULL,NULL,NULL,NULL,NULL,NULL)<0)
diff --git a/src/H5Dio.c b/src/H5Dio.c
index 6675bc5..28bc617 100644
--- a/src/H5Dio.c
+++ b/src/H5Dio.c
@@ -358,8 +358,10 @@ H5D_get_dxpl_cache_real(hid_t dxpl_id, H5D_dxpl_cache_t *cache)
 
 #ifdef H5_HAVE_PARALLEL
     /* Collect Parallel I/O information for possible later use */
-    if(H5P_get(dx_plist, H5D_XFER_IO_XFER_MODE_NAME, &cache->xfer_mode)<0)
+     if(H5P_get(dx_plist, H5D_XFER_IO_XFER_MODE_NAME, &cache->xfer_mode)<0)
         HGOTO_ERROR (H5E_PLIST, H5E_CANTGET, FAIL, "Can't retrieve parallel transfer method")
+     if(H5P_get(dx_plist, H5D_XFER_IO_XFER_OPT_MODE_NAME, &cache->xfer_opt_mode)<0)
+       HGOTO_ERROR (H5E_PLIST, H5E_CANTGET, FAIL, "Can't retrieve parallel transfer method")
 #endif /*H5_HAVE_PARALLEL*/
 
     /* Get error detection properties */
diff --git a/src/H5Dmpio.c b/src/H5Dmpio.c
index ead1777..4db79ec 100644
--- a/src/H5Dmpio.c
+++ b/src/H5Dmpio.c
@@ -27,6 +27,8 @@
 
 #define H5D_PACKAGE		/*suppress error about including H5Dpkg	  */
 /*#define KENT */
+/*#define CC_PERF*/
+
 
 
 /***********/
@@ -64,13 +66,10 @@
 
 /* Macros to represent options on how to obtain chunk address for one linked-chunk IO case */
 #define H5D_OBTAIN_ONE_CHUNK_ADDR_IND 0
-#define H5D_OBTAIN_ALL_CHUNK_ADDR_IND 1
 #define H5D_OBTAIN_ALL_CHUNK_ADDR_COL 2
 
 /* Macros to define the default ratio of obtaining all chunk addresses for one linked-chunk IO case */
-#define H5D_ALL_CHUNK_ADDR_THRES_IND  10
-#define H5D_ALL_CHUNK_ADDR_THRES_IND_NUM 4
-#define H5D_ALL_CHUNK_ADDR_THRES_COL  20
+#define H5D_ALL_CHUNK_ADDR_THRES_COL  30
 #define H5D_ALL_CHUNK_ADDR_THRES_COL_NUM 10000
 
 /***** Macros for multi-chunk collective IO case. *****/
@@ -104,29 +103,32 @@ typedef struct H5D_common_coll_info_t {
   size_t  mpi_buf_count;
   haddr_t chunk_addr;
 } H5D_common_coll_info_t;
-
+  
 
 /********************/
 /* Local Prototypes */
 /********************/
 
-static herr_t
-H5D_multi_chunk_collective_io(H5D_io_info_t *io_info,fm_map *fm,const void *buf,
+static herr_t 
+H5D_multi_chunk_collective_io(H5D_io_info_t *io_info,fm_map *fm,const void *buf, 
 			      hbool_t do_write);
+static herr_t
+H5D_multi_chunk_collective_io_no_opt(H5D_io_info_t *io_info,fm_map *fm,const void *buf,
+                              hbool_t do_write);
 
 static herr_t
-H5D_link_chunk_collective_io(H5D_io_info_t *io_info,fm_map *fm,const void *buf,
+H5D_link_chunk_collective_io(H5D_io_info_t *io_info,fm_map *fm,const void *buf, 
 			     hbool_t do_write,int sum_chunk);
 
-static herr_t
+static herr_t 
 H5D_inter_collective_io(H5D_io_info_t *io_info,const H5S_t *file_space,
-			const H5S_t *mem_space,haddr_t addr,
+			const H5S_t *mem_space,haddr_t addr, 
 		        const void *buf, hbool_t do_write );
 
-static herr_t
+static herr_t 
 H5D_final_collective_io(H5D_io_info_t *io_info,MPI_Datatype*mpi_file_type,
 			 MPI_Datatype *mpi_buf_type,
-			 H5D_common_coll_info_t* coll_info,
+			 H5D_common_coll_info_t* coll_info, 
 			 const void *buf, hbool_t do_write);
 #ifdef OLD_WAY
 static herr_t
@@ -134,24 +136,25 @@ H5D_pre_sort_chunk(H5D_io_info_t *io_info,int total_chunks,
 		   haddr_t total_chunk_addr_array[]);
 #endif
 
-static herr_t
+static herr_t 
 H5D_sort_chunk(H5D_io_info_t * io_info,
 	       fm_map *fm,
 	       H5D_chunk_addr_info_t chunk_addr_info_array[],
 	       int many_chunk_opt);
 
-static herr_t
-H5D_obtain_mpio_mode(H5D_io_info_t* io_info,
+static herr_t 
+H5D_obtain_mpio_mode(H5D_io_info_t* io_info, 
 		     fm_map *fm,
 		     uint8_t assign_io_mode[],
 		     haddr_t chunk_addr[]);
 
 static herr_t H5D_ioinfo_make_ind(H5D_io_info_t *io_info);
+static herr_t H5D_ioinfo_make_coll_opt(H5D_io_info_t *io_info);
 static herr_t H5D_ioinfo_make_coll(H5D_io_info_t *io_info);
 static herr_t H5D_mpio_get_min_chunk(const H5D_io_info_t *io_info,
     const fm_map *fm, int *min_chunkf);
 static int H5D_cmp_chunk_addr(const void *addr1, const void *addr2);
-static herr_t
+static herr_t 
 H5D_mpio_get_sum_chunk(const H5D_io_info_t *io_info,
 		       const fm_map *fm, int *sum_chunkf);
 
@@ -284,10 +287,10 @@ done:
  * Decription:  If H5_MPI_SPECIAL_COLLECTIVE_IO_WORKS is not defined,
                    collective IO with no contribution from one or more
                    processes are not assured. We will check the minimum
-                   number of chunks the process is used. If the number is
+                   number of chunks the process is used. If the number is 
                    zero, we will use independent IO mode instead.
                 This is necessary with Linked chunk IO.
- * Purpose:	Checks if it is possible to do collective IO
+ * Purpose:	Checks if it is possible to do collective IO 
  *
  * Return:	Success:        Non-negative: TRUE or FALSE
  *		Failure:	Negative
@@ -307,8 +310,8 @@ H5D_mpio_chunk_adjust_iomode(H5D_io_info_t *io_info, const fm_map *fm) {
 
 #ifndef H5_MPI_SPECIAL_COLLECTIVE_IO_WORKS
     if(H5D_mpio_get_min_chunk(io_info,fm,&min_chunk)<0)
-         HGOTO_ERROR(H5E_DATASPACE, H5E_CANTSWAP, FAIL, "unable to obtain the min chunk number of all processes");
-    if(min_chunk == 0) {
+         HGOTO_ERROR(H5E_DATASPACE, H5E_CANTSWAP, FAIL, "unable to obtain the min chunk number of all processes"); 
+    if(min_chunk == 0) {                
        H5P_genplist_t *dx_plist;           /* Data transer property list */
 
        /* Get the dataset transfer property list */
@@ -347,11 +350,11 @@ done:
  */
 herr_t
 H5D_mpio_select_read(H5D_io_info_t *io_info,
-                     size_t mpi_buf_count,
+                     size_t mpi_buf_count, 
                      const size_t UNUSED elmt_size,
-		     const H5S_t UNUSED *file_space,
+		     const H5S_t UNUSED *file_space, 
 		     const H5S_t UNUSED *mem_space,
-		     haddr_t addr,
+		     haddr_t addr,		     
 		     void *buf/*out*/)
 {
     herr_t ret_value = SUCCEED;
@@ -378,9 +381,9 @@ done:
  */
 herr_t
 H5D_mpio_select_write(H5D_io_info_t *io_info,
-		      size_t mpi_buf_count,
+		      size_t mpi_buf_count, 
 		      const size_t UNUSED elmt_size,
-		      const H5S_t UNUSED *file_space,
+		      const H5S_t UNUSED *file_space, 
 		      const H5S_t UNUSED *mem_space,
 		      haddr_t addr,
 		      const void *buf)
@@ -444,6 +447,51 @@ H5D_ioinfo_make_ind(H5D_io_info_t *io_info)
 done:
     FUNC_LEAVE_NOAPI(ret_value)
 } /* end H5D_ioinfo_make_ind() */
+
+/*-------------------------------------------------------------------------
+ * Function:	H5D_ioinfo_make_coll_opt
+ *
+ * Purpose:	Switch to MPI independent I/O with file set view
+ *
+ * Return:	Non-negative on success/Negative on failure
+ *
+ * Programmer:	Quincey Koziol
+ *		Friday, August 12, 2005
+ *
+ *-------------------------------------------------------------------------
+ */
+static herr_t
+H5D_ioinfo_make_coll_opt(H5D_io_info_t *io_info)
+{
+    H5P_genplist_t *dx_plist;           /* Data transer property list */
+    herr_t	ret_value = SUCCEED;	/*return value		*/
+
+    FUNC_ENTER_NOAPI_NOINIT(H5D_ioinfo_make_coll_opt)
+
+    /* Get the dataset transfer property list */
+    if (NULL == (dx_plist = H5I_object(io_info->dxpl_id)))
+        HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, "not a dataset transfer property list")
+
+    /* Change the xfer_mode to independent, handle the request,
+     * then set xfer_mode before return.
+     */
+    io_info->dxpl_cache->xfer_opt_mode = H5FD_MPIO_INDIVIDUAL_IO;
+    if(H5P_set (dx_plist, H5D_XFER_IO_XFER_OPT_MODE_NAME, &io_info->dxpl_cache->xfer_opt_mode) < 0)
+        HGOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "can't set transfer mode")
+
+    /* Set the pointers to the non-MPI-specific routines */
+    io_info->ops.read = H5D_mpio_select_read;
+    io_info->ops.write = H5D_mpio_select_write;
+
+    /* Indicate that the transfer mode should be restored before returning
+     * to user.
+     */
+    io_info->xfer_opt_mode_changed = TRUE;
+
+done:
+    FUNC_LEAVE_NOAPI(ret_value)
+} /* end H5D_ioinfo_make_coll_opt() */
+
 
 
 /*-------------------------------------------------------------------------
@@ -477,6 +525,11 @@ H5D_ioinfo_make_coll(H5D_io_info_t *io_info)
     if(H5P_set (dx_plist, H5D_XFER_IO_XFER_MODE_NAME, &io_info->dxpl_cache->xfer_mode) < 0)
         HGOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "can't set transfer mode")
 
+    io_info->dxpl_cache->xfer_opt_mode = H5FD_MPIO_COLLECTIVE_IO;
+    if(H5P_set (dx_plist, H5D_XFER_IO_XFER_OPT_MODE_NAME, &io_info->dxpl_cache->xfer_opt_mode) < 0)
+        HGOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "can't set transfer mode")
+
+
     /* Set the pointers to the MPI-specific routines */
     io_info->ops.read = H5D_mpio_select_read;
     io_info->ops.write = H5D_mpio_select_write;
@@ -485,6 +538,7 @@ H5D_ioinfo_make_coll(H5D_io_info_t *io_info)
      * to user.
      */
     io_info->xfer_mode_changed=FALSE;
+    io_info->xfer_opt_mode_changed=FALSE;
 
 done:
     FUNC_LEAVE_NOAPI(ret_value)
@@ -556,7 +610,7 @@ H5D_mpio_get_sum_chunk(const H5D_io_info_t *io_info,
     printf("num_chunkf = %d\n",num_chunkf);
 #endif
 
-    /* Determine the minimum # of chunks for all processes */
+    /* Determine the summation of number of chunks for all processes */
     if (MPI_SUCCESS != (mpi_code = MPI_Allreduce(&num_chunkf, sum_chunkf, 1, MPI_INT, MPI_SUM, io_info->comm)))
         HMPI_GOTO_ERROR(FAIL, "MPI_Allreduce failed", mpi_code)
 
@@ -569,8 +623,8 @@ done:
  * Function:	H5D_contig_collective_io
  *
  * Purpose:	Wrapper Routine for H5D_inter_collective_io
-                The starting address of contiguous storage is passed
- *
+                The starting address of contiguous storage is passed 
+ *               
  *
  * Return:	Non-negative on success/Negative on failure
  *
@@ -581,11 +635,11 @@ done:
  *-------------------------------------------------------------------------
  */
 herr_t
-H5D_contig_collective_io(H5D_io_info_t *io_info,
+H5D_contig_collective_io(H5D_io_info_t *io_info, 
 			 const H5S_t *file_space,
 			 const H5S_t *mem_space,
 			 const void *buf,
-			 hbool_t do_write)
+			 hbool_t do_write) 
 {
 
 
@@ -607,8 +661,8 @@ H5D_contig_collective_io(H5D_io_info_t *io_info,
 #endif
     if(H5D_inter_collective_io(io_info,file_space,mem_space,addr,buf,do_write)<0)
 	HGOTO_ERROR(H5E_IO, H5E_CANTGET, FAIL,"couldn't finish shared collective MPI-IO");
-
- done:
+      
+ done: 
 
     FUNC_LEAVE_NOAPI(ret_value)
 } /* end H5D_contig_collective_io */
@@ -616,10 +670,10 @@ H5D_contig_collective_io(H5D_io_info_t *io_info,
 /*-------------------------------------------------------------------------
  * Function:	H5D_chunk_collective_io
  *
- * Purpose:	Routine for
-                1) choose an IO option:
+ * Purpose:	Routine for 
+                1) choose an IO option: 
 		      a) One collective IO defined by one MPI derived datatype to link through all chunks
-		or    b) multiple chunk IOs,to do MPI-IO for each chunk, the IO mode may be adjusted
+		or    b) multiple chunk IOs,to do MPI-IO for each chunk, the IO mode may be adjusted 
                          due to the selection pattern for each chunk.
  *              For option a)
 			1. Sort the chunk address, obtain chunk info according to the sorted chunk address
@@ -633,7 +687,7 @@ H5D_contig_collective_io(H5D_io_info_t *io_info,
                         2. Depending on whether the IO mode is collective or independent or none,
                            Create either MPI derived datatype for each chunk to do collective IO or just do independent IO
                         3. Set up collective IO property list for collective mode
-                        4. DO IO
+                        4. DO IO               
  *
  * Return:	Non-negative on success/Negative on failure
  *
@@ -643,44 +697,46 @@ H5D_contig_collective_io(H5D_io_info_t *io_info,
  *
  *-------------------------------------------------------------------------
  */
-herr_t
-H5D_chunk_collective_io(H5D_io_info_t *io_info,fm_map *fm,const void *buf, hbool_t do_write)
+herr_t 
+H5D_chunk_collective_io(H5D_io_info_t *io_info,fm_map *fm,const void *buf, hbool_t do_write) 
 {
 
     int               io_option = H5D_MULTI_CHUNK_IO_MORE_OPT;
     int               sum_chunk = 0,mpi_size;
     unsigned          one_link_chunk_io_threshold;
-    H5P_genplist_t    *plist;
+    H5P_genplist_t    *plist; 
     H5FD_mpio_chunk_opt_t chunk_opt_mode;
 #ifdef H5_HAVE_INSTRUMENTED_LIBRARY
     htri_t            check_prop,temp_not_link_io = FALSE;
     int               prop_value,new_value;
 #endif
-    herr_t            ret_value = SUCCEED;
+    herr_t            ret_value = SUCCEED;    
 
 
     FUNC_ENTER_NOAPI_NOINIT(H5D_chunk_collective_io)
 
     assert (IS_H5FD_MPIO(io_info->dset->oloc.file));
-
+    
     /* Obtain the data transfer properties */
     if(NULL == (plist = H5I_object(io_info->dxpl_id)))
         HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, "not a file access property list")
-
+    
     /* Check the optional property list on what to do with collective chunk IO. */
     chunk_opt_mode=(H5FD_mpio_chunk_opt_t)H5P_peek_unsigned(plist,H5D_XFER_MPIO_CHUNK_OPT_HARD_NAME);
 #ifdef KENT
     printf("chunk_opt_mode = %d\n",chunk_opt_mode);
 #endif
-
-    if(chunk_opt_mode == H5FD_MPIO_CHUNK_ONE_IO) io_option = H5D_ONE_LINK_CHUNK_IO;/*no opt*/
+    
+    if(chunk_opt_mode == H5FD_MPIO_CHUNK_ONE_IO) {
+        io_option = H5D_ONE_LINK_CHUNK_IO;/*no opt*/
+    }
     else if(chunk_opt_mode == H5FD_MPIO_CHUNK_MULTI_IO) io_option = H5D_MULTI_CHUNK_IO;/*no opt */
     else {
-       if(H5D_mpio_get_sum_chunk(io_info,fm,&sum_chunk)<0)
-  	       HGOTO_ERROR(H5E_DATASPACE, H5E_CANTSWAP, FAIL, "unable to obtain the total chunk number of all processes");
+       if(H5D_mpio_get_sum_chunk(io_info,fm,&sum_chunk)<0)   
+  	       HGOTO_ERROR(H5E_DATASPACE, H5E_CANTSWAP, FAIL, "unable to obtain the total chunk number of all processes"); 
        if((mpi_size = H5F_mpi_get_size(io_info->dset->oloc.file))<0)
 	 HGOTO_ERROR (H5E_IO, H5E_MPI, FAIL, "unable to obtain mpi size");
-
+    
        if(NULL == (plist = H5I_object(io_info->dxpl_id)))
          HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, "not a file access property list")
 
@@ -732,8 +788,8 @@ H5D_chunk_collective_io(H5D_io_info_t *io_info,fm_map *fm,const void *buf, hbool
 	          HGOTO_ERROR(H5E_PLIST, H5E_UNSUPPORTED, FAIL, "unable to get property value");
               }
         }
-
-
+       
+              
 #endif
 #ifndef H5_MPI_COMPLEX_DERIVED_DATATYPE_WORKS
     if(io_option == H5D_ONE_LINK_CHUNK_IO ) io_option = H5D_MULTI_CHUNK_IO ;/* We can not do this with one chunk IO. */
@@ -745,9 +801,15 @@ H5D_chunk_collective_io(H5D_io_info_t *io_info,fm_map *fm,const void *buf, hbool
       if(H5D_link_chunk_collective_io(io_info,fm,buf,do_write,sum_chunk)<0)
 	HGOTO_ERROR(H5E_IO, H5E_CANTGET, FAIL,"couldn't finish linked chunk MPI-IO");
     }
+      
+    else if(io_option == H5D_MULTI_CHUNK_IO) {
+      if(H5D_multi_chunk_collective_io_no_opt(io_info,fm,buf,do_write)<0)
+        HGOTO_ERROR(H5E_IO, H5E_CANTGET, FAIL,"couldn't finish multiple chunk MPI-IO");
+    }
 
+      
     else { /*multiple chunk IOs without opt */
-
+    
       if(H5D_multi_chunk_collective_io(io_info,fm,buf,do_write)<0)
         HGOTO_ERROR(H5E_IO, H5E_CANTGET, FAIL,"couldn't finish multiple chunk MPI-IO");
 
@@ -764,7 +826,7 @@ done:
 			1. Sort the chunk address and chunk info
                         2. Build up MPI derived datatype for each chunk
                         3. Build up the final MPI derived datatype
-			4. Use common collective IO routine to do MPI-IO
+			4. Use common collective IO routine to do MPI-IO 
 
  *
  * Return:	Non-negative on success/Negative on failure
@@ -788,9 +850,9 @@ H5D_link_chunk_collective_io(H5D_io_info_t *io_info,fm_map *fm,const void *buf,
       size_t           mpi_buf_count;
       size_t           mpi_file_count;
       hbool_t	       mbt_is_derived=0,      /* Whether the buffer (memory) type is derived and needs to be free'd */
-		       mft_is_derived=0;      /* Whether the file type is derived and needs to be free'd */
-
-      int              mpi_size,mpi_code;              /* MPI return code */
+		       mft_is_derived=0;      /* Whether the file type is derived and needs to be free'd */  
+     
+      int              mpi_size,mpi_code;              /* MPI return code */ 
 
       int               i,num_chunk=0,total_chunks;
       size_t            ori_num_chunk;
@@ -814,12 +876,8 @@ H5D_link_chunk_collective_io(H5D_io_info_t *io_info,fm_map *fm,const void *buf,
       H5D_common_coll_info_t coll_info;
       H5D_chunk_addr_info_t*  chunk_addr_info_array=NULL;
 
-#ifdef CC_PERF
-      char *bc_percent = NULL;
-      char *bcc_percent = NULL;
-#endif
-      herr_t            ret_value = SUCCEED;
-
+      herr_t            ret_value = SUCCEED;    
+      
       FUNC_ENTER_NOAPI_NOINIT(H5D_link_chunk_collective_io)
       ori_total_chunks = fm->total_chunks;
       H5_ASSIGN_OVERFLOW(total_chunks,ori_total_chunks,hsize_t,int);
@@ -829,7 +887,7 @@ H5D_link_chunk_collective_io(H5D_io_info_t *io_info,fm_map *fm,const void *buf,
         H5SL_node_t *chunk_node;
         H5D_chunk_info_t *chunk_info;
         H5D_storage_t  store;
-
+ 
         chunk_node = H5SL_first(fm->fsel);
 	if(chunk_node == NULL) {
 	  if(H5D_istore_chunkmap(io_info,total_chunks,&chunk_base_addr,fm->down_chunks)<0)
@@ -846,7 +904,7 @@ H5D_link_chunk_collective_io(H5D_io_info_t *io_info,fm_map *fm,const void *buf,
 
 	  if(HADDR_UNDEF==(chunk_base_addr = H5D_istore_get_addr(io_info,NULL)))
 	    HGOTO_ERROR(H5E_STORAGE, H5E_CANTGET, FAIL,"couldn't get chunk info from skipped list");
-
+	
 #ifdef KENT
 printf("before inter_collective_io for total chunk = 1 \n");
 #endif
@@ -864,7 +922,7 @@ printf("before inter_collective_io for total chunk = 1 \n");
 printf("total_chunks = %d\n",(int)total_chunks);
 #endif
 
-
+         
       if(num_chunk == 0) total_chunk_addr_array = H5MM_malloc(sizeof(haddr_t)*total_chunks);
       else
       {
@@ -882,29 +940,14 @@ printf("total_chunks = %d\n",(int)total_chunks);
 	HGOTO_ERROR(H5E_DATATYPE, H5E_BADSIZE, FAIL, "datatype size invalid");
       dst_type_size = src_type_size;
 
-
-#ifdef CC_PERF
-      /* "bcc" means 'b-tree iterately obtain all chunk addresses collectively',
-	 "bc" means 'b-tree iterately obtain all chunk addresses individually',
-          the default one means 'obtaining the chunk address individually',
-      */
-
-      if(bcc_percent=getenv("BCC_PERCENT")){
-         bsearch_coll_chunk_threshold  = atoi(bcc_percent);
-         assert((bsearch_coll_chunk_threshold >=0) &&(bsearch_coll_chunk_threshold <=100));
-      }
-      else
-         bsearch_coll_chunk_threshold  = H5D_ALL_CHUNK_ADDR_THRES_COL;
-#else
-      bsearch_coll_chunk_threshold  = H5D_ALL_CHUNK_ADDR_THRES_COL; /*This number may be changed according to the performance study */
-#endif
+      bsearch_coll_chunk_threshold  = H5D_ALL_CHUNK_ADDR_THRES_COL;
 
       if((mpi_size = H5F_mpi_get_size(io_info->dset->oloc.file))<0)
 	 HGOTO_ERROR (H5E_IO, H5E_MPI, FAIL, "unable to obtain mpi size");
 
-      /* Calculate the actual threshold to obtain all chunk addresses collectively
+      /* Calculate the actual threshold to obtain all chunk addresses collectively 
          The bigger this number is, the more possible the use of obtaining chunk address collectively. */
-      /* For non-optimization one-link IO,
+      /* For non-optimization one-link IO, 
          actual bsearch threshold is always 0,
          we would always want to obtain the chunk addresses individually
          for each process. */
@@ -914,53 +957,17 @@ printf("total_chunks = %d\n",(int)total_chunks);
          &&(sum_chunk/mpi_size >= H5D_ALL_CHUNK_ADDR_THRES_COL_NUM))
 	many_chunk_opt = H5D_OBTAIN_ALL_CHUNK_ADDR_COL;
 
-      else {
-
-#ifdef CC_PERF
-	if(bc_percent=getenv("BC_PERCENT")){
-	  bsearch_chunk_ratio  = atoi(bc_percent);
-	  assert((bsearch_chunk_ratio<=100)&&(bsearch_chunk_ratio>=0));
-	}
-	else
-	  bsearch_chunk_ratio  = H5D_ALL_CHUNK_ADDR_THRES_IND;
-#else
-         bsearch_chunk_ratio = H5D_ALL_CHUNK_ADDR_THRES_IND; /*This number may be changed according to the performance study */
-#endif
-
-	 /* This threshold is to check whether we can use iterator to obtain all chunk addresses.
-	    The unit of the threshold is the number of chunks. The value should be at least 1.
-            It can be calculated as follows:
-
-	    if(total_chunks*bsearch_chunk_ratio/100 <=1)
-	      bsearch_chunk_threahold = 1;
-            else
-	      bsearch_chunk_threshold = total_chunks*bsearch_chunk_ratio/100;
-	    In order to make the caluculation more efficient,
-	    we use the following approximate formula to calculate the threshold.
-
-	    bsearch_chunk_threshold = 1+ (total_chunks*bsearch_chunk_ratio-99)/100;
-
-	    The only difference is when total_chunks* besearch_chunk_ratio == 100n+99;
-            the approximate formula will give value (n+1) instead of n for threshold.
-	    That shouldn't matter much from our persective.
-	 */
-
-        bsearch_chunk_threshold = 1 +(total_chunks*bsearch_chunk_ratio-99)/100;
-	if(num_chunk > bsearch_chunk_threshold) many_chunk_opt = H5D_OBTAIN_ALL_CHUNK_ADDR_IND;
-        if((sum_chunk == 0) && (total_chunks >= H5D_ALL_CHUNK_ADDR_THRES_IND_NUM))
-          many_chunk_opt = H5D_OBTAIN_ALL_CHUNK_ADDR_IND;
-      }
 #ifdef KENT
 printf("before sorting the chunk address \n");
 #endif
-      /* Sort the chunk address
+      /* Sort the chunk address 
          when chunk optimization selection is either H5D_OBTAIN_*/
       if(num_chunk == 0){ /* special case: this process doesn't select anything */
          if(H5D_istore_chunkmap(io_info,total_chunks,total_chunk_addr_array,fm->down_chunks)<0)
              HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't get chunk address");
          chunk_base_addr = total_chunk_addr_array[0];
       }
-
+ 
       else {
          if(H5D_sort_chunk(io_info,fm,chunk_addr_info_array,many_chunk_opt)<0)
         	 HGOTO_ERROR (H5E_DATASPACE, H5E_CANTSWAP, FAIL, "unable to sort chunk address");
@@ -969,8 +976,8 @@ printf("before sorting the chunk address \n");
 #ifdef KENT
 printf("after sorting the chunk address \n");
 #endif
-
-      /* Obtain MPI derived datatype from all individual chunks */
+      
+      /* Obtain MPI derived datatype from all individual chunks */ 
       for ( i = 0; i < num_chunk; i++) {
 	  /* Disk MPI derived datatype */
           if(H5S_mpio_space_type(chunk_addr_info_array[i].chunk_info.fspace,src_type_size,&chunk_ftype[i],
@@ -981,7 +988,7 @@ printf("after sorting the chunk address \n");
           if(H5S_mpio_space_type(chunk_addr_info_array[i].chunk_info.mspace,dst_type_size,&chunk_mtype[i],
                                        &mpi_buf_count,&mpi_buf_extra_offset,&mbt_is_derived)<0)
 	       	HGOTO_ERROR(H5E_DATASPACE, H5E_BADTYPE, FAIL,"couldn't create MPI buf type");
-
+           
           /* Chunk address relative to the first chunk */
 	  chunk_addr_info_array[i].chunk_addr -= chunk_base_addr;
           H5_ASSIGN_OVERFLOW(chunk_disp_array[i],chunk_addr_info_array[i].chunk_addr,haddr_t,MPI_Aint);
@@ -989,7 +996,7 @@ printf("after sorting the chunk address \n");
 
       blocklen_value = 1;
       if(num_chunk){
-
+	
 	/* initialize the buffer with the constant value 1 */
 	H5V_array_fill(blocklen,&blocklen_value,sizeof(int),(size_t)num_chunk);
 
@@ -1022,7 +1029,7 @@ printf("after sorting the chunk address \n");
       else {/* no selection at all for this process */
 	chunk_final_ftype = MPI_BYTE;
 	chunk_final_mtype = MPI_BYTE;
-
+	
 	/* buffer, file derived datatypes should be true */
 	coll_info.mbt_is_derived = 0;
 	coll_info.mft_is_derived = 0;
@@ -1032,7 +1039,7 @@ printf("after sorting the chunk address \n");
 #ifdef KENT
 printf("before coming to final collective IO\n");
 #endif
-
+      
       if(H5D_final_collective_io(io_info,&chunk_final_ftype,&chunk_final_mtype,&coll_info,buf,do_write)<0)
 	HGOTO_ERROR(H5E_IO, H5E_CANTGET, FAIL,"couldn't finish MPI-IO");
 
@@ -1042,7 +1049,7 @@ printf("before freeing memory inside  H5D_link_collective_io ret_value = %d\n",r
 #endif
 
 
-
+      
      if (fm->total_chunks != 1) {
        if(num_chunk == 0) HDfree(total_chunk_addr_array);
        else {
@@ -1072,7 +1079,7 @@ printf("before leaving H5D_link_collective_io ret_value = %d\n",ret_value);
                 1. Use MPI_gather and MPI_Bcast to obtain IO mode in each chunk(collective/independent/none)
                 2. Depending on whether the IO mode is collective or independent or none,
                    Create either MPI derived datatype for each chunk or just do independent IO
-                3. Use common collective IO routine to do MPI-IO
+                3. Use common collective IO routine to do MPI-IO               
  *
  * Return:	Non-negative on success/Negative on failure
  *
@@ -1082,8 +1089,8 @@ printf("before leaving H5D_link_collective_io ret_value = %d\n",ret_value);
  *
  *-------------------------------------------------------------------------
  */
-static herr_t
-H5D_multi_chunk_collective_io(H5D_io_info_t *io_info,fm_map *fm,const void *buf, hbool_t do_write)
+static herr_t 
+H5D_multi_chunk_collective_io(H5D_io_info_t *io_info,fm_map *fm,const void *buf, hbool_t do_write) 
 {
 
       int               i,total_chunk;
@@ -1096,7 +1103,7 @@ H5D_multi_chunk_collective_io(H5D_io_info_t *io_info,fm_map *fm,const void *buf,
       H5D_storage_t     store;                /* union of EFL and chunk pointer in file space */
       hbool_t           select_chunk;
       hbool_t 	        last_io_mode_coll = TRUE;
-      herr_t            ret_value = SUCCEED;
+      herr_t            ret_value = SUCCEED;    
 #ifdef KENT
       int mpi_rank;
 #endif
@@ -1117,7 +1124,7 @@ H5D_multi_chunk_collective_io(H5D_io_info_t *io_info,fm_map *fm,const void *buf,
 #endif
 
       /* obtain IO option for each chunk */
-      if(H5D_obtain_mpio_mode(io_info,fm,chunk_io_option,chunk_addr)<0)
+      if(H5D_obtain_mpio_mode(io_info,fm,chunk_io_option,chunk_addr)<0) 
 	HGOTO_ERROR (H5E_DATASET, H5E_CANTRECV, FAIL, "unable to obtain MPIO mode");
 
       for( i = 0; i<total_chunk;i++){
@@ -1140,7 +1147,7 @@ printf("mpi_rank = %d, chunk index = %d\n",mpi_rank,i);
           if(NULL ==(chunk_node = H5SL_first(fm->fsel)))
 	    HGOTO_ERROR(H5E_STORAGE, H5E_CANTGET, FAIL,"couldn't get chunk node from skipped list");
 #else
-
+	  
 	   if(NULL ==(chunk_node = H5SL_first(fm->fsel)))
 	    HGOTO_ERROR(H5E_STORAGE, H5E_CANTGET, FAIL,"couldn't get chunk node from skipped list");
 	    while(chunk_node){
@@ -1161,30 +1168,30 @@ printf("mpi_rank = %d, chunk index = %d\n",mpi_rank,i);
 #endif
 	}
 
-        if(chunk_io_option[i] == 1){ /*collective IO for this chunk,
+        if(chunk_io_option[i] == 1){ /*collective IO for this chunk, 
 				       note: even there is no selection for this process,
                                              the process still needs to contribute MPI NONE TYPE.*/
 #ifdef KENT
 printf("inside collective chunk IO mpi_rank = %d, chunk index = %d\n",mpi_rank,i);
 #endif
-
+	
 	  if(!last_io_mode_coll)
 	  /* Switch back to collective I/O */
               if(H5D_ioinfo_make_coll(io_info) < 0)
                  HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't switch to collective I/O")
-
+          
 	    if(select_chunk){
 	      if(H5D_inter_collective_io(io_info,chunk_info->fspace,chunk_info->mspace,
 			             chunk_addr[i],buf,do_write )<0)
 	        HGOTO_ERROR(H5E_IO, H5E_CANTGET, FAIL,"couldn't finish shared collective MPI-IO");
-
+	       
 	    }
 	    else{
 	     if(H5D_inter_collective_io(io_info,NULL,NULL,
 			             chunk_addr[i],buf,do_write )<0)
 	        HGOTO_ERROR(H5E_IO, H5E_CANTGET, FAIL,"couldn't finish shared collective MPI-IO");
-
-	    }
+	       
+	    } 
               last_io_mode_coll = TRUE;
 
 	}
@@ -1192,8 +1199,9 @@ printf("inside collective chunk IO mpi_rank = %d, chunk index = %d\n",mpi_rank,i
 #ifdef KENT
 printf("inside independent IO mpi_rank = %d, chunk index = %d\n",mpi_rank,i);
 #endif
-
+	
 	  HDassert(chunk_io_option[i] == 0);
+#if 0
 	  if(!select_chunk) continue; /* this process has nothing to do with this chunk, continue! */
 	  if(last_io_mode_coll)
 	  /* Switch to independent I/O */
@@ -1206,16 +1214,16 @@ printf("inside independent IO mpi_rank = %d, chunk index = %d\n",mpi_rank,i);
 			 chunk_info->fspace,chunk_info->mspace,0,
 			 buf);
 	      /* Check return value of the write */
-	    if (ret_value<0)
+	    if (ret_value<0) 
 	      HGOTO_ERROR(H5E_DATASET, H5E_WRITEERROR, FAIL, "optimized write failed")
 	  }
 	  else {
 	     ret_value = (io_info->ops.read)(io_info,
 			  chunk_info->chunk_points,H5T_get_size(io_info->dset->shared->type),
 			  chunk_info->fspace,chunk_info->mspace,0,
-	        	  buf);
+	        	  buf);			   
 	      /* Check return value from optimized write */
-	      if (ret_value<0)
+	      if (ret_value<0) 
 		HGOTO_ERROR(H5E_DATASET, H5E_READERROR, FAIL, "optimized read failed")
 	  }
 
@@ -1227,7 +1235,47 @@ printf("inside independent IO mpi_rank = %d, chunk index = %d\n",mpi_rank,i);
 	  /* Switch back to collective I/O */
               if(H5D_ioinfo_make_coll(io_info) < 0)
                  HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't switch to collective I/O")
+#endif
+
+      {
+#ifdef KENT
+printf("coming into independent IO with file set view\n");
+           /* if(H5Pset_dxpl_mpio_collective_opt(io_info->dxpl_id,H5FD_MPIO_INDIVIDUAL_IO)<0)
+               HGOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL,"couldn't set individual MPI-IO with the file setview");
+printf("after setting the property list\n");
+*/
+#endif
+	  if(!last_io_mode_coll)
+	  /* using independent I/O with file setview.*/
+            if(H5D_ioinfo_make_coll_opt(io_info) < 0)
+                 HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't switch to collective I/O")
+            if(select_chunk){
+              if(H5D_inter_collective_io(io_info,chunk_info->fspace,chunk_info->mspace,
+                                     chunk_addr[i],buf,do_write )<0)
+                HGOTO_ERROR(H5E_IO, H5E_CANTGET, FAIL,"couldn't finish shared collective MPI-IO");
+            }
+            else{
+             if(H5D_inter_collective_io(io_info,NULL,NULL,
+                                     chunk_addr[i],buf,do_write )<0)
+                HGOTO_ERROR(H5E_IO, H5E_CANTGET, FAIL,"couldn't finish shared collective MPI-IO");
 
+            }
+#ifdef KENT
+printf("after inter collective IO\n");
+ /*           if(H5Pset_dxpl_mpio_collective_opt(io_info->dxpl_id,H5FD_MPIO_COLLECTIVE_IO)<0)
+               HGOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL,"couldn't set collective MPI-IO ");
+*/
+#endif
+              last_io_mode_coll = FALSE;
+
+
+          }
+        }
+      }
+      if(!last_io_mode_coll)
+	  /* Switch back to collective I/O */
+              if(H5D_ioinfo_make_coll(io_info) < 0)
+                 HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't switch to collective I/O")
   done:
     HDfree(chunk_io_option);
     HDfree(chunk_addr);
@@ -1235,13 +1283,155 @@ printf("inside independent IO mpi_rank = %d, chunk index = %d\n",mpi_rank,i);
     FUNC_LEAVE_NOAPI(ret_value)
 } /* end H5D_multi_chunk_collective_io */
 
+/*-------------------------------------------------------------------------
+ * Function:	H5D_multi_chunk_collective_io_no_opt
+ *
+ * Purpose:	To do collective IO without any optimization per chunk base
+ *              The internal independent IO inside HDF5 cannot handle
+ *              non-contiguous(or with holes) storage efficiently.
+ *              Under this case, the one independent IO call may consist of
+ *              many small disk IOs. So we may use independent IO with derived datatype
+                to replace the independent IO when we find this chunk is not good to
+                do collective IO. However, according to our performance study,
+                this approach may not overcome the overhead caused by gather/scatter.
+                So we decide to leave the original collective IO per chunk approach as 
+                an option for users. If users choose to use 
+                H5Pset_dxpl_mpio_chunk_opt(dxpl_id,H5FD_MPIO_OPT_MULTI_IO),
+                this function will be called. 
+                The HDF5 library won't do any management but leave it to MPI-IO to figure 
+                out.  
+ *
+ * Return:	Non-negative on success/Negative on failure
+ *
+ * Programmer:
+ *
+ * Modifications:
+ *
+ *-------------------------------------------------------------------------
+ */
+static herr_t 
+H5D_multi_chunk_collective_io_no_opt(H5D_io_info_t *io_info,fm_map *fm,const void *buf, hbool_t do_write) 
+{
+/*** START HERE ****/
+
+      int               i,count_chunk,min_num_chunk;
+      haddr_t           chunk_addr;
+
+      H5SL_node_t      *chunk_node;           /* Current node in chunk skip list */
+      H5D_storage_t     store;                /* union of EFL and chunk pointer in file space */
+      herr_t            ret_value = SUCCEED;    
+#ifdef KENT
+      int mpi_rank;
+#endif
+
+
+      FUNC_ENTER_NOAPI_NOINIT(H5D_multi_chunk_collective_io_no_opt)
+#ifdef KENT
+      mpi_rank = H5F_mpi_get_rank(io_info->dset->oloc.file);
+      printf("coming to multi_chunk_collective_io_no_opt\n");
+#endif
+
+      if(H5D_mpio_get_min_chunk(io_info,fm,&min_num_chunk)<0)
+         HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't get minimum number of chunk");
+      count_chunk = 0;
+  
+      /* Get first node in chunk skip list */
+      chunk_node=H5SL_first(fm->fsel);
+ 
+       /* Iterate through chunks to be operated on */
+      while(chunk_node) {
+           H5D_chunk_info_t *chunk_info;   /* chunk information */
+           hbool_t make_ind, make_coll;        /* Flags to indicate that the MPI mode should change */
+
+           /* Get the actual chunk information from the skip list node */
+           chunk_info=H5SL_item(chunk_node);
+
+           /* Set dataset storage for I/O info */
+          io_info->store=&store;
+`
+           /* Pass in chunk's coordinates in a union. */
+           store.chunk.offset = chunk_info->coords;
+           store.chunk.index = chunk_info->index;
+
+           /* Reset flags for changing parallel I/O mode */
+           make_ind = make_coll = FALSE;
+            
+           count_chunk++;
+            /* If the number of chunk is greater than minimum number of chunk,
+                  Do independent read */
+           if(count_chunk > min_num_chunk) {
+              /* Switch to independent I/O (permanently) */
+               make_ind = TRUE;
+            }
+
+#ifndef H5_MPI_COMPLEX_DERIVED_DATATYPE_WORKS
+/* This case needs to be improved to check if the selected space
+   is regular. If all selections are regular, collective IO can still be done.
+   However, since we find an MPI-IO bug at a DOE machine(mcr) that cannot
+   handle collective I/O selection for this case correctly, 
+   we turn off this optimization but leave the following code
+   for future optimization. Otherwise, the following else {} doesn't make sense.
+   KY 2006/8/4/ */
+            else {
+                 /* Switch to independent I/O (temporarily) */
+                   make_ind = TRUE;
+                    make_coll = TRUE;
+             } /* end else */
+#endif /* H5_MPI_COMPLEX_DERIVED_DATATYPE_WORKS */
+
+            /* Switch to independent I/O */
+            if(make_ind)
+                if(H5D_ioinfo_make_ind(io_info) < 0)
+                    HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't switch to independent I/O")
+
+            if(make_ind) {/*independent I/O */
+  
+              if(do_write) {
+                ret_value = (io_info->ops.write)(io_info,
+                         chunk_info->chunk_points,H5T_get_size(io_info->dset->shared->type),
+                         chunk_info->fspace,chunk_info->mspace,0,
+                         buf);
+              /* Check return value of the write */
+                if (ret_value<0)
+                    HGOTO_ERROR(H5E_DATASET, H5E_WRITEERROR, FAIL, "optimized write failed")
+              }
+              else {
+                ret_value = (io_info->ops.read)(io_info,
+                          chunk_info->chunk_points,H5T_get_size(io_info->dset->shared->type),
+                          chunk_info->fspace,chunk_info->mspace,0,
+                          buf);
+                /* Check return value from optimized write */
+                if (ret_value<0)
+                  HGOTO_ERROR(H5E_DATASET, H5E_READERROR, FAIL, "optimized read failed")
+              }
+            }
+            else { /*collective I/O */
+              if(HADDR_UNDEF==(chunk_addr = H5D_istore_get_addr(io_info,NULL)))
+                  HGOTO_ERROR(H5E_STORAGE, H5E_CANTGET, FAIL,"couldn't get chunk info from skipped list");
+              if(H5D_inter_collective_io(io_info,chunk_info->fspace,chunk_info->mspace,
+                                     chunk_addr,buf,do_write )<0)
+                HGOTO_ERROR(H5E_IO, H5E_CANTGET, FAIL,"couldn't finish shared collective MPI-IO");
+            }
+
+
+            if(make_coll)
+              if(H5D_ioinfo_make_coll(io_info) < 0)
+                    HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't switch to independent I/O")
+          /* Get the next chunk node in the skip list */
+            chunk_node=H5SL_next(chunk_node);
+       } /* end while */
+
+  done:
+
+    FUNC_LEAVE_NOAPI(ret_value)
+} /* end H5D_multi_chunk_collective_io_no_opt */
 
 /*-------------------------------------------------------------------------
  * Function:	H5D_inter_collective_io
  *
  * Purpose:	Routine for the shared part of collective IO between multiple chunk
                 collective IO and contiguous collective IO
-
+		
  *
  * Return:	Non-negative on success/Negative on failure
  *
@@ -1251,16 +1441,16 @@ printf("inside independent IO mpi_rank = %d, chunk index = %d\n",mpi_rank,i);
  *
  *-------------------------------------------------------------------------
  */
-static herr_t
+static herr_t 
 H5D_inter_collective_io(H5D_io_info_t *io_info,const H5S_t *file_space,const H5S_t *mem_space,
-			 haddr_t addr, const void *buf, hbool_t do_write )
+			 haddr_t addr, const void *buf, hbool_t do_write ) 
 {
 
       size_t	        mpi_buf_count, mpi_file_count;     /* Number of "objects" to transfer */
       MPI_Datatype      mpi_file_type,mpi_buf_type;
       hsize_t	        mpi_buf_offset, mpi_file_offset;   /* Offset within dataset where selection (ie. MPI type) begins */
       hbool_t	        mbt_is_derived=0,      /* Whether the buffer (memory) type is derived and needs to be free'd */
-		        mft_is_derived=0;      /* Whether the file type is derived and needs to be free'd */
+		        mft_is_derived=0;      /* Whether the file type is derived and needs to be free'd */  
       H5D_common_coll_info_t coll_info;
       herr_t       ret_value = SUCCEED;  /* return value */
 
@@ -1274,11 +1464,11 @@ H5D_inter_collective_io(H5D_io_info_t *io_info,const H5S_t *file_space,const H5S
 	if(H5S_mpio_space_type(mem_space,H5T_get_size(io_info->dset->shared->type),
 			       &mpi_buf_type,&mpi_buf_count,&mpi_buf_offset,&mbt_is_derived)<0)
 	       HGOTO_ERROR(H5E_DATASPACE, H5E_BADTYPE, FAIL,"couldn't create MPI buffer type");
-
+	    
       }
       else {
 	    /* For non-selection, participate with a none MPI derived datatype, the count is 0.  */
-	    mpi_buf_type   = MPI_BYTE;
+	    mpi_buf_type   = MPI_BYTE;   
 	    mpi_file_type  = MPI_BYTE;
 	    mpi_file_count = 0;
 	    mpi_buf_count  = 0;
@@ -1305,7 +1495,7 @@ printf("before leaving inter_collective_io ret_value = %d\n",ret_value);
  * Function:	H5D_final_collective_io
  *
  * Purpose:	Routine for the common part of collective IO with different storages.
-
+		
  *
  * Return:	Non-negative on success/Negative on failure
  *
@@ -1315,13 +1505,13 @@ printf("before leaving inter_collective_io ret_value = %d\n",ret_value);
  *
  *-------------------------------------------------------------------------
  */
-static herr_t
+static herr_t 
 H5D_final_collective_io(H5D_io_info_t *io_info,MPI_Datatype*mpi_file_type,MPI_Datatype *mpi_buf_type,
-			 H5D_common_coll_info_t* coll_info, const void *buf, hbool_t do_write)
+			 H5D_common_coll_info_t* coll_info, const void *buf, hbool_t do_write) 
 {
 
 
-    int               mpi_code;              /* MPI return code */
+    int               mpi_code;              /* MPI return code */ 
     hbool_t	      plist_is_setup=0;      /* Whether the dxpl has been customized */
     herr_t            ret_value = SUCCEED;
 
@@ -1338,7 +1528,7 @@ H5D_final_collective_io(H5D_io_info_t *io_info,MPI_Datatype*mpi_file_type,MPI_Da
      plist_is_setup=1;
 #ifdef KENT
      HDfprintf(stdout,"chunk addr %Hu\n",coll_info->chunk_addr);
-     printf("mpi_buf_count %d\n",coll_info->mpi_buf_count);
+     printf("mpi_buf_count %d\n",coll_info->mpi_buf_count);	
 #endif
      if(do_write) {
 	ret_value = (io_info->ops.write)(io_info,
@@ -1348,7 +1538,7 @@ H5D_final_collective_io(H5D_io_info_t *io_info,MPI_Datatype*mpi_file_type,MPI_Da
 #ifdef KENT
         printf("ret_value after final collective IO= %d\n",ret_value);
 #endif
-	if (ret_value<0)
+	if (ret_value<0) 
 	    HGOTO_ERROR(H5E_DATASET, H5E_WRITEERROR, FAIL, "optimized write failed")
 	      }
      else {
@@ -1356,11 +1546,11 @@ H5D_final_collective_io(H5D_io_info_t *io_info,MPI_Datatype*mpi_file_type,MPI_Da
 	        coll_info->mpi_buf_count,0,NULL,NULL,coll_info->chunk_addr,
                 buf);
 	   /* Check return value from optimized write */
-	 if (ret_value<0)
+	 if (ret_value<0) 
 	    HGOTO_ERROR(H5E_DATASET, H5E_READERROR, FAIL, "optimized read failed")
 	      }
  done:
-
+   
      /* Reset the dxpl settings */
       if(plist_is_setup) {
         if(H5FD_mpi_teardown_collective(io_info->dxpl_id)<0)
@@ -1371,7 +1561,7 @@ H5D_final_collective_io(H5D_io_info_t *io_info,MPI_Datatype*mpi_file_type,MPI_Da
       if (coll_info->mbt_is_derived) {
 	if (MPI_SUCCESS != (mpi_code= MPI_Type_free( mpi_buf_type )))
             HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code);
-
+	
       }
       if (coll_info->mft_is_derived) {
 	if (MPI_SUCCESS != (mpi_code= MPI_Type_free( mpi_file_type )))
@@ -1392,7 +1582,7 @@ H5D_final_collective_io(H5D_io_info_t *io_info,MPI_Datatype*mpi_file_type,MPI_Da
 
    Description:
                 root will collective all chunk addresses and broadcast towards other processes.
-
+   
    Parameters:
 
                 Input: H5D_io_info_t* io_info,
@@ -1430,7 +1620,7 @@ H5D_pre_sort_chunk(H5D_io_info_t *io_info,int total_chunks,haddr_t total_chunk_a
   if(MPI_SUCCESS != (mpi_code = MPI_Type_commit(&chunk_addrtype)))
     HMPI_GOTO_ERROR(FAIL, "MPI_Type_commit failed", mpi_code);
 
-
+  
   mpi_type_cleanup = 1;
 
   if(mpi_rank == root) {
@@ -1441,7 +1631,7 @@ H5D_pre_sort_chunk(H5D_io_info_t *io_info,int total_chunks,haddr_t total_chunk_a
   /* Broadcasting the MPI_IO option info. and chunk address info. */
    if(MPI_SUCCESS !=(mpi_code = MPI_Bcast(total_chunk_addr_array,1,chunk_addrtype,root,comm)))
      HMPI_GOTO_ERROR(FAIL, "MPI_BCast failed", mpi_code);
-
+  
 done:
 
    if(mpi_type_cleanup){
@@ -1462,12 +1652,12 @@ done:
                 For most cases, the chunk address has already been sorted in increasing order.
 		The special sorting flag is used to optimize this common case.
                 quick sort is used for necessary sorting.
-
+   
    Parameters:
                 Input: H5D_io_info_t* io_info,
 		       fm_map *fm(global chunk map struct)
-		Input/Output:  H5D_chunk_addr_info_t chunk_addr_info_array[]   : array to store chunk address and information
-                       many_chunk_opt                         : flag to optimize the way to obtain chunk addresses
+		Input/Output:  H5D_chunk_addr_info_t chunk_addr_info_array[]   : array to store chunk address and information 
+                       many_chunk_opt                         : flag to optimize the way to obtain chunk addresses 
                                                                 for many chunks
  *
  * Return:	Non-negative on success/Negative on failure
@@ -1479,7 +1669,7 @@ done:
  *-------------------------------------------------------------------------
  */
 
-static herr_t
+static herr_t 
 H5D_sort_chunk(H5D_io_info_t * io_info,
 	       fm_map *fm,
 	       H5D_chunk_addr_info_t chunk_addr_info_array[],
@@ -1500,7 +1690,7 @@ H5D_sort_chunk(H5D_io_info_t * io_info,
     H5D_storage_t     store;              /*union of EFL and chunk pointer in file space */
     hbool_t           do_sort = FALSE;
     herr_t	      ret_value = SUCCEED;	/*return value		*/
-
+  
     FUNC_ENTER_NOAPI_NOINIT(H5D_sort_chunk)
 
     num_chunks =  H5SL_count(fm->fsel);
@@ -1511,14 +1701,11 @@ printf("many_chunk_opt= %d\n",many_chunk_opt);
     /* If we need to optimize the way to obtain the chunk address */
     if(many_chunk_opt != H5D_OBTAIN_ONE_CHUNK_ADDR_IND){
 
+      int mpi_rank, root;
       total_chunks = (int)fm->total_chunks;
       total_chunk_addr_array = H5MM_malloc(sizeof(haddr_t)*total_chunks);
       tchunk_addr_cleanup = 1;
 
-      if(many_chunk_opt == H5D_OBTAIN_ALL_CHUNK_ADDR_COL) {/* We will broadcast the array from the root process */
-
-	int mpi_rank, root;
-
 #ifdef KENT
 printf("Coming inside H5D_OBTAIN_ALL_CHUNK_ADDR_COL\n");
 #endif
@@ -1541,12 +1728,7 @@ printf("Coming inside H5D_OBTAIN_ALL_CHUNK_ADDR_COL\n");
 	/* Broadcasting the MPI_IO option info. and chunk address info. */
 	if(MPI_SUCCESS !=(mpi_code = MPI_Bcast(total_chunk_addr_array,1,chunk_addrtype,root,io_info->comm)))
 	   HMPI_GOTO_ERROR(FAIL, "MPI_BCast failed", mpi_code);
-      }
 
-      else { /* Obtain all chunk addresses independently */
-	if(H5D_istore_chunkmap(io_info,total_chunks,total_chunk_addr_array,fm->down_chunks)<0)
-	     HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't get chunk address");
-      }
     }
 
     /* Get first node in chunk skip list */
@@ -1562,8 +1744,11 @@ printf("Coming inside H5D_OBTAIN_ALL_CHUNK_ADDR_COL\n");
     if(many_chunk_opt == H5D_OBTAIN_ONE_CHUNK_ADDR_IND){
       if(HADDR_UNDEF==(chunk_addr = H5D_istore_get_addr(io_info,NULL)))
 	  HGOTO_ERROR(H5E_STORAGE, H5E_CANTGET, FAIL,"couldn't get chunk info from skipped list");
+#ifdef KENT
+   printf("coming to obtain each chunk address individually \n");
+#endif
     }
-    else
+    else 
        chunk_addr = total_chunk_addr_array[chunk_info->index];
     chunk_addr_info_array[i].chunk_addr  = chunk_addr;
     chunk_addr_info_array[i].chunk_info  = *chunk_info;
@@ -1574,12 +1759,12 @@ printf("Coming inside H5D_OBTAIN_ALL_CHUNK_ADDR_COL\n");
             chunk_info         = H5SL_item(chunk_node);
             store.chunk.offset = chunk_info->coords;
             store.chunk.index  = chunk_info->index;
-
+	    
 	    if(many_chunk_opt == H5D_OBTAIN_ONE_CHUNK_ADDR_IND){
 	      if(HADDR_UNDEF==(chunk_addr = H5D_istore_get_addr(io_info,NULL)))
 		HGOTO_ERROR(H5E_STORAGE, H5E_CANTGET, FAIL,"couldn't get chunk info from skipped list");
 	    }
-	    else
+	    else 
 	      chunk_addr = total_chunk_addr_array[chunk_info->index];
 
 	    if(chunk_addr < chunk_addr_info_array[i].chunk_addr) do_sort = TRUE;
@@ -1605,7 +1790,7 @@ done:
     }
     FUNC_LEAVE_NOAPI(ret_value)
 } /* end H5D_sort_chunk() */
-
+    
 
 /*-------------------------------------------------------------------------
  * Function:	H5D_obtain_mpio_mode
@@ -1616,11 +1801,11 @@ done:
    Description:
 
                 1) Each process provides two piece of information for all chunks with selection
-		   a) chunk index
+		   a) chunk index 
                    b) wheather this chunk is regular(for MPI derived datatype not working case)
 
                 2) Gather all the information to the root process
-
+		
 		3) Root process will do the following:
 		   a) Obtain chunk address for all chunks in this data space
 		   b) With the consideration of the user option, calculate IO mode for each chunk
@@ -1628,7 +1813,7 @@ done:
 		      in order to do MPI Bcast only once
                    d) MPI Bcast the IO mode and chunk address information for each chunk.
 		4) Each process then retrieves IO mode and chunk address information to assign_io_mode and chunk_addr.
-
+ 
    Parameters:
 
                 Input: H5D_io_info_t* io_info,
@@ -1645,8 +1830,8 @@ done:
  *-------------------------------------------------------------------------
  */
 
-static herr_t
-H5D_obtain_mpio_mode(H5D_io_info_t* io_info,
+static herr_t 
+H5D_obtain_mpio_mode(H5D_io_info_t* io_info, 
 		     fm_map *fm,
 		     uint8_t assign_io_mode[],
 		     haddr_t chunk_addr[])
@@ -1661,7 +1846,7 @@ H5D_obtain_mpio_mode(H5D_io_info_t* io_info,
   uint8_t*          mergebuf=NULL;
   uint8_t*          tempbuf;
 
-  H5SL_node_t*      chunk_node;
+  H5SL_node_t*      chunk_node;  
   H5D_chunk_info_t* chunk_info;
 
   MPI_Datatype      bastype[2];
@@ -1688,7 +1873,7 @@ H5D_obtain_mpio_mode(H5D_io_info_t* io_info,
   FUNC_ENTER_NOAPI_NOINIT(H5D_obtain_mpio_mode)
 
   /* Assign the rank 0 to the root */
-  root              = 0;
+  root              = 0; 
   comm              = io_info->comm;
 
   /* Obtain the number of process and the current rank of the process */
@@ -1696,7 +1881,7 @@ H5D_obtain_mpio_mode(H5D_io_info_t* io_info,
 	 HGOTO_ERROR (H5E_IO, H5E_MPI, FAIL, "unable to obtain mpi rank");
   if((mpi_size = H5F_mpi_get_size(io_info->dset->oloc.file))<0)
 	 HGOTO_ERROR (H5E_IO, H5E_MPI, FAIL, "unable to obtain mpi size");
-
+  
    /* Allocate memory */
   ori_total_chunks      = fm->total_chunks;
   H5_ASSIGN_OVERFLOW(total_chunks,ori_total_chunks,hsize_t,int);
@@ -1704,30 +1889,30 @@ H5D_obtain_mpio_mode(H5D_io_info_t* io_info,
  /* Obtain the data transfer properties */
   if(NULL == (plist = H5I_object(io_info->dxpl_id)))
        HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, "not a file access property list")
-
+  
   percent_nproc_per_chunk=H5P_peek_unsigned(plist,H5D_XFER_MPIO_CHUNK_OPT_RATIO_NAME);
 #if defined(H5_MPI_COMPLEX_DERIVED_DATATYPE_WORKS) && defined(H5_MPI_SPECIAL_COLLECTIVE_IO_WORKS)
-
+   
   chunk_opt_mode=(H5FD_mpio_chunk_opt_t)H5P_peek_unsigned(plist,H5D_XFER_MPIO_CHUNK_OPT_HARD_NAME);
 
   if((chunk_opt_mode == H5FD_MPIO_CHUNK_MULTI_IO) || (percent_nproc_per_chunk == 0)){
     if(H5D_istore_chunkmap(io_info,total_chunks,chunk_addr,fm->down_chunks)<0)
-       HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't get chunk address");
+       HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't get chunk address");    
     for(ic = 0; ic<total_chunks;ic++)
        assign_io_mode[ic] = H5D_CHUNK_IO_MODE_COL;
        goto done;
   }
-#endif
+#endif    
   threshold_nproc_per_chunk = mpi_size * percent_nproc_per_chunk/100;
 
 
   io_mode_info      = (uint8_t *)H5MM_calloc(total_chunks*sizeof(MPI_BYTE));
   mergebuf          = H5MM_malloc((sizeof(haddr_t)+sizeof(MPI_BYTE))*total_chunks);
   tempbuf           = mergebuf + sizeof(MPI_BYTE)*total_chunks;
-  if(mpi_rank == root)
+  if(mpi_rank == root) 
      recv_io_mode_info = (uint8_t *)H5MM_malloc(total_chunks*sizeof(MPI_BYTE)*mpi_size);
 
-
+  
   mem_cleanup       = 1;
 
   chunk_node        = H5SL_first(fm->fsel);
@@ -1750,7 +1935,7 @@ H5D_obtain_mpio_mode(H5D_io_info_t* io_info,
 
       chunk_node = H5SL_next(chunk_node);
   }
-
+  
   /*Create sent MPI derived datatype */
   if(MPI_SUCCESS !=(mpi_code = MPI_Type_contiguous(total_chunks,MPI_BYTE,&stype)))
     HMPI_GOTO_ERROR(FAIL, "MPI_Comm_rank failed", mpi_code);
@@ -1764,7 +1949,7 @@ H5D_obtain_mpio_mode(H5D_io_info_t* io_info,
   basdisp[0]    = 0;
   basdisp[1]    = (MPI_Aint)(sizeof(MPI_BYTE)*total_chunks);/* may need to check overflow */
   bastype[0]    = MPI_BYTE;
-
+ 
   if(MPI_SUCCESS !=(mpi_code = MPI_Type_contiguous(sizeof(haddr_t),MPI_BYTE,&chunk_addrtype)))
     HMPI_GOTO_ERROR(FAIL, "MPI_Type_contiguous failed", mpi_code);
   if(MPI_SUCCESS != (mpi_code = MPI_Type_commit(&chunk_addrtype)))
@@ -1792,7 +1977,7 @@ H5D_obtain_mpio_mode(H5D_io_info_t* io_info,
     int*              ind_this_chunk;
 #endif
 
-    /* pre-computing: calculate number of processes and
+    /* pre-computing: calculate number of processes and 
         regularity of the selection occupied in each chunk */
     nproc_per_chunk = (int*)H5MM_calloc(total_chunks*sizeof(int));
 #if !defined(H5_MPI_COMPLEX_DERIVED_DATATYPE_WORKS) || !defined(H5_MPI_SPECIAL_COLLECTIVE_IO_WORKS)
@@ -1832,7 +2017,7 @@ H5D_obtain_mpio_mode(H5D_io_info_t* io_info,
 
     /* Calculating MPIO mode for each chunk (collective, independent, none) */
     for(ic = 0; ic < total_chunks; ic++){
-	  if(nproc_per_chunk[ic]>=MAX(2,threshold_nproc_per_chunk)){
+	  if(nproc_per_chunk[ic]>MAX(1,threshold_nproc_per_chunk)){
 #if !defined(H5_MPI_COMPLEX_DERIVED_DATATYPE_WORKS) || !defined(H5_MPI_SPECIAL_COLLECTIVE_IO_WORKS)
 	    if(!ind_this_chunk[ic]) assign_io_mode[ic] = H5D_CHUNK_IO_MODE_COL;
 #else
@@ -1866,7 +2051,7 @@ H5D_obtain_mpio_mode(H5D_io_info_t* io_info,
       new_value = 0;
       if(H5Pset(io_info->dxpl_id,H5D_XFER_COLL_CHUNK_MULTI_RATIO_COLL_NAME,&new_value)<0)
               HGOTO_ERROR(H5E_PLIST, H5E_UNSUPPORTED, FAIL, "unable to set property value");
-#else
+#else 
       for(ic = 0; ic < total_chunks; ic++){
         if(assign_io_mode[ic] == H5D_CHUNK_IO_MODE_COL) {
            new_value = 0;
@@ -1893,7 +2078,7 @@ H5D_obtain_mpio_mode(H5D_io_info_t* io_info,
       }
    }
 #endif
-
+ 
 done:
 
    if(mpi_type_cleanup) {
@@ -1910,7 +2095,7 @@ done:
    if(mem_cleanup){
      HDfree(io_mode_info);
      HDfree(mergebuf);
-     if(mpi_rank == root)
+     if(mpi_rank == root) 
        HDfree(recv_io_mode_info);
    }
 
@@ -1923,7 +2108,7 @@ H5D_cmp_chunk_addr(const void *chunk_addr_info1, const void *chunk_addr_info2)
    haddr_t addr1, addr2;
 
    FUNC_ENTER_NOAPI_NOINIT(H5D_cmp_chunk_addr)
-
+   
    addr1 = ((const H5D_chunk_addr_info_t *)chunk_addr_info1)->chunk_addr;
    addr2 = ((const H5D_chunk_addr_info_t *)chunk_addr_info2)->chunk_addr;
 
diff --git a/src/H5Dpkg.h b/src/H5Dpkg.h
index f6946f2..da44370 100644
--- a/src/H5Dpkg.h
+++ b/src/H5Dpkg.h
@@ -107,6 +107,7 @@ typedef struct H5D_io_info_t {
 #ifdef H5_HAVE_PARALLEL
     MPI_Comm comm;              /* MPI communicator for file */
     hbool_t xfer_mode_changed;  /* Whether the transfer mode was changed */
+    hbool_t xfer_opt_mode_changed;
 #endif /* H5_HAVE_PARALLEL */
     const H5D_storage_t *store; /* Dataset storage info */
     H5D_io_ops_t ops;           /* I/O operation function pointers */
diff --git a/src/H5Dprivate.h b/src/H5Dprivate.h
index a026162..537aa76 100644
--- a/src/H5Dprivate.h
+++ b/src/H5Dprivate.h
@@ -141,7 +141,16 @@
 #define H5D_XFER_IO_XFER_MODE_NAME       "io_xfer_mode"
 #define H5D_XFER_IO_XFER_MODE_SIZE       sizeof(H5FD_mpio_xfer_t)
 #define H5D_XFER_IO_XFER_MODE_DEF        H5FD_MPIO_INDEPENDENT
+
+/* Definitions for I/O optimization transfer mode property(using MPI-IO independent IO with file set view) */
+#define H5D_XFER_IO_XFER_OPT_MODE_NAME    "io_xfer_opt_mode"
+#define H5D_XFER_IO_XFER_OPT_MODE_SIZE    sizeof(H5FD_mpio_collective_opt_t)
+#define H5D_XFER_IO_XFER_OPT_MODE_DEF         H5FD_MPIO_COLLECTIVE_IO
 /* Definitions for optimization of MPI-IO transfer mode property */
+#define H5D_XFER_MPIO_COLLECTIVE_OPT_NAME      "mpio_collective_opt"
+#define H5D_XFER_MPIO_COLLECTIVE_OPT_SIZE       sizeof(H5FD_mpio_collective_opt_t)
+#define H5D_XFER_MPIO_COLLECTIVE_OPT_DEF        H5FD_MPIO_COLLECTIVE_IO
+
 #define H5D_XFER_MPIO_CHUNK_OPT_HARD_NAME      "mpio_chunk_opt_hard"
 #define H5D_XFER_MPIO_CHUNK_OPT_HARD_SIZE       sizeof(H5FD_mpio_chunk_opt_t)
 #define H5D_XFER_MPIO_CHUNK_OPT_HARD_DEF        H5FD_MPIO_CHUNK_DEFAULT
@@ -220,6 +229,7 @@ typedef struct H5D_dxpl_cache_t {
     size_t vec_size;            /* Size of hyperslab vector (H5D_XFER_HYPER_VECTOR_SIZE_NAME) */
 #ifdef H5_HAVE_PARALLEL
     H5FD_mpio_xfer_t xfer_mode; /* Parallel transfer for this request (H5D_XFER_IO_XFER_MODE_NAME) */
+    H5FD_mpio_collective_opt_t xfer_opt_mode; /* Parallel transfer with independent IO or collective IO with this mode */
 #endif /*H5_HAVE_PARALLEL*/
     H5Z_cb_t filter_cb;         /* Filter callback function (H5D_XFER_FILTER_CB_NAME) */
     H5Z_data_xform_t *data_xform_prop; /* Data transform prop (H5D_XFER_XFORM_NAME) */
diff --git a/src/H5FDmpi.h b/src/H5FDmpi.h
index 6c2a2c5..87eba64 100644
--- a/src/H5FDmpi.h
+++ b/src/H5FDmpi.h
@@ -21,8 +21,8 @@
 #ifndef H5FDmpi_H
 #define H5FDmpi_H
 
-/***** Macros for One linked collective IO case. *****/
-/* The default value to do one linked collective IO for all chunks.
+/***** Macros for One linked collective IO case. *****/ 
+/* The default value to do one linked collective IO for all chunks. 
    If the average number of chunks per process is greater than this value,
       the library will create an MPI derived datatype to link all chunks to do collective IO.
       The user can set this value through an API. */
@@ -30,11 +30,11 @@
 #define H5D_ONE_LINK_CHUNK_IO_THRESHOLD 0
 /***** Macros for multi-chunk collective IO case. *****/
 /* The default value of the threshold to do collective IO for this chunk.
-   If the average number of processes per chunk is greater than the default value,
+   If the average percentage of processes per chunk is greater than the default value,
    collective IO is done for this chunk.
 */
 
-#define H5D_MULTI_CHUNK_IO_COL_THRESHOLD 50
+#define H5D_MULTI_CHUNK_IO_COL_THRESHOLD 60
 /* Type of I/O for data transfer properties */
 typedef enum H5FD_mpio_xfer_t {
     H5FD_MPIO_INDEPENDENT = 0, 		/*zero is the default*/
@@ -48,6 +48,12 @@ typedef enum H5FD_mpio_chunk_opt_t {
     H5FD_MPIO_CHUNK_MULTI_IO
 } H5FD_mpio_chunk_opt_t;
 
+/* Type of I/O for data transfer properties */
+typedef enum H5FD_mpio_collective_opt_t {
+    H5FD_MPIO_COLLECTIVE_IO = 0,
+    H5FD_MPIO_INDIVIDUAL_IO  		/*zero is the default*/
+} H5FD_mpio_collective_opt_t;
+
 
 #ifdef H5_HAVE_PARALLEL
 
diff --git a/src/H5FDmpio.c b/src/H5FDmpio.c
index 0be55e3..73bc2dc 100644
--- a/src/H5FDmpio.c
+++ b/src/H5FDmpio.c
@@ -531,7 +531,7 @@ done:
 }
 
 /*-------------------------------------------------------------------------
- * Function:	H5Pset_dxpl_mpio_chunk_opt
+ * Function:	H5Pset_dxpl_mpio_collective_opt
 
 Purpose:
 	To set a flag to choose linked chunk IO or multi-chunk IO without
@@ -543,14 +543,66 @@ Description:
         The library won't behave as it asks for only when we find
         that the low-level MPI-IO package doesn't support this.
 
-Parameters:
+Parameters: 
         hid_t dxpl_id	      		in: Data transfer property list identifier
 	H5FD_mpio_chunk_opt_t   	in: The optimization flag for linked chunk IO
                                             or multi-chunk IO.
+                                                
+
+Returns: 
+Returns a non-negative value if successful. Otherwise returns a negative value. 
+*
+ *-------------------------------------------------------------------------
+ */
+herr_t
+H5Pset_dxpl_mpio_collective_opt(hid_t dxpl_id, H5FD_mpio_collective_opt_t opt_mode)
+{
+    H5P_genplist_t *plist;      /* Property list pointer */
+    herr_t ret_value;
+
+    FUNC_ENTER_API(H5Pset_dxpl_mpio_collective_opt, FAIL)
+/*    H5TRACE2("e","iDt",dxpl_id,xfer_mode);*/
+
+    if(dxpl_id==H5P_DEFAULT)
+        HGOTO_ERROR(H5E_PLIST, H5E_BADVALUE, FAIL, "can't set values in default property list")
+
+    /* Check arguments */
+    if(NULL == (plist = H5P_object_verify(dxpl_id,H5P_DATASET_XFER)))
+        HGOTO_ERROR(H5E_PLIST, H5E_BADTYPE, FAIL, "not a dxpl")
 
+    /* Set the transfer mode */
+    if (H5P_set(plist,H5D_XFER_MPIO_COLLECTIVE_OPT_NAME,&opt_mode)<0)
+        HGOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "unable to set value")
+
+    /* Initialize driver-specific properties */
+    ret_value= H5P_set_driver(plist, H5FD_MPIO, NULL);
+
+done:
+    FUNC_LEAVE_API(ret_value)
+}
 
-Returns:
-Returns a non-negative value if successful. Otherwise returns a negative value.
+
+/*-------------------------------------------------------------------------
+ * Function:	H5Pset_dxpl_mpio_chunk_opt
+
+Purpose:
+	To set a flag to choose linked chunk IO or multi-chunk IO without
+        involving decision-making inside HDF5
+
+Description:
+        The library will do linked chunk IO or multi-chunk IO without
+        involving communications for decision-making process.
+        The library won't behave as it asks for only when we find
+        that the low-level MPI-IO package doesn't support this.
+
+Parameters: 
+        hid_t dxpl_id	      		in: Data transfer property list identifier
+	H5FD_mpio_chunk_opt_t   	in: The optimization flag for linked chunk IO
+                                            or multi-chunk IO.
+                                                
+
+Returns: 
+Returns a non-negative value if successful. Otherwise returns a negative value. 
 *
  *-------------------------------------------------------------------------
  */
@@ -590,15 +642,15 @@ Purpose:
 	To set a threshold for doing linked chunk IO
 
 Description:
-        If the number is greater than the threshold set by the user,
+        If the number is greater than the threshold set by the user, 
         the library will do linked chunk IO; otherwise, IO will be done for every chunk.
 
-Parameters:
+Parameters: 
         hid_t dxpl_id	      		in: Data transfer property list identifier
-	unsigned num_proc_per_chunk	in: the threshold of the average number of chunks selected by each process
+	unsigned num_proc_per_chunk	in: the threshold of the average number of chunks selected by each process 
 
-Returns:
-Returns a non-negative value if successful. Otherwise returns a negative value.
+Returns: 
+Returns a non-negative value if successful. Otherwise returns a negative value. 
 *
  *-------------------------------------------------------------------------
  */
@@ -637,13 +689,13 @@ Purpose:
 	To set a threshold for doing collective IO for each chunk
 Description:
 	The library will calculate the percentage of the number of process holding selections at each chunk. If that percentage of number of process in the individual chunk is greater than the threshold set by the user, the library will do collective chunk IO for this chunk; otherwise, independent IO will be done for this chunk.
-Parameters:
-	hid_t dxpl_id
+Parameters: 
+	hid_t dxpl_id	         				
 		in: Data transfer property list identifier
-	unsigned percent_num_proc_per_chunk
+	unsigned percent_num_proc_per_chunk	
 		in: the threshold of the percentage of the number of process holding selections per chunk
-Returns:
-Returns a non-negative value if successful. Otherwise returns a negative value.
+Returns: 
+Returns a non-negative value if successful. Otherwise returns a negative value. 
 
 
 *
@@ -1343,6 +1395,7 @@ H5FD_mpio_read(H5FD_t *_file, H5FD_mem_t UNUSED type, hid_t dxpl_id, haddr_t add
     int         		n;
     int                         type_size;      /* MPI datatype used for I/O's size */
     int                         io_size;        /* Actual number of bytes requested */
+    H5P_genplist_t              *plist;      /* Property list pointer */
     unsigned			use_view_this_time=0;
     herr_t              	ret_value=SUCCEED;
 
@@ -1377,7 +1430,6 @@ H5FD_mpio_read(H5FD_t *_file, H5FD_mem_t UNUSED type, hid_t dxpl_id, haddr_t add
 
     /* Only look for MPI views for raw data transfers */
     if(type==H5FD_MEM_DRAW) {
-        H5P_genplist_t              *plist;      /* Property list pointer */
         H5FD_mpio_xfer_t            xfer_mode;   /* I/O tranfer mode */
 
         /* Obtain the data transfer properties */
@@ -1419,13 +1471,38 @@ H5FD_mpio_read(H5FD_t *_file, H5FD_mem_t UNUSED type, hid_t dxpl_id, haddr_t add
 
     /* Read the data. */
     if (use_view_this_time) {
+       H5FD_mpio_collective_opt_t coll_opt_mode;
+       H5FD_mpio_collective_opt_t xfer_opt_mode;
 #ifdef H5FDmpio_DEBUG
 	if (H5FD_mpio_Debug[(int)'t'])
 	    fprintf(stdout, "H5FD_mpio_read: using MPIO collective mode\n");
 #endif
+        /* Peek the collective_opt property to check whether the application wants to do IO individually. */
+        coll_opt_mode=(H5FD_mpio_collective_opt_t)H5P_peek_unsigned(plist,H5D_XFER_MPIO_COLLECTIVE_OPT_NAME);
+
+       /* Peek the xfer_opt_mode property to check whether the application wants to do IO individually. */
+        xfer_opt_mode=(H5FD_mpio_collective_opt_t)H5P_peek_unsigned(plist,H5D_XFER_IO_XFER_OPT_MODE_NAME);
+     
+        if(coll_opt_mode == H5FD_MPIO_COLLECTIVE_IO && xfer_opt_mode == H5FD_MPIO_COLLECTIVE_IO) {
+#ifdef H5FDmpio_DEBUG
+        if (H5FD_mpio_Debug[(int)'t'])
+            fprintf(stdout, "H5FD_mpio_read: doing MPI collective IO\n");
+#endif
+/* Temporarily change to read_at_all 
+        if (MPI_SUCCESS!= (mpi_code=MPI_File_read_at_all(file->f, mpi_off, buf, size_i, buf_type, &mpi_stat )))*/
         if (MPI_SUCCESS!= (mpi_code=MPI_File_read_at_all(file->f, mpi_off, buf, size_i, buf_type, &mpi_stat )))
             HMPI_GOTO_ERROR(FAIL, "MPI_File_read_at_all failed", mpi_code)
+        }
+        else {
+#ifdef H5FDmpio_DEBUG
+        if (H5FD_mpio_Debug[(int)'t'])
+            fprintf(stdout, "H5FD_mpio_read: doing MPI independent IO\n");
+#endif
 
+        if (MPI_SUCCESS!= (mpi_code=MPI_File_read_at(file->f, mpi_off, buf, size_i, buf_type, &mpi_stat )))
+            HMPI_GOTO_ERROR(FAIL, "MPI_File_read_at failed", mpi_code)
+        }
+ 
         /*
          * Reset the file view when we used MPI derived types
          */
@@ -1701,7 +1778,7 @@ H5FD_mpio_write(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t addr,
 	/* The metadata cache now only writes from process 0, which makes
 	 * this synchronization incorrect.  I'm leaving this code commented
 	 * out instead of deleting it to remind us that we should re-write
-	 * this function so that a metadata write from any other process
+	 * this function so that a metadata write from any other process 
 	 * should flag an error.
 	 *                                  -- JRM 9/1/05
 	 */
@@ -1725,15 +1802,45 @@ H5FD_mpio_write(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t addr,
 
     /* Write the data. */
     if (use_view_this_time) {
+       H5FD_mpio_collective_opt_t coll_opt_mode;
+       H5FD_mpio_collective_opt_t  xfer_opt_mode;
 #ifdef H5FDmpio_DEBUG
         if (H5FD_mpio_Debug[(int)'t'])
             fprintf(stdout, "H5FD_mpio_write: using MPIO collective mode\n");
 #endif
+        /* Peek the collective_opt property to check whether the application wants to do IO individually. */
+        coll_opt_mode=(H5FD_mpio_collective_opt_t)H5P_peek_unsigned(plist,H5D_XFER_MPIO_COLLECTIVE_OPT_NAME);
+
+         /* Peek the xfer_opt_mode property to check whether the application wants to do IO individually. */
+        xfer_opt_mode=(H5FD_mpio_collective_opt_t)H5P_peek_unsigned(plist,H5D_XFER_IO_XFER_OPT_MODE_NAME);
+
+        
+
         /*OKAY: CAST DISCARDS CONST QUALIFIER*/
+
+        if(coll_opt_mode == H5FD_MPIO_COLLECTIVE_IO && xfer_opt_mode == H5FD_MPIO_COLLECTIVE_IO ) {
+#ifdef H5FDmpio_DEBUG
+        if (H5FD_mpio_Debug[(int)'t'])
+            fprintf(stdout, "H5FD_mpio_write: doing MPI collective IO\n");
+#endif
+        /* Temporarily change to _at 
+if (MPI_SUCCESS != (mpi_code=MPI_File_write_at_all(file->f, mpi_off, (void*)buf, size_i, buf_type, &mpi_stat)))
+*/
         if (MPI_SUCCESS != (mpi_code=MPI_File_write_at_all(file->f, mpi_off, (void*)buf, size_i, buf_type, &mpi_stat)))
             HMPI_GOTO_ERROR(FAIL, "MPI_File_write_at_all failed", mpi_code)
+        }
+        else {
+#ifdef H5FDmpio_DEBUG
+        if (H5FD_mpio_Debug[(int)'t'])
+            fprintf(stdout, "H5FD_mpio_write: doing MPI independent IO\n");
+#endif
+ 
+          if (MPI_SUCCESS != (mpi_code=MPI_File_write_at(file->f, mpi_off, (void*)buf, size_i, buf_type, &mpi_stat)))
+            HMPI_GOTO_ERROR(FAIL, "MPI_File_write_at failed", mpi_code)
+        }
 
-        /*
+
+       /*
          * Reset the file view when we used MPI derived types
          */
         /*OKAY: CAST DISCARDS CONST QUALIFIER*/
@@ -1772,13 +1879,13 @@ done:
 
 #if 0 /* JRM */
     /* Since metadata writes are now done by process 0 only, this broadcast
-     * is no longer needed.  I leave it in and commented out to remind us
+     * is no longer needed.  I leave it in and commented out to remind us 
      * that we need to re-work this function to reflect this reallity.
      *
      *                                          -- JRM 9/1/05
      */
-    /* if only one process writes, need to broadcast the ret_value to
-     * other processes
+    /* if only one process writes, need to broadcast the ret_value to 
+     * other processes 
      */
     if (type!=H5FD_MEM_DRAW) {
 	if (MPI_SUCCESS != (mpi_code=MPI_Bcast(&ret_value, sizeof(ret_value), MPI_BYTE, H5_PAR_META_WRITE, file->comm)))
diff --git a/src/H5FDmpio.h b/src/H5FDmpio.h
index d2ddd0e..e06042a 100644
--- a/src/H5FDmpio.h
+++ b/src/H5FDmpio.h
@@ -51,6 +51,7 @@ H5_DLL herr_t H5Pget_fapl_mpio(hid_t fapl_id, MPI_Comm *comm/*out*/,
 			MPI_Info *info/*out*/);
 H5_DLL herr_t H5Pset_dxpl_mpio(hid_t dxpl_id, H5FD_mpio_xfer_t xfer_mode);
 H5_DLL herr_t H5Pget_dxpl_mpio(hid_t dxpl_id, H5FD_mpio_xfer_t *xfer_mode/*out*/);
+H5_DLL herr_t H5Pset_dxpl_mpio_collective_opt(hid_t dxpl_id, H5FD_mpio_collective_opt_t opt_mode);
 H5_DLL herr_t H5Pset_dxpl_mpio_chunk_opt(hid_t dxpl_id, H5FD_mpio_chunk_opt_t opt_mode);
 H5_DLL herr_t H5Pset_dxpl_mpio_chunk_opt_num(hid_t dxpl_id, unsigned num_chunk_per_proc);
 H5_DLL herr_t H5Pset_dxpl_mpio_chunk_opt_ratio(hid_t dxpl_id, unsigned percent_num_proc_per_chunk);
author	MuQun Yang <ymuqun@hdfgroup.org>	2006-08-09 03:00:11 (GMT)
committer	MuQun Yang <ymuqun@hdfgroup.org>	2006-08-09 03:00:11 (GMT)
commit	6916816a563532fddc3699a6d5e4adb57212968d (patch)
tree	70121257e539ec369455ebd43119873fd96c7489 /src
parent	d17d42acd0fbba4b3433937f448c99930553b038 (diff)
download	hdf5-6916816a563532fddc3699a6d5e4adb57212968d.zip hdf5-6916816a563532fddc3699a6d5e4adb57212968d.tar.gz hdf5-6916816a563532fddc3699a6d5e4adb57212968d.tar.bz2