[svn-r12553] This check-in includes the following part of parallel optimization codes:

1. Provide another option for users to do independent IO with MPI file setview(collectively) 2. With the request of collective IO from users, using Independent IO with MPI file setview if we find collective IO is not good for the applications for IO per chunk(multi-chunk IO) case. Previously we used pure independent IO and that actually performed small IO(IO each row) for this case. The recent performance study suggested the independent IO with file setview can acheieve significantly better performance than collective IO when not many processes participate in the IO. 3. For applications that explicitly choose to do collective IO per chunk case, the library won't do any optimization(gather/broadcast) operations. The library simply passes the collective IO request to MPI-IO. Tested at copper, kagiso, heping, mir and tungsten(cmpi and mpich) Kagiso is using LAM, t_mpi test was broken even. The cchunk10 test failed at heping and mir. I suspected it was an MPICH problem. Will investigate later. Everything passed at copper. at tungsten: the old cmpi bug(failed at esetw) is still there. Other tests passed. Some sequential fheap tests failed at kagiso.
author: MuQun Yang <ymuqun@hdfgroup.org> 2006-08-09 03:00:11 (GMT)
committer: MuQun Yang <ymuqun@hdfgroup.org> 2006-08-09 03:00:11 (GMT)
commit: 6916816a563532fddc3699a6d5e4adb57212968d (patch)
tree: 70121257e539ec369455ebd43119873fd96c7489 /src/H5Dmpio.c
parent: d17d42acd0fbba4b3433937f448c99930553b038 (diff)
download: hdf5-6916816a563532fddc3699a6d5e4adb57212968d.zip
hdf5-6916816a563532fddc3699a6d5e4adb57212968d.tar.gz
hdf5-6916816a563532fddc3699a6d5e4adb57212968d.tar.bz2
1 files changed, 385 insertions, 200 deletions
diff --git a/src/H5Dmpio.c b/src/H5Dmpio.c
index ead1777..4db79ec 100644
--- a/src/H5Dmpio.c
+++ b/src/H5Dmpio.c
@@ -27,6 +27,8 @@
 
 #define H5D_PACKAGE		/*suppress error about including H5Dpkg	  */
 /*#define KENT */
+/*#define CC_PERF*/
+
 
 
 /***********/
@@ -64,13 +66,10 @@
 
 /* Macros to represent options on how to obtain chunk address for one linked-chunk IO case */
 #define H5D_OBTAIN_ONE_CHUNK_ADDR_IND 0
-#define H5D_OBTAIN_ALL_CHUNK_ADDR_IND 1
 #define H5D_OBTAIN_ALL_CHUNK_ADDR_COL 2
 
 /* Macros to define the default ratio of obtaining all chunk addresses for one linked-chunk IO case */
-#define H5D_ALL_CHUNK_ADDR_THRES_IND  10
-#define H5D_ALL_CHUNK_ADDR_THRES_IND_NUM 4
-#define H5D_ALL_CHUNK_ADDR_THRES_COL  20
+#define H5D_ALL_CHUNK_ADDR_THRES_COL  30
 #define H5D_ALL_CHUNK_ADDR_THRES_COL_NUM 10000
 
 /***** Macros for multi-chunk collective IO case. *****/
@@ -104,29 +103,32 @@ typedef struct H5D_common_coll_info_t {
   size_t  mpi_buf_count;
   haddr_t chunk_addr;
 } H5D_common_coll_info_t;
-
+  
 
 /********************/
 /* Local Prototypes */
 /********************/
 
-static herr_t
-H5D_multi_chunk_collective_io(H5D_io_info_t *io_info,fm_map *fm,const void *buf,
+static herr_t 
+H5D_multi_chunk_collective_io(H5D_io_info_t *io_info,fm_map *fm,const void *buf, 
 			      hbool_t do_write);
+static herr_t
+H5D_multi_chunk_collective_io_no_opt(H5D_io_info_t *io_info,fm_map *fm,const void *buf,
+                              hbool_t do_write);
 
 static herr_t
-H5D_link_chunk_collective_io(H5D_io_info_t *io_info,fm_map *fm,const void *buf,
+H5D_link_chunk_collective_io(H5D_io_info_t *io_info,fm_map *fm,const void *buf, 
 			     hbool_t do_write,int sum_chunk);
 
-static herr_t
+static herr_t 
 H5D_inter_collective_io(H5D_io_info_t *io_info,const H5S_t *file_space,
-			const H5S_t *mem_space,haddr_t addr,
+			const H5S_t *mem_space,haddr_t addr, 
 		        const void *buf, hbool_t do_write );
 
-static herr_t
+static herr_t 
 H5D_final_collective_io(H5D_io_info_t *io_info,MPI_Datatype*mpi_file_type,
 			 MPI_Datatype *mpi_buf_type,
-			 H5D_common_coll_info_t* coll_info,
+			 H5D_common_coll_info_t* coll_info, 
 			 const void *buf, hbool_t do_write);
 #ifdef OLD_WAY
 static herr_t
@@ -134,24 +136,25 @@ H5D_pre_sort_chunk(H5D_io_info_t *io_info,int total_chunks,
 		   haddr_t total_chunk_addr_array[]);
 #endif
 
-static herr_t
+static herr_t 
 H5D_sort_chunk(H5D_io_info_t * io_info,
 	       fm_map *fm,
 	       H5D_chunk_addr_info_t chunk_addr_info_array[],
 	       int many_chunk_opt);
 
-static herr_t
-H5D_obtain_mpio_mode(H5D_io_info_t* io_info,
+static herr_t 
+H5D_obtain_mpio_mode(H5D_io_info_t* io_info, 
 		     fm_map *fm,
 		     uint8_t assign_io_mode[],
 		     haddr_t chunk_addr[]);
 
 static herr_t H5D_ioinfo_make_ind(H5D_io_info_t *io_info);
+static herr_t H5D_ioinfo_make_coll_opt(H5D_io_info_t *io_info);
 static herr_t H5D_ioinfo_make_coll(H5D_io_info_t *io_info);
 static herr_t H5D_mpio_get_min_chunk(const H5D_io_info_t *io_info,
     const fm_map *fm, int *min_chunkf);
 static int H5D_cmp_chunk_addr(const void *addr1, const void *addr2);
-static herr_t
+static herr_t 
 H5D_mpio_get_sum_chunk(const H5D_io_info_t *io_info,
 		       const fm_map *fm, int *sum_chunkf);
 
@@ -284,10 +287,10 @@ done:
  * Decription:  If H5_MPI_SPECIAL_COLLECTIVE_IO_WORKS is not defined,
                    collective IO with no contribution from one or more
                    processes are not assured. We will check the minimum
-                   number of chunks the process is used. If the number is
+                   number of chunks the process is used. If the number is 
                    zero, we will use independent IO mode instead.
                 This is necessary with Linked chunk IO.
- * Purpose:	Checks if it is possible to do collective IO
+ * Purpose:	Checks if it is possible to do collective IO 
  *
  * Return:	Success:        Non-negative: TRUE or FALSE
  *		Failure:	Negative
@@ -307,8 +310,8 @@ H5D_mpio_chunk_adjust_iomode(H5D_io_info_t *io_info, const fm_map *fm) {
 
 #ifndef H5_MPI_SPECIAL_COLLECTIVE_IO_WORKS
     if(H5D_mpio_get_min_chunk(io_info,fm,&min_chunk)<0)
-         HGOTO_ERROR(H5E_DATASPACE, H5E_CANTSWAP, FAIL, "unable to obtain the min chunk number of all processes");
-    if(min_chunk == 0) {
+         HGOTO_ERROR(H5E_DATASPACE, H5E_CANTSWAP, FAIL, "unable to obtain the min chunk number of all processes"); 
+    if(min_chunk == 0) {                
        H5P_genplist_t *dx_plist;           /* Data transer property list */
 
        /* Get the dataset transfer property list */
@@ -347,11 +350,11 @@ done:
  */
 herr_t
 H5D_mpio_select_read(H5D_io_info_t *io_info,
-                     size_t mpi_buf_count,
+                     size_t mpi_buf_count, 
                      const size_t UNUSED elmt_size,
-		     const H5S_t UNUSED *file_space,
+		     const H5S_t UNUSED *file_space, 
 		     const H5S_t UNUSED *mem_space,
-		     haddr_t addr,
+		     haddr_t addr,		     
 		     void *buf/*out*/)
 {
     herr_t ret_value = SUCCEED;
@@ -378,9 +381,9 @@ done:
  */
 herr_t
 H5D_mpio_select_write(H5D_io_info_t *io_info,
-		      size_t mpi_buf_count,
+		      size_t mpi_buf_count, 
 		      const size_t UNUSED elmt_size,
-		      const H5S_t UNUSED *file_space,
+		      const H5S_t UNUSED *file_space, 
 		      const H5S_t UNUSED *mem_space,
 		      haddr_t addr,
 		      const void *buf)
@@ -444,6 +447,51 @@ H5D_ioinfo_make_ind(H5D_io_info_t *io_info)
 done:
     FUNC_LEAVE_NOAPI(ret_value)
 } /* end H5D_ioinfo_make_ind() */
+
+/*-------------------------------------------------------------------------
+ * Function:	H5D_ioinfo_make_coll_opt
+ *
+ * Purpose:	Switch to MPI independent I/O with file set view
+ *
+ * Return:	Non-negative on success/Negative on failure
+ *
+ * Programmer:	Quincey Koziol
+ *		Friday, August 12, 2005
+ *
+ *-------------------------------------------------------------------------
+ */
+static herr_t
+H5D_ioinfo_make_coll_opt(H5D_io_info_t *io_info)
+{
+    H5P_genplist_t *dx_plist;           /* Data transer property list */
+    herr_t	ret_value = SUCCEED;	/*return value		*/
+
+    FUNC_ENTER_NOAPI_NOINIT(H5D_ioinfo_make_coll_opt)
+
+    /* Get the dataset transfer property list */
+    if (NULL == (dx_plist = H5I_object(io_info->dxpl_id)))
+        HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, "not a dataset transfer property list")
+
+    /* Change the xfer_mode to independent, handle the request,
+     * then set xfer_mode before return.
+     */
+    io_info->dxpl_cache->xfer_opt_mode = H5FD_MPIO_INDIVIDUAL_IO;
+    if(H5P_set (dx_plist, H5D_XFER_IO_XFER_OPT_MODE_NAME, &io_info->dxpl_cache->xfer_opt_mode) < 0)
+        HGOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "can't set transfer mode")
+
+    /* Set the pointers to the non-MPI-specific routines */
+    io_info->ops.read = H5D_mpio_select_read;
+    io_info->ops.write = H5D_mpio_select_write;
+
+    /* Indicate that the transfer mode should be restored before returning
+     * to user.
+     */
+    io_info->xfer_opt_mode_changed = TRUE;
+
+done:
+    FUNC_LEAVE_NOAPI(ret_value)
+} /* end H5D_ioinfo_make_coll_opt() */
+
 
 
 /*-------------------------------------------------------------------------
@@ -477,6 +525,11 @@ H5D_ioinfo_make_coll(H5D_io_info_t *io_info)
     if(H5P_set (dx_plist, H5D_XFER_IO_XFER_MODE_NAME, &io_info->dxpl_cache->xfer_mode) < 0)
         HGOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "can't set transfer mode")
 
+    io_info->dxpl_cache->xfer_opt_mode = H5FD_MPIO_COLLECTIVE_IO;
+    if(H5P_set (dx_plist, H5D_XFER_IO_XFER_OPT_MODE_NAME, &io_info->dxpl_cache->xfer_opt_mode) < 0)
+        HGOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "can't set transfer mode")
+
+
     /* Set the pointers to the MPI-specific routines */
     io_info->ops.read = H5D_mpio_select_read;
     io_info->ops.write = H5D_mpio_select_write;
@@ -485,6 +538,7 @@ H5D_ioinfo_make_coll(H5D_io_info_t *io_info)
      * to user.
      */
     io_info->xfer_mode_changed=FALSE;
+    io_info->xfer_opt_mode_changed=FALSE;
 
 done:
     FUNC_LEAVE_NOAPI(ret_value)
@@ -556,7 +610,7 @@ H5D_mpio_get_sum_chunk(const H5D_io_info_t *io_info,
     printf("num_chunkf = %d\n",num_chunkf);
 #endif
 
-    /* Determine the minimum # of chunks for all processes */
+    /* Determine the summation of number of chunks for all processes */
     if (MPI_SUCCESS != (mpi_code = MPI_Allreduce(&num_chunkf, sum_chunkf, 1, MPI_INT, MPI_SUM, io_info->comm)))
         HMPI_GOTO_ERROR(FAIL, "MPI_Allreduce failed", mpi_code)
 
@@ -569,8 +623,8 @@ done:
  * Function:	H5D_contig_collective_io
  *
  * Purpose:	Wrapper Routine for H5D_inter_collective_io
-                The starting address of contiguous storage is passed
- *
+                The starting address of contiguous storage is passed 
+ *               
  *
  * Return:	Non-negative on success/Negative on failure
  *
@@ -581,11 +635,11 @@ done:
  *-------------------------------------------------------------------------
  */
 herr_t
-H5D_contig_collective_io(H5D_io_info_t *io_info,
+H5D_contig_collective_io(H5D_io_info_t *io_info, 
 			 const H5S_t *file_space,
 			 const H5S_t *mem_space,
 			 const void *buf,
-			 hbool_t do_write)
+			 hbool_t do_write) 
 {
 
 
@@ -607,8 +661,8 @@ H5D_contig_collective_io(H5D_io_info_t *io_info,
 #endif
     if(H5D_inter_collective_io(io_info,file_space,mem_space,addr,buf,do_write)<0)
 	HGOTO_ERROR(H5E_IO, H5E_CANTGET, FAIL,"couldn't finish shared collective MPI-IO");
-
- done:
+      
+ done: 
 
     FUNC_LEAVE_NOAPI(ret_value)
 } /* end H5D_contig_collective_io */
@@ -616,10 +670,10 @@ H5D_contig_collective_io(H5D_io_info_t *io_info,
 /*-------------------------------------------------------------------------
  * Function:	H5D_chunk_collective_io
  *
- * Purpose:	Routine for
-                1) choose an IO option:
+ * Purpose:	Routine for 
+                1) choose an IO option: 
 		      a) One collective IO defined by one MPI derived datatype to link through all chunks
-		or    b) multiple chunk IOs,to do MPI-IO for each chunk, the IO mode may be adjusted
+		or    b) multiple chunk IOs,to do MPI-IO for each chunk, the IO mode may be adjusted 
                          due to the selection pattern for each chunk.
  *              For option a)
 			1. Sort the chunk address, obtain chunk info according to the sorted chunk address
@@ -633,7 +687,7 @@ H5D_contig_collective_io(H5D_io_info_t *io_info,
                         2. Depending on whether the IO mode is collective or independent or none,
                            Create either MPI derived datatype for each chunk to do collective IO or just do independent IO
                         3. Set up collective IO property list for collective mode
-                        4. DO IO
+                        4. DO IO               
  *
  * Return:	Non-negative on success/Negative on failure
  *
@@ -643,44 +697,46 @@ H5D_contig_collective_io(H5D_io_info_t *io_info,
  *
  *-------------------------------------------------------------------------
  */
-herr_t
-H5D_chunk_collective_io(H5D_io_info_t *io_info,fm_map *fm,const void *buf, hbool_t do_write)
+herr_t 
+H5D_chunk_collective_io(H5D_io_info_t *io_info,fm_map *fm,const void *buf, hbool_t do_write) 
 {
 
     int               io_option = H5D_MULTI_CHUNK_IO_MORE_OPT;
     int               sum_chunk = 0,mpi_size;
     unsigned          one_link_chunk_io_threshold;
-    H5P_genplist_t    *plist;
+    H5P_genplist_t    *plist; 
     H5FD_mpio_chunk_opt_t chunk_opt_mode;
 #ifdef H5_HAVE_INSTRUMENTED_LIBRARY
     htri_t            check_prop,temp_not_link_io = FALSE;
     int               prop_value,new_value;
 #endif
-    herr_t            ret_value = SUCCEED;
+    herr_t            ret_value = SUCCEED;    
 
 
     FUNC_ENTER_NOAPI_NOINIT(H5D_chunk_collective_io)
 
     assert (IS_H5FD_MPIO(io_info->dset->oloc.file));
-
+    
     /* Obtain the data transfer properties */
     if(NULL == (plist = H5I_object(io_info->dxpl_id)))
         HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, "not a file access property list")
-
+    
     /* Check the optional property list on what to do with collective chunk IO. */
     chunk_opt_mode=(H5FD_mpio_chunk_opt_t)H5P_peek_unsigned(plist,H5D_XFER_MPIO_CHUNK_OPT_HARD_NAME);
 #ifdef KENT
     printf("chunk_opt_mode = %d\n",chunk_opt_mode);
 #endif
-
-    if(chunk_opt_mode == H5FD_MPIO_CHUNK_ONE_IO) io_option = H5D_ONE_LINK_CHUNK_IO;/*no opt*/
+    
+    if(chunk_opt_mode == H5FD_MPIO_CHUNK_ONE_IO) {
+        io_option = H5D_ONE_LINK_CHUNK_IO;/*no opt*/
+    }
     else if(chunk_opt_mode == H5FD_MPIO_CHUNK_MULTI_IO) io_option = H5D_MULTI_CHUNK_IO;/*no opt */
     else {
-       if(H5D_mpio_get_sum_chunk(io_info,fm,&sum_chunk)<0)
-  	       HGOTO_ERROR(H5E_DATASPACE, H5E_CANTSWAP, FAIL, "unable to obtain the total chunk number of all processes");
+       if(H5D_mpio_get_sum_chunk(io_info,fm,&sum_chunk)<0)   
+  	       HGOTO_ERROR(H5E_DATASPACE, H5E_CANTSWAP, FAIL, "unable to obtain the total chunk number of all processes"); 
        if((mpi_size = H5F_mpi_get_size(io_info->dset->oloc.file))<0)
 	 HGOTO_ERROR (H5E_IO, H5E_MPI, FAIL, "unable to obtain mpi size");
-
+    
        if(NULL == (plist = H5I_object(io_info->dxpl_id)))
          HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, "not a file access property list")
 
@@ -732,8 +788,8 @@ H5D_chunk_collective_io(H5D_io_info_t *io_info,fm_map *fm,const void *buf, hbool
 	          HGOTO_ERROR(H5E_PLIST, H5E_UNSUPPORTED, FAIL, "unable to get property value");
               }
         }
-
-
+       
+              
 #endif
 #ifndef H5_MPI_COMPLEX_DERIVED_DATATYPE_WORKS
     if(io_option == H5D_ONE_LINK_CHUNK_IO ) io_option = H5D_MULTI_CHUNK_IO ;/* We can not do this with one chunk IO. */
@@ -745,9 +801,15 @@ H5D_chunk_collective_io(H5D_io_info_t *io_info,fm_map *fm,const void *buf, hbool
       if(H5D_link_chunk_collective_io(io_info,fm,buf,do_write,sum_chunk)<0)
 	HGOTO_ERROR(H5E_IO, H5E_CANTGET, FAIL,"couldn't finish linked chunk MPI-IO");
     }
+      
+    else if(io_option == H5D_MULTI_CHUNK_IO) {
+      if(H5D_multi_chunk_collective_io_no_opt(io_info,fm,buf,do_write)<0)
+        HGOTO_ERROR(H5E_IO, H5E_CANTGET, FAIL,"couldn't finish multiple chunk MPI-IO");
+    }
 
+      
     else { /*multiple chunk IOs without opt */
-
+    
       if(H5D_multi_chunk_collective_io(io_info,fm,buf,do_write)<0)
         HGOTO_ERROR(H5E_IO, H5E_CANTGET, FAIL,"couldn't finish multiple chunk MPI-IO");
 
@@ -764,7 +826,7 @@ done:
 			1. Sort the chunk address and chunk info
                         2. Build up MPI derived datatype for each chunk
                         3. Build up the final MPI derived datatype
-			4. Use common collective IO routine to do MPI-IO
+			4. Use common collective IO routine to do MPI-IO 
 
  *
  * Return:	Non-negative on success/Negative on failure
@@ -788,9 +850,9 @@ H5D_link_chunk_collective_io(H5D_io_info_t *io_info,fm_map *fm,const void *buf,
       size_t           mpi_buf_count;
       size_t           mpi_file_count;
       hbool_t	       mbt_is_derived=0,      /* Whether the buffer (memory) type is derived and needs to be free'd */
-		       mft_is_derived=0;      /* Whether the file type is derived and needs to be free'd */
-
-      int              mpi_size,mpi_code;              /* MPI return code */
+		       mft_is_derived=0;      /* Whether the file type is derived and needs to be free'd */  
+     
+      int              mpi_size,mpi_code;              /* MPI return code */ 
 
       int               i,num_chunk=0,total_chunks;
       size_t            ori_num_chunk;
@@ -814,12 +876,8 @@ H5D_link_chunk_collective_io(H5D_io_info_t *io_info,fm_map *fm,const void *buf,
       H5D_common_coll_info_t coll_info;
       H5D_chunk_addr_info_t*  chunk_addr_info_array=NULL;
 
-#ifdef CC_PERF
-      char *bc_percent = NULL;
-      char *bcc_percent = NULL;
-#endif
-      herr_t            ret_value = SUCCEED;
-
+      herr_t            ret_value = SUCCEED;    
+      
       FUNC_ENTER_NOAPI_NOINIT(H5D_link_chunk_collective_io)
       ori_total_chunks = fm->total_chunks;
       H5_ASSIGN_OVERFLOW(total_chunks,ori_total_chunks,hsize_t,int);
@@ -829,7 +887,7 @@ H5D_link_chunk_collective_io(H5D_io_info_t *io_info,fm_map *fm,const void *buf,
         H5SL_node_t *chunk_node;
         H5D_chunk_info_t *chunk_info;
         H5D_storage_t  store;
-
+ 
         chunk_node = H5SL_first(fm->fsel);
 	if(chunk_node == NULL) {
 	  if(H5D_istore_chunkmap(io_info,total_chunks,&chunk_base_addr,fm->down_chunks)<0)
@@ -846,7 +904,7 @@ H5D_link_chunk_collective_io(H5D_io_info_t *io_info,fm_map *fm,const void *buf,
 
 	  if(HADDR_UNDEF==(chunk_base_addr = H5D_istore_get_addr(io_info,NULL)))
 	    HGOTO_ERROR(H5E_STORAGE, H5E_CANTGET, FAIL,"couldn't get chunk info from skipped list");
-
+	
 #ifdef KENT
 printf("before inter_collective_io for total chunk = 1 \n");
 #endif
@@ -864,7 +922,7 @@ printf("before inter_collective_io for total chunk = 1 \n");
 printf("total_chunks = %d\n",(int)total_chunks);
 #endif
 
-
+         
       if(num_chunk == 0) total_chunk_addr_array = H5MM_malloc(sizeof(haddr_t)*total_chunks);
       else
       {
@@ -882,29 +940,14 @@ printf("total_chunks = %d\n",(int)total_chunks);
 	HGOTO_ERROR(H5E_DATATYPE, H5E_BADSIZE, FAIL, "datatype size invalid");
       dst_type_size = src_type_size;
 
-
-#ifdef CC_PERF
-      /* "bcc" means 'b-tree iterately obtain all chunk addresses collectively',
-	 "bc" means 'b-tree iterately obtain all chunk addresses individually',
-          the default one means 'obtaining the chunk address individually',
-      */
-
-      if(bcc_percent=getenv("BCC_PERCENT")){
-         bsearch_coll_chunk_threshold  = atoi(bcc_percent);
-         assert((bsearch_coll_chunk_threshold >=0) &&(bsearch_coll_chunk_threshold <=100));
-      }
-      else
-         bsearch_coll_chunk_threshold  = H5D_ALL_CHUNK_ADDR_THRES_COL;
-#else
-      bsearch_coll_chunk_threshold  = H5D_ALL_CHUNK_ADDR_THRES_COL; /*This number may be changed according to the performance study */
-#endif
+      bsearch_coll_chunk_threshold  = H5D_ALL_CHUNK_ADDR_THRES_COL;
 
       if((mpi_size = H5F_mpi_get_size(io_info->dset->oloc.file))<0)
 	 HGOTO_ERROR (H5E_IO, H5E_MPI, FAIL, "unable to obtain mpi size");
 
-      /* Calculate the actual threshold to obtain all chunk addresses collectively
+      /* Calculate the actual threshold to obtain all chunk addresses collectively 
          The bigger this number is, the more possible the use of obtaining chunk address collectively. */
-      /* For non-optimization one-link IO,
+      /* For non-optimization one-link IO, 
          actual bsearch threshold is always 0,
          we would always want to obtain the chunk addresses individually
          for each process. */
@@ -914,53 +957,17 @@ printf("total_chunks = %d\n",(int)total_chunks);
          &&(sum_chunk/mpi_size >= H5D_ALL_CHUNK_ADDR_THRES_COL_NUM))
 	many_chunk_opt = H5D_OBTAIN_ALL_CHUNK_ADDR_COL;
 
-      else {
-
-#ifdef CC_PERF
-	if(bc_percent=getenv("BC_PERCENT")){
-	  bsearch_chunk_ratio  = atoi(bc_percent);
-	  assert((bsearch_chunk_ratio<=100)&&(bsearch_chunk_ratio>=0));
-	}
-	else
-	  bsearch_chunk_ratio  = H5D_ALL_CHUNK_ADDR_THRES_IND;
-#else
-         bsearch_chunk_ratio = H5D_ALL_CHUNK_ADDR_THRES_IND; /*This number may be changed according to the performance study */
-#endif
-
-	 /* This threshold is to check whether we can use iterator to obtain all chunk addresses.
-	    The unit of the threshold is the number of chunks. The value should be at least 1.
-            It can be calculated as follows:
-
-	    if(total_chunks*bsearch_chunk_ratio/100 <=1)
-	      bsearch_chunk_threahold = 1;
-            else
-	      bsearch_chunk_threshold = total_chunks*bsearch_chunk_ratio/100;
-	    In order to make the caluculation more efficient,
-	    we use the following approximate formula to calculate the threshold.
-
-	    bsearch_chunk_threshold = 1+ (total_chunks*bsearch_chunk_ratio-99)/100;
-
-	    The only difference is when total_chunks* besearch_chunk_ratio == 100n+99;
-            the approximate formula will give value (n+1) instead of n for threshold.
-	    That shouldn't matter much from our persective.
-	 */
-
-        bsearch_chunk_threshold = 1 +(total_chunks*bsearch_chunk_ratio-99)/100;
-	if(num_chunk > bsearch_chunk_threshold) many_chunk_opt = H5D_OBTAIN_ALL_CHUNK_ADDR_IND;
-        if((sum_chunk == 0) && (total_chunks >= H5D_ALL_CHUNK_ADDR_THRES_IND_NUM))
-          many_chunk_opt = H5D_OBTAIN_ALL_CHUNK_ADDR_IND;
-      }
 #ifdef KENT
 printf("before sorting the chunk address \n");
 #endif
-      /* Sort the chunk address
+      /* Sort the chunk address 
          when chunk optimization selection is either H5D_OBTAIN_*/
       if(num_chunk == 0){ /* special case: this process doesn't select anything */
          if(H5D_istore_chunkmap(io_info,total_chunks,total_chunk_addr_array,fm->down_chunks)<0)
              HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't get chunk address");
          chunk_base_addr = total_chunk_addr_array[0];
       }
-
+ 
       else {
          if(H5D_sort_chunk(io_info,fm,chunk_addr_info_array,many_chunk_opt)<0)
         	 HGOTO_ERROR (H5E_DATASPACE, H5E_CANTSWAP, FAIL, "unable to sort chunk address");
@@ -969,8 +976,8 @@ printf("before sorting the chunk address \n");
 #ifdef KENT
 printf("after sorting the chunk address \n");
 #endif
-
-      /* Obtain MPI derived datatype from all individual chunks */
+      
+      /* Obtain MPI derived datatype from all individual chunks */ 
       for ( i = 0; i < num_chunk; i++) {
 	  /* Disk MPI derived datatype */
           if(H5S_mpio_space_type(chunk_addr_info_array[i].chunk_info.fspace,src_type_size,&chunk_ftype[i],
@@ -981,7 +988,7 @@ printf("after sorting the chunk address \n");
           if(H5S_mpio_space_type(chunk_addr_info_array[i].chunk_info.mspace,dst_type_size,&chunk_mtype[i],
                                        &mpi_buf_count,&mpi_buf_extra_offset,&mbt_is_derived)<0)
 	       	HGOTO_ERROR(H5E_DATASPACE, H5E_BADTYPE, FAIL,"couldn't create MPI buf type");
-
+           
           /* Chunk address relative to the first chunk */
 	  chunk_addr_info_array[i].chunk_addr -= chunk_base_addr;
           H5_ASSIGN_OVERFLOW(chunk_disp_array[i],chunk_addr_info_array[i].chunk_addr,haddr_t,MPI_Aint);
@@ -989,7 +996,7 @@ printf("after sorting the chunk address \n");
 
       blocklen_value = 1;
       if(num_chunk){
-
+	
 	/* initialize the buffer with the constant value 1 */
 	H5V_array_fill(blocklen,&blocklen_value,sizeof(int),(size_t)num_chunk);
 
@@ -1022,7 +1029,7 @@ printf("after sorting the chunk address \n");
       else {/* no selection at all for this process */
 	chunk_final_ftype = MPI_BYTE;
 	chunk_final_mtype = MPI_BYTE;
-
+	
 	/* buffer, file derived datatypes should be true */
 	coll_info.mbt_is_derived = 0;
 	coll_info.mft_is_derived = 0;
@@ -1032,7 +1039,7 @@ printf("after sorting the chunk address \n");
 #ifdef KENT
 printf("before coming to final collective IO\n");
 #endif
-
+      
       if(H5D_final_collective_io(io_info,&chunk_final_ftype,&chunk_final_mtype,&coll_info,buf,do_write)<0)
 	HGOTO_ERROR(H5E_IO, H5E_CANTGET, FAIL,"couldn't finish MPI-IO");
 
@@ -1042,7 +1049,7 @@ printf("before freeing memory inside  H5D_link_collective_io ret_value = %d\n",r
 #endif
 
 
-
+      
      if (fm->total_chunks != 1) {
        if(num_chunk == 0) HDfree(total_chunk_addr_array);
        else {
@@ -1072,7 +1079,7 @@ printf("before leaving H5D_link_collective_io ret_value = %d\n",ret_value);
                 1. Use MPI_gather and MPI_Bcast to obtain IO mode in each chunk(collective/independent/none)
                 2. Depending on whether the IO mode is collective or independent or none,
                    Create either MPI derived datatype for each chunk or just do independent IO
-                3. Use common collective IO routine to do MPI-IO
+                3. Use common collective IO routine to do MPI-IO               
  *
  * Return:	Non-negative on success/Negative on failure
  *
@@ -1082,8 +1089,8 @@ printf("before leaving H5D_link_collective_io ret_value = %d\n",ret_value);
  *
  *-------------------------------------------------------------------------
  */
-static herr_t
-H5D_multi_chunk_collective_io(H5D_io_info_t *io_info,fm_map *fm,const void *buf, hbool_t do_write)
+static herr_t 
+H5D_multi_chunk_collective_io(H5D_io_info_t *io_info,fm_map *fm,const void *buf, hbool_t do_write) 
 {
 
       int               i,total_chunk;
@@ -1096,7 +1103,7 @@ H5D_multi_chunk_collective_io(H5D_io_info_t *io_info,fm_map *fm,const void *buf,
       H5D_storage_t     store;                /* union of EFL and chunk pointer in file space */
       hbool_t           select_chunk;
       hbool_t 	        last_io_mode_coll = TRUE;
-      herr_t            ret_value = SUCCEED;
+      herr_t            ret_value = SUCCEED;    
 #ifdef KENT
       int mpi_rank;
 #endif
@@ -1117,7 +1124,7 @@ H5D_multi_chunk_collective_io(H5D_io_info_t *io_info,fm_map *fm,const void *buf,
 #endif
 
       /* obtain IO option for each chunk */
-      if(H5D_obtain_mpio_mode(io_info,fm,chunk_io_option,chunk_addr)<0)
+      if(H5D_obtain_mpio_mode(io_info,fm,chunk_io_option,chunk_addr)<0) 
 	HGOTO_ERROR (H5E_DATASET, H5E_CANTRECV, FAIL, "unable to obtain MPIO mode");
 
       for( i = 0; i<total_chunk;i++){
@@ -1140,7 +1147,7 @@ printf("mpi_rank = %d, chunk index = %d\n",mpi_rank,i);
           if(NULL ==(chunk_node = H5SL_first(fm->fsel)))
 	    HGOTO_ERROR(H5E_STORAGE, H5E_CANTGET, FAIL,"couldn't get chunk node from skipped list");
 #else
-
+	  
 	   if(NULL ==(chunk_node = H5SL_first(fm->fsel)))
 	    HGOTO_ERROR(H5E_STORAGE, H5E_CANTGET, FAIL,"couldn't get chunk node from skipped list");
 	    while(chunk_node){
@@ -1161,30 +1168,30 @@ printf("mpi_rank = %d, chunk index = %d\n",mpi_rank,i);
 #endif
 	}
 
-        if(chunk_io_option[i] == 1){ /*collective IO for this chunk,
+        if(chunk_io_option[i] == 1){ /*collective IO for this chunk, 
 				       note: even there is no selection for this process,
                                              the process still needs to contribute MPI NONE TYPE.*/
 #ifdef KENT
 printf("inside collective chunk IO mpi_rank = %d, chunk index = %d\n",mpi_rank,i);
 #endif
-
+	
 	  if(!last_io_mode_coll)
 	  /* Switch back to collective I/O */
               if(H5D_ioinfo_make_coll(io_info) < 0)
                  HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't switch to collective I/O")
-
+          
 	    if(select_chunk){
 	      if(H5D_inter_collective_io(io_info,chunk_info->fspace,chunk_info->mspace,
 			             chunk_addr[i],buf,do_write )<0)
 	        HGOTO_ERROR(H5E_IO, H5E_CANTGET, FAIL,"couldn't finish shared collective MPI-IO");
-
+	       
 	    }
 	    else{
 	     if(H5D_inter_collective_io(io_info,NULL,NULL,
 			             chunk_addr[i],buf,do_write )<0)
 	        HGOTO_ERROR(H5E_IO, H5E_CANTGET, FAIL,"couldn't finish shared collective MPI-IO");
-
-	    }
+	       
+	    } 
               last_io_mode_coll = TRUE;
 
 	}
@@ -1192,8 +1199,9 @@ printf("inside collective chunk IO mpi_rank = %d, chunk index = %d\n",mpi_rank,i
 #ifdef KENT
 printf("inside independent IO mpi_rank = %d, chunk index = %d\n",mpi_rank,i);
 #endif
-
+	
 	  HDassert(chunk_io_option[i] == 0);
+#if 0
 	  if(!select_chunk) continue; /* this process has nothing to do with this chunk, continue! */
 	  if(last_io_mode_coll)
 	  /* Switch to independent I/O */
@@ -1206,16 +1214,16 @@ printf("inside independent IO mpi_rank = %d, chunk index = %d\n",mpi_rank,i);
 			 chunk_info->fspace,chunk_info->mspace,0,
 			 buf);
 	      /* Check return value of the write */
-	    if (ret_value<0)
+	    if (ret_value<0) 
 	      HGOTO_ERROR(H5E_DATASET, H5E_WRITEERROR, FAIL, "optimized write failed")
 	  }
 	  else {
 	     ret_value = (io_info->ops.read)(io_info,
 			  chunk_info->chunk_points,H5T_get_size(io_info->dset->shared->type),
 			  chunk_info->fspace,chunk_info->mspace,0,
-	        	  buf);
+	        	  buf);			   
 	      /* Check return value from optimized write */
-	      if (ret_value<0)
+	      if (ret_value<0) 
 		HGOTO_ERROR(H5E_DATASET, H5E_READERROR, FAIL, "optimized read failed")
 	  }
 
@@ -1227,7 +1235,47 @@ printf("inside independent IO mpi_rank = %d, chunk index = %d\n",mpi_rank,i);
 	  /* Switch back to collective I/O */
               if(H5D_ioinfo_make_coll(io_info) < 0)
                  HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't switch to collective I/O")
+#endif
+
+      {
+#ifdef KENT
+printf("coming into independent IO with file set view\n");
+           /* if(H5Pset_dxpl_mpio_collective_opt(io_info->dxpl_id,H5FD_MPIO_INDIVIDUAL_IO)<0)
+               HGOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL,"couldn't set individual MPI-IO with the file setview");
+printf("after setting the property list\n");
+*/
+#endif
+	  if(!last_io_mode_coll)
+	  /* using independent I/O with file setview.*/
+            if(H5D_ioinfo_make_coll_opt(io_info) < 0)
+                 HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't switch to collective I/O")
+            if(select_chunk){
+              if(H5D_inter_collective_io(io_info,chunk_info->fspace,chunk_info->mspace,
+                                     chunk_addr[i],buf,do_write )<0)
+                HGOTO_ERROR(H5E_IO, H5E_CANTGET, FAIL,"couldn't finish shared collective MPI-IO");
+            }
+            else{
+             if(H5D_inter_collective_io(io_info,NULL,NULL,
+                                     chunk_addr[i],buf,do_write )<0)
+                HGOTO_ERROR(H5E_IO, H5E_CANTGET, FAIL,"couldn't finish shared collective MPI-IO");
 
+            }
+#ifdef KENT
+printf("after inter collective IO\n");
+ /*           if(H5Pset_dxpl_mpio_collective_opt(io_info->dxpl_id,H5FD_MPIO_COLLECTIVE_IO)<0)
+               HGOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL,"couldn't set collective MPI-IO ");
+*/
+#endif
+              last_io_mode_coll = FALSE;
+
+
+          }
+        }
+      }
+      if(!last_io_mode_coll)
+	  /* Switch back to collective I/O */
+              if(H5D_ioinfo_make_coll(io_info) < 0)
+                 HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't switch to collective I/O")
   done:
     HDfree(chunk_io_option);
     HDfree(chunk_addr);
@@ -1235,13 +1283,155 @@ printf("inside independent IO mpi_rank = %d, chunk index = %d\n",mpi_rank,i);
     FUNC_LEAVE_NOAPI(ret_value)
 } /* end H5D_multi_chunk_collective_io */
 
+/*-------------------------------------------------------------------------
+ * Function:	H5D_multi_chunk_collective_io_no_opt
+ *
+ * Purpose:	To do collective IO without any optimization per chunk base
+ *              The internal independent IO inside HDF5 cannot handle
+ *              non-contiguous(or with holes) storage efficiently.
+ *              Under this case, the one independent IO call may consist of
+ *              many small disk IOs. So we may use independent IO with derived datatype
+                to replace the independent IO when we find this chunk is not good to
+                do collective IO. However, according to our performance study,
+                this approach may not overcome the overhead caused by gather/scatter.
+                So we decide to leave the original collective IO per chunk approach as 
+                an option for users. If users choose to use 
+                H5Pset_dxpl_mpio_chunk_opt(dxpl_id,H5FD_MPIO_OPT_MULTI_IO),
+                this function will be called. 
+                The HDF5 library won't do any management but leave it to MPI-IO to figure 
+                out.  
+ *
+ * Return:	Non-negative on success/Negative on failure
+ *
+ * Programmer:
+ *
+ * Modifications:
+ *
+ *-------------------------------------------------------------------------
+ */
+static herr_t 
+H5D_multi_chunk_collective_io_no_opt(H5D_io_info_t *io_info,fm_map *fm,const void *buf, hbool_t do_write) 
+{
+/*** START HERE ****/
+
+      int               i,count_chunk,min_num_chunk;
+      haddr_t           chunk_addr;
+
+      H5SL_node_t      *chunk_node;           /* Current node in chunk skip list */
+      H5D_storage_t     store;                /* union of EFL and chunk pointer in file space */
+      herr_t            ret_value = SUCCEED;    
+#ifdef KENT
+      int mpi_rank;
+#endif
+
+
+      FUNC_ENTER_NOAPI_NOINIT(H5D_multi_chunk_collective_io_no_opt)
+#ifdef KENT
+      mpi_rank = H5F_mpi_get_rank(io_info->dset->oloc.file);
+      printf("coming to multi_chunk_collective_io_no_opt\n");
+#endif
+
+      if(H5D_mpio_get_min_chunk(io_info,fm,&min_num_chunk)<0)
+         HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't get minimum number of chunk");
+      count_chunk = 0;
+  
+      /* Get first node in chunk skip list */
+      chunk_node=H5SL_first(fm->fsel);
+ 
+       /* Iterate through chunks to be operated on */
+      while(chunk_node) {
+           H5D_chunk_info_t *chunk_info;   /* chunk information */
+           hbool_t make_ind, make_coll;        /* Flags to indicate that the MPI mode should change */
+
+           /* Get the actual chunk information from the skip list node */
+           chunk_info=H5SL_item(chunk_node);
+
+           /* Set dataset storage for I/O info */
+          io_info->store=&store;
+`
+           /* Pass in chunk's coordinates in a union. */
+           store.chunk.offset = chunk_info->coords;
+           store.chunk.index = chunk_info->index;
+
+           /* Reset flags for changing parallel I/O mode */
+           make_ind = make_coll = FALSE;
+            
+           count_chunk++;
+            /* If the number of chunk is greater than minimum number of chunk,
+                  Do independent read */
+           if(count_chunk > min_num_chunk) {
+              /* Switch to independent I/O (permanently) */
+               make_ind = TRUE;
+            }
+
+#ifndef H5_MPI_COMPLEX_DERIVED_DATATYPE_WORKS
+/* This case needs to be improved to check if the selected space
+   is regular. If all selections are regular, collective IO can still be done.
+   However, since we find an MPI-IO bug at a DOE machine(mcr) that cannot
+   handle collective I/O selection for this case correctly, 
+   we turn off this optimization but leave the following code
+   for future optimization. Otherwise, the following else {} doesn't make sense.
+   KY 2006/8/4/ */
+            else {
+                 /* Switch to independent I/O (temporarily) */
+                   make_ind = TRUE;
+                    make_coll = TRUE;
+             } /* end else */
+#endif /* H5_MPI_COMPLEX_DERIVED_DATATYPE_WORKS */
+
+            /* Switch to independent I/O */
+            if(make_ind)
+                if(H5D_ioinfo_make_ind(io_info) < 0)
+                    HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't switch to independent I/O")
+
+            if(make_ind) {/*independent I/O */
+  
+              if(do_write) {
+                ret_value = (io_info->ops.write)(io_info,
+                         chunk_info->chunk_points,H5T_get_size(io_info->dset->shared->type),
+                         chunk_info->fspace,chunk_info->mspace,0,
+                         buf);
+              /* Check return value of the write */
+                if (ret_value<0)
+                    HGOTO_ERROR(H5E_DATASET, H5E_WRITEERROR, FAIL, "optimized write failed")
+              }
+              else {
+                ret_value = (io_info->ops.read)(io_info,
+                          chunk_info->chunk_points,H5T_get_size(io_info->dset->shared->type),
+                          chunk_info->fspace,chunk_info->mspace,0,
+                          buf);
+                /* Check return value from optimized write */
+                if (ret_value<0)
+                  HGOTO_ERROR(H5E_DATASET, H5E_READERROR, FAIL, "optimized read failed")
+              }
+            }
+            else { /*collective I/O */
+              if(HADDR_UNDEF==(chunk_addr = H5D_istore_get_addr(io_info,NULL)))
+                  HGOTO_ERROR(H5E_STORAGE, H5E_CANTGET, FAIL,"couldn't get chunk info from skipped list");
+              if(H5D_inter_collective_io(io_info,chunk_info->fspace,chunk_info->mspace,
+                                     chunk_addr,buf,do_write )<0)
+                HGOTO_ERROR(H5E_IO, H5E_CANTGET, FAIL,"couldn't finish shared collective MPI-IO");
+            }
+
+
+            if(make_coll)
+              if(H5D_ioinfo_make_coll(io_info) < 0)
+                    HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't switch to independent I/O")
+          /* Get the next chunk node in the skip list */
+            chunk_node=H5SL_next(chunk_node);
+       } /* end while */
+
+  done:
+
+    FUNC_LEAVE_NOAPI(ret_value)
+} /* end H5D_multi_chunk_collective_io_no_opt */
 
 /*-------------------------------------------------------------------------
  * Function:	H5D_inter_collective_io
  *
  * Purpose:	Routine for the shared part of collective IO between multiple chunk
                 collective IO and contiguous collective IO
-
+		
  *
  * Return:	Non-negative on success/Negative on failure
  *
@@ -1251,16 +1441,16 @@ printf("inside independent IO mpi_rank = %d, chunk index = %d\n",mpi_rank,i);
  *
  *-------------------------------------------------------------------------
  */
-static herr_t
+static herr_t 
 H5D_inter_collective_io(H5D_io_info_t *io_info,const H5S_t *file_space,const H5S_t *mem_space,
-			 haddr_t addr, const void *buf, hbool_t do_write )
+			 haddr_t addr, const void *buf, hbool_t do_write ) 
 {
 
       size_t	        mpi_buf_count, mpi_file_count;     /* Number of "objects" to transfer */
       MPI_Datatype      mpi_file_type,mpi_buf_type;
       hsize_t	        mpi_buf_offset, mpi_file_offset;   /* Offset within dataset where selection (ie. MPI type) begins */
       hbool_t	        mbt_is_derived=0,      /* Whether the buffer (memory) type is derived and needs to be free'd */
-		        mft_is_derived=0;      /* Whether the file type is derived and needs to be free'd */
+		        mft_is_derived=0;      /* Whether the file type is derived and needs to be free'd */  
       H5D_common_coll_info_t coll_info;
       herr_t       ret_value = SUCCEED;  /* return value */
 
@@ -1274,11 +1464,11 @@ H5D_inter_collective_io(H5D_io_info_t *io_info,const H5S_t *file_space,const H5S
 	if(H5S_mpio_space_type(mem_space,H5T_get_size(io_info->dset->shared->type),
 			       &mpi_buf_type,&mpi_buf_count,&mpi_buf_offset,&mbt_is_derived)<0)
 	       HGOTO_ERROR(H5E_DATASPACE, H5E_BADTYPE, FAIL,"couldn't create MPI buffer type");
-
+	    
       }
       else {
 	    /* For non-selection, participate with a none MPI derived datatype, the count is 0.  */
-	    mpi_buf_type   = MPI_BYTE;
+	    mpi_buf_type   = MPI_BYTE;   
 	    mpi_file_type  = MPI_BYTE;
 	    mpi_file_count = 0;
 	    mpi_buf_count  = 0;
@@ -1305,7 +1495,7 @@ printf("before leaving inter_collective_io ret_value = %d\n",ret_value);
  * Function:	H5D_final_collective_io
  *
  * Purpose:	Routine for the common part of collective IO with different storages.
-
+		
  *
  * Return:	Non-negative on success/Negative on failure
  *
@@ -1315,13 +1505,13 @@ printf("before leaving inter_collective_io ret_value = %d\n",ret_value);
  *
  *-------------------------------------------------------------------------
  */
-static herr_t
+static herr_t 
 H5D_final_collective_io(H5D_io_info_t *io_info,MPI_Datatype*mpi_file_type,MPI_Datatype *mpi_buf_type,
-			 H5D_common_coll_info_t* coll_info, const void *buf, hbool_t do_write)
+			 H5D_common_coll_info_t* coll_info, const void *buf, hbool_t do_write) 
 {
 
 
-    int               mpi_code;              /* MPI return code */
+    int               mpi_code;              /* MPI return code */ 
     hbool_t	      plist_is_setup=0;      /* Whether the dxpl has been customized */
     herr_t            ret_value = SUCCEED;
 
@@ -1338,7 +1528,7 @@ H5D_final_collective_io(H5D_io_info_t *io_info,MPI_Datatype*mpi_file_type,MPI_Da
      plist_is_setup=1;
 #ifdef KENT
      HDfprintf(stdout,"chunk addr %Hu\n",coll_info->chunk_addr);
-     printf("mpi_buf_count %d\n",coll_info->mpi_buf_count);
+     printf("mpi_buf_count %d\n",coll_info->mpi_buf_count);	
 #endif
      if(do_write) {
 	ret_value = (io_info->ops.write)(io_info,
@@ -1348,7 +1538,7 @@ H5D_final_collective_io(H5D_io_info_t *io_info,MPI_Datatype*mpi_file_type,MPI_Da
 #ifdef KENT
         printf("ret_value after final collective IO= %d\n",ret_value);
 #endif
-	if (ret_value<0)
+	if (ret_value<0) 
 	    HGOTO_ERROR(H5E_DATASET, H5E_WRITEERROR, FAIL, "optimized write failed")
 	      }
      else {
@@ -1356,11 +1546,11 @@ H5D_final_collective_io(H5D_io_info_t *io_info,MPI_Datatype*mpi_file_type,MPI_Da
 	        coll_info->mpi_buf_count,0,NULL,NULL,coll_info->chunk_addr,
                 buf);
 	   /* Check return value from optimized write */
-	 if (ret_value<0)
+	 if (ret_value<0) 
 	    HGOTO_ERROR(H5E_DATASET, H5E_READERROR, FAIL, "optimized read failed")
 	      }
  done:
-
+   
      /* Reset the dxpl settings */
       if(plist_is_setup) {
         if(H5FD_mpi_teardown_collective(io_info->dxpl_id)<0)
@@ -1371,7 +1561,7 @@ H5D_final_collective_io(H5D_io_info_t *io_info,MPI_Datatype*mpi_file_type,MPI_Da
       if (coll_info->mbt_is_derived) {
 	if (MPI_SUCCESS != (mpi_code= MPI_Type_free( mpi_buf_type )))
             HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code);
-
+	
       }
       if (coll_info->mft_is_derived) {
 	if (MPI_SUCCESS != (mpi_code= MPI_Type_free( mpi_file_type )))
@@ -1392,7 +1582,7 @@ H5D_final_collective_io(H5D_io_info_t *io_info,MPI_Datatype*mpi_file_type,MPI_Da
 
    Description:
                 root will collective all chunk addresses and broadcast towards other processes.
-
+   
    Parameters:
 
                 Input: H5D_io_info_t* io_info,
@@ -1430,7 +1620,7 @@ H5D_pre_sort_chunk(H5D_io_info_t *io_info,int total_chunks,haddr_t total_chunk_a
   if(MPI_SUCCESS != (mpi_code = MPI_Type_commit(&chunk_addrtype)))
     HMPI_GOTO_ERROR(FAIL, "MPI_Type_commit failed", mpi_code);
 
-
+  
   mpi_type_cleanup = 1;
 
   if(mpi_rank == root) {
@@ -1441,7 +1631,7 @@ H5D_pre_sort_chunk(H5D_io_info_t *io_info,int total_chunks,haddr_t total_chunk_a
   /* Broadcasting the MPI_IO option info. and chunk address info. */
    if(MPI_SUCCESS !=(mpi_code = MPI_Bcast(total_chunk_addr_array,1,chunk_addrtype,root,comm)))
      HMPI_GOTO_ERROR(FAIL, "MPI_BCast failed", mpi_code);
-
+  
 done:
 
    if(mpi_type_cleanup){
@@ -1462,12 +1652,12 @@ done:
                 For most cases, the chunk address has already been sorted in increasing order.
 		The special sorting flag is used to optimize this common case.
                 quick sort is used for necessary sorting.
-
+   
    Parameters:
                 Input: H5D_io_info_t* io_info,
 		       fm_map *fm(global chunk map struct)
-		Input/Output:  H5D_chunk_addr_info_t chunk_addr_info_array[]   : array to store chunk address and information
-                       many_chunk_opt                         : flag to optimize the way to obtain chunk addresses
+		Input/Output:  H5D_chunk_addr_info_t chunk_addr_info_array[]   : array to store chunk address and information 
+                       many_chunk_opt                         : flag to optimize the way to obtain chunk addresses 
                                                                 for many chunks
  *
  * Return:	Non-negative on success/Negative on failure
@@ -1479,7 +1669,7 @@ done:
  *-------------------------------------------------------------------------
  */
 
-static herr_t
+static herr_t 
 H5D_sort_chunk(H5D_io_info_t * io_info,
 	       fm_map *fm,
 	       H5D_chunk_addr_info_t chunk_addr_info_array[],
@@ -1500,7 +1690,7 @@ H5D_sort_chunk(H5D_io_info_t * io_info,
     H5D_storage_t     store;              /*union of EFL and chunk pointer in file space */
     hbool_t           do_sort = FALSE;
     herr_t	      ret_value = SUCCEED;	/*return value		*/
-
+  
     FUNC_ENTER_NOAPI_NOINIT(H5D_sort_chunk)
 
     num_chunks =  H5SL_count(fm->fsel);
@@ -1511,14 +1701,11 @@ printf("many_chunk_opt= %d\n",many_chunk_opt);
     /* If we need to optimize the way to obtain the chunk address */
     if(many_chunk_opt != H5D_OBTAIN_ONE_CHUNK_ADDR_IND){
 
+      int mpi_rank, root;
       total_chunks = (int)fm->total_chunks;
       total_chunk_addr_array = H5MM_malloc(sizeof(haddr_t)*total_chunks);
       tchunk_addr_cleanup = 1;
 
-      if(many_chunk_opt == H5D_OBTAIN_ALL_CHUNK_ADDR_COL) {/* We will broadcast the array from the root process */
-
-	int mpi_rank, root;
-
 #ifdef KENT
 printf("Coming inside H5D_OBTAIN_ALL_CHUNK_ADDR_COL\n");
 #endif
@@ -1541,12 +1728,7 @@ printf("Coming inside H5D_OBTAIN_ALL_CHUNK_ADDR_COL\n");
 	/* Broadcasting the MPI_IO option info. and chunk address info. */
 	if(MPI_SUCCESS !=(mpi_code = MPI_Bcast(total_chunk_addr_array,1,chunk_addrtype,root,io_info->comm)))
 	   HMPI_GOTO_ERROR(FAIL, "MPI_BCast failed", mpi_code);
-      }
 
-      else { /* Obtain all chunk addresses independently */
-	if(H5D_istore_chunkmap(io_info,total_chunks,total_chunk_addr_array,fm->down_chunks)<0)
-	     HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't get chunk address");
-      }
     }
 
     /* Get first node in chunk skip list */
@@ -1562,8 +1744,11 @@ printf("Coming inside H5D_OBTAIN_ALL_CHUNK_ADDR_COL\n");
     if(many_chunk_opt == H5D_OBTAIN_ONE_CHUNK_ADDR_IND){
       if(HADDR_UNDEF==(chunk_addr = H5D_istore_get_addr(io_info,NULL)))
 	  HGOTO_ERROR(H5E_STORAGE, H5E_CANTGET, FAIL,"couldn't get chunk info from skipped list");
+#ifdef KENT
+   printf("coming to obtain each chunk address individually \n");
+#endif
     }
-    else
+    else 
        chunk_addr = total_chunk_addr_array[chunk_info->index];
     chunk_addr_info_array[i].chunk_addr  = chunk_addr;
     chunk_addr_info_array[i].chunk_info  = *chunk_info;
@@ -1574,12 +1759,12 @@ printf("Coming inside H5D_OBTAIN_ALL_CHUNK_ADDR_COL\n");
             chunk_info         = H5SL_item(chunk_node);
             store.chunk.offset = chunk_info->coords;
             store.chunk.index  = chunk_info->index;
-
+	    
 	    if(many_chunk_opt == H5D_OBTAIN_ONE_CHUNK_ADDR_IND){
 	      if(HADDR_UNDEF==(chunk_addr = H5D_istore_get_addr(io_info,NULL)))
 		HGOTO_ERROR(H5E_STORAGE, H5E_CANTGET, FAIL,"couldn't get chunk info from skipped list");
 	    }
-	    else
+	    else 
 	      chunk_addr = total_chunk_addr_array[chunk_info->index];
 
 	    if(chunk_addr < chunk_addr_info_array[i].chunk_addr) do_sort = TRUE;
@@ -1605,7 +1790,7 @@ done:
     }
     FUNC_LEAVE_NOAPI(ret_value)
 } /* end H5D_sort_chunk() */
-
+    
 
 /*-------------------------------------------------------------------------
  * Function:	H5D_obtain_mpio_mode
@@ -1616,11 +1801,11 @@ done:
    Description:
 
                 1) Each process provides two piece of information for all chunks with selection
-		   a) chunk index
+		   a) chunk index 
                    b) wheather this chunk is regular(for MPI derived datatype not working case)
 
                 2) Gather all the information to the root process
-
+		
 		3) Root process will do the following:
 		   a) Obtain chunk address for all chunks in this data space
 		   b) With the consideration of the user option, calculate IO mode for each chunk
@@ -1628,7 +1813,7 @@ done:
 		      in order to do MPI Bcast only once
                    d) MPI Bcast the IO mode and chunk address information for each chunk.
 		4) Each process then retrieves IO mode and chunk address information to assign_io_mode and chunk_addr.
-
+ 
    Parameters:
 
                 Input: H5D_io_info_t* io_info,
@@ -1645,8 +1830,8 @@ done:
  *-------------------------------------------------------------------------
  */
 
-static herr_t
-H5D_obtain_mpio_mode(H5D_io_info_t* io_info,
+static herr_t 
+H5D_obtain_mpio_mode(H5D_io_info_t* io_info, 
 		     fm_map *fm,
 		     uint8_t assign_io_mode[],
 		     haddr_t chunk_addr[])
@@ -1661,7 +1846,7 @@ H5D_obtain_mpio_mode(H5D_io_info_t* io_info,
   uint8_t*          mergebuf=NULL;
   uint8_t*          tempbuf;
 
-  H5SL_node_t*      chunk_node;
+  H5SL_node_t*      chunk_node;  
   H5D_chunk_info_t* chunk_info;
 
   MPI_Datatype      bastype[2];
@@ -1688,7 +1873,7 @@ H5D_obtain_mpio_mode(H5D_io_info_t* io_info,
   FUNC_ENTER_NOAPI_NOINIT(H5D_obtain_mpio_mode)
 
   /* Assign the rank 0 to the root */
-  root              = 0;
+  root              = 0; 
   comm              = io_info->comm;
 
   /* Obtain the number of process and the current rank of the process */
@@ -1696,7 +1881,7 @@ H5D_obtain_mpio_mode(H5D_io_info_t* io_info,
 	 HGOTO_ERROR (H5E_IO, H5E_MPI, FAIL, "unable to obtain mpi rank");
   if((mpi_size = H5F_mpi_get_size(io_info->dset->oloc.file))<0)
 	 HGOTO_ERROR (H5E_IO, H5E_MPI, FAIL, "unable to obtain mpi size");
-
+  
    /* Allocate memory */
   ori_total_chunks      = fm->total_chunks;
   H5_ASSIGN_OVERFLOW(total_chunks,ori_total_chunks,hsize_t,int);
@@ -1704,30 +1889,30 @@ H5D_obtain_mpio_mode(H5D_io_info_t* io_info,
  /* Obtain the data transfer properties */
   if(NULL == (plist = H5I_object(io_info->dxpl_id)))
        HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, "not a file access property list")
-
+  
   percent_nproc_per_chunk=H5P_peek_unsigned(plist,H5D_XFER_MPIO_CHUNK_OPT_RATIO_NAME);
 #if defined(H5_MPI_COMPLEX_DERIVED_DATATYPE_WORKS) && defined(H5_MPI_SPECIAL_COLLECTIVE_IO_WORKS)
-
+   
   chunk_opt_mode=(H5FD_mpio_chunk_opt_t)H5P_peek_unsigned(plist,H5D_XFER_MPIO_CHUNK_OPT_HARD_NAME);
 
   if((chunk_opt_mode == H5FD_MPIO_CHUNK_MULTI_IO) || (percent_nproc_per_chunk == 0)){
     if(H5D_istore_chunkmap(io_info,total_chunks,chunk_addr,fm->down_chunks)<0)
-       HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't get chunk address");
+       HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't get chunk address");    
     for(ic = 0; ic<total_chunks;ic++)
        assign_io_mode[ic] = H5D_CHUNK_IO_MODE_COL;
        goto done;
   }
-#endif
+#endif    
   threshold_nproc_per_chunk = mpi_size * percent_nproc_per_chunk/100;
 
 
   io_mode_info      = (uint8_t *)H5MM_calloc(total_chunks*sizeof(MPI_BYTE));
   mergebuf          = H5MM_malloc((sizeof(haddr_t)+sizeof(MPI_BYTE))*total_chunks);
   tempbuf           = mergebuf + sizeof(MPI_BYTE)*total_chunks;
-  if(mpi_rank == root)
+  if(mpi_rank == root) 
      recv_io_mode_info = (uint8_t *)H5MM_malloc(total_chunks*sizeof(MPI_BYTE)*mpi_size);
 
-
+  
   mem_cleanup       = 1;
 
   chunk_node        = H5SL_first(fm->fsel);
@@ -1750,7 +1935,7 @@ H5D_obtain_mpio_mode(H5D_io_info_t* io_info,
 
       chunk_node = H5SL_next(chunk_node);
   }
-
+  
   /*Create sent MPI derived datatype */
   if(MPI_SUCCESS !=(mpi_code = MPI_Type_contiguous(total_chunks,MPI_BYTE,&stype)))
     HMPI_GOTO_ERROR(FAIL, "MPI_Comm_rank failed", mpi_code);
@@ -1764,7 +1949,7 @@ H5D_obtain_mpio_mode(H5D_io_info_t* io_info,
   basdisp[0]    = 0;
   basdisp[1]    = (MPI_Aint)(sizeof(MPI_BYTE)*total_chunks);/* may need to check overflow */
   bastype[0]    = MPI_BYTE;
-
+ 
   if(MPI_SUCCESS !=(mpi_code = MPI_Type_contiguous(sizeof(haddr_t),MPI_BYTE,&chunk_addrtype)))
     HMPI_GOTO_ERROR(FAIL, "MPI_Type_contiguous failed", mpi_code);
   if(MPI_SUCCESS != (mpi_code = MPI_Type_commit(&chunk_addrtype)))
@@ -1792,7 +1977,7 @@ H5D_obtain_mpio_mode(H5D_io_info_t* io_info,
     int*              ind_this_chunk;
 #endif
 
-    /* pre-computing: calculate number of processes and
+    /* pre-computing: calculate number of processes and 
         regularity of the selection occupied in each chunk */
     nproc_per_chunk = (int*)H5MM_calloc(total_chunks*sizeof(int));
 #if !defined(H5_MPI_COMPLEX_DERIVED_DATATYPE_WORKS) || !defined(H5_MPI_SPECIAL_COLLECTIVE_IO_WORKS)
@@ -1832,7 +2017,7 @@ H5D_obtain_mpio_mode(H5D_io_info_t* io_info,
 
     /* Calculating MPIO mode for each chunk (collective, independent, none) */
     for(ic = 0; ic < total_chunks; ic++){
-	  if(nproc_per_chunk[ic]>=MAX(2,threshold_nproc_per_chunk)){
+	  if(nproc_per_chunk[ic]>MAX(1,threshold_nproc_per_chunk)){
 #if !defined(H5_MPI_COMPLEX_DERIVED_DATATYPE_WORKS) || !defined(H5_MPI_SPECIAL_COLLECTIVE_IO_WORKS)
 	    if(!ind_this_chunk[ic]) assign_io_mode[ic] = H5D_CHUNK_IO_MODE_COL;
 #else
@@ -1866,7 +2051,7 @@ H5D_obtain_mpio_mode(H5D_io_info_t* io_info,
       new_value = 0;
       if(H5Pset(io_info->dxpl_id,H5D_XFER_COLL_CHUNK_MULTI_RATIO_COLL_NAME,&new_value)<0)
               HGOTO_ERROR(H5E_PLIST, H5E_UNSUPPORTED, FAIL, "unable to set property value");
-#else
+#else 
       for(ic = 0; ic < total_chunks; ic++){
         if(assign_io_mode[ic] == H5D_CHUNK_IO_MODE_COL) {
            new_value = 0;
@@ -1893,7 +2078,7 @@ H5D_obtain_mpio_mode(H5D_io_info_t* io_info,
       }
    }
 #endif
-
+ 
 done:
 
    if(mpi_type_cleanup) {
@@ -1910,7 +2095,7 @@ done:
    if(mem_cleanup){
      HDfree(io_mode_info);
      HDfree(mergebuf);
-     if(mpi_rank == root)
+     if(mpi_rank == root) 
        HDfree(recv_io_mode_info);
    }
 
@@ -1923,7 +2108,7 @@ H5D_cmp_chunk_addr(const void *chunk_addr_info1, const void *chunk_addr_info2)
    haddr_t addr1, addr2;
 
    FUNC_ENTER_NOAPI_NOINIT(H5D_cmp_chunk_addr)
-
+   
    addr1 = ((const H5D_chunk_addr_info_t *)chunk_addr_info1)->chunk_addr;
    addr2 = ((const H5D_chunk_addr_info_t *)chunk_addr_info2)->chunk_addr;
author	MuQun Yang <ymuqun@hdfgroup.org>	2006-08-09 03:00:11 (GMT)
committer	MuQun Yang <ymuqun@hdfgroup.org>	2006-08-09 03:00:11 (GMT)
commit	6916816a563532fddc3699a6d5e4adb57212968d (patch)
tree	70121257e539ec369455ebd43119873fd96c7489 /src/H5Dmpio.c
parent	d17d42acd0fbba4b3433937f448c99930553b038 (diff)
download	hdf5-6916816a563532fddc3699a6d5e4adb57212968d.zip hdf5-6916816a563532fddc3699a6d5e4adb57212968d.tar.gz hdf5-6916816a563532fddc3699a6d5e4adb57212968d.tar.bz2