summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorQuincey Koziol <koziol@hdfgroup.org>2000-09-27 18:23:40 (GMT)
committerQuincey Koziol <koziol@hdfgroup.org>2000-09-27 18:23:40 (GMT)
commit9e4665ba8c24c09e4f9442d0a914339a076e341e (patch)
tree871a76959631266000565a018c16eef4ce7e0773
parentd59cdf21d9b53c2e9c70c004d8e3432193b8521a (diff)
downloadhdf5-9e4665ba8c24c09e4f9442d0a914339a076e341e.zip
hdf5-9e4665ba8c24c09e4f9442d0a914339a076e341e.tar.gz
hdf5-9e4665ba8c24c09e4f9442d0a914339a076e341e.tar.bz2
[svn-r2606] Purpose:
Code Optimization Description: The matrix operations are currently the hot-spot in the library code for regular hyperslab operations. Solution: Unrolled loops for 3 of the more heavily used functions (H5V_stride_optimize2, H5V_hyper_stride & H5V_hyper_copy) for the common cases (i.e. up to 3-D datasets). This squeezes some more blood out of the stone (turnip? :-) and improves the h5hypers.c benchmark on baldric by another 20-25%. Platforms tested: Solaris 2.6 (i.e. baldric)
-rw-r--r--src/H5V.c275
1 files changed, 243 insertions, 32 deletions
diff --git a/src/H5V.c b/src/H5V.c
index dc61b22..a91f191 100644
--- a/src/H5V.c
+++ b/src/H5V.c
@@ -100,17 +100,102 @@ H5V_stride_optimize2(intn *np/*in,out*/, hsize_t *elmt_size/*in,out*/,
/*
* Combine adjacent memory accesses
*/
- while (*np &&
- stride1[*np-1] > 0 &&
- (hsize_t)(stride1[*np-1]) == *elmt_size &&
- stride2[*np-1] > 0 &&
- (hsize_t)(stride2[*np-1]) == *elmt_size) {
- *elmt_size *= size[*np-1];
- if (--*np) {
- stride1[*np-1] += size[*np] * stride1[*np];
- stride2[*np-1] += size[*np] * stride2[*np];
- }
- }
+
+ /* Unroll loop for common cases */
+ switch(*np) {
+ case 1: /* For 0-D datasets (dunno if this ever gets used...) */
+ if((hsize_t)(stride1[0]) == *elmt_size &&
+ (hsize_t)(stride2[0]) == *elmt_size) {
+ *elmt_size *= size[0];
+ --*np; /* *np decrements to a value of 0 now */
+ } /* end if */
+ break;
+
+ case 2: /* For 1-D datasets */
+ if((hsize_t)(stride1[1]) == *elmt_size &&
+ (hsize_t)(stride2[1]) == *elmt_size) {
+ *elmt_size *= size[1];
+ --*np; /* *np decrements to a value of 1 now */
+ stride1[0] += size[1] * stride1[1];
+ stride2[0] += size[1] * stride2[1];
+
+ if((hsize_t)(stride1[0]) == *elmt_size &&
+ (hsize_t)(stride2[0]) == *elmt_size) {
+ *elmt_size *= size[0];
+ --*np; /* *np decrements to a value of 0 now */
+ } /* end if */
+ } /* end if */
+ break;
+
+ case 3: /* For 2-D datasets */
+ if((hsize_t)(stride1[2]) == *elmt_size &&
+ (hsize_t)(stride2[2]) == *elmt_size) {
+ *elmt_size *= size[2];
+ --*np; /* *np decrements to a value of 2 now */
+ stride1[1] += size[2] * stride1[2];
+ stride2[1] += size[2] * stride2[2];
+
+ if((hsize_t)(stride1[1]) == *elmt_size &&
+ (hsize_t)(stride2[1]) == *elmt_size) {
+ *elmt_size *= size[1];
+ --*np; /* *np decrements to a value of 1 now */
+ stride1[0] += size[1] * stride1[1];
+ stride2[0] += size[1] * stride2[1];
+
+ if((hsize_t)(stride1[0]) == *elmt_size &&
+ (hsize_t)(stride2[0]) == *elmt_size) {
+ *elmt_size *= size[0];
+ --*np; /* *np decrements to a value of 0 now */
+ } /* end if */
+ } /* end if */
+ } /* end if */
+ break;
+
+ case 4: /* For 3-D datasets */
+ if((hsize_t)(stride1[3]) == *elmt_size &&
+ (hsize_t)(stride2[3]) == *elmt_size) {
+ *elmt_size *= size[3];
+ --*np; /* *np decrements to a value of 3 now */
+ stride1[2] += size[3] * stride1[3];
+ stride2[2] += size[3] * stride2[3];
+
+ if((hsize_t)(stride1[2]) == *elmt_size &&
+ (hsize_t)(stride2[2]) == *elmt_size) {
+ *elmt_size *= size[2];
+ --*np; /* *np decrements to a value of 2 now */
+ stride1[1] += size[2] * stride1[2];
+ stride2[1] += size[2] * stride2[2];
+
+ if((hsize_t)(stride1[1]) == *elmt_size &&
+ (hsize_t)(stride2[1]) == *elmt_size) {
+ *elmt_size *= size[1];
+ --*np; /* *np decrements to a value of 1 now */
+ stride1[0] += size[1] * stride1[1];
+ stride2[0] += size[1] * stride2[1];
+
+ if((hsize_t)(stride1[0]) == *elmt_size &&
+ (hsize_t)(stride2[0]) == *elmt_size) {
+ *elmt_size *= size[0];
+ --*np; /* *np decrements to a value of 0 now */
+ } /* end if */
+ } /* end if */
+ } /* end if */
+ } /* end if */
+ break;
+
+ default:
+ while (*np &&
+ (hsize_t)(stride1[*np-1]) == *elmt_size &&
+ (hsize_t)(stride2[*np-1]) == *elmt_size) {
+ *elmt_size *= size[*np-1];
+ if (--*np) {
+ stride1[*np-1] += size[*np] * stride1[*np];
+ stride2[*np-1] += size[*np] * stride2[*np];
+ }
+ }
+ break;
+ } /* end switch */
+
FUNC_LEAVE(SUCCEED);
}
@@ -151,6 +236,7 @@ H5V_hyper_stride(intn n, const hsize_t *size,
{
hsize_t skip; /*starting point byte offset */
hsize_t acc; /*accumulator */
+ hsize_t tmp;
int i; /*counter */
FUNC_ENTER(H5V_hyper_stride, (HDabort(), 0));
@@ -164,14 +250,60 @@ H5V_hyper_stride(intn n, const hsize_t *size,
stride[n-1] = 1;
skip = offset ? offset[n-1] : 0;
- /* others */
- for (i=n-2, acc=1; i>=0; --i) {
- hsize_t tmp = acc * (total_size[i+1] - size[i+1]);
- assert (tmp<((hsize_t)1<<(8*sizeof(hssize_t)-1)));
- stride[i] = (hssize_t)tmp; /*overflow checked*/
- acc *= total_size[i+1];
- skip += acc * (offset ? offset[i] : 0);
- }
+ switch(n) {
+ case 2: /* 1-D dataset */
+ tmp = total_size[1] - size[1];
+ assert (tmp<((hsize_t)1<<(8*sizeof(hssize_t)-1)));
+ stride[0] = (hssize_t)tmp; /*overflow checked*/
+ acc = total_size[1];
+ skip += acc * (offset ? offset[0] : 0);
+ break;
+
+ case 3: /* 2-D dataset */
+ tmp = total_size[2] - size[2];
+ assert (tmp<((hsize_t)1<<(8*sizeof(hssize_t)-1)));
+ stride[1] = (hssize_t)tmp; /*overflow checked*/
+ acc = total_size[2];
+ skip += acc * (offset ? offset[1] : 0);
+
+ tmp = acc * (total_size[1] - size[1]);
+ assert (tmp<((hsize_t)1<<(8*sizeof(hssize_t)-1)));
+ stride[0] = (hssize_t)tmp; /*overflow checked*/
+ acc *= total_size[1];
+ skip += acc * (offset ? offset[0] : 0);
+ break;
+
+ case 4: /* 3-D dataset */
+ tmp = total_size[3] - size[3];
+ assert (tmp<((hsize_t)1<<(8*sizeof(hssize_t)-1)));
+ stride[2] = (hssize_t)tmp; /*overflow checked*/
+ acc = total_size[3];
+ skip += acc * (offset ? offset[2] : 0);
+
+ tmp = acc * (total_size[2] - size[2]);
+ assert (tmp<((hsize_t)1<<(8*sizeof(hssize_t)-1)));
+ stride[1] = (hssize_t)tmp; /*overflow checked*/
+ acc *= total_size[2];
+ skip += acc * (offset ? offset[1] : 0);
+
+ tmp = acc * (total_size[1] - size[1]);
+ assert (tmp<((hsize_t)1<<(8*sizeof(hssize_t)-1)));
+ stride[0] = (hssize_t)tmp; /*overflow checked*/
+ acc *= total_size[1];
+ skip += acc * (offset ? offset[0] : 0);
+ break;
+
+ default:
+ /* others */
+ for (i=n-2, acc=1; i>=0; --i) {
+ hsize_t tmp = acc * (total_size[i+1] - size[i+1]);
+ assert (tmp<((hsize_t)1<<(8*sizeof(hssize_t)-1)));
+ stride[i] = (hssize_t)tmp; /*overflow checked*/
+ acc *= total_size[i+1];
+ skip += acc * (offset ? offset[i] : 0);
+ }
+ break;
+ } /* end switch */
FUNC_LEAVE(skip);
}
@@ -381,6 +513,8 @@ H5V_hyper_copy(intn n, const hsize_t *_size,
hssize_t dst_stride[H5V_HYPER_NDIMS]; /*dest stride info */
hsize_t dst_start, src_start; /*offset to start at */
hsize_t elmt_size = 1; /*element size in bytes */
+ hsize_t tmp1;
+ hsize_t tmp2;
herr_t status; /*return status */
#ifndef NDEBUG
intn i;
@@ -423,19 +557,96 @@ H5V_hyper_copy(intn n, const hsize_t *_size,
dst_start = dst_offset ? dst_offset[n-1] : 0;
src_start = src_offset ? src_offset[n-1] : 0;
- /* others */
- for (ii=n-2, dst_acc=1, src_acc=1; ii>=0; --ii) {
- hsize_t tmp1 = dst_acc * (dst_size[ii+1] - size[ii+1]);
- hsize_t tmp2 = src_acc * (src_size[ii+1] - size[ii+1]);
- assert (tmp1<((hsize_t)1<<(8*sizeof(hssize_t)-1)));
- assert (tmp2<((hsize_t)1<<(8*sizeof(hssize_t)-1)));
- dst_stride[ii] = (hssize_t)tmp1; /*overflow checked*/
- src_stride[ii] = (hssize_t)tmp2; /*overflow checked*/
- dst_acc *= dst_size[ii+1];
- src_acc *= src_size[ii+1];
- dst_start += dst_acc * (dst_offset ? dst_offset[ii] : 0);
- src_start += src_acc * (src_offset ? src_offset[ii] : 0);
- }
+ /* Unroll loop for common cases */
+ switch(n) {
+ case 2:
+ tmp1 = (dst_size[1] - size[1]);
+ tmp2 = (src_size[1] - size[1]);
+ assert (tmp1<((hsize_t)1<<(8*sizeof(hssize_t)-1)));
+ assert (tmp2<((hsize_t)1<<(8*sizeof(hssize_t)-1)));
+ dst_stride[0] = (hssize_t)tmp1; /*overflow checked*/
+ src_stride[0] = (hssize_t)tmp2; /*overflow checked*/
+ dst_acc = dst_size[1];
+ src_acc = src_size[1];
+ dst_start += dst_acc * (dst_offset ? dst_offset[0] : 0);
+ src_start += src_acc * (src_offset ? src_offset[0] : 0);
+ break;
+
+ case 3:
+ tmp1 = (dst_size[2] - size[2]);
+ tmp2 = (src_size[2] - size[2]);
+ assert (tmp1<((hsize_t)1<<(8*sizeof(hssize_t)-1)));
+ assert (tmp2<((hsize_t)1<<(8*sizeof(hssize_t)-1)));
+ dst_stride[1] = (hssize_t)tmp1; /*overflow checked*/
+ src_stride[1] = (hssize_t)tmp2; /*overflow checked*/
+ dst_acc = dst_size[2];
+ src_acc = src_size[2];
+ dst_start += dst_acc * (dst_offset ? dst_offset[1] : 0);
+ src_start += src_acc * (src_offset ? src_offset[1] : 0);
+
+ tmp1 = dst_acc * (dst_size[1] - size[1]);
+ tmp2 = src_acc * (src_size[1] - size[1]);
+ assert (tmp1<((hsize_t)1<<(8*sizeof(hssize_t)-1)));
+ assert (tmp2<((hsize_t)1<<(8*sizeof(hssize_t)-1)));
+ dst_stride[0] = (hssize_t)tmp1; /*overflow checked*/
+ src_stride[0] = (hssize_t)tmp2; /*overflow checked*/
+ dst_acc *= dst_size[1];
+ src_acc *= src_size[1];
+ dst_start += dst_acc * (dst_offset ? dst_offset[0] : 0);
+ src_start += src_acc * (src_offset ? src_offset[0] : 0);
+ break;
+
+ case 4:
+ tmp1 = (dst_size[3] - size[3]);
+ tmp2 = (src_size[3] - size[3]);
+ assert (tmp1<((hsize_t)1<<(8*sizeof(hssize_t)-1)));
+ assert (tmp2<((hsize_t)1<<(8*sizeof(hssize_t)-1)));
+ dst_stride[2] = (hssize_t)tmp1; /*overflow checked*/
+ src_stride[2] = (hssize_t)tmp2; /*overflow checked*/
+ dst_acc = dst_size[3];
+ src_acc = src_size[3];
+ dst_start += dst_acc * (dst_offset ? dst_offset[2] : 0);
+ src_start += src_acc * (src_offset ? src_offset[2] : 0);
+
+ tmp1 = dst_acc * (dst_size[2] - size[2]);
+ tmp2 = src_acc * (src_size[2] - size[2]);
+ assert (tmp1<((hsize_t)1<<(8*sizeof(hssize_t)-1)));
+ assert (tmp2<((hsize_t)1<<(8*sizeof(hssize_t)-1)));
+ dst_stride[1] = (hssize_t)tmp1; /*overflow checked*/
+ src_stride[1] = (hssize_t)tmp2; /*overflow checked*/
+ dst_acc *= dst_size[2];
+ src_acc *= src_size[2];
+ dst_start += dst_acc * (dst_offset ? dst_offset[1] : 0);
+ src_start += src_acc * (src_offset ? src_offset[1] : 0);
+
+ tmp1 = dst_acc * (dst_size[1] - size[1]);
+ tmp2 = src_acc * (src_size[1] - size[1]);
+ assert (tmp1<((hsize_t)1<<(8*sizeof(hssize_t)-1)));
+ assert (tmp2<((hsize_t)1<<(8*sizeof(hssize_t)-1)));
+ dst_stride[0] = (hssize_t)tmp1; /*overflow checked*/
+ src_stride[0] = (hssize_t)tmp2; /*overflow checked*/
+ dst_acc *= dst_size[1];
+ src_acc *= src_size[1];
+ dst_start += dst_acc * (dst_offset ? dst_offset[0] : 0);
+ src_start += src_acc * (src_offset ? src_offset[0] : 0);
+ break;
+
+ default:
+ /* others */
+ for (ii=n-2, dst_acc=1, src_acc=1; ii>=0; --ii) {
+ hsize_t tmp1 = dst_acc * (dst_size[ii+1] - size[ii+1]);
+ hsize_t tmp2 = src_acc * (src_size[ii+1] - size[ii+1]);
+ assert (tmp1<((hsize_t)1<<(8*sizeof(hssize_t)-1)));
+ assert (tmp2<((hsize_t)1<<(8*sizeof(hssize_t)-1)));
+ dst_stride[ii] = (hssize_t)tmp1; /*overflow checked*/
+ src_stride[ii] = (hssize_t)tmp2; /*overflow checked*/
+ dst_acc *= dst_size[ii+1];
+ src_acc *= src_size[ii+1];
+ dst_start += dst_acc * (dst_offset ? dst_offset[ii] : 0);
+ src_start += src_acc * (src_offset ? src_offset[ii] : 0);
+ }
+ break;
+ } /* end switch */
}
#endif /* NO_INLINED_CODE */