From 9e4665ba8c24c09e4f9442d0a914339a076e341e Mon Sep 17 00:00:00 2001 From: Quincey Koziol Date: Wed, 27 Sep 2000 13:23:40 -0500 Subject: [svn-r2606] Purpose: Code Optimization Description: The matrix operations are currently the hot-spot in the library code for regular hyperslab operations. Solution: Unrolled loops for 3 of the more heavily used functions (H5V_stride_optimize2, H5V_hyper_stride & H5V_hyper_copy) for the common cases (i.e. up to 3-D datasets). This squeezes some more blood out of the stone (turnip? :-) and improves the h5hypers.c benchmark on baldric by another 20-25%. Platforms tested: Solaris 2.6 (i.e. baldric) --- src/H5V.c | 275 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 243 insertions(+), 32 deletions(-) diff --git a/src/H5V.c b/src/H5V.c index dc61b22..a91f191 100644 --- a/src/H5V.c +++ b/src/H5V.c @@ -100,17 +100,102 @@ H5V_stride_optimize2(intn *np/*in,out*/, hsize_t *elmt_size/*in,out*/, /* * Combine adjacent memory accesses */ - while (*np && - stride1[*np-1] > 0 && - (hsize_t)(stride1[*np-1]) == *elmt_size && - stride2[*np-1] > 0 && - (hsize_t)(stride2[*np-1]) == *elmt_size) { - *elmt_size *= size[*np-1]; - if (--*np) { - stride1[*np-1] += size[*np] * stride1[*np]; - stride2[*np-1] += size[*np] * stride2[*np]; - } - } + + /* Unroll loop for common cases */ + switch(*np) { + case 1: /* For 0-D datasets (dunno if this ever gets used...) */ + if((hsize_t)(stride1[0]) == *elmt_size && + (hsize_t)(stride2[0]) == *elmt_size) { + *elmt_size *= size[0]; + --*np; /* *np decrements to a value of 0 now */ + } /* end if */ + break; + + case 2: /* For 1-D datasets */ + if((hsize_t)(stride1[1]) == *elmt_size && + (hsize_t)(stride2[1]) == *elmt_size) { + *elmt_size *= size[1]; + --*np; /* *np decrements to a value of 1 now */ + stride1[0] += size[1] * stride1[1]; + stride2[0] += size[1] * stride2[1]; + + if((hsize_t)(stride1[0]) == *elmt_size && + (hsize_t)(stride2[0]) == *elmt_size) { + *elmt_size *= size[0]; + --*np; /* *np decrements to a value of 0 now */ + } /* end if */ + } /* end if */ + break; + + case 3: /* For 2-D datasets */ + if((hsize_t)(stride1[2]) == *elmt_size && + (hsize_t)(stride2[2]) == *elmt_size) { + *elmt_size *= size[2]; + --*np; /* *np decrements to a value of 2 now */ + stride1[1] += size[2] * stride1[2]; + stride2[1] += size[2] * stride2[2]; + + if((hsize_t)(stride1[1]) == *elmt_size && + (hsize_t)(stride2[1]) == *elmt_size) { + *elmt_size *= size[1]; + --*np; /* *np decrements to a value of 1 now */ + stride1[0] += size[1] * stride1[1]; + stride2[0] += size[1] * stride2[1]; + + if((hsize_t)(stride1[0]) == *elmt_size && + (hsize_t)(stride2[0]) == *elmt_size) { + *elmt_size *= size[0]; + --*np; /* *np decrements to a value of 0 now */ + } /* end if */ + } /* end if */ + } /* end if */ + break; + + case 4: /* For 3-D datasets */ + if((hsize_t)(stride1[3]) == *elmt_size && + (hsize_t)(stride2[3]) == *elmt_size) { + *elmt_size *= size[3]; + --*np; /* *np decrements to a value of 3 now */ + stride1[2] += size[3] * stride1[3]; + stride2[2] += size[3] * stride2[3]; + + if((hsize_t)(stride1[2]) == *elmt_size && + (hsize_t)(stride2[2]) == *elmt_size) { + *elmt_size *= size[2]; + --*np; /* *np decrements to a value of 2 now */ + stride1[1] += size[2] * stride1[2]; + stride2[1] += size[2] * stride2[2]; + + if((hsize_t)(stride1[1]) == *elmt_size && + (hsize_t)(stride2[1]) == *elmt_size) { + *elmt_size *= size[1]; + --*np; /* *np decrements to a value of 1 now */ + stride1[0] += size[1] * stride1[1]; + stride2[0] += size[1] * stride2[1]; + + if((hsize_t)(stride1[0]) == *elmt_size && + (hsize_t)(stride2[0]) == *elmt_size) { + *elmt_size *= size[0]; + --*np; /* *np decrements to a value of 0 now */ + } /* end if */ + } /* end if */ + } /* end if */ + } /* end if */ + break; + + default: + while (*np && + (hsize_t)(stride1[*np-1]) == *elmt_size && + (hsize_t)(stride2[*np-1]) == *elmt_size) { + *elmt_size *= size[*np-1]; + if (--*np) { + stride1[*np-1] += size[*np] * stride1[*np]; + stride2[*np-1] += size[*np] * stride2[*np]; + } + } + break; + } /* end switch */ + FUNC_LEAVE(SUCCEED); } @@ -151,6 +236,7 @@ H5V_hyper_stride(intn n, const hsize_t *size, { hsize_t skip; /*starting point byte offset */ hsize_t acc; /*accumulator */ + hsize_t tmp; int i; /*counter */ FUNC_ENTER(H5V_hyper_stride, (HDabort(), 0)); @@ -164,14 +250,60 @@ H5V_hyper_stride(intn n, const hsize_t *size, stride[n-1] = 1; skip = offset ? offset[n-1] : 0; - /* others */ - for (i=n-2, acc=1; i>=0; --i) { - hsize_t tmp = acc * (total_size[i+1] - size[i+1]); - assert (tmp<((hsize_t)1<<(8*sizeof(hssize_t)-1))); - stride[i] = (hssize_t)tmp; /*overflow checked*/ - acc *= total_size[i+1]; - skip += acc * (offset ? offset[i] : 0); - } + switch(n) { + case 2: /* 1-D dataset */ + tmp = total_size[1] - size[1]; + assert (tmp<((hsize_t)1<<(8*sizeof(hssize_t)-1))); + stride[0] = (hssize_t)tmp; /*overflow checked*/ + acc = total_size[1]; + skip += acc * (offset ? offset[0] : 0); + break; + + case 3: /* 2-D dataset */ + tmp = total_size[2] - size[2]; + assert (tmp<((hsize_t)1<<(8*sizeof(hssize_t)-1))); + stride[1] = (hssize_t)tmp; /*overflow checked*/ + acc = total_size[2]; + skip += acc * (offset ? offset[1] : 0); + + tmp = acc * (total_size[1] - size[1]); + assert (tmp<((hsize_t)1<<(8*sizeof(hssize_t)-1))); + stride[0] = (hssize_t)tmp; /*overflow checked*/ + acc *= total_size[1]; + skip += acc * (offset ? offset[0] : 0); + break; + + case 4: /* 3-D dataset */ + tmp = total_size[3] - size[3]; + assert (tmp<((hsize_t)1<<(8*sizeof(hssize_t)-1))); + stride[2] = (hssize_t)tmp; /*overflow checked*/ + acc = total_size[3]; + skip += acc * (offset ? offset[2] : 0); + + tmp = acc * (total_size[2] - size[2]); + assert (tmp<((hsize_t)1<<(8*sizeof(hssize_t)-1))); + stride[1] = (hssize_t)tmp; /*overflow checked*/ + acc *= total_size[2]; + skip += acc * (offset ? offset[1] : 0); + + tmp = acc * (total_size[1] - size[1]); + assert (tmp<((hsize_t)1<<(8*sizeof(hssize_t)-1))); + stride[0] = (hssize_t)tmp; /*overflow checked*/ + acc *= total_size[1]; + skip += acc * (offset ? offset[0] : 0); + break; + + default: + /* others */ + for (i=n-2, acc=1; i>=0; --i) { + hsize_t tmp = acc * (total_size[i+1] - size[i+1]); + assert (tmp<((hsize_t)1<<(8*sizeof(hssize_t)-1))); + stride[i] = (hssize_t)tmp; /*overflow checked*/ + acc *= total_size[i+1]; + skip += acc * (offset ? offset[i] : 0); + } + break; + } /* end switch */ FUNC_LEAVE(skip); } @@ -381,6 +513,8 @@ H5V_hyper_copy(intn n, const hsize_t *_size, hssize_t dst_stride[H5V_HYPER_NDIMS]; /*dest stride info */ hsize_t dst_start, src_start; /*offset to start at */ hsize_t elmt_size = 1; /*element size in bytes */ + hsize_t tmp1; + hsize_t tmp2; herr_t status; /*return status */ #ifndef NDEBUG intn i; @@ -423,19 +557,96 @@ H5V_hyper_copy(intn n, const hsize_t *_size, dst_start = dst_offset ? dst_offset[n-1] : 0; src_start = src_offset ? src_offset[n-1] : 0; - /* others */ - for (ii=n-2, dst_acc=1, src_acc=1; ii>=0; --ii) { - hsize_t tmp1 = dst_acc * (dst_size[ii+1] - size[ii+1]); - hsize_t tmp2 = src_acc * (src_size[ii+1] - size[ii+1]); - assert (tmp1<((hsize_t)1<<(8*sizeof(hssize_t)-1))); - assert (tmp2<((hsize_t)1<<(8*sizeof(hssize_t)-1))); - dst_stride[ii] = (hssize_t)tmp1; /*overflow checked*/ - src_stride[ii] = (hssize_t)tmp2; /*overflow checked*/ - dst_acc *= dst_size[ii+1]; - src_acc *= src_size[ii+1]; - dst_start += dst_acc * (dst_offset ? dst_offset[ii] : 0); - src_start += src_acc * (src_offset ? src_offset[ii] : 0); - } + /* Unroll loop for common cases */ + switch(n) { + case 2: + tmp1 = (dst_size[1] - size[1]); + tmp2 = (src_size[1] - size[1]); + assert (tmp1<((hsize_t)1<<(8*sizeof(hssize_t)-1))); + assert (tmp2<((hsize_t)1<<(8*sizeof(hssize_t)-1))); + dst_stride[0] = (hssize_t)tmp1; /*overflow checked*/ + src_stride[0] = (hssize_t)tmp2; /*overflow checked*/ + dst_acc = dst_size[1]; + src_acc = src_size[1]; + dst_start += dst_acc * (dst_offset ? dst_offset[0] : 0); + src_start += src_acc * (src_offset ? src_offset[0] : 0); + break; + + case 3: + tmp1 = (dst_size[2] - size[2]); + tmp2 = (src_size[2] - size[2]); + assert (tmp1<((hsize_t)1<<(8*sizeof(hssize_t)-1))); + assert (tmp2<((hsize_t)1<<(8*sizeof(hssize_t)-1))); + dst_stride[1] = (hssize_t)tmp1; /*overflow checked*/ + src_stride[1] = (hssize_t)tmp2; /*overflow checked*/ + dst_acc = dst_size[2]; + src_acc = src_size[2]; + dst_start += dst_acc * (dst_offset ? dst_offset[1] : 0); + src_start += src_acc * (src_offset ? src_offset[1] : 0); + + tmp1 = dst_acc * (dst_size[1] - size[1]); + tmp2 = src_acc * (src_size[1] - size[1]); + assert (tmp1<((hsize_t)1<<(8*sizeof(hssize_t)-1))); + assert (tmp2<((hsize_t)1<<(8*sizeof(hssize_t)-1))); + dst_stride[0] = (hssize_t)tmp1; /*overflow checked*/ + src_stride[0] = (hssize_t)tmp2; /*overflow checked*/ + dst_acc *= dst_size[1]; + src_acc *= src_size[1]; + dst_start += dst_acc * (dst_offset ? dst_offset[0] : 0); + src_start += src_acc * (src_offset ? src_offset[0] : 0); + break; + + case 4: + tmp1 = (dst_size[3] - size[3]); + tmp2 = (src_size[3] - size[3]); + assert (tmp1<((hsize_t)1<<(8*sizeof(hssize_t)-1))); + assert (tmp2<((hsize_t)1<<(8*sizeof(hssize_t)-1))); + dst_stride[2] = (hssize_t)tmp1; /*overflow checked*/ + src_stride[2] = (hssize_t)tmp2; /*overflow checked*/ + dst_acc = dst_size[3]; + src_acc = src_size[3]; + dst_start += dst_acc * (dst_offset ? dst_offset[2] : 0); + src_start += src_acc * (src_offset ? src_offset[2] : 0); + + tmp1 = dst_acc * (dst_size[2] - size[2]); + tmp2 = src_acc * (src_size[2] - size[2]); + assert (tmp1<((hsize_t)1<<(8*sizeof(hssize_t)-1))); + assert (tmp2<((hsize_t)1<<(8*sizeof(hssize_t)-1))); + dst_stride[1] = (hssize_t)tmp1; /*overflow checked*/ + src_stride[1] = (hssize_t)tmp2; /*overflow checked*/ + dst_acc *= dst_size[2]; + src_acc *= src_size[2]; + dst_start += dst_acc * (dst_offset ? dst_offset[1] : 0); + src_start += src_acc * (src_offset ? src_offset[1] : 0); + + tmp1 = dst_acc * (dst_size[1] - size[1]); + tmp2 = src_acc * (src_size[1] - size[1]); + assert (tmp1<((hsize_t)1<<(8*sizeof(hssize_t)-1))); + assert (tmp2<((hsize_t)1<<(8*sizeof(hssize_t)-1))); + dst_stride[0] = (hssize_t)tmp1; /*overflow checked*/ + src_stride[0] = (hssize_t)tmp2; /*overflow checked*/ + dst_acc *= dst_size[1]; + src_acc *= src_size[1]; + dst_start += dst_acc * (dst_offset ? dst_offset[0] : 0); + src_start += src_acc * (src_offset ? src_offset[0] : 0); + break; + + default: + /* others */ + for (ii=n-2, dst_acc=1, src_acc=1; ii>=0; --ii) { + hsize_t tmp1 = dst_acc * (dst_size[ii+1] - size[ii+1]); + hsize_t tmp2 = src_acc * (src_size[ii+1] - size[ii+1]); + assert (tmp1<((hsize_t)1<<(8*sizeof(hssize_t)-1))); + assert (tmp2<((hsize_t)1<<(8*sizeof(hssize_t)-1))); + dst_stride[ii] = (hssize_t)tmp1; /*overflow checked*/ + src_stride[ii] = (hssize_t)tmp2; /*overflow checked*/ + dst_acc *= dst_size[ii+1]; + src_acc *= src_size[ii+1]; + dst_start += dst_acc * (dst_offset ? dst_offset[ii] : 0); + src_start += src_acc * (src_offset ? src_offset[ii] : 0); + } + break; + } /* end switch */ } #endif /* NO_INLINED_CODE */ -- cgit v0.12