summaryrefslogtreecommitdiffstats
path: root/generic/tclUtil.c
diff options
context:
space:
mode:
authordgp <dgp@users.sourceforge.net>2025-05-09 18:13:17 (GMT)
committerdgp <dgp@users.sourceforge.net>2025-05-09 18:13:17 (GMT)
commit0e8128247f4e30a8267d342c323765b6bf6cd644 (patch)
tree178ac8dfc9bc556dc9b906db4b10fbec1d068146 /generic/tclUtil.c
parent9c7449fb49496cf68c0eef7452e0ad2bb3fac931 (diff)
downloadtcl-0e8128247f4e30a8267d342c323765b6bf6cd644.zip
tcl-0e8128247f4e30a8267d342c323765b6bf6cd644.tar.gz
tcl-0e8128247f4e30a8267d342c323765b6bf6cd644.tar.bz2
[fd1585e2a1] Adopt efficient internal indexing calculation utility TclMSB().
Diffstat (limited to 'generic/tclUtil.c')
-rw-r--r--generic/tclUtil.c180
1 files changed, 180 insertions, 0 deletions
diff --git a/generic/tclUtil.c b/generic/tclUtil.c
index 4beb25d..7940b66 100644
--- a/generic/tclUtil.c
+++ b/generic/tclUtil.c
@@ -19,6 +19,11 @@
#include "tclTomMath.h"
#include <math.h>
+#if defined(_MSC_VER) && defined(_WIN64)
+# include <intrin.h>
+# pragma intrinsic(_BitScanReverse64)
+#endif
+
/*
* The absolute pathname of the executable in which this Tcl library is
* running.
@@ -4611,6 +4616,181 @@ TclReToGlob(
}
/*
+ *----------------------------------------------------------------------
+ *
+ * TclMSB --
+ *
+ * Given a unsigned long long non-zero value n, return the index of
+ * the most significant bit in n that is set. This is equivalent to
+ * returning trunc(log2(n)). It's also equivalent to the largest
+ * integer k such that 2^k <= n.
+ *
+ * This routine is adapted from Andrej Brodnik, "Computation of the
+ * Least Significant Set Bit", pp 7-10, Proceedings of the 2nd
+ * Electrotechnical and Computer Science Conference, Portoroz,
+ * Slovenia, 1993. The adaptations permit the computation to take
+ * place within unsigned long long values without the need for double
+ * length buffers for calculation. They also fill in a number of
+ * details the paper omits or leaves unclear.
+ *
+ * Results:
+ * The index of the most significant set bit in n, a value between
+ * 0 and 63, inclusive.
+ *
+ * Side effects:
+ * None.
+ *
+ *----------------------------------------------------------------------
+ */
+
+int
+TclMSB(
+ unsigned long long n)
+{
+ /* assert ( 64 == CHAR_BIT * sizeof(unsigned long long) ); */
+ /* assert ( n != 0 ); */
+
+ /*
+ * Many platforms offer access to this functionality through
+ * compiler specific incantations that exploit processor
+ * instructions. Add more as appropriate.
+ */
+
+#if defined(_MSC_VER) && defined(_WIN64)
+ /*
+ * This candidate implementation for Microsoft compilers is
+ * untested. (Remove this comment when someone tests it and
+ * either finds it working, or fixes any brokenness.)
+ */
+ unsigned long result;
+
+ (void) _BitScanReverse64(&result, (unsigned __int64)n);
+ return (int)result;
+
+#elif defined(__GNUC__) && ((__GNUC__ > 3) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4))
+
+ /*
+ * The GNU Compiler Collection offers this builtin routine
+ * starting with version 3.4, released 2004.
+ * clzll() = Count of Leading Zeroes in a Long Long
+ * NOTE: we rely on input constraint (n != 0).
+ */
+
+ return 63 - __builtin_clzll(n);
+
+#else
+
+ /*
+ * For a byte, consider two masks, C1 = 10000000 selecting just
+ * the high bit, and C2 = 01111111 selecting all other bits.
+ * Then for any byte value n, the computation
+ * LEAD(n) = C1 & (n | (C2 + (n & C2)))
+ * will leave all bits but the high bit unset, and will have the
+ * high bit set iff n!=0. The whole thing is an 8-bit test
+ * for being non-zero. For an 8-byte n, each byte can have
+ * the test applied all at once, with combined masks.
+ */
+ const unsigned long long C1 = 0x8080808080808080;
+ const unsigned long long C2 = 0x7F7F7F7F7F7F7F7F;
+#define LEAD(n) (C1 & (n | (C2 + (n & C2))))
+
+ /*
+ * To shift a bit to a new place, multiplication by 2^k will do.
+ * To shift the top 7 bits produced by the LEAD test to the high
+ * 7 bits of the entire long long, multiply by the right sum of
+ * powers of 2. In this case
+ * Q = 1 + 2^7 + 2^14 + 2^21 + 2^28 + 2^35 + 2^42
+ * Then shift those 7 bits down to the low 7 bits of the long long.
+ * The key to making this work is that none of the shifted bits
+ * collide with each other in the top 7-bit destination.
+ * Note that we lose the bit that indicates whether the low byte
+ * is non-zero. That doesn't matter because we require the original
+ * value n to be non-zero, so if all other bytes signal to be zero,
+ * we know the low byte is non-zero, and if one of the other bytes
+ * signals non-zero, we just don't care what the low byte is.
+ */
+ const unsigned long long Q = 0x0000040810204081;
+
+ /*
+ * To place a copy of a 7-bit value in each of 7 bytes in
+ * a long long, just multply by the right value. In this case
+ * P = 0x00 01 01 01 01 01 01 01
+ * We don't put a copy in the high byte since analysis of the
+ * remaining steps in the algorithm indicates we do not need it.
+ */
+ const unsigned long long P = 0x0001010101010101;
+
+ /*
+ * With 7 copies of the LEAD value, we can now apply 7 masks
+ * to it in a single step by an & against the right value.
+ * B = 00000000 01111111 01111110 01111100
+ * 01111000 01110000 01100000 01000000
+ * The higher the MSB of the copied value is, the more of the
+ * B-masked bytes stored in t will be non-zero.
+ */
+ const unsigned long long B = 0x007F7E7C78706040;
+ unsigned long long t = B & P * (LEAD(n) * Q >> 57);
+
+ /*
+ * We want to get a count of the non-zero bytes stored in t.
+ * First use LEAD(t) to create a set of high bits signaling
+ * non-zero values as before. Call this value
+ * X = x6*2^55 +x5*2^47 +x4*2^39 +x3*2^31 +x2*2^23 +x1*2^15 +x0*2^7
+ * Then notice what multiplication by
+ * P = 2^48 + 2^40 + 2^32 + 2^24 + 2^16 + 2^8 + 1
+ * produces:
+ * P*X = x0*2^7 + (x0 + x1)*2^15 + ...
+ * ... + (x0 + x1 + x2 + x3 + x4 + x5 + x6) * 2^55 + ...
+ * ... + (x5 + x6)*2^95 + x6*2^103
+ * The high terms of this product are going to overflow the long long
+ * and get lost, but we don't care about them. What we care is that
+ * the 2^55 term is exactly the sum we seek. We shift the product
+ * down by 55 bits and then mask away all but the bottom 3 bits
+ * (Max sum can be 7) we get exactly the count of non-zero B-masked
+ * bytes. By design of the mask, this count is the index of the
+ * MSB of the LEAD value. It indicates which byte of the original
+ * value contains the MSB of the original value.
+ */
+#define SUM(t) (0x7 & (int)(LEAD(t) * P >> 55));
+
+ /*
+ * Multiply by 8 to get the number of bits to shift to place
+ * that MSB-containing byte in the low byte.
+ */
+ int k = 8 * SUM(t);
+
+ /*
+ * Shift the MSB byte to the low byte. Then shift one more bit.
+ * Since we know the MSB byte is non-zero we only need to compute
+ * the MSB of the top 7 bits. If all top 7 bits are zero, we know
+ * the bottom bit is the 1 and the correct index is 0. Compute the
+ * MSB of that value by the same steps we did before.
+ */
+ t = B & P * (n >> k >> 1);
+
+ /*
+ * Add the index of the MSB of the byte to the index of the low
+ * bit of that byte computed before to get the final answer.
+ */
+ return k + SUM(t);
+
+ /* Total operations: 33
+ * 10 bit-ands, 6 multiplies, 4 adds, 5 rightshifts,
+ * 3 assignments, 3 bit-ors, 2 typecasts.
+ *
+ * The whole task is one direct computation.
+ * No branches. No loops.
+ *
+ * 33 operations cannot beat one instruction, so assembly
+ * wins and should be used wherever possible, but this isn't bad.
+ */
+
+#undef SUM
+#undef LEAD
+#endif
+}
+
+/*
* Local Variables:
* mode: c
* c-basic-offset: 4