1 files changed, 180 insertions, 0 deletions
diff --git a/generic/tclUtil.c b/generic/tclUtil.c
index 4beb25d..7940b66 100644
--- a/generic/tclUtil.c
+++ b/generic/tclUtil.c
@@ -19,6 +19,11 @@
 #include "tclTomMath.h"
 #include <math.h>
 
+#if defined(_MSC_VER) && defined(_WIN64)
+#   include <intrin.h>
+#   pragma intrinsic(_BitScanReverse64)
+#endif
+
 /*
  * The absolute pathname of the executable in which this Tcl library is
  * running.
@@ -4611,6 +4616,181 @@ TclReToGlob(
 }
 
 /*
+ *----------------------------------------------------------------------
+ *
+ * TclMSB --
+ *
+ *	Given a unsigned long long non-zero value n, return the index of
+ *	the most significant bit in n that is set.  This is equivalent to
+ *	returning trunc(log2(n)).  It's also equivalent to the largest
+ *	integer k such that 2^k <= n.
+ *
+ *	This routine is adapted from Andrej Brodnik, "Computation of the
+ *	Least Significant Set Bit", pp 7-10, Proceedings of the 2nd
+ *	Electrotechnical and Computer Science Conference, Portoroz,
+ *	Slovenia, 1993.  The adaptations permit the computation to take
+ *	place within unsigned long long values without the need for double
+ *	length	buffers for calculation.  They also fill in a number of
+ *	details the paper omits or leaves unclear.
+ *
+ * Results:
+ *	The index of the most significant set bit in n, a value between
+ *	0 and 63, inclusive.
+ *
+ * Side effects:
+ *	None.
+ *
+ *----------------------------------------------------------------------
+ */
+
+int
+TclMSB(
+    unsigned long long n)
+{
+    /* assert ( 64 == CHAR_BIT * sizeof(unsigned long long) ); */
+    /* assert ( n != 0 ); */
+
+    /*
+     * Many platforms offer access to this functionality through
+     * compiler specific incantations that exploit processor
+     * instructions.  Add more as appropriate.
+     */
+
+#if defined(_MSC_VER) && defined(_WIN64)
+	/*
+	 * This candidate implementation for Microsoft compilers is
+	 * untested.  (Remove this comment when someone tests it and
+	 * either finds it working, or fixes any brokenness.)
+	 */
+	unsigned long result;
+
+	(void) _BitScanReverse64(&result, (unsigned __int64)n);
+	return (int)result;
+
+#elif defined(__GNUC__) && ((__GNUC__ > 3) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4))
+
+	/*
+	 * The GNU Compiler Collection offers this builtin routine
+	 * starting with version 3.4, released 2004.
+	 * clzll() = Count of Leading Zeroes in a Long Long
+	 * NOTE: we rely on input constraint (n != 0).
+	 */
+	
+	return 63 - __builtin_clzll(n);
+	
+#else
+
+	/*
+	 * For a byte, consider two masks, C1 = 10000000 selecting just
+	 * the high bit, and C2 = 01111111 selecting all other bits.
+	 * Then for any byte value n, the computation
+	 *	LEAD(n) = C1 & (n | (C2 + (n & C2)))
+	 * will leave all bits but the high bit unset, and will have the
+	 * high bit set iff n!=0.  The whole thing is an 8-bit test
+	 * for being non-zero.  For an 8-byte n, each byte can have
+	 * the test applied all at once, with combined masks.
+	 */
+	const unsigned long long C1 = 0x8080808080808080;
+	const unsigned long long C2 = 0x7F7F7F7F7F7F7F7F;
+#define LEAD(n) (C1 & (n | (C2 + (n & C2))))
+
+	/*
+	 * To shift a bit to a new place, multiplication by 2^k will do.
+	 * To shift the top 7 bits produced by the LEAD test to the high
+	 * 7 bits of the entire long long, multiply by the right sum of
+	 * powers of 2.  In this case
+	 * Q = 1 + 2^7 + 2^14 + 2^21 + 2^28 + 2^35 + 2^42
+	 * Then shift those 7 bits down to the low 7 bits of the long long.
+	 * The key to making this work is that none of the shifted bits
+	 * collide with each other in the top 7-bit destination.
+	 * Note that we lose the bit that indicates whether the low byte
+	 * is non-zero.  That doesn't matter because we require the original
+	 * value n to be non-zero, so if all other bytes signal to be zero,
+	 * we know the low byte is non-zero, and if one of the other bytes
+	 * signals non-zero, we just don't care what the low byte is.
+	 */
+	const unsigned long long Q = 0x0000040810204081;
+
+	/*
+	 * To place a copy of a 7-bit value in each of 7 bytes in
+	 * a long long, just multply by the right value.  In this case
+	 *  P = 0x00 01 01 01 01 01 01 01
+	 * We don't put a copy in the high byte since analysis of the
+	 * remaining steps in the algorithm indicates we do not need it.
+	 */
+	const unsigned long long P = 0x0001010101010101;
+
+	/*
+	 * With 7 copies of the LEAD value, we can now apply 7 masks
+	 * to it in a single step by an & against the right value.
+	 * B =	00000000 01111111 01111110 01111100
+	 *	01111000 01110000 01100000 01000000
+	 * The higher the MSB of the copied value is, the more of the
+	 * B-masked bytes stored in t will be non-zero.
+	 */
+	const unsigned long long B = 0x007F7E7C78706040;
+	unsigned long long t = B & P * (LEAD(n) * Q >> 57);
+
+	/*
+	 * We want to get a count of the non-zero bytes stored in t.
+	 * First use LEAD(t) to create a set of high bits signaling
+	 * non-zero values as before.  Call this value
+	 * X = x6*2^55 +x5*2^47 +x4*2^39 +x3*2^31 +x2*2^23 +x1*2^15 +x0*2^7
+	 * Then notice what multiplication by
+	 * P =    2^48 +   2^40 +   2^32 +   2^24 +   2^16 +   2^8 + 1
+	 * produces:
+	 * P*X = x0*2^7 + (x0 + x1)*2^15 + ...
+	 *       ... + (x0 + x1 + x2 + x3 + x4 + x5 + x6) * 2^55 + ...
+	 *	 ... + (x5 + x6)*2^95 + x6*2^103
+	 * The high terms of this product are going to overflow the long long
+	 * and get lost, but we don't care about them.  What we care is that
+	 * the 2^55 term is exactly the sum we seek.  We shift the product
+	 * down by 55 bits and then mask away all but the bottom 3 bits
+	 * (Max sum can be 7) we get exactly the count of non-zero B-masked
+	 * bytes.  By design of the mask, this count is the index of the
+	 * MSB of the LEAD value.  It indicates which byte of the original
+	 * value contains the MSB of the original value.
+	 */
+#define SUM(t) (0x7 & (int)(LEAD(t) * P >> 55));
+
+	/*
+	 * Multiply by 8 to get the number of bits to shift to place
+	 * that MSB-containing byte in the low byte.
+	 */
+	int k = 8 * SUM(t);
+
+	/*
+	 * Shift the MSB byte to the low byte.  Then shift one more bit.
+	 * Since we know the MSB byte is non-zero we only need to compute
+	 * the MSB of the top 7 bits.  If all top 7 bits are zero, we know
+	 * the bottom bit is the 1 and the correct index is 0.  Compute the
+	 * MSB of that value by the same steps we did before.
+	 */
+	t = B & P * (n >> k >> 1);
+
+	/*
+	 * Add the index of the MSB of the byte to the index of the low
+	 * bit of that byte computed before to get the final answer.
+	 */
+	return k + SUM(t);
+
+	/* Total operations: 33
+	 * 10 bit-ands, 6 multiplies, 4 adds, 5 rightshifts,
+	 * 3 assignments, 3 bit-ors, 2 typecasts.
+	 *
+	 * The whole task is one direct computation.
+	 * No branches. No loops.
+	 *
+	 * 33 operations cannot beat one instruction, so assembly
+	 * wins and should be used wherever possible, but this isn't bad.
+	 */
+
+#undef SUM
+#undef LEAD
+#endif
+}
+
+/*
  * Local Variables:
  * mode: c
  * c-basic-offset: 4