/*
 * tclBrodnik.c --
 *
 *	This file contains the implementation of a BrodnikArray.
 *
 * Contributions from Don Porter, NIST, 2013. (not subject to US copyright)
 *
 * See the file "license.terms" for information on usage and redistribution of
 * this file, and for a DISCLAIMER OF ALL WARRANTIES.
 */

#include "tclBrodnik.h"

#if defined(HAVE_INTRIN_H)
#    include <intrin.h>
#ifdef _WIN64
#    pragma intrinsic(_BitScanReverse64)
#else
#    pragma intrinsic(_BitScanReverse)
#endif
#endif

/*
 *----------------------------------------------------------------------
 *
 * TclMSB --
 *
 *	Given a size_t non-zero value n, return the index of the most
 *	significant bit in n that is set.  This is equivalent to returning
 *	trunc(log2(n)).  It's also equivalent to the largest integer k
 *	such that 2^k <= n.
 *
 *	This routine is adapted from Andrej Brodnik, "Computation of the
 *	Least Significant Set Bit", pp 7-10, Proceedings of the 2nd
 *	Electrotechnical and Computer Science Conference, Portoroz,
 *	Slovenia, 1993.  The adaptations permit the computation to take
 *	place within size_t values without the need for double length
 *	buffers for calculation.  They also fill in a number of details
 *	the paper omits or leaves unclear.
 *
 * Results:
 *	The index of the most significant set bit in n, a value between
 *	0 and CHAR_BIT*sizeof(size_t) - 1, inclusive.
 *
 * Side effects:
 *	None.
 *
 *----------------------------------------------------------------------
 */

int
TclMSB(
    size_t n)
{
    const int M = CHAR_BIT * sizeof(size_t);	/* Bits in a size_t */

    /*
     * TODO: This function corresponds to a processor instruction on
     * many platforms.  Add here the various platform and compiler
     * specific incantations to invoke those assembly instructions.
     */
#if defined(__GNUC__) && ((__GNUC__ >= 4) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 4)))
	return n ? M - 1 - __builtin_clzll(n) : 0;
#endif

#if defined(_MSC_VER) && _MSCVER >= 1300
	unsigned long result;

#ifdef _WIN64
	(void) _BitScanReverse64(&result, n)
#else
	(void) _BitScanReverse(&result, n)
#endif

	return result;
#endif

    if (M == 64) {

	/*
	 * For a byte, consider two masks, C1 = 10000000 selecting just
	 * the high bit, and C2 = 01111111 selecting all other bits.
	 * Then for any byte value n, the computation
	 *	LEAD(n) = C1 & (n | (C2 + (n & C2)))
	 * will leave all bits but the high bit unset, and will have the
	 * high bit set iff n!=0.  The whole thing is an 8-bit test
	 * for being non-zero.  For an 8-byte size_t, each byte can have
	 * the test applied all at once, with combined masks.
	 */
	const size_t C1 = 0x8080808080808080;
	const size_t C2 = 0x7F7F7F7F7F7F7F7F;
#define LEAD(n) (C1 & (n | (C2 + (n & C2))))

	/*
	 * To shift a bit to a new place, multiplication by 2^k will do.
	 * To shift the top 7 bits produced by the LEAD test to the high
	 * 7 bits of the entire size_t, multiply by the right sum of
	 * powers of 2.  In this case
	 * Q = 1 + 2^7 + 2^14 + 2^21 + 2^28 + 2^35 + 2^42
	 * Then shift those 7 bits down to the low 7 bits of the size_t.
	 * The key to making this work is that none of the shifted bits
	 * collide with each other in the top 7-bit destination.
	 * Note that we lose the bit that indicates whether the low byte
	 * is non-zero.  That doesn't matter because we require the original
	 * value n to be non-zero, so if all other bytes signal to be zero,
	 * we know the low byte is non-zero, and if one of the other bytes
	 * signals non-zero, we just don't care what the low byte is.
	 */
	const size_t  Q = 0x0000040810204081;

	/*
	 * To place a copy of a 7-bit value in each of 7 bytes in
	 * a size_t, just multply by the right value.  In this case
	 *  P = 0x00 01 01 01 01 01 01 01
	 * We don't put a copy in the high byte since analysis of the
	 * remaining steps in the algorithm indicates we do not need it.
	 */
	const size_t  P = 0x0001010101010101;

	/*
	 * With 7 copies of the LEAD value, we can now apply 7 masks
	 * to it in a single step by an & against the right value.
	 * B =	00000000 01111111 01111110 01111100
	 *	01111000 01110000 01100000 01000000
	 * The higher the MSB of the copied value is, the more of the
	 * B-masked bytes stored in t will be non-zero.
	 */
	const size_t  B = 0x007F7E7C78706040;
	size_t t = B & P * (LEAD(n) * Q >> 57);

	/*
	 * We want to get a count of the non-zero bytes stored in t.
	 * First use LEAD(t) to create a set of high bits signaling
	 * non-zero values as before.  Call this value
	 * X = x6*2^55 +x5*2^47 +x4*2^39 +x3*2^31 +x2*2^23 +x1*2^15 +x0*2^7
	 * Then notice what multiplication by
	 * P =    2^48 +   2^40 +   2^32 +   2^24 +   2^16 +   2^8 + 1
	 * produces:
	 * P*X = x0*2^7 + (x0 + x1)*2^15 + ...
	 *       ... + (x0 + x1 + x2 + x3 + x4 + x5 + x6) * 2^55 + ...
	 *	 ... + (x5 + x6)*2^95 + x6*2^103
	 * The high terms of this product are going to overflow the size_t
	 * and get lost, but we don't care about them.  What we care is that
	 * the 2^55 term is exactly the sum we seek.  We shift the product
	 * down by 55 bits and then mask away all but the bottom 3 bits
	 * (Max sum can be 7) we get exactly the count of non-zero B-masked
	 * bytes.  By design of the mask, this count is the index of the
	 * MSB of the LEAD value.  It indicates which byte of the original
	 * value contains the MSB of the original value.
	 */
#define SUM(t) (0x7 & (int)(LEAD(t) * P >> 55));

	/*
	 * Multiply by 8 to get the number of bits to shift to place
	 * that MSB-containing byte in the low byte.
	 */
	int k = 8 * SUM(t);

	/*
	 * Shift the MSB byte to the low byte.  Then shift one more bit.
	 * Since we know the MSB byte is non-zero we only need to compute
	 * the MSB of the top 7 bits.  If all top 7 bits are zero, we know
	 * the bottom bit is the 1 and the correct index is 0.  Compute the
	 * MSB of that value by the same steps we did before.
	 */
	t = B & P * (n >> k >> 1);

	/*
	 * Add the index of the MSB of the byte to the index of the low
	 * bit of that byte computed before to get the final answer.
	 */
	return k + SUM(t);

	/* Total operations: 33
	 * 10 bit-ands, 6 multiplies, 4 adds, 5 rightshifts,
	 * 3 assignments, 3 bit-ors, 2 typecasts.
	 *
	 * The whole task is one direct computation.
	 * No branches. No loops.
	 *
	 * 33 operations cannot beat one instruction, so assembly
	 * wins and should be used wherever possible, but this isn't bad.
	 */

#undef SUM
    } else if (M == 32) {

	/* Same scheme as above, with adjustments to the 32-bit size */
	const size_t C1 = 0xA0820820;
	const size_t C2 = 0x5F7DF7DF;
	const size_t C3 = 0xC0820820;
	const size_t C4 = 0x20000000;
	const size_t Q  = 0x00010841;
	const size_t P = 0x01041041;
	const size_t B = 0x1F79C610;

#define SUM(t) (0x7 & (LEAD(t) * P >> 29));

	size_t t = B & P * ((C3 & (LEAD(n) + C4)) * Q >> 27);
	int k = 6 * SUM(t);

	t = B & P * (n >> k >> 1);
	return k + SUM(t);

	/* Total operations: 33
	 * 11 bit-ands, 6 multiplies, 5 adds, 5 rightshifts,
	 * 3 assignments, 3 bit-ors.
	 */

#undef SUM
#undef LEAD

    } else {
	/* Simple and slow fallback for cases we haven't done yet. */
	int k = 0;

	while (n >>= 1) {
	    k++;
	}
	return k;
    }
}

/*
 *----------------------------------------------------------------------
 *
 * TclBAConvertIndices --
 *
 *	Given a size_t index into the sequence, convert into the
 *	corresponding index pair of the store[] of a BrodnikArray.
 *
 *	store[] is an array of arrays, and as the total size grows
 *	larger, the size of the arrays store[i] grow in a way that
 *	each new array is of about size sqrt(N), yet the index conversion
 *	routine remains relatively simple to calculate.
 *
 * Results:
 *	The index pair is written to *hiPtr and *loPtr, which may not
 *	be NULL.
 *
 * Side effects:
 *	None.
 *
 *----------------------------------------------------------------------
 */

void
TclBAConvertIndices(
    size_t		index,
    unsigned int	*hiPtr,
    unsigned int	*loPtr)
{
    size_t		r = index + 1;
    int			k = TclMSB(r);
    int			shift = (k + 1) >> 1;
    unsigned int	lobits = (1 << shift) - 1;
    unsigned int	hibits = 1 << (k - shift);

    *hiPtr	= (lobits << 1) - ((k & 1) * hibits)
			+ ((r >> shift) & (hibits - 1));
    *loPtr	= r & lobits;
}

/*
 *----------------------------------------------------------------------
 *
 * TclBAInvertIndices --
 *
 *	Given a size_t index pair hi, lo into the store[] of a BrodnikArray,
 *	compute and return the size_t index of the element found there.
 *
 * Results:
 *	The size_t index value.
 *
 * Side effects:
 *	None.
 *----------------------------------------------------------------------
 */

size_t
TclBAInvertIndices(
    unsigned int	hi,
    unsigned int	lo)
{
    size_t		plus2 = hi + 2;
    int			n = TclMSB(plus2) - 1;
    unsigned int	bit = (((size_t)1)<<n);
    int			SB = 2*n + ((plus2 & bit)!=0);
    size_t		base = (((size_t)1)<<SB) - 1;
    size_t		increment = ((bit - 1) & plus2) << n;

    return base + increment + lo;
}