generic/tclBrodnik.c


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293

/*
 * tclBrodnik.c --
 *
 *	This file contains the implementation of a BrodnikArray.
 *
 * Contributions from Don Porter, NIST, 2013. (not subject to US copyright)
 *
 * See the file "license.terms" for information on usage and redistribution of
 * this file, and for a DISCLAIMER OF ALL WARRANTIES.
 */

#include "tclBrodnik.h"

#if defined(HAVE_INTRIN_H)
#    include <intrin.h>
#ifdef _WIN64
#    pragma intrinsic(_BitScanReverse64)
#else
#    pragma intrinsic(_BitScanReverse)
#endif
#endif

/*
 *----------------------------------------------------------------------
 *
 * TclMSB --
 *
 *	Given a size_t non-zero value n, return the index of the most
 *	significant bit in n that is set.  This is equivalent to returning
 *	trunc(log2(n)).  It's also equivalent to the largest integer k
 *	such that 2^k <= n.
 *
 *	This routine is adapted from Andrej Brodnik, "Computation of the
 *	Least Significant Set Bit", pp 7-10, Proceedings of the 2nd
 *	Electrotechnical and Computer Science Conference, Portoroz,
 *	Slovenia, 1993.  The adaptations permit the computation to take
 *	place within size_t values without the need for double length
 *	buffers for calculation.  They also fill in a number of details
 *	the paper omits or leaves unclear.
 *
 * Results:
 *	The index of the most significant set bit in n, a value between
 *	0 and CHAR_BIT*sizeof(size_t) - 1, inclusive.
 *
 * Side effects:
 *	None.
 *
 *----------------------------------------------------------------------
 */

int
TclMSB(
    size_t n)
{
    const int M = CHAR_BIT * sizeof(size_t);	/* Bits in a size_t */

    /*
     * TODO: This function corresponds to a processor instruction on
     * many platforms.  Add here the various platform and compiler
     * specific incantations to invoke those assembly instructions.
     */
#if defined(__GNUC__) && ((__GNUC__ >= 4) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 4)))
	return n ? M - 1 - __builtin_clzll(n) : 0;
#endif

#if defined(_MSC_VER) && _MSCVER >= 1300
	unsigned long result;

#ifdef _WIN64
	(void) _BitScanReverse64(&result, n)
#else
	(void) _BitScanReverse(&result, n)
#endif

	return result;
#endif

    if (M == 64) {

	/*
	 * For a byte, consider two masks, C1 = 10000000 selecting just
	 * the high bit, and C2 = 01111111 selecting all other bits.
	 * Then for any byte value n, the computation
	 *	LEAD(n) = C1 & (n | (C2 + (n & C2)))
	 * will leave all bits but the high bit unset, and will have the
	 * high bit set iff n!=0.  The whole thing is an 8-bit test
	 * for being non-zero.  For an 8-byte size_t, each byte can have
	 * the test applied all at once, with combined masks.
	 */
	const size_t C1 = 0x8080808080808080;
	const size_t C2 = 0x7F7F7F7F7F7F7F7F;
#define LEAD(n) (C1 & (n | (C2 + (n & C2))))

	/*
	 * To shift a bit to a new place, multiplication by 2^k will do.
	 * To shift the top 7 bits produced by the LEAD test to the high
	 * 7 bits of the entire size_t, multiply by the right sum of
	 * powers of 2.  In this case
	 * Q = 1 + 2^7 + 2^14 + 2^21 + 2^28 + 2^35 + 2^42
	 * Then shift those 7 bits down to the low 7 bits of the size_t.
	 * The key to making this work is that none of the shifted bits
	 * collide with each other in the top 7-bit destination.
	 * Note that we lose the bit that indicates whether the low byte
	 * is non-zero.  That doesn't matter because we require the original
	 * value n to be non-zero, so if all other bytes signal to be zero,
	 * we know the low byte is non-zero, and if one of the other bytes
	 * signals non-zero, we just don't care what the low byte is.
	 */
	const size_t  Q = 0x0000040810204081;

	/*
	 * To place a copy of a 7-bit value in each of 7 bytes in
	 * a size_t, just multply by the right value.  In this case
	 *  P = 0x00 01 01 01 01 01 01 01
	 * We don't put a copy in the high byte since analysis of the
	 * remaining steps in the algorithm indicates we do not need it.
	 */
	const size_t  P = 0x0001010101010101;

	/*
	 * With 7 copies of the LEAD value, we can now apply 7 masks
	 * to it in a single step by an & against the right value.
	 * B =	00000000 01111111 01111110 01111100
	 *	01111000 01110000 01100000 01000000
	 * The higher the MSB of the copied value is, the more of the
	 * B-masked bytes stored in t will be non-zero.
	 */
	const size_t  B = 0x007F7E7C78706040;
	size_t t = B & P * (LEAD(n) * Q >> 57);

	/*
	 * We want to get a count of the non-zero bytes stored in t.
	 * First use LEAD(t) to create a set of high bits signaling
	 * non-zero values as before.  Call this value
	 * X = x6*2^55 +x5*2^47 +x4*2^39 +x3*2^31 +x2*2^23 +x1*2^15 +x0*2^7
	 * Then notice what multiplication by
	 * P =    2^48 +   2^40 +   2^32 +   2^24 +   2^16 +   2^8 + 1
	 * produces:
	 * P*X = x0*2^7 + (x0 + x1)*2^15 + ...
	 *       ... + (x0 + x1 + x2 + x3 + x4 + x5 + x6) * 2^55 + ...
	 *	 ... + (x5 + x6)*2^95 + x6*2^103
	 * The high terms of this product are going to overflow the size_t
	 * and get lost, but we don't care about them.  What we care is that
	 * the 2^55 term is exactly the sum we seek.  We shift the product
	 * down by 55 bits and then mask away all but the bottom 3 bits
	 * (Max sum can be 7) we get exactly the count of non-zero B-masked
	 * bytes.  By design of the mask, this count is the index of the
	 * MSB of the LEAD value.  It indicates which byte of the original
	 * value contains the MSB of the original value.
	 */
#define SUM(t) (0x7 & (int)(LEAD(t) * P >> 55));

	/*
	 * Multiply by 8 to get the number of bits to shift to place
	 * that MSB-containing byte in the low byte.
	 */
	int k = 8 * SUM(t);

	/*
	 * Shift the MSB byte to the low byte.  Then shift one more bit.
	 * Since we know the MSB byte is non-zero we only need to compute
	 * the MSB of the top 7 bits.  If all top 7 bits are zero, we know
	 * the bottom bit is the 1 and the correct index is 0.  Compute the
	 * MSB of that value by the same steps we did before.
	 */
	t = B & P * (n >> k >> 1);

	/*
	 * Add the index of the MSB of the byte to the index of the low
	 * bit of that byte computed before to get the final answer.
	 */
	return k + SUM(t);

	/* Total operations: 33
	 * 10 bit-ands, 6 multiplies, 4 adds, 5 rightshifts,
	 * 3 assignments, 3 bit-ors, 2 typecasts.
	 *
	 * The whole task is one direct computation.
	 * No branches. No loops.
	 *
	 * 33 operations cannot beat one instruction, so assembly
	 * wins and should be used wherever possible, but this isn't bad.
	 */

#undef SUM
    } else if (M == 32) {

	/* Same scheme as above, with adjustments to the 32-bit size */
	const size_t C1 = 0xA0820820;
	const size_t C2 = 0x5F7DF7DF;
	const size_t C3 = 0xC0820820;
	const size_t C4 = 0x20000000;
	const size_t Q  = 0x00010841;
	const size_t P = 0x01041041;
	const size_t B = 0x1F79C610;

#define SUM(t) (0x7 & (LEAD(t) * P >> 29));

	size_t t = B & P * ((C3 & (LEAD(n) + C4)) * Q >> 27);
	int k = 6 * SUM(t);

	t = B & P * (n >> k >> 1);
	return k + SUM(t);

	/* Total operations: 33
	 * 11 bit-ands, 6 multiplies, 5 adds, 5 rightshifts,
	 * 3 assignments, 3 bit-ors.
	 */

#undef SUM
#undef LEAD

    } else {
	/* Simple and slow fallback for cases we haven't done yet. */
	int k = 0;

	while (n >>= 1) {
	    k++;
	}
	return k;
    }
}

/*
 *----------------------------------------------------------------------
 *
 * TclBAConvertIndices --
 *
 *	Given a size_t index into the sequence, convert into the
 *	corresponding index pair of the store[] of a BrodnikArray.
 *
 *	store[] is an array of arrays, and as the total size grows
 *	larger, the size of the arrays store[i] grow in a way that
 *	each new array is of about size sqrt(N), yet the index conversion
 *	routine remains relatively simple to calculate.
 *
 * Results:
 *	The index pair is written to *hiPtr and *loPtr, which may not
 *	be NULL.
 *
 * Side effects:
 *	None.
 *
 *----------------------------------------------------------------------
 */

void
TclBAConvertIndices(
    size_t		index,
    unsigned int	*hiPtr,
    unsigned int	*loPtr)
{
    size_t		r = index + 1;
    int			k = TclMSB(r);
    int			shift = (k + 1) >> 1;
    unsigned int	lobits = (1 << shift) - 1;
    unsigned int	hibits = 1 << (k - shift);

    *hiPtr	= (lobits << 1) - ((k & 1) * hibits)
			+ ((r >> shift) & (hibits - 1));
    *loPtr	= r & lobits;
}

/*
 *----------------------------------------------------------------------
 *
 * TclBAInvertIndices --
 *
 *	Given a size_t index pair hi, lo into the store[] of a BrodnikArray,
 *	compute and return the size_t index of the element found there.
 *
 * Results:
 *	The size_t index value.
 *
 * Side effects:
 *	None.
 *----------------------------------------------------------------------
 */

size_t
TclBAInvertIndices(
    unsigned int	hi,
    unsigned int	lo)
{
    size_t		plus2 = hi + 2;
    int			n = TclMSB(plus2) - 1;
    unsigned int	bit = (((size_t)1)<<n);
    int			SB = 2*n + ((plus2 & bit)!=0);
    size_t		base = (((size_t)1)<<SB) - 1;
    size_t		increment = ((bit - 1) & plus2) << n;

    return base + increment + lo;
}