summaryrefslogtreecommitdiffstats
path: root/Python/pyhash.c
blob: 57a2da715e17f09875c7416d66ddbdde54244ebc (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
/* Set of hash utility functions to help maintaining the invariant that
    if a==b then hash(a)==hash(b)

   All the utility functions (_Py_Hash*()) return "-1" to signify an error.
*/
#include "Python.h"

#ifdef __APPLE__
#  include <libkern/OSByteOrder.h>
#elif defined(HAVE_LE64TOH) && defined(HAVE_ENDIAN_H)
#  include <endian.h>
#elif defined(HAVE_LE64TOH) && defined(HAVE_SYS_ENDIAN_H)
#  include <sys/endian.h>
#endif

#ifdef __cplusplus
extern "C" {
#endif

_Py_HashSecret_t _Py_HashSecret;

#if Py_HASH_ALGORITHM == Py_HASH_EXTERNAL
extern PyHash_FuncDef PyHash_Func;
#else
static PyHash_FuncDef PyHash_Func;
#endif

/* Count _Py_HashBytes() calls */
#ifdef Py_HASH_STATS
#define Py_HASH_STATS_MAX 32
static Py_ssize_t hashstats[Py_HASH_STATS_MAX + 1] = {0};
#endif

/* For numeric types, the hash of a number x is based on the reduction
   of x modulo the prime P = 2**_PyHASH_BITS - 1.  It's designed so that
   hash(x) == hash(y) whenever x and y are numerically equal, even if
   x and y have different types.

   A quick summary of the hashing strategy:

   (1) First define the 'reduction of x modulo P' for any rational
   number x; this is a standard extension of the usual notion of
   reduction modulo P for integers.  If x == p/q (written in lowest
   terms), the reduction is interpreted as the reduction of p times
   the inverse of the reduction of q, all modulo P; if q is exactly
   divisible by P then define the reduction to be infinity.  So we've
   got a well-defined map

      reduce : { rational numbers } -> { 0, 1, 2, ..., P-1, infinity }.

   (2) Now for a rational number x, define hash(x) by:

      reduce(x)   if x >= 0
      -reduce(-x) if x < 0

   If the result of the reduction is infinity (this is impossible for
   integers, floats and Decimals) then use the predefined hash value
   _PyHASH_INF for x >= 0, or -_PyHASH_INF for x < 0, instead.
   _PyHASH_INF, -_PyHASH_INF and _PyHASH_NAN are also used for the
   hashes of float and Decimal infinities and nans.

   A selling point for the above strategy is that it makes it possible
   to compute hashes of decimal and binary floating-point numbers
   efficiently, even if the exponent of the binary or decimal number
   is large.  The key point is that

      reduce(x * y) == reduce(x) * reduce(y) (modulo _PyHASH_MODULUS)

   provided that {reduce(x), reduce(y)} != {0, infinity}.  The reduction of a
   binary or decimal float is never infinity, since the denominator is a power
   of 2 (for binary) or a divisor of a power of 10 (for decimal).  So we have,
   for nonnegative x,

      reduce(x * 2**e) == reduce(x) * reduce(2**e) % _PyHASH_MODULUS

      reduce(x * 10**e) == reduce(x) * reduce(10**e) % _PyHASH_MODULUS

   and reduce(10**e) can be computed efficiently by the usual modular
   exponentiation algorithm.  For reduce(2**e) it's even better: since
   P is of the form 2**n-1, reduce(2**e) is 2**(e mod n), and multiplication
   by 2**(e mod n) modulo 2**n-1 just amounts to a rotation of bits.

   */

Py_hash_t
_Py_HashDouble(double v)
{
    int e, sign;
    double m;
    Py_uhash_t x, y;

    if (!Py_IS_FINITE(v)) {
        if (Py_IS_INFINITY(v))
            return v > 0 ? _PyHASH_INF : -_PyHASH_INF;
        else
            return _PyHASH_NAN;
    }

    m = frexp(v, &e);

    sign = 1;
    if (m < 0) {
        sign = -1;
        m = -m;
    }

    /* process 28 bits at a time;  this should work well both for binary
       and hexadecimal floating point. */
    x = 0;
    while (m) {
        x = ((x << 28) & _PyHASH_MODULUS) | x >> (_PyHASH_BITS - 28);
        m *= 268435456.0;  /* 2**28 */
        e -= 28;
        y = (Py_uhash_t)m;  /* pull out integer part */
        m -= y;
        x += y;
        if (x >= _PyHASH_MODULUS)
            x -= _PyHASH_MODULUS;
    }

    /* adjust for the exponent;  first reduce it modulo _PyHASH_BITS */
    e = e >= 0 ? e % _PyHASH_BITS : _PyHASH_BITS-1-((-1-e) % _PyHASH_BITS);
    x = ((x << e) & _PyHASH_MODULUS) | x >> (_PyHASH_BITS - e);

    x = x * sign;
    if (x == (Py_uhash_t)-1)
        x = (Py_uhash_t)-2;
    return (Py_hash_t)x;
}

Py_hash_t
_Py_HashPointer(void *p)
{
    Py_hash_t x;
    size_t y = (size_t)p;
    /* bottom 3 or 4 bits are likely to be 0; rotate y by 4 to avoid
       excessive hash collisions for dicts and sets */
    y = (y >> 4) | (y << (8 * SIZEOF_VOID_P - 4));
    x = (Py_hash_t)y;
    if (x == -1)
        x = -2;
    return x;
}

Py_hash_t
_Py_HashBytes(const void *src, Py_ssize_t len)
{
    Py_hash_t x;
    /*
      We make the hash of the empty string be 0, rather than using
      (prefix ^ suffix), since this slightly obfuscates the hash secret
    */
    if (len == 0) {
        return 0;
    }

#ifdef Py_HASH_STATS
    hashstats[(len <= Py_HASH_STATS_MAX) ? len : 0]++;
#endif

#if Py_HASH_CUTOFF > 0
    if (len < Py_HASH_CUTOFF) {
        /* Optimize hashing of very small strings with inline DJBX33A. */
        Py_uhash_t hash;
        const unsigned char *p = src;
        hash = 5381; /* DJBX33A starts with 5381 */

        switch(len) {
            /* ((hash << 5) + hash) + *p == hash * 33 + *p */
            case 7: hash = ((hash << 5) + hash) + *p++; /* fallthrough */
            case 6: hash = ((hash << 5) + hash) + *p++; /* fallthrough */
            case 5: hash = ((hash << 5) + hash) + *p++; /* fallthrough */
            case 4: hash = ((hash << 5) + hash) + *p++; /* fallthrough */
            case 3: hash = ((hash << 5) + hash) + *p++; /* fallthrough */
            case 2: hash = ((hash << 5) + hash) + *p++; /* fallthrough */
            case 1: hash = ((hash << 5) + hash) + *p++; break;
            default:
                assert(0);
        }
        hash ^= len;
        hash ^= (Py_uhash_t) _Py_HashSecret.djbx33a.suffix;
        x = (Py_hash_t)hash;
    }
    else
#endif /* Py_HASH_CUTOFF */
        x = PyHash_Func.hash(src, len);

    if (x == -1)
        return -2;
    return x;
}

void
_PyHash_Fini(void)
{
#ifdef Py_HASH_STATS
    int i;
    Py_ssize_t total = 0;
    char *fmt = "%2i %8" PY_FORMAT_SIZE_T "d %8" PY_FORMAT_SIZE_T "d\n";

    fprintf(stderr, "len   calls    total\n");
    for (i = 1; i <= Py_HASH_STATS_MAX; i++) {
        total += hashstats[i];
        fprintf(stderr, fmt, i, hashstats[i], total);
    }
    total += hashstats[0];
    fprintf(stderr, ">  %8" PY_FORMAT_SIZE_T "d %8" PY_FORMAT_SIZE_T "d\n",
            hashstats[0], total);
#endif
}

PyHash_FuncDef *
PyHash_GetFuncDef(void)
{
    return &PyHash_Func;
}

/* Optimized memcpy() for Windows */
#ifdef _MSC_VER
#  if SIZEOF_PY_UHASH_T == 4
#    define PY_UHASH_CPY(dst, src) do {                                    \
       dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; \
       } while(0)
#  elif SIZEOF_PY_UHASH_T == 8
#    define PY_UHASH_CPY(dst, src) do {                                    \
       dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; \
       dst[4] = src[4]; dst[5] = src[5]; dst[6] = src[6]; dst[7] = src[7]; \
       } while(0)
#  else
#    error SIZEOF_PY_UHASH_T must be 4 or 8
#  endif /* SIZEOF_PY_UHASH_T */
#else /* not Windows */
#  define PY_UHASH_CPY(dst, src) memcpy(dst, src, SIZEOF_PY_UHASH_T)
#endif /* _MSC_VER */


#if Py_HASH_ALGORITHM == Py_HASH_FNV
/* **************************************************************************
 * Modified Fowler-Noll-Vo (FNV) hash function
 */
static Py_hash_t
fnv(const void *src, Py_ssize_t len)
{
    const unsigned char *p = src;
    Py_uhash_t x;
    Py_ssize_t remainder, blocks;
    union {
        Py_uhash_t value;
        unsigned char bytes[SIZEOF_PY_UHASH_T];
    } block;

#ifdef Py_DEBUG
    assert(_Py_HashSecret_Initialized);
#endif
    remainder = len % SIZEOF_PY_UHASH_T;
    if (remainder == 0) {
        /* Process at least one block byte by byte to reduce hash collisions
         * for strings with common prefixes. */
        remainder = SIZEOF_PY_UHASH_T;
    }
    blocks = (len - remainder) / SIZEOF_PY_UHASH_T;

    x = (Py_uhash_t) _Py_HashSecret.fnv.prefix;
    x ^= (Py_uhash_t) *p << 7;
    while (blocks--) {
        PY_UHASH_CPY(block.bytes, p);
        x = (_PyHASH_MULTIPLIER * x) ^ block.value;
        p += SIZEOF_PY_UHASH_T;
    }
    /* add remainder */
    for (; remainder > 0; remainder--)
        x = (_PyHASH_MULTIPLIER * x) ^ (Py_uhash_t) *p++;
    x ^= (Py_uhash_t) len;
    x ^= (Py_uhash_t) _Py_HashSecret.fnv.suffix;
    if (x == -1) {
        x = -2;
    }
    return x;
}

static PyHash_FuncDef PyHash_Func = {fnv, "fnv", 8 * SIZEOF_PY_HASH_T,
                                     16 * SIZEOF_PY_HASH_T};

#endif /* Py_HASH_ALGORITHM == Py_HASH_FNV */


#if Py_HASH_ALGORITHM == Py_HASH_SIPHASH24
/* **************************************************************************
 <MIT License>
 Copyright (c) 2013  Marek Majkowski <marek@popcount.org>

 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
 in the Software without restriction, including without limitation the rights
 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 copies of the Software, and to permit persons to whom the Software is
 furnished to do so, subject to the following conditions:

 The above copyright notice and this permission notice shall be included in
 all copies or substantial portions of the Software.

 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 </MIT License>

 Original location:
    https://github.com/majek/csiphash/

 Solution inspired by code from:
    Samuel Neves (supercop/crypto_auth/siphash24/little)
    djb (supercop/crypto_auth/siphash24/little2)
    Jean-Philippe Aumasson (https://131002.net/siphash/siphash24.c)

 Modified for Python by Christian Heimes:
    - C89 / MSVC compatibility
    - _rotl64() on Windows
    - letoh64() fallback
*/

/* byte swap little endian to host endian
 * Endian conversion not only ensures that the hash function returns the same
 * value on all platforms. It is also required to for a good dispersion of
 * the hash values' least significant bits.
 */
#if PY_LITTLE_ENDIAN
#  define _le64toh(x) ((uint64_t)(x))
#elif defined(__APPLE__)
#  define _le64toh(x) OSSwapLittleToHostInt64(x)
#elif defined(HAVE_LETOH64)
#  define _le64toh(x) le64toh(x)
#else
#  define _le64toh(x) (((uint64_t)(x) << 56) | \
                      (((uint64_t)(x) << 40) & 0xff000000000000ULL) | \
                      (((uint64_t)(x) << 24) & 0xff0000000000ULL) | \
                      (((uint64_t)(x) << 8)  & 0xff00000000ULL) | \
                      (((uint64_t)(x) >> 8)  & 0xff000000ULL) | \
                      (((uint64_t)(x) >> 24) & 0xff0000ULL) | \
                      (((uint64_t)(x) >> 40) & 0xff00ULL) | \
                      ((uint64_t)(x)  >> 56))
#endif


#ifdef _MSC_VER
#  define ROTATE(x, b)  _rotl64(x, b)
#else
#  define ROTATE(x, b) (uint64_t)( ((x) << (b)) | ( (x) >> (64 - (b))) )
#endif

#define HALF_ROUND(a,b,c,d,s,t)         \
    a += b; c += d;             \
    b = ROTATE(b, s) ^ a;           \
    d = ROTATE(d, t) ^ c;           \
    a = ROTATE(a, 32);

#define DOUBLE_ROUND(v0,v1,v2,v3)       \
    HALF_ROUND(v0,v1,v2,v3,13,16);      \
    HALF_ROUND(v2,v1,v0,v3,17,21);      \
    HALF_ROUND(v0,v1,v2,v3,13,16);      \
    HALF_ROUND(v2,v1,v0,v3,17,21);


static Py_hash_t
siphash24(const void *src, Py_ssize_t src_sz) {
    uint64_t k0 = _le64toh(_Py_HashSecret.siphash.k0);
    uint64_t k1 = _le64toh(_Py_HashSecret.siphash.k1);
    uint64_t b = (uint64_t)src_sz << 56;
    const uint64_t *in = (uint64_t*)src;

    uint64_t v0 = k0 ^ 0x736f6d6570736575ULL;
    uint64_t v1 = k1 ^ 0x646f72616e646f6dULL;
    uint64_t v2 = k0 ^ 0x6c7967656e657261ULL;
    uint64_t v3 = k1 ^ 0x7465646279746573ULL;

    uint64_t t;
    uint8_t *pt;
    uint8_t *m;

    while (src_sz >= 8) {
        uint64_t mi = _le64toh(*in);
        in += 1;
        src_sz -= 8;
        v3 ^= mi;
        DOUBLE_ROUND(v0,v1,v2,v3);
        v0 ^= mi;
    }

    t = 0;
    pt = (uint8_t *)&t;
    m = (uint8_t *)in;
    switch (src_sz) {
        case 7: pt[6] = m[6];
        case 6: pt[5] = m[5];
        case 5: pt[4] = m[4];
        case 4: memcpy(pt, m, sizeof(uint32_t)); break;
        case 3: pt[2] = m[2];
        case 2: pt[1] = m[1];
        case 1: pt[0] = m[0];
    }
    b |= _le64toh(t);

    v3 ^= b;
    DOUBLE_ROUND(v0,v1,v2,v3);
    v0 ^= b;
    v2 ^= 0xff;
    DOUBLE_ROUND(v0,v1,v2,v3);
    DOUBLE_ROUND(v0,v1,v2,v3);

    /* modified */
    t = (v0 ^ v1) ^ (v2 ^ v3);
    return (Py_hash_t)t;
}

static PyHash_FuncDef PyHash_Func = {siphash24, "siphash24", 64, 128};

#endif /* Py_HASH_ALGORITHM == Py_HASH_SIPHASH24 */

#ifdef __cplusplus
}
#endif
for long periods of time. .PP \fBTcl_RegExpExec\fR executes the regular expression pattern matcher. It returns 1 if \fItext\fR contains a range of characters that match \fIregexp\fR, 0 if no match is found, and \-1 if an error occurs. In the case of an error, \fBTcl_RegExpExec\fR leaves an error message in the interpreter result. When searching a string for multiple matches of a pattern, it is important to distinguish between the start of the original string and the start of the current search. For example, when searching for the second occurrence of a match, the \fItext\fR argument might point to the character just after the first match; however, it is important for the pattern matcher to know that this is not the start of the entire string, so that it doesn't allow \fB^\fR atoms in the pattern to match. The \fIstart\fR argument provides this information by pointing to the start of the overall string containing \fItext\fR. \fIStart\fR will be less than or equal to \fItext\fR; if it is less than \fItext\fR then no \fB^\fR matches will be allowed. .PP \fBTcl_RegExpRange\fR may be invoked after \fBTcl_RegExpExec\fR returns; it provides detailed information about what ranges of the string matched what parts of the pattern. \fBTcl_RegExpRange\fR returns a pair of pointers in \fI*startPtr\fR and \fI*endPtr\fR that identify a range of characters in the source string for the most recent call to \fBTcl_RegExpExec\fR. \fIIndex\fR indicates which of several ranges is desired: if \fIindex\fR is 0, information is returned about the overall range of characters that matched the entire pattern; otherwise, information is returned about the range of characters that matched the \fIindex\fR'th parenthesized subexpression within the pattern. If there is no range corresponding to \fIindex\fR then NULL is stored in \fI*startPtr\fR and \fI*endPtr\fR. .PP \fBTcl_GetRegExpFromObj\fR, \fBTcl_RegExpExecObj\fR, and \fBTcl_RegExpGetInfo\fR are object interfaces that provide the most direct control of Henry Spencer's regular expression library. For users that need to modify compilation and execution options directly, it is recommended that you use these interfaces instead of calling the internal regexp functions. These interfaces handle the details of UTF to Unicode translations as well as providing improved performance through caching in the pattern and string objects. .PP \fBTcl_GetRegExpFromObj\fR attempts to return a compiled regular expression from the \fIpatObj\fR. If the object does not already contain a compiled regular expression it will attempt to create one from the string in the object and assign it to the internal representation of the \fIpatObj\fR. The return value of this function is of type \fBTcl_RegExp\fR. The return value is a token for this compiled form, which can be used in subsequent calls to \fBTcl_RegExpExecObj\fR or \fBTcl_RegExpGetInfo\fR. If an error occurs while compiling the regular expression then \fBTcl_GetRegExpFromObj\fR returns NULL and leaves an error message in the interpreter result. The regular expression token can be used as long as the internal representation of \fIpatObj\fR refers to the compiled form. The \fIeflags\fR argument is a bit-wise OR of zero or more of the following flags that control the compilation of \fIpatObj\fR: .RS 2 .TP \fBTCL_REG_ADVANCED\fR Compile advanced regular expressions .PQ ARE s . This mode corresponds to the normal regular expression syntax accepted by the Tcl \fBregexp\fR and \fBregsub\fR commands. .TP \fBTCL_REG_EXTENDED\fR Compile extended regular expressions .PQ ERE s . This mode corresponds to the regular expression syntax recognized by Tcl 8.0 and earlier versions. .TP \fBTCL_REG_BASIC\fR Compile basic regular expressions .PQ BRE s . This mode corresponds to the regular expression syntax recognized by common Unix utilities like \fBsed\fR and \fBgrep\fR. This is the default if no flags are specified. .TP \fBTCL_REG_EXPANDED\fR Compile the regular expression (basic, extended, or advanced) using an expanded syntax that allows comments and whitespace. This mode causes non-backslashed non-bracket-expression white space and #-to-end-of-line comments to be ignored. .TP \fBTCL_REG_QUOTE\fR Compile a literal string, with all characters treated as ordinary characters. .TP \fBTCL_REG_NOCASE\fR Compile for matching that ignores upper/lower case distinctions. .TP \fBTCL_REG_NEWLINE\fR Compile for newline-sensitive matching. By default, newline is a completely ordinary character with no special meaning in either regular expressions or strings. With this flag, .QW [^ bracket expressions and .QW . never match newline, .QW ^ matches an empty string after any newline in addition to its normal function, and .QW $ matches an empty string before any newline in addition to its normal function. \fBREG_NEWLINE\fR is the bit-wise OR of \fBREG_NLSTOP\fR and \fBREG_NLANCH\fR. .TP \fBTCL_REG_NLSTOP\fR Compile for partial newline-sensitive matching, with the behavior of .QW [^ bracket expressions and .QW . affected, but not the behavior of .QW ^ and .QW $ . In this mode, .QW [^ bracket expressions and .QW . never match newline. .TP \fBTCL_REG_NLANCH\fR Compile for inverse partial newline-sensitive matching, with the behavior of .QW ^ and .QW $ (the .QW anchors ) affected, but not the behavior of .QW [^ bracket expressions and .QW . . In this mode .QW ^ matches an empty string after any newline in addition to its normal function, and .QW $ matches an empty string before any newline in addition to its normal function. .TP \fBTCL_REG_NOSUB\fR Compile for matching that reports only success or failure, not what was matched. This reduces compile overhead and may improve performance. Subsequent calls to \fBTcl_RegExpGetInfo\fR or \fBTcl_RegExpRange\fR will not report any match information. .TP \fBTCL_REG_CANMATCH\fR Compile for matching that reports the potential to complete a partial match given more text (see below). .RE .PP Only one of \fBTCL_REG_EXTENDED\fR, \fBTCL_REG_ADVANCED\fR, \fBTCL_REG_BASIC\fR, and \fBTCL_REG_QUOTE\fR may be specified. .PP \fBTcl_RegExpExecObj\fR executes the regular expression pattern matcher. It returns 1 if \fIobjPtr\fR contains a range of characters that match \fIregexp\fR, 0 if no match is found, and \-1 if an error occurs. In the case of an error, \fBTcl_RegExpExecObj\fR leaves an error message in the interpreter result. The \fInmatches\fR value indicates to the matcher how many subexpressions are of interest. If \fInmatches\fR is 0, then no subexpression match information is recorded, which may allow the matcher to make various optimizations. If the value is -1, then all of the subexpressions in the pattern are remembered. If the value is a positive integer, then only that number of subexpressions will be remembered. Matching begins at the specified Unicode character index given by \fIoffset\fR. Unlike \fBTcl_RegExpExec\fR, the behavior of anchors is not affected by the offset value. Instead the behavior of the anchors is explicitly controlled by the \fIeflags\fR argument, which is a bit-wise OR of zero or more of the following flags: .RS 2 .TP \fBTCL_REG_NOTBOL\fR The starting character will not be treated as the beginning of a line or the beginning of the string, so .QW ^ will not match there. Note that this flag has no effect on how .QW \fB\eA\fR matches. .TP \fBTCL_REG_NOTEOL\fR The last character in the string will not be treated as the end of a line or the end of the string, so .QW $ will not match there. Note that this flag has no effect on how .QW \fB\eZ\fR matches. .RE .PP \fBTcl_RegExpGetInfo\fR retrieves information about the last match performed with a given regular expression \fIregexp\fR. The \fIinfoPtr\fR argument contains a pointer to a structure that is defined as follows: .PP .CS typedef struct Tcl_RegExpInfo { int \fInsubs\fR; Tcl_RegExpIndices *\fImatches\fR; long \fIextendStart\fR; } Tcl_RegExpInfo; .CE .PP The \fInsubs\fR field contains a count of the number of parenthesized subexpressions within the regular expression. If the \fBTCL_REG_NOSUB\fR was used, then this value will be zero. The \fImatches\fR field points to an array of \fInsubs\fR values that indicate the bounds of each subexpression matched. The first element in the array refers to the range matched by the entire regular expression, and subsequent elements refer to the parenthesized subexpressions in the order that they appear in the pattern. Each element is a structure that is defined as follows: .PP .CS typedef struct Tcl_RegExpIndices { long \fIstart\fR; long \fIend\fR; } Tcl_RegExpIndices; .CE .PP The \fIstart\fR and \fIend\fR values are Unicode character indices relative to the offset location within \fIobjPtr\fR where matching began. The \fIstart\fR index identifies the first character of the matched subexpression. The \fIend\fR index identifies the first character after the matched subexpression. If the subexpression matched the empty string, then \fIstart\fR and \fIend\fR will be equal. If the subexpression did not participate in the match, then \fIstart\fR and \fIend\fR will be set to -1. .PP The \fIextendStart\fR field in \fBTcl_RegExpInfo\fR is only set if the \fBTCL_REG_CANMATCH\fR flag was used. It indicates the first character in the string where a match could occur. If a match was found, this will be the same as the beginning of the current match. If no match was found, then it indicates the earliest point at which a match might occur if additional text is appended to the string. If it is no match is possible even with further text, this field will be set to -1. .SH "SEE ALSO" re_syntax(n) .SH KEYWORDS match, pattern, regular expression, string, subexpression, Tcl_RegExpIndices, Tcl_RegExpInfo