diff options
| -rw-r--r-- | Lib/sre.py | 13 | ||||
| -rw-r--r-- | Lib/sre_compile.py | 64 | ||||
| -rw-r--r-- | Lib/sre_constants.py | 18 | ||||
| -rw-r--r-- | Lib/sre_parse.py | 6 | ||||
| -rw-r--r-- | Modules/_sre.c | 743 | ||||
| -rw-r--r-- | Modules/sre.h | 26 | ||||
| -rw-r--r-- | Modules/sre_constants.h | 36 | 
7 files changed, 370 insertions, 536 deletions
@@ -5,8 +5,12 @@  #  # Copyright (c) 1998-2000 by Secret Labs AB.  All rights reserved.  # +# This version of the SRE library can be redistributed under CNRI's +# Python 1.6 license.  For any other use, please contact Secret Labs +# AB (info@pythonware.com). +#  # Portions of this engine have been developed in cooperation with -# CNRI.  Hewlett-Packard provided funding for 2.0 integration and +# CNRI.  Hewlett-Packard provided funding for 1.6 integration and  # other compatibility work.  # @@ -24,7 +28,7 @@ M = MULTILINE = sre_compile.SRE_FLAG_MULTILINE  S = DOTALL = sre_compile.SRE_FLAG_DOTALL  X = VERBOSE = sre_compile.SRE_FLAG_VERBOSE -# sre extensions (may or may not be in 2.0 final) +# sre extensions (may or may not be in 1.6/2.0 final)  T = TEMPLATE = sre_compile.SRE_FLAG_TEMPLATE  U = UNICODE = sre_compile.SRE_FLAG_UNICODE @@ -168,15 +172,14 @@ copy_reg.pickle(type(_compile("")), _pickle, _compile)  class Scanner:      def __init__(self, lexicon): -        from sre_constants import BRANCH, SUBPATTERN, INDEX +        from sre_constants import BRANCH, SUBPATTERN          self.lexicon = lexicon          # combine phrases into a compound pattern          p = []          s = sre_parse.Pattern()          for phrase, action in lexicon:              p.append(sre_parse.SubPattern(s, [ -                (SUBPATTERN, (None, sre_parse.parse(phrase))), -                (INDEX, len(p)) +                (SUBPATTERN, (len(p), sre_parse.parse(phrase))),                  ]))          p = sre_parse.SubPattern(s, [(BRANCH, (None, p))])          s.groups = len(p) diff --git a/Lib/sre_compile.py b/Lib/sre_compile.py index ef26e1c..2d1cbb1 100644 --- a/Lib/sre_compile.py +++ b/Lib/sre_compile.py @@ -5,9 +5,7 @@  #  # Copyright (c) 1997-2000 by Secret Labs AB.  All rights reserved.  # -# Portions of this engine have been developed in cooperation with -# CNRI.  Hewlett-Packard provided funding for 2.0 integration and -# other compatibility work. +# See the sre.py file for information on usage and redistribution.  #  import _sre @@ -124,6 +122,7 @@ def _compile(code, pattern, flags):                  emit(CHCODES[CATEGORY_NOT_LINEBREAK])          elif op in (REPEAT, MIN_REPEAT, MAX_REPEAT):              if flags & SRE_FLAG_TEMPLATE: +                raise error, "internal: unsupported template operator"                  emit(OPCODES[REPEAT])                  skip = len(code); emit(0)                  emit(av[0]) @@ -136,9 +135,8 @@ def _compile(code, pattern, flags):                  if lo == 0:                      raise error, "nothing to repeat"                  if 0 and lo == hi == 1 and op is MAX_REPEAT: -                    # FIXME: <fl> need a better way to figure out when -                    # it's safe to use this one (in the parser, probably) -                    emit(OPCODES[MAX_REPEAT_ONE]) +                    # FIXME: <fl> fast and wrong (but we'll fix that) +                    emit(OPCODES[REPEAT_ONE])                      skip = len(code); emit(0)                      emit(av[0])                      emit(av[1]) @@ -146,29 +144,24 @@ def _compile(code, pattern, flags):                      emit(OPCODES[SUCCESS])                      code[skip] = len(code) - skip                  else: -                    emit(OPCODES[op]) +                    emit(OPCODES[REPEAT])                      skip = len(code); emit(0)                      emit(av[0])                      emit(av[1]) -                    mark = MAXCODE -                    if av[2][0][0] == SUBPATTERN: -                        # repeated subpattern -                        gid, foo = av[2][0][1] -                        if gid: -                            mark = (gid-1)*2 -                    emit(mark)                      _compile(code, av[2], flags) -                    emit(OPCODES[SUCCESS])                      code[skip] = len(code) - skip +                    if op == MAX_REPEAT: +                        emit(OPCODES[MAX_UNTIL]) +                    else: +                        emit(OPCODES[MIN_UNTIL])          elif op is SUBPATTERN: -            gid = av[0] -            if gid: +            if av[0]:                  emit(OPCODES[MARK]) -                emit((gid-1)*2) +                emit((av[0]-1)*2)              _compile(code, av[1], flags) -            if gid: +            if av[0]:                  emit(OPCODES[MARK]) -                emit((gid-1)*2+1) +                emit((av[0]-1)*2+1)          elif op in (SUCCESS, FAILURE):              emit(OPCODES[op])          elif op in (ASSERT, ASSERT_NOT): @@ -197,11 +190,10 @@ def _compile(code, pattern, flags):              else:                  emit(ATCODES[av])          elif op is BRANCH: +            emit(OPCODES[op])              tail = []              for av in av[1]: -                emit(OPCODES[op])                  skip = len(code); emit(0) -                emit(MAXCODE) # save mark                  _compile(code, av, flags)                  emit(OPCODES[JUMP])                  tail.append(len(code)); emit(0) @@ -223,9 +215,6 @@ def _compile(code, pattern, flags):              else:                  emit(OPCODES[op])              emit(av-1) -        elif op in (MARK, INDEX): -            emit(OPCODES[op]) -            emit(av)          else:              raise ValueError, ("unsupported operand type", op) @@ -294,16 +283,7 @@ try:  except NameError:      pass -def compile(p, flags=0): -    # internal: convert pattern list to internal format - -    # compile, as necessary -    if type(p) in STRING_TYPES: -        import sre_parse -        pattern = p -        p = sre_parse.parse(p, flags) -    else: -        pattern = None +def _compile1(p, flags):      flags = p.pattern.flags | flags      code = [] @@ -316,6 +296,20 @@ def compile(p, flags=0):      code.append(OPCODES[SUCCESS]) +    return code + +def compile(p, flags=0): +    # internal: convert pattern list to internal format + +    if type(p) in STRING_TYPES: +        import sre_parse +        pattern = p +        p = sre_parse.parse(p, flags) +    else: +        pattern = None + +    code = _compile1(p, flags) +      # print code      # FIXME: <fl> get rid of this limitation! diff --git a/Lib/sre_constants.py b/Lib/sre_constants.py index ef32c32..e595915 100644 --- a/Lib/sre_constants.py +++ b/Lib/sre_constants.py @@ -6,9 +6,7 @@  #  # Copyright (c) 1998-2000 by Secret Labs AB.  All rights reserved.  # -# Portions of this engine have been developed in cooperation with -# CNRI.  Hewlett-Packard provided funding for 2.0 integration and -# other compatibility work. +# See the sre.py file for information on usage and redistribution.  #  # should this really be here? @@ -33,15 +31,15 @@ GROUPREF = "groupref"  GROUPREF_IGNORE = "groupref_ignore"  IN = "in"  IN_IGNORE = "in_ignore" -INDEX = "index"  INFO = "info"  JUMP = "jump"  LITERAL = "literal"  LITERAL_IGNORE = "literal_ignore"  MARK = "mark"  MAX_REPEAT = "max_repeat" -MAX_REPEAT_ONE = "max_repeat_one" +MAX_UNTIL = "max_until"  MIN_REPEAT = "min_repeat" +MIN_UNTIL = "min_until"  NEGATE = "negate"  NOT_LITERAL = "not_literal"  NOT_LITERAL_IGNORE = "not_literal_ignore" @@ -91,19 +89,19 @@ OPCODES = [      CATEGORY,      CHARSET,      GROUPREF, GROUPREF_IGNORE, -    INDEX,      IN, IN_IGNORE,      INFO,      JUMP,      LITERAL, LITERAL_IGNORE,      MARK, -    MAX_REPEAT, -    MAX_REPEAT_ONE, -    MIN_REPEAT, +    MAX_UNTIL, +    MIN_UNTIL,      NOT_LITERAL, NOT_LITERAL_IGNORE,      NEGATE,      RANGE, -    REPEAT +    REPEAT, +    REPEAT_ONE, +    SUBPATTERN  ] diff --git a/Lib/sre_parse.py b/Lib/sre_parse.py index 1b56352..299aa0e 100644 --- a/Lib/sre_parse.py +++ b/Lib/sre_parse.py @@ -5,9 +5,7 @@  #  # Copyright (c) 1998-2000 by Secret Labs AB.  All rights reserved.  # -# Portions of this engine have been developed in cooperation with -# CNRI.  Hewlett-Packard provided funding for 2.0 integration and -# other compatibility work. +# See the sre.py file for information on usage and redistribution.  #  import string, sys @@ -536,8 +534,6 @@ def _parse(source, state):                      group = state.getgroup(name)                  p = _parse_sub(source, state)                  subpattern.append((SUBPATTERN, (group, p))) -                if group is not None: -                    p.append((INDEX, group))              else:                  while 1:                      char = source.get() diff --git a/Modules/_sre.c b/Modules/_sre.c index b0efc85..0b208e6 100644 --- a/Modules/_sre.c +++ b/Modules/_sre.c @@ -5,37 +5,29 @@   *   * partial history:   * 99-10-24 fl  created (based on existing template matcher code) - * 99-11-13 fl  added categories, branching, and more (0.2) - * 99-11-16 fl  some tweaks to compile on non-Windows platforms - * 99-12-18 fl  non-literals, generic maximizing repeat (0.3) - * 00-02-28 fl  tons of changes (not all to the better ;-) (0.4)   * 00-03-06 fl  first alpha, sort of (0.5) - * 00-03-14 fl  removed most compatibility stuff (0.6) - * 00-05-10 fl  towards third alpha (0.8.2) - * 00-05-13 fl  added experimental scanner stuff (0.8.3) - * 00-05-27 fl  final bug hunt (0.8.4) - * 00-06-21 fl  less bugs, more taste (0.8.5) - * 00-06-25 fl  major changes to better deal with nested repeats (0.9) - * 00-06-28 fl  fixed findall (0.9.1) - * 00-06-29 fl  fixed split, added more scanner features (0.9.2)   * 00-06-30 fl  added fast search optimization (0.9.3)   * 00-06-30 fl  added assert (lookahead) primitives, etc (0.9.4)   * 00-07-02 fl  added charset optimizations, etc (0.9.5)   * 00-07-03 fl  store code in pattern object, lookbehind, etc   * 00-07-08 fl  added regs attribute - * 00-07-18 fl  changed branch operator to use failure stack   * 00-07-21 fl  reset lastindex in scanner methods (0.9.6) + * 00-08-01 fl  fixes for 1.6b1 (0.9.8)   *   * Copyright (c) 1997-2000 by Secret Labs AB.  All rights reserved.   * + * This version of the SRE library can be redistributed under CNRI's + * Python 1.6 license.  For any other use, please contact Secret Labs + * AB (info@pythonware.com). + *   * Portions of this engine have been developed in cooperation with - * CNRI.  Hewlett-Packard provided funding for 2.0 integration and + * CNRI.  Hewlett-Packard provided funding for 1.6 integration and   * other compatibility work.   */  #ifndef SRE_RECURSIVE -char copyright[] = " SRE 0.9.6 Copyright (c) 1997-2000 by Secret Labs AB "; +char copyright[] = " SRE 0.9.8 Copyright (c) 1997-2000 by Secret Labs AB ";  #include "Python.h" @@ -67,7 +59,7 @@ char copyright[] = " SRE 0.9.6 Copyright (c) 1997-2000 by Secret Labs AB ";  #define USE_FAST_SEARCH  /* enables aggressive inlining (always on for Visual C) */ -#define USE_INLINE +#undef USE_INLINE  /* -------------------------------------------------------------------- */ @@ -84,6 +76,7 @@ char copyright[] = " SRE 0.9.6 Copyright (c) 1997-2000 by Secret Labs AB ";  /* error codes */  #define SRE_ERROR_ILLEGAL -1 /* illegal opcode */ +#define SRE_ERROR_STATE -2 /* illegal state */  #define SRE_ERROR_MEMORY -9 /* out of memory */  #if defined(VERBOSE) @@ -216,56 +209,85 @@ sre_category(SRE_CODE category, unsigned int ch)  /* helpers */ -LOCAL(int) -stack_free(SRE_STATE* state) +static void +mark_init(SRE_STATE* state)  { -    if (state->stack) { -        TRACE(("release stack\n")); -        free(state->stack); -        state->stack = NULL; -    } -    state->stacksize = 0; -    return 0; +    state->mark_stack = NULL; +    state->mark_stack_size = state->mark_stack_base = 0; +} + +static void +mark_fini(SRE_STATE* state) +{ +    if (state->mark_stack) +        free(state->mark_stack); +    mark_init(state);  } -static int /* shouldn't be LOCAL */ -stack_extend(SRE_STATE* state, int lo, int hi) +static int +mark_save(SRE_STATE* state, int lo, int hi)  { -    SRE_STACK* stack; -    int stacksize; +    void* stack; +    int size; +    int minsize, newsize; + +    if (hi <= lo) +        return 0; -    /* grow the stack to a suitable size; we need at least lo entries, -       at most hi entries.  if for some reason hi is lower than lo, lo -       wins */ +    size = (hi - lo) + 1; -    stacksize = state->stacksize; +    newsize = state->mark_stack_size; +    minsize = state->mark_stack_base + size; -    if (stacksize == 0) { +    if (newsize < minsize) {          /* create new stack */ -        stacksize = 512; -        if (stacksize < lo) -            stacksize = lo; -        else if (stacksize > hi) -            stacksize = hi; -        TRACE(("allocate stack %d\n", stacksize)); -        stack = malloc(sizeof(SRE_STACK) * stacksize); -    } else { -        /* grow the stack (typically by a factor of two) */ -        while (stacksize < lo) -            stacksize = 2 * stacksize; -        /* FIXME: <fl> could trim size if it's much larger than hi, -           as long it's larger than lo */ -        TRACE(("grow stack to %d\n", stacksize)); -        stack = realloc(state->stack, sizeof(SRE_STACK) * stacksize); +        if (!newsize) { +            newsize = 512; +            if (newsize < minsize) +                newsize = minsize; +            TRACE(("allocate stack %d\n", newsize)); +            stack = malloc(sizeof(void*) * newsize); +        } else { +            /* grow the stack */ +            while (newsize < minsize) +                newsize += newsize; +            TRACE(("grow stack to %d\n", newsize)); +            stack = realloc(state->mark_stack, sizeof(void*) * newsize); +        } +        if (!stack) { +            mark_fini(state); +            return SRE_ERROR_MEMORY; +        } +        state->mark_stack = stack; +        state->mark_stack_size = newsize;      } -    if (!stack) { -        stack_free(state); -        return SRE_ERROR_MEMORY; -    } +    TRACE(("copy %d:%d to %d\n", lo, hi, state->mark_stack_base)); + +    memcpy(state->mark_stack + state->mark_stack_base, state->mark + lo, +           size * sizeof(void*)); -    state->stack = stack; -    state->stacksize = stacksize; +    state->mark_stack_base += size; + +    return 0; +} + +static int +mark_restore(SRE_STATE* state, int lo, int hi) +{ +    int size; + +    if (hi <= lo) +        return 0; + +    size = (hi - lo) + 1; + +    state->mark_stack_base -= size; + +    TRACE(("copy %d:%d from %d\n", lo, hi, state->mark_stack_base)); + +    memcpy(state->mark + lo, state->mark_stack + state->mark_stack_base, +           size * sizeof(void*));      return 0;  } @@ -372,28 +394,28 @@ SRE_MEMBER(SRE_CODE* set, SRE_CODE ch)              return !ok;          case SRE_OP_LITERAL: -            /* args: <literal> */ +            /* <LITERAL> <code> */              if (ch == set[0])                  return ok;              set++;              break;          case SRE_OP_RANGE: -            /* args: <lower> <upper> */ +            /* <RANGE> <lower> <upper> */              if (set[0] <= ch && ch <= set[1])                  return ok;              set += 2;              break;          case SRE_OP_CHARSET: -            /* args: <bitmap> (16 bits per code word) */ +            /* <CHARSET> <bitmap> (16 bits per code word) */              if (ch < 256 && (set[ch >> 4] & (1 << (ch & 15))))                  return ok;              set += 16;              break;          case SRE_OP_CATEGORY: -            /* args: <category> */ +            /* <CATEGORY> <code> */              if (sre_category(set[0], (int) ch))                  return ok;              set += 1; @@ -415,39 +437,17 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)      SRE_CHAR* end = state->end;      SRE_CHAR* ptr = state->ptr; -    int stack; -    int stackbase; -    int lastmark;      int i, count; -    SRE_STACK* sp; - -    /* FIXME: this is a hack! */ -    void* mark_copy[SRE_MARK_SIZE]; -    void* mark = NULL; - -#define PUSH(skip_, mark_, max_)\ -    if (stack >= state->stacksize) {\ -        i = stack_extend(state, stack + 1, stackbase + max_);\ -        if (i < 0)\ -            return i;\ -    }\ -    TRACE(("%8d: stack[%d]\n", PTR(ptr), stack));\ -    sp = state->stack + (stack++);\ -    sp->ptr = ptr;\ -    sp->pattern = pattern + skip_;\ -    sp->mark = mark_;\ -    if (mark_ != 65535) {\ -        sp->mark0 = state->mark[mark_];\ -        sp->mark1 = state->mark[mark_+1];\ -        TRACE(("          mark %d %d %d\n", mark_, PTR(state->mark[mark_]),\ -               PTR(state->mark[mark_+1])));\ -    }\ +    SRE_REPEAT* rp; +    int lastmark; + +    SRE_REPEAT rep; /* FIXME: <fl> allocate in STATE instead */      TRACE(("%8d: enter\n", PTR(ptr)));      if (pattern[0] == SRE_OP_INFO) {          /* optimization info block */ -        /* args: <1=skip> <2=flags> <3=min> ... */ +        /* <INFO> <1=skip> <2=flags> <3=min> ... */          if (pattern[3] && (end - ptr) < pattern[3]) {              TRACE(("reject (got %d chars, need %d)\n",                     (end - ptr), pattern[3])); @@ -456,11 +456,6 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)          pattern += pattern[1] + 1;      } -    stackbase = stack = state->stackbase; -    lastmark = state->lastmark; - -  retry: -      for (;;) {          switch (*pattern++) { @@ -468,30 +463,30 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)          case SRE_OP_FAILURE:              /* immediate failure */              TRACE(("%8d: failure\n", PTR(ptr))); -            goto failure; +            return 0;          case SRE_OP_SUCCESS:              /* end of pattern */              TRACE(("%8d: success\n", PTR(ptr)));              state->ptr = ptr; -            goto success; +            return 1;          case SRE_OP_AT:              /* match at given position */ -            /* args: <at> */ +            /* <AT> <code> */              TRACE(("%8d: position %d\n", PTR(ptr), *pattern));              if (!SRE_AT(state, ptr, *pattern)) -                goto failure; +                return 0;              pattern++;              break;          case SRE_OP_CATEGORY:              /* match at given category */ -            /* args: <category> */ +            /* <CATEGORY> <code> */              TRACE(("%8d: category %d [category %d]\n", PTR(ptr),                     *ptr, *pattern));              if (ptr >= end || !sre_category(pattern[0], ptr[0])) -                goto failure; +                return 0;              TRACE(("%8d: category ok\n", PTR(ptr)));              pattern++;              ptr++; @@ -499,43 +494,44 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)          case SRE_OP_LITERAL:              /* match literal string */ -            /* args: <code> */ +            /* <LITERAL> <code> */              TRACE(("%8d: literal %c\n", PTR(ptr), pattern[0]));              if (ptr >= end || (SRE_CODE) ptr[0] != pattern[0]) -                goto failure; +                return 0;              pattern++;              ptr++;              break;          case SRE_OP_NOT_LITERAL:              /* match anything that is not literal character */ -            /* args: <code> */ +            /* <NOT_LITERAL> <code> */              TRACE(("%8d: literal not %c\n", PTR(ptr), pattern[0]));              if (ptr >= end || (SRE_CODE) ptr[0] == pattern[0]) -                goto failure; +                return 0;              pattern++;              ptr++;              break;          case SRE_OP_ANY:              /* match anything */ +            /* <ANY> */              TRACE(("%8d: anything\n", PTR(ptr)));              if (ptr >= end) -                goto failure; +                return 0;              ptr++;              break;          case SRE_OP_IN:              /* match set member (or non_member) */ -            /* args: <skip> <set> */ +            /* <IN> <skip> <set> */              TRACE(("%8d: set %c\n", PTR(ptr), *ptr));              if (ptr >= end || !SRE_MEMBER(pattern + 1, *ptr)) -                goto failure; +                return 0;              pattern += pattern[0];              ptr++;              break; -        case SRE_OP_GROUP: +        case SRE_OP_GROUPREF:              /* match backreference */              TRACE(("%8d: group %d\n", PTR(ptr), pattern[0]));              i = pattern[0]; @@ -543,17 +539,17 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)                  SRE_CHAR* p = (SRE_CHAR*) state->mark[i+i];                  SRE_CHAR* e = (SRE_CHAR*) state->mark[i+i+1];                  if (!p || !e || e < p) -                    goto failure; +                    return 0;                  while (p < e) {                      if (ptr >= end || *ptr != *p) -                        goto failure; +                        return 0;                      p++; ptr++;                  }              }              pattern++;              break; -        case SRE_OP_GROUP_IGNORE: +        case SRE_OP_GROUPREF_IGNORE:              /* match backreference */              TRACE(("%8d: group ignore %d\n", PTR(ptr), pattern[0]));              i = pattern[0]; @@ -561,11 +557,11 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)                  SRE_CHAR* p = (SRE_CHAR*) state->mark[i+i];                  SRE_CHAR* e = (SRE_CHAR*) state->mark[i+i+1];                  if (!p || !e || e < p) -                    goto failure; +                    return 0;                  while (p < e) {                      if (ptr >= end ||                          state->lower(*ptr) != state->lower(*p)) -                        goto failure; +                        return 0;                      p++; ptr++;                  }              } @@ -576,7 +572,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)              TRACE(("%8d: literal lower(%c)\n", PTR(ptr), pattern[0]));              if (ptr >= end ||                  state->lower(*ptr) != state->lower(*pattern)) -                goto failure; +                return 0;              pattern++;              ptr++;              break; @@ -585,7 +581,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)              TRACE(("%8d: literal not lower(%c)\n", PTR(ptr), pattern[0]));              if (ptr >= end ||                  state->lower(*ptr) == state->lower(*pattern)) -                goto failure; +                return 0;              pattern++;              ptr++;              break; @@ -594,359 +590,211 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)              TRACE(("%8d: set lower(%c)\n", PTR(ptr), *ptr));              if (ptr >= end                  || !SRE_MEMBER(pattern+1, (SRE_CODE) state->lower(*ptr))) -                goto failure; +                return 0;              pattern += pattern[0];              ptr++;              break;          case SRE_OP_MARK:              /* set mark */ -            /* args: <mark> */ +            /* <MARK> <gid> */              TRACE(("%8d: set mark %d\n", PTR(ptr), pattern[0])); -            if (state->lastmark < pattern[0]+1) -                state->lastmark = pattern[0]+1; -            if (!mark) { -                mark = mark_copy; -                memcpy(mark, state->mark, state->lastmark*sizeof(void*)); -            } -            state->mark[pattern[0]] = ptr; -            pattern++; -            break; - -        case SRE_OP_INDEX: -            /* set index */ -            /* args: <index> */ -            TRACE(("%8d: set index %d\n", PTR(ptr), pattern[0])); -            state->lastindex = pattern[0]; +            i = pattern[0]; +            if (i & 1) +                state->lastindex = i/2 + 1; +            if (i > state->lastmark) +                state->lastmark = i; +            state->mark[i] = ptr;              pattern++;              break;          case SRE_OP_JUMP:          case SRE_OP_INFO:              /* jump forward */ -            /* args: <skip> */ +            /* <JUMP> <offset> */              TRACE(("%8d: jump +%d\n", PTR(ptr), pattern[0]));              pattern += pattern[0];              break;          case SRE_OP_ASSERT:              /* assert subpattern */ -            /* args: <skip> <back> <pattern> */ +            /* <ASSERT> <skip> <back> <pattern> */              TRACE(("%8d: assert subpattern %d\n", PTR(ptr), pattern[1]));              state->ptr = ptr - pattern[1];              if (state->ptr < state->beginning) -                goto failure; +                return 0;              i = SRE_MATCH(state, pattern + 2); -            if (i < 0) +            if (i <= 0)                  return i; -            if (!i) -                goto failure; +            if (pattern[1] > 0 && state->ptr != ptr) +                return SRE_ERROR_STATE;              pattern += pattern[0];              break;          case SRE_OP_ASSERT_NOT:              /* assert not subpattern */ -            /* args: <skip> <pattern> */ +            /* <ASSERT_NOT> <skip> <back> <pattern> */              TRACE(("%8d: assert not subpattern %d\n", PTR(ptr), pattern[1]));              state->ptr = ptr - pattern[1];              if (state->ptr < state->beginning) -                goto failure; +                return 0;              i = SRE_MATCH(state, pattern + 2);              if (i < 0)                  return i;              if (i) -                goto failure; +                return 0; +            if (pattern[1] > 0 && state->ptr != ptr) +                return SRE_ERROR_STATE;              pattern += pattern[0];              break;          case SRE_OP_BRANCH: -            /* try an alternate branch */ -            /* format: <branch> <0=skip> <1=mark> <tail...> */ +            /* alternation */ +            /* <BRANCH> <0=skip> code <JUMP> ... <NULL> */              TRACE(("%8d: branch\n", PTR(ptr))); -            if (pattern[2] != SRE_OP_LITERAL || -                (ptr < end && (SRE_CODE) ptr[0] == pattern[3])) { -                /* worth trying */ -                PUSH(pattern[0], pattern[3], 1); -                pattern += 2; -            } else -                pattern += pattern[0]; -            break; - -#if 0 -        case SRE_OP_MAX_REPEAT_ONE: -            /* match repeated sequence (maximizing regexp) */ - -            /* this operator only works if the repeated item is -               exactly one character wide, and we're not already -               collecting backtracking points.  for other cases, -               use the MAX_REPEAT operator instead */ - -            /* args: <skip> <min> <max> <step> */ -            TRACE(("%8d: max repeat one {%d,%d}\n", PTR(ptr), -                   pattern[1], pattern[2])); - -            count = 0; - -            if (pattern[3] == SRE_OP_ANY) { -                /* repeated wildcard.  skip to the end of the target -                   string, and backtrack from there */ -                /* FIXME: must look for line endings */ -                if (ptr + pattern[1] > end) -                    goto failure; /* cannot match */ -                count = pattern[2]; -                if (count > end - ptr) -                    count = end - ptr; -                ptr += count; - -            } else if (pattern[3] == SRE_OP_LITERAL) { -                /* repeated literal */ -                SRE_CODE chr = pattern[4]; -                while (count < (int) pattern[2]) { -                    if (ptr >= end || (SRE_CODE) ptr[0] != chr) -                        break; -                    ptr++; -                    count++; +            { +                lastmark = state->lastmark; +                while (pattern[0]) { +                    TRACE(("%8d: try branch\n", PTR(ptr))); +                    if (pattern[2] != SRE_OP_LITERAL || +                        (ptr < end && (SRE_CODE) ptr[0] == pattern[3])) { +                        state->ptr = ptr; +                        i = SRE_MATCH(state, pattern + 1); +                        if (i) +                            return i; +                    } +                    while (state->lastmark > lastmark) +                        state->mark[state->lastmark--] = NULL; +                    pattern += pattern[0];                  } +            } +            return 0; -            } else if (pattern[3] == SRE_OP_LITERAL_IGNORE) { -                /* repeated literal */ -                SRE_CODE chr = pattern[4]; -                while (count < (int) pattern[2]) { -                    if (ptr >= end || (SRE_CODE) state->lower(*ptr) != chr) -                        break; -                    ptr++; -                    count++; -                } +        case SRE_OP_REPEAT: +            /* create repeat context.  all the hard work is done +               by the UNTIL operator */ +            /* <REPEAT> <skip> <1=min> <2=max> item <UNTIL> tail */ +            TRACE(("%8d: repeat {%d,%d}\n", PTR(ptr), +                   pattern[1], pattern[2])); -            } else if (pattern[3] == SRE_OP_NOT_LITERAL) { -                /* repeated non-literal */ -                SRE_CODE chr = pattern[4]; -                while (count < (int) pattern[2]) { -                    if (ptr >= end || (SRE_CODE) ptr[0] == chr) -                        break; -                    ptr++; -                    count++; -                } +            state->ptr = ptr; -            } else if (pattern[3] == SRE_OP_NOT_LITERAL_IGNORE) { -                /* repeated non-literal */ -                SRE_CODE chr = pattern[4]; -                while (count < (int) pattern[2]) { -                    if (ptr >= end || (SRE_CODE) state->lower(ptr[0]) == chr) -                        break; -                    ptr++; -                    count++; -                } +            rep.count = -1; +            rep.pattern = pattern; -            } else if (pattern[3] == SRE_OP_IN) { -                /* repeated set */ -                while (count < (int) pattern[2]) { -                    if (ptr >= end || !SRE_MEMBER(pattern + 5, *ptr)) -                        break; -                    ptr++; -                    count++; -                } +            /* install new repeat context */ +            rep.prev = state->repeat; +            state->repeat = &rep; -            } else { -                /* repeated single character pattern */ -                state->ptr = ptr; -                while (count < (int) pattern[2]) { -                    i = SRE_MATCH(state, pattern + 3); -                    if (i < 0) -                        return i; -                    if (!i) -                        break; -                    count++; -                } -                state->ptr = ptr; -                ptr += count; -            } +            i = SRE_MATCH(state, pattern + pattern[0]); -            /* when we arrive here, count contains the number of -               matches, and ptr points to the tail of the target -               string.  check if the rest of the pattern matches, and -               backtrack if not. */ +            state->repeat = rep.prev; +            /* free(rp); */ -            TRACE(("%8d: repeat %d found\n", PTR(ptr), count)); +            return i; -            if (count < (int) pattern[1]) -                goto failure; +        case SRE_OP_MAX_UNTIL: +            /* maximizing repeat */ +            /* <REPEAT> <skip> <1=min> <2=max> item <MAX_UNTIL> tail */ -            if (pattern[pattern[0]] == SRE_OP_SUCCESS) { -                /* tail is empty.  we're finished */ -                TRACE(("%8d: tail is empty\n", PTR(ptr))); -                state->ptr = ptr; -                goto success; - -            } else if (pattern[pattern[0]] == SRE_OP_LITERAL) { -                /* tail starts with a literal. skip positions where -                   the rest of the pattern cannot possibly match */ -                SRE_CODE chr = pattern[pattern[0]+1]; -                TRACE(("%8d: tail is literal %d\n", PTR(ptr), chr)); -                for (;;) { -                    TRACE(("%8d: scan for tail match\n", PTR(ptr))); -                    while (count >= (int) pattern[1] && -                           (ptr >= end || *ptr != chr)) { -                        ptr--; -                        count--; -                    } -                    TRACE(("%8d: check tail\n", PTR(ptr))); -                    if (count < (int) pattern[1]) -                        break; -                    state->ptr = ptr; -                    i = SRE_MATCH(state, pattern + pattern[0]); -                    if (i > 0) { -                        TRACE(("%8d: repeat %d picked\n", PTR(ptr), count)); -                        goto success; -                    } -                    TRACE(("%8d: BACKTRACK\n", PTR(ptr))); -                    ptr--; -                    count--; -                } +            /* FIXME: we probably need to deal with zero-width +               matches in here... */ -            } else { -                /* general case */ -                TRACE(("%8d: tail is pattern\n", PTR(ptr))); -                while (count >= (int) pattern[1]) { -                    state->ptr = ptr; -                    i = SRE_MATCH(state, pattern + pattern[0]); -                    if (i < 0) -                        return i; -                    if (i) { -                        TRACE(("%8d: repeat %d picked\n", PTR(ptr), count)); -                        goto success; -                    } -                    TRACE(("%8d: BACKTRACK\n", PTR(ptr))); -                    ptr--; -                    count--; -                } -            } -            goto failure; -#endif +            rp = state->repeat; +            if (!rp) +                return SRE_ERROR_STATE; -        case SRE_OP_MAX_REPEAT: -            /* match repeated sequence (maximizing regexp) */ -            /* args: <skip> <1=min> <2=max> <3=save> <4=item> */ +            state->ptr = ptr; -            TRACE(("%8d: max repeat (%d %d)\n", PTR(ptr), -                   pattern[1], pattern[2])); +            count = rp->count + 1; -            count = 0; -            state->ptr = ptr; +            TRACE(("%8d: max until %d\n", PTR(ptr), count)); -            /* match minimum number of items */ -            while (count < (int) pattern[1]) { -                i = SRE_MATCH(state, pattern + 4); -                if (i < 0) +            if (count < rp->pattern[1]) { +                /* not enough matches */ +                TRACE(("%8d: match item (required)\n", PTR(ptr))); +                rp->count = count; +                i = SRE_MATCH(state, rp->pattern + 3); +                if (i)                      return i; -                if (!i) -                    goto failure; -                if (state->ptr == ptr) { -                    /* if the match was successful but empty, set the -                       count to max and terminate the scanning loop */ -                    count = (int) pattern[2]; -                    break; -                } -                count++; -                ptr = state->ptr; +                rp->count = count - 1; +                state->ptr = ptr; +                return 0;              } -            TRACE(("%8d: found %d leading items\n", PTR(ptr), count)); - -            if (count < (int) pattern[1]) -                goto failure; - -            /* match maximum number of items, pushing alternate end -               points to the stack */ - -            while (pattern[2] == 65535 || count < (int) pattern[2]) { -                /* this position is valid; add it to the retry -                   stack */ -                PUSH(pattern[0], pattern[3], pattern[2]); -                /* match more stuff */ -                state->stackbase = stack; -                i = SRE_MATCH(state, pattern + 4); -                state->stackbase = stackbase; -                if (i < 0) +            if (count < rp->pattern[2] || rp->pattern[2] == 65535) { +                TRACE(("%8d: match item (optional)\n", PTR(ptr))); +                /* we may have enough matches, but if we can +                   match another item, do so */ +                rp->count = count; +                lastmark = state->lastmark; +                mark_save(state, 0, lastmark); +                i = SRE_MATCH(state, rp->pattern + 3); +                if (i)                      return i; -                if (!i) -                    break; -                if (state->ptr == ptr) { -                    count = (int) pattern[2]; -                    break; -                } -                /* move forward */ -                ptr = state->ptr; -                count++; +                mark_restore(state, 0, lastmark); +                rp->count = count - 1; +                state->ptr = ptr;              } -            /* when we get here, count is the number of successful -               matches, and ptr points to the tail. */ +            /* cannot match more repeated items here.  make sure the +               tail matches */ +            TRACE(("%8d: match tail\n", PTR(ptr))); +            state->repeat = rp->prev; +            i = SRE_MATCH(state, pattern); +            if (i) { +                /* free(rp); */ +                return i; +            } +            state->repeat = rp; +            return 0; -            TRACE(("%8d: skip +%d\n", PTR(ptr), pattern[0])); +        case SRE_OP_MIN_UNTIL: +            /* minimizing repeat */ +            /* <REPEAT> <skip> <1=min> <2=max> item <MIN_UNTIL> tail */ -            pattern += pattern[0]; -            break; +            rp = state->repeat; +            if (!rp) +                return SRE_ERROR_STATE; -        case SRE_OP_MIN_REPEAT: -            /* match repeated sequence (minimizing regexp) */ -            /* args: <skip> <1=min> <2=max> <3=save> <4=item> */ +            count = rp->count + 1; + +            TRACE(("%8d: min until %d\n", PTR(ptr), count)); -            TRACE(("%8d: min repeat %d %d\n", PTR(ptr), -                   pattern[1], pattern[2])); -            count = 0;              state->ptr = ptr; -            /* match minimum number of items */ -            while (count < (int) pattern[1]) { -                i = SRE_MATCH(state, pattern + 4); -                if (i < 0) -                    return i; -                if (!i) -                    goto failure; -                count++; -            } -            /* move forward until the tail matches. */ -            while (count <= (int) pattern[2]) { -                ptr = state->ptr; -                i = SRE_MATCH(state, pattern + pattern[0]); -                if (i > 0) { -                    TRACE(("%8d: repeat %d picked\n", PTR(ptr), count)); -                    goto success; -                } -                state->ptr = ptr; /* backtrack */ -                i = SRE_MATCH(state, pattern + 4); -                if (i < 0) + +            if (count < rp->pattern[1]) { +                /* not enough matches */ +                TRACE(("%8d: match item (required)\n", PTR(ptr))); +                rp->count = count; +                i = SRE_MATCH(state, rp->pattern + 3); +                if (i)                      return i; -                if (!i) -                    goto failure; -                count++; +                rp->count = count-1; +                state->ptr = ptr; +                return 0;              } -            goto failure; -        case SRE_OP_REPEAT: -            /* TEMPLATE: match repeated sequence (no backtracking) */ -            /* args: <skip> <min> <max> */ -            TRACE(("%8d: repeat %d %d\n", PTR(ptr), pattern[1], pattern[2])); -            count = 0; -            state->ptr = ptr; -            while (count < (int) pattern[2]) { -                i = SRE_MATCH(state, pattern + 3); -                if (i < 0) -                    return i; -                if (!i) -                    break; -                if (state->ptr == ptr) { -                    count = (int) pattern[2]; -                    break; -                } -                count++; +            /* see if the tail matches */ +            TRACE(("%8d: match tail\n", PTR(ptr))); +            state->repeat = rp->prev; +            i = SRE_MATCH(state, pattern); +            if (i) { +                /* free(rp); */ +                return i;              } -            if (count <= (int) pattern[1]) -                goto failure; -            TRACE(("%8d: repeat %d matches\n", PTR(ptr), count)); -            pattern += pattern[0]; -            ptr = state->ptr; -            break; +            state->repeat = rp; + +            if (count >= rp->pattern[2] && rp->pattern[2] != 65535) +                return 0; + +            TRACE(("%8d: match item (optional)\n", PTR(ptr))); +            rp->count = count; +            i = SRE_MATCH(state, rp->pattern + 3); +            if (i) +                return i; +            rp->count = count - 1; +            return 0;          default:              TRACE(("%8d: unknown opcode %d\n", PTR(ptr), pattern[-1])); @@ -954,33 +802,11 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)          }      } -  failure: -    TRACE(("%8d: leave (failure)\n", PTR(ptr))); -    if (stack-- > stackbase) { -        TRACE(("%8d: pop stack[%d]\n", stack)); -        sp = state->stack + stack; -        ptr = sp->ptr; -        pattern = sp->pattern; -        if (sp->mark != 65535) { -            state->mark[sp->mark] = sp->mark0; -            state->mark[sp->mark+1] = sp->mark1; -        } -        TRACE(("%8d: retry (%d)\n", PTR(ptr), stack)); -        goto retry; -    } -    state->lastmark = lastmark; -    state->stackbase = stackbase; -    if (mark) -        memcpy(state->mark, mark, state->lastmark*sizeof(void*)); -    return 0; - -  success: -    TRACE(("%8d: leave (success)\n", PTR(ptr))); -    state->stackbase = stackbase; -    return 1; +    /* shouldn't end up here */ +    return SRE_ERROR_ILLEGAL;  } -LOCAL(int) +static int  SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)  {      SRE_CHAR* ptr = state->start; @@ -994,7 +820,7 @@ SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)      if (pattern[0] == SRE_OP_INFO) {          /* optimization info block */ -        /* args: <1=skip> <2=flags> <3=min> <4=max> <5=prefix info>  */ +        /* <INFO> <1=skip> <2=flags> <3=min> <4=max> <5=prefix info>  */          flags = pattern[2]; @@ -1191,6 +1017,24 @@ sre_getlower(PyObject* self, PyObject* args)      return Py_BuildValue("i", sre_lower(character));  } +LOCAL(void) +state_reset(SRE_STATE* state) +{ +    int i; + +    state->lastmark = 0; + +    /* FIXME: dynamic! */ +    for (i = 0; i < SRE_MARK_SIZE; i++) +        state->mark[i] = NULL; + +    state->lastindex = -1; + +    state->repeat = NULL; + +    mark_fini(state); +} +  LOCAL(PyObject*)  state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string,             int start, int end) @@ -1198,44 +1042,53 @@ state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string,      /* prepare state object */      PyBufferProcs *buffer; -    int i, count; +    int size, bytes;      void* ptr;      /* get pointer to string buffer */      buffer = string->ob_type->tp_as_buffer;      if (!buffer || !buffer->bf_getreadbuffer || !buffer->bf_getsegcount ||          buffer->bf_getsegcount(string, NULL) != 1) { -        PyErr_SetString(PyExc_TypeError, "expected read-only buffer"); +        PyErr_SetString(PyExc_TypeError, "expected string or buffer");          return NULL;      }      /* determine buffer size */ -    count = buffer->bf_getreadbuffer(string, 0, &ptr); -    if (count < 0) { -        /* sanity check */ +    bytes = buffer->bf_getreadbuffer(string, 0, &ptr); +    if (bytes < 0) {          PyErr_SetString(PyExc_TypeError, "buffer has negative size");          return NULL;      }      /* determine character size */ -#if defined(HAVE_UNICODE) -    state->charsize = (PyUnicode_Check(string) ? sizeof(Py_UNICODE) : 1); + +#if PY_VERSION_HEX >= 0x01060000 +    size = PyObject_Size(string);  #else -    state->charsize = 1; +    size = PyObject_Length(string);  #endif -    count /= state->charsize; +    if (PyString_Check(string) || bytes == size) +        state->charsize = 1; +#if defined(HAVE_UNICODE) +    else if (bytes == (int) (size * sizeof(Py_UNICODE))) +        state->charsize = sizeof(Py_UNICODE); +#endif +    else { +        PyErr_SetString(PyExc_TypeError, "buffer size mismatch"); +        return NULL; +    }      /* adjust boundaries */      if (start < 0)          start = 0; -    else if (start > count) -        start = count; +    else if (start > size) +        start = size;      if (end < 0)          end = 0; -    else if (end > count) -        end = count; +    else if (end > size) +        end = size;      state->beginning = ptr; @@ -1247,18 +1100,6 @@ state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string,      state->pos = start;      state->endpos = end; -    state->lastmark = 0; - -    /* FIXME: dynamic! */ -    for (i = 0; i < SRE_MARK_SIZE; i++) -        state->mark[i] = NULL; - -    state->lastindex = -1; - -    state->stack = NULL; -    state->stackbase = 0; -    state->stacksize = 0; -      if (pattern->flags & SRE_FLAG_LOCALE)          state->lower = sre_lower_locale;  #if defined(HAVE_UNICODE) @@ -1268,6 +1109,10 @@ state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string,      else          state->lower = sre_lower; +    state->mark_stack = NULL; + +    state_reset(state); +      return string;  } @@ -1275,7 +1120,7 @@ LOCAL(void)  state_fini(SRE_STATE* state)  {      Py_XDECREF(state->string); -    stack_free(state); +    mark_fini(state);  }  LOCAL(PyObject*) @@ -2091,7 +1936,8 @@ scanner_match(ScannerObject* self, PyObject* args)      PyObject* match;      int status; -    state->lastindex = -1; +    state_reset(state); +      state->ptr = state->start;      if (state->charsize == 1) { @@ -2121,7 +1967,8 @@ scanner_search(ScannerObject* self, PyObject* args)      PyObject* match;      int status; -    state->lastindex = -1; +    state_reset(state); +      state->ptr = state->start;      if (state->charsize == 1) { diff --git a/Modules/sre.h b/Modules/sre.h index a62b917..bf58eb5 100644 --- a/Modules/sre.h +++ b/Modules/sre.h @@ -1,5 +1,4 @@  /* - *   * Secret Labs' Regular Expression Engine   *   * regular expression matching engine @@ -44,18 +43,15 @@ typedef struct {  typedef unsigned int (*SRE_TOLOWER_HOOK)(unsigned int ch); -typedef struct { -    /* stack elements */ -    SRE_CODE* pattern; -    void* ptr; -    int mark; -    void* mark0; -    void* mark1; -} SRE_STACK; -  /* FIXME: <fl> shouldn't be a constant, really... */  #define SRE_MARK_SIZE 200 +typedef struct SRE_REPEAT_T { +    int count; +    SRE_CODE* pattern; /* points to REPEAT operator arguments */ +    struct SRE_REPEAT_T *prev; /* points to previous repeat context */ +} SRE_REPEAT; +  typedef struct {      /* string pointers */      void* ptr; /* current position (also end of current slice) */ @@ -71,16 +67,16 @@ typedef struct {      int lastindex;      int lastmark;      void* mark[SRE_MARK_SIZE]; -    /* backtracking stack */ -    SRE_STACK* stack; -    int stacksize; -    int stackbase; +    /* dynamically allocated stuff */ +    void** mark_stack; +    int mark_stack_size; +    int mark_stack_base; +    SRE_REPEAT *repeat; /* current repeat context */      /* hooks */      SRE_TOLOWER_HOOK lower;  } SRE_STATE;  typedef struct { -    /* scanner (internal helper object) */      PyObject_HEAD      PyObject* pattern;      SRE_STATE state; diff --git a/Modules/sre_constants.h b/Modules/sre_constants.h index bffcdde..5cfe495 100644 --- a/Modules/sre_constants.h +++ b/Modules/sre_constants.h @@ -21,24 +21,24 @@  #define SRE_OP_CALL 7  #define SRE_OP_CATEGORY 8  #define SRE_OP_CHARSET 9 -#define SRE_OP_GROUP 10 -#define SRE_OP_GROUP_IGNORE 11 -#define SRE_OP_INDEX 12 -#define SRE_OP_IN 13 -#define SRE_OP_IN_IGNORE 14 -#define SRE_OP_INFO 15 -#define SRE_OP_JUMP 16 -#define SRE_OP_LITERAL 17 -#define SRE_OP_LITERAL_IGNORE 18 -#define SRE_OP_MARK 19 -#define SRE_OP_MAX_REPEAT 20 -#define SRE_OP_MAX_REPEAT_ONE 21 -#define SRE_OP_MIN_REPEAT 22 -#define SRE_OP_NOT_LITERAL 23 -#define SRE_OP_NOT_LITERAL_IGNORE 24 -#define SRE_OP_NEGATE 25 -#define SRE_OP_RANGE 26 -#define SRE_OP_REPEAT 27 +#define SRE_OP_GROUPREF 10 +#define SRE_OP_GROUPREF_IGNORE 11 +#define SRE_OP_IN 12 +#define SRE_OP_IN_IGNORE 13 +#define SRE_OP_INFO 14 +#define SRE_OP_JUMP 15 +#define SRE_OP_LITERAL 16 +#define SRE_OP_LITERAL_IGNORE 17 +#define SRE_OP_MARK 18 +#define SRE_OP_MAX_UNTIL 19 +#define SRE_OP_MIN_UNTIL 20 +#define SRE_OP_NOT_LITERAL 21 +#define SRE_OP_NOT_LITERAL_IGNORE 22 +#define SRE_OP_NEGATE 23 +#define SRE_OP_RANGE 24 +#define SRE_OP_REPEAT 25 +#define SRE_OP_REPEAT_ONE 26 +#define SRE_OP_SUBPATTERN 27  #define SRE_AT_BEGINNING 0  #define SRE_AT_BEGINNING_LINE 1  #define SRE_AT_BOUNDARY 2  | 
