diff options
author | Guido van Rossum <guido@python.org> | 1998-04-03 21:13:31 (GMT) |
---|---|---|
committer | Guido van Rossum <guido@python.org> | 1998-04-03 21:13:31 (GMT) |
commit | 042ff9eb3afb3e0853bf650f5ab3742f592aa0aa (patch) | |
tree | 1865c64cc228d85433d3ee4ca910f7c05482201d | |
parent | 104be4a4a797bfc6f8982c81839736083fa54fac (diff) | |
download | cpython-042ff9eb3afb3e0853bf650f5ab3742f592aa0aa.zip cpython-042ff9eb3afb3e0853bf650f5ab3742f592aa0aa.tar.gz cpython-042ff9eb3afb3e0853bf650f5ab3742f592aa0aa.tar.bz2 |
AMK's latest
-rw-r--r-- | Modules/pcre-int.h | 5 | ||||
-rw-r--r-- | Modules/pcre.h | 12 | ||||
-rw-r--r-- | Modules/pcremodule.c | 75 | ||||
-rw-r--r-- | Modules/pypcre.c | 166 |
4 files changed, 168 insertions, 90 deletions
diff --git a/Modules/pcre-int.h b/Modules/pcre-int.h index 2c34dfe..07aeb84 100644 --- a/Modules/pcre-int.h +++ b/Modules/pcre-int.h @@ -3,7 +3,7 @@ *************************************************/ -#define PCRE_VERSION "1.04 22-Dec-1997" +#define PCRE_VERSION "1.07 16-Feb-1998" /* This is a library of functions to support regular expressions whose syntax @@ -12,7 +12,7 @@ the file Tech.Notes for some information on the internals. Written by: Philip Hazel <ph10@cam.ac.uk> - Copyright (c) 1997 University of Cambridge + Copyright (c) 1998 University of Cambridge ----------------------------------------------------------------------------- Permission is granted to anyone to use this software for any purpose on any @@ -192,6 +192,7 @@ enum { OP_CRMINRANGE, OP_CLASS, /* Match a character class */ + OP_NEGCLASS, /* Match a character class, specified negatively */ OP_CLASS_L, /* Match a character class */ OP_REF, /* Match a back reference */ diff --git a/Modules/pcre.h b/Modules/pcre.h index 4a01bd2..06768a9 100644 --- a/Modules/pcre.h +++ b/Modules/pcre.h @@ -2,7 +2,7 @@ * Perl-Compatible Regular Expressions * *************************************************/ -/* Copyright (c) 1997 University of Cambridge */ +/* Copyright (c) 1998 University of Cambridge */ #ifndef _PCRE_H #define _PCRE_H @@ -17,6 +17,12 @@ it is needed here for malloc. */ #include <sys/types.h> #include <stdlib.h> +/* Allow for C++ users */ + +#ifdef __cplusplus +extern "C" { +#endif + /* Options */ #define PCRE_CASELESS 0x0001 @@ -68,4 +74,8 @@ extern int pcre_info(const pcre *, int *, int *); extern pcre_extra *pcre_study(const pcre *, int, const char **); extern const char *pcre_version(void); +#ifdef __cplusplus +} /* extern "C" */ +#endif + #endif /* End of pcre.h */ diff --git a/Modules/pcremodule.c b/Modules/pcremodule.c index c4890df..07a36fa 100644 --- a/Modules/pcremodule.c +++ b/Modules/pcremodule.c @@ -72,7 +72,7 @@ staticforward PyTypeObject Pcre_Type; #define NOT_WORD_BOUNDARY 6 #define BEGINNING_OF_BUFFER 7 #define END_OF_BUFFER 8 - +#define STRING 9 static PcreObject * newPcreObject(arg) @@ -191,49 +191,20 @@ PyPcre_compile(self, args) { PcreObject *rv; PyObject *dictionary; - char *pattern, *newpattern; + char *pattern; const char *error; int num_zeros, i, j; - int patternlen, options, erroroffset; - if (!PyArg_ParseTuple(args, "s#iO!", &pattern, &patternlen, &options, + int options, erroroffset; + if (!PyArg_ParseTuple(args, "siO!", &pattern, &options, &PyDict_Type, &dictionary)) return NULL; rv = newPcreObject(args); if ( rv == NULL ) return NULL; - /* PCRE doesn't like having null bytes in its pattern, so we have to replace - any zeros in the string with the characters '\000'. This increases the size - of the string by 3*num_zeros, plus 1 byte for the terminating \0. */ - num_zeros=1; /* Start at 1; this will give 3 extra bytes of leeway */ - for(i=0; i<patternlen; i++) { - if (pattern[i]==0) num_zeros++; - } - newpattern=malloc(patternlen + num_zeros*3 + 4); - if (newpattern==NULL) { - PyErr_SetString(PyExc_MemoryError, "can't allocate memory for new pattern"); - return NULL; - } - for (i=j=0; i<patternlen; i++, j++) - { - if (pattern[i]!=0) newpattern[j]=pattern[i]; - else { - newpattern[j++] ='\\'; - newpattern[j++] = '0'; - newpattern[j++] = '0'; - newpattern[j ] = '0'; - } - } - /* Keep purify happy; for pcre, one null byte is enough! */ - newpattern[j++]='\0'; - newpattern[j++]='\0'; - newpattern[j++]='\0'; - newpattern[j]='\0'; - - rv->regex = pcre_compile((char*)newpattern, options, + rv->regex = pcre_compile((char*)pattern, options, &error, &erroroffset, dictionary); - free(newpattern); if (rv->regex==NULL) { PyMem_DEL(rv); @@ -312,6 +283,10 @@ PyPcre_expand_escape(pattern, pattern_len, indexptr, typeptr) *indexptr=index; return Py_BuildValue("c", (char)8); break; + case('\\'): + *indexptr=index; + return Py_BuildValue("c", '\\'); + break; case('x'): { @@ -348,6 +323,8 @@ PyPcre_expand_escape(pattern, pattern_len, indexptr, typeptr) case('g'): { int end, i; + int group_num = 0, is_number=0; + if (pattern_len<=index) { PyErr_SetString(ErrorObject, "unfinished symbolic reference"); @@ -374,16 +351,22 @@ PyPcre_expand_escape(pattern, pattern_len, indexptr, typeptr) PyErr_SetString(ErrorObject, "zero-length symbolic reference"); return NULL; } - if (!(pcre_ctypes[pattern[index]] & ctype_word) /* First char. not alphanumeric */ - || (pcre_ctypes[pattern[index]] & ctype_digit) ) /* First char. a digit */ + if ((pcre_ctypes[pattern[index]] & ctype_digit)) /* First char. a digit */ { - /* XXX should include the text of the reference */ - PyErr_SetString(ErrorObject, "first character of symbolic reference not a letter or _"); - return NULL; + is_number = 1; + group_num = pattern[index] - '0'; } for(i=index+1; i<end; i++) { + if (is_number && + !(pcre_ctypes[pattern[i]] & ctype_digit) ) + { + /* XXX should include the text of the reference */ + PyErr_SetString(ErrorObject, "illegal non-digit character in \\g<...> starting with digit"); + return NULL; + } + else {group_num = group_num * 10 + pattern[i] - '0';} if (!(pcre_ctypes[pattern[i]] & ctype_word) ) { /* XXX should include the text of the reference */ @@ -394,6 +377,9 @@ PyPcre_expand_escape(pattern, pattern_len, indexptr, typeptr) *typeptr = MEMORY_REFERENCE; *indexptr = end+1; + /* If it's a number, return the integer value of the group */ + if (is_number) return Py_BuildValue("i", group_num); + /* Otherwise, return a string containing the group name */ return Py_BuildValue("s#", pattern+index, end-index); } break; @@ -478,8 +464,11 @@ PyPcre_expand_escape(pattern, pattern_len, indexptr, typeptr) break; default: + /* It's some unknown escape like \s, so return a string containing + \s */ + *typeptr = STRING; *indexptr = index; - return Py_BuildValue("c", c); + return Py_BuildValue("s#", pattern+index-2, 2); break; } } @@ -571,6 +560,12 @@ PyPcre_expand(self, args) Py_DECREF(result); } break; + case(STRING): + { + PyList_Append(results, value); + total_len += PyString_Size(value); + break; + } default: Py_DECREF(results); PyErr_SetString(ErrorObject, diff --git a/Modules/pypcre.c b/Modules/pypcre.c index 796f3b4..bec9197 100644 --- a/Modules/pypcre.c +++ b/Modules/pypcre.c @@ -211,7 +211,7 @@ the file Tech.Notes for some information on the internals. Written by: Philip Hazel <ph10@cam.ac.uk> - Copyright (c) 1997 University of Cambridge + Copyright (c) 1998 University of Cambridge ----------------------------------------------------------------------------- Permission is granted to anyone to use this software for any purpose on any @@ -409,6 +409,7 @@ do according to the repeat count. */ case OP_CLASS: + case OP_NEGCLASS: { tcode++; for (c = 0; c < 32; c++) start_bits[c] |= tcode[c]; @@ -547,7 +548,7 @@ the file Tech.Notes for some information on the internals. Written by: Philip Hazel <ph10@cam.ac.uk> - Copyright (c) 1997 University of Cambridge + Copyright (c) 1998 University of Cambridge ----------------------------------------------------------------------------- Permission is granted to anyone to use this software for any purpose on any @@ -586,18 +587,26 @@ the external pcre header. */ + #ifndef Py_eval_input /* For Python 1.4, graminit.h has to be explicitly included */ #define Py_eval_input eval_input #endif /* FOR_PYTHON */ +/* Allow compilation as C++ source code, should anybody want to do that. */ + +#ifdef __cplusplus +#define class pcre_class +#endif + + /* Min and max values for the common repeats; for the maxima, 0 => infinity */ -static char rep_min[] = { 0, 0, 1, 1, 0, 0 }; -static char rep_max[] = { 0, 0, 0, 0, 1, 1 }; +static const char rep_min[] = { 0, 0, 1, 1, 0, 0 }; +static const char rep_max[] = { 0, 0, 0, 0, 1, 1 }; -/* Text forms of OP_ values and things, for debugging */ +/* Text forms of OP_ values and things, for debugging (not all used) */ #ifdef DEBUG static const char *OP_names[] = { @@ -610,7 +619,7 @@ static const char *OP_names[] = { "*", "*?", "+", "+?", "?", "??", "{", "{", "{", "*", "*?", "+", "+?", "?", "??", "{", "{", "{", "*", "*?", "+", "+?", "?", "??", "{", "{", - "class", "classL", "Ref", + "class", "negclass", "classL", "Ref", "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not", "Once", "Brazero", "Braminzero", "Bra" }; @@ -621,7 +630,7 @@ are simple data values; negative values are for special things like \d and so on. Zero means further processing is needed (for things like \x), or the escape is invalid. */ -static short int escapes[] = { +static const short int escapes[] = { 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */ 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */ '@', -ESC_A, -ESC_B, 0, -ESC_D, 0, 0, 0, /* @ - G */ @@ -636,8 +645,9 @@ static short int escapes[] = { /* Definition to allow mutual recursion */ -static BOOL compile_regex(int, int *, uschar **, const uschar **, - const char **, PyObject *); +static BOOL +compile_regex(int, int *, uschar **, const uschar **, const char **, + PyObject *); /* Structure for passing "static" information around between the functions doing the matching, so that they are thread-safe. */ @@ -866,12 +876,13 @@ do { /* Check a class or a back reference for a zero minimum */ case OP_CLASS: + case OP_NEGCLASS: case OP_REF: case OP_CLASS_L: switch(*cc) { case (OP_REF): cc += 2; break; - case (OP_CLASS): cc += 1+32; break; + case (OP_CLASS): case (OP_NEGCLASS): cc += 1+32; break; case (OP_CLASS_L): cc += 1+1+32; break; } @@ -1017,15 +1028,17 @@ else { /* PYTHON: Try to compute an octal value for a character */ - for(c=0, i=0; c!=-1 && ptr[i]!=0 && i<3; i++) + for(c=0, i=0; ptr[i]!=0 && i<3; i++) { if (( pcre_ctypes[ ptr[i] ] & ctype_odigit) != 0) c = c * 8 + ptr[i]-'0'; else - c = -1; /* Non-octal character */ + break; /* Non-octal character--break out of the loop */ } - /* Aha! There were 3 octal digits, so it must be a character */ - if (c != -1 && i == 3) + /* It's a character if there were exactly 3 octal digits, or if + we're inside a character class and there was at least one + octal digit. */ + if ( (i == 3) || (isclass && i!=0) ) { ptr += i-1; break; @@ -1278,11 +1291,14 @@ for (;; ptr++) class_flag = NULL; } - /* If the first character is '^', set the negation flag */ + /* If the first character is '^', set the negation flag, and use a + different opcode. This only matters if caseless matching is specified at + runtime. */ if ((c = *(++ptr)) == '^') { negate_class = TRUE; + if (*(code-1)==OP_CLASS) *(code-1) = OP_NEGCLASS; c = *(++ptr); } else negate_class = FALSE; @@ -1648,7 +1664,8 @@ for (;; ptr++) /* If previous was a character class or a back reference, we put the repeat stuff after it. */ - else if (*previous == OP_CLASS || *previous==OP_CLASS_L || *previous == OP_REF) + else if (*previous == OP_CLASS || *previous == OP_NEGCLASS || + *previous==OP_CLASS_L || *previous == OP_REF) { if (repeat_min == 0 && repeat_max == -1) *code++ = OP_CRSTAR + repeat_type; @@ -2003,7 +2020,7 @@ for (;; ptr++) the next state. */ previous[1] = length; - ptr--; + if (length < 255) ptr--; break; } } /* end of big loop */ @@ -2832,6 +2849,7 @@ while (code < code_end) goto CLASS_REF_REPEAT; case OP_CLASS: + case OP_NEGCLASS: case OP_CLASS_L: { int i, min, max; @@ -2840,11 +2858,14 @@ while (code < code_end) { code++; printf("Locflag = %i ", *code++); + printf(" ["); } else - code++; - - printf(" ["); + { + if (*code++ == OP_CLASS) printf(" ["); + else printf(" ^["); + } + for (i = 0; i < 256; i++) { @@ -3601,10 +3622,14 @@ for (;;) item to see if there is repeat information following. Then obey similar code to character type repeats - written out again for speed. If caseless matching was set at runtime but not at compile time, we have to check both - versions of a character. */ + versions of a character, and we have to behave differently for positive and + negative classes. This is the only time where OP_CLASS and OP_NEGCLASS are + treated differently. */ case OP_CLASS: + case OP_NEGCLASS: { + BOOL nasty_case = *ecode == OP_NEGCLASS && md->runtime_caseless; const uschar *data = ecode + 1; /* Save for matching */ ecode += 33; /* Advance past the item */ @@ -3633,15 +3658,8 @@ for (;;) break; default: /* No repeat follows */ - if (eptr >= md->end_subject) FAIL; - c = *eptr++; - if ((data[c/8] & (1 << (c&7))) != 0) continue; /* With main loop */ - if (md->runtime_caseless) - { - c = pcre_fcc[c]; - if ((data[c/8] & (1 << (c&7))) != 0) continue; /* With main loop */ - } - FAIL; + min = max = 1; + break; } /* First, ensure the minimum number of matches are present. */ @@ -3650,13 +3668,31 @@ for (;;) { if (eptr >= md->end_subject) FAIL; c = *eptr++; - if ((data[c/8] & (1 << (c&7))) != 0) continue; - if (md->runtime_caseless) + + /* Either not runtime caseless, or it was a positive class. For + runtime caseless, continue if either case is in the map. */ + + if (!nasty_case) { - c = pcre_fcc[c]; if ((data[c/8] & (1 << (c&7))) != 0) continue; + if (md->runtime_caseless) + { + c = pcre_fcc[c]; + if ((data[c/8] & (1 << (c&7))) != 0) continue; + } } - FAIL; + + /* Runtime caseless and it was a negative class. Continue only if + both cases are in the map. */ + + else + { + if ((data[c/8] & (1 << (c&7))) == 0) FAIL; + c = pcre_fcc[c]; + if ((data[c/8] & (1 << (c&7))) != 0) continue; + } + + FAIL; } /* If max == min we can continue with the main loop without the @@ -3674,12 +3710,30 @@ for (;;) if (match(eptr, ecode, offset_top, md)) SUCCEED; if (i >= max || eptr >= md->end_subject) FAIL; c = *eptr++; - if ((data[c/8] & (1 << (c&7))) != 0) continue; - if (md->runtime_caseless) + + /* Either not runtime caseless, or it was a positive class. For + runtime caseless, continue if either case is in the map. */ + + if (!nasty_case) { - c = pcre_fcc[c]; if ((data[c/8] & (1 << (c&7))) != 0) continue; + if (md->runtime_caseless) + { + c = pcre_fcc[c]; + if ((data[c/8] & (1 << (c&7))) != 0) continue; + } } + + /* Runtime caseless and it was a negative class. Continue only if + both cases are in the map. */ + + else + { + if ((data[c/8] & (1 << (c&7))) == 0) return FALSE; + c = pcre_fcc[c]; + if ((data[c/8] & (1 << (c&7))) != 0) continue; + } + FAIL; } /* Control never gets here */ @@ -3694,12 +3748,30 @@ for (;;) { if (eptr >= md->end_subject) break; c = *eptr; - if ((data[c/8] & (1 << (c&7))) != 0) continue; - if (md->runtime_caseless) + + /* Either not runtime caseless, or it was a positive class. For + runtime caseless, continue if either case is in the map. */ + + if (!nasty_case) { + if ((data[c/8] & (1 << (c&7))) != 0) continue; + if (md->runtime_caseless) + { + c = pcre_fcc[c]; + if ((data[c/8] & (1 << (c&7))) != 0) continue; + } + } + + /* Runtime caseless and it was a negative class. Continue only if + both cases are in the map. */ + + else + { + if ((data[c/8] & (1 << (c&7))) == 0) break; c = pcre_fcc[c]; if ((data[c/8] & (1 << (c&7))) != 0) continue; } + break; } @@ -4430,17 +4502,17 @@ pcre_exec(const pcre *external_re, const pcre_extra *external_extra, /* The "volatile" directives are to make gcc -Wall stop complaining that these variables can be clobbered by the longjmp. Hopefully they won't cost too much performance. */ -int resetcount, ocount; -int first_char = -1; +volatile int resetcount, ocount; +volatile int first_char = -1; match_data match_block; const uschar *start_bits = NULL; const uschar *start_match = (const uschar *)subject + start_pos; const uschar *end_subject; const real_pcre *re = (const real_pcre *)external_re; const real_pcre_extra *extra = (const real_pcre_extra *)external_extra; -BOOL using_temporary_offsets = FALSE; -BOOL anchored = ((re->options | options) & PCRE_ANCHORED) != 0; -BOOL startline = (re->options & PCRE_STARTLINE) != 0; +volatile BOOL using_temporary_offsets = FALSE; +volatile BOOL anchored = ((re->options | options) & PCRE_ANCHORED) != 0; +volatile BOOL startline = (re->options & PCRE_STARTLINE) != 0; if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION; @@ -4480,7 +4552,7 @@ ocount = offsetcount & (-2); if (re->top_backref > 0 && re->top_backref >= ocount/2) { ocount = re->top_backref * 2 + 2; - match_block.offset_vector = (pcre_malloc)(ocount * sizeof(int)); + match_block.offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int)); if (match_block.offset_vector == NULL) return PCRE_ERROR_NOMEMORY; using_temporary_offsets = TRUE; DPRINTF(("Got memory to hold back references\n")); @@ -4639,10 +4711,10 @@ do free_stack(&match_block); return rc; } /* End of (if setjmp(match_block.error_env)...) */ + free_stack(&match_block); + /* Return an error code; pcremodule.c will preserve the exception */ if (PyErr_Occurred()) return PCRE_ERROR_NOMEMORY; - - free_stack(&match_block); } while (!anchored && match_block.errorcode == PCRE_ERROR_NOMATCH && |