From 58132c6799fe89ed5f08da4ff5d0f5631e075b8c Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Wed, 17 Dec 1997 00:24:13 +0000 Subject: AMK's latest; plus three null bytes that I added for purify --- Modules/pcre-int.h | 293 +++++++++++++++++++++++++++++++++++++++++++++++ Modules/pcre-internal.h | 294 ------------------------------------------------ Modules/pcre.h | 8 +- Modules/pcremodule.c | 42 ++++--- Modules/pypcre.c | 267 ++++++++++++++++++++----------------------- 5 files changed, 447 insertions(+), 457 deletions(-) create mode 100644 Modules/pcre-int.h delete mode 100644 Modules/pcre-internal.h diff --git a/Modules/pcre-int.h b/Modules/pcre-int.h new file mode 100644 index 0000000..2e3e1af --- /dev/null +++ b/Modules/pcre-int.h @@ -0,0 +1,293 @@ +/************************************************* +* Perl-Compatible Regular Expressions * +*************************************************/ + + +#define PCRE_VERSION "1.02 12-Dec-1997" + + +/* This is a library of functions to support regular expressions whose syntax +and semantics are as close as possible to those of the Perl 5 language. See +the file Tech.Notes for some information on the internals. + +Written by: Philip Hazel + + Copyright (c) 1997 University of Cambridge + +----------------------------------------------------------------------------- +Permission is granted to anyone to use this software for any purpose on any +computer system, and to redistribute it freely, subject to the following +restrictions: + +1. This software is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + +2. The origin of this software must not be misrepresented, either by + explicit claim or by omission. + +3. Altered versions must be plainly marked as such, and must not be + misrepresented as being the original software. +----------------------------------------------------------------------------- +*/ + +/* This header contains definitions that are shared between the different +modules, but which are not relevant to the outside. */ + + +/* To cope with SunOS4 and other systems that lack memmove() but have bcopy(), +define a macro for memmove() if USE_BCOPY is defined. */ + +#ifdef USE_BCOPY +#define memmove(a, b, c) bcopy(b, a, c) +#endif + +/* Standard C headers plus the external interface definition */ + +#include +#include +#include +#include +#include +#include +#include +#include "pcre.h" + +/* Private options flags start at the most significant end of the two bytes. +The public options defined in pcre.h start at the least significant end. Make +sure they don't overlap! */ + +#define PCRE_FIRSTSET 0x8000 /* first_char is set */ +#define PCRE_STARTLINE 0x4000 /* start after \n for multiline */ +#define PCRE_COMPILED_CASELESS 0x2000 /* like it says */ + +/* Options for the "extra" block produced by pcre_study(). */ + +#define PCRE_STUDY_CASELESS 0x01 /* study was caseless */ +#define PCRE_STUDY_MAPPED 0x02 /* a map of starting chars exists */ + +/* Masks for identifying the public options: all permitted at compile time, +only some permitted at run or study time. */ + +#ifdef FOR_PYTHON +#define PUBLIC_OPTIONS \ + (PCRE_CASELESS|PCRE_EXTENDED|PCRE_ANCHORED|PCRE_MULTILINE| \ + PCRE_DOTALL|PCRE_DOLLAR_ENDONLY|PCRE_EXTRA|PCRE_LOCALE) +#else +#define PUBLIC_OPTIONS \ + (PCRE_CASELESS|PCRE_EXTENDED|PCRE_ANCHORED|PCRE_MULTILINE| \ + PCRE_DOTALL|PCRE_DOLLAR_ENDONLY|PCRE_EXTRA) +#endif +#define PUBLIC_EXEC_OPTIONS \ + (PCRE_CASELESS|PCRE_ANCHORED|PCRE_MULTILINE|PCRE_NOTBOL|PCRE_NOTEOL| \ + PCRE_DOTALL|PCRE_DOLLAR_ENDONLY) + +#define PUBLIC_STUDY_OPTIONS (PCRE_CASELESS) + +/* Magic number to provide a small check against being handed junk. */ + +#define MAGIC_NUMBER 0x50435245 /* 'PCRE' */ + +/* Miscellaneous definitions */ + +typedef int BOOL; + +#define FALSE 0 +#define TRUE 1 + +/* These are escaped items that aren't just an encoding of a particular data +value such as \n. They must have non-zero values, as check_escape() returns +their negation. Also, they must appear in the same order as in the opcode +definitions below, up to ESC_Z. The final one must be ESC_REF as subsequent +values are used for \1, \2, \3, etc. There is a test in the code for an escape +greater than ESC_b and less than ESC_X to detect the types that may be +repeated. If any new escapes are put in-between that don't consume a character, +that code will have to change. */ + +enum { ESC_A = 1, ESC_B, ESC_b, ESC_D, ESC_d, ESC_S, ESC_s, ESC_W, ESC_w, + + /* These are not Perl escapes, so can't appear in the */ + ESC_X, /* simple table-lookup because they must be conditional */ + /* on PCRE_EXTRA. */ + ESC_Z, + ESC_REF }; + +/* Opcode table: OP_BRA must be last, as all values >= it are used for brackets +that extract substrings. Starting from 1 (i.e. after OP_END), the values up to +OP_EOD must correspond in order to the list of escapes immediately above. */ + +enum { + OP_END, /* End of pattern */ + + /* Values corresponding to backslashed metacharacters */ + + OP_SOD, /* Start of data: \A */ + OP_NOT_WORD_BOUNDARY, /* \B */ + OP_WORD_BOUNDARY, /* \b */ + OP_NOT_DIGIT, /* \D */ + OP_DIGIT, /* \d */ + OP_NOT_WHITESPACE, /* \S */ + OP_WHITESPACE, /* \s */ + OP_NOT_WORDCHAR, /* \W */ + OP_WORDCHAR, /* \w */ + OP_CUT, /* The analogue of Prolog's "cut" operation (extension) */ + OP_EOD, /* End of data: \Z. */ + + OP_NOT_WORD_BOUNDARY_L, /* localized \B */ + OP_WORD_BOUNDARY_L, /* localized \b */ + OP_NOT_WORDCHAR_L, /* localized \W */ + OP_WORDCHAR_L, /* localized \w */ + + OP_CIRC, /* Start of line - varies with multiline switch */ + OP_DOLL, /* End of line - varies with multiline switch */ + OP_ANY, /* Match any character */ + OP_CHARS, /* Match string of characters */ + OP_NOT, /* Match anything but the following char */ + + OP_STAR, /* The maximizing and minimizing versions of */ + OP_MINSTAR, /* all these opcodes must come in pairs, with */ + OP_PLUS, /* the minimizing one second. */ + OP_MINPLUS, /* This first set applies to single characters */ + OP_QUERY, + OP_MINQUERY, + OP_UPTO, /* From 0 to n matches */ + OP_MINUPTO, + OP_EXACT, /* Exactly n matches */ + + OP_NOTSTAR, /* The maximizing and minimizing versions of */ + OP_NOTMINSTAR, /* all these opcodes must come in pairs, with */ + OP_NOTPLUS, /* the minimizing one second. */ + OP_NOTMINPLUS, /* This first set applies to "not" single characters */ + OP_NOTQUERY, + OP_NOTMINQUERY, + OP_NOTUPTO, /* From 0 to n matches */ + OP_NOTMINUPTO, + OP_NOTEXACT, /* Exactly n matches */ + + OP_TYPESTAR, /* The maximizing and minimizing versions of */ + OP_TYPEMINSTAR, /* all these opcodes must come in pairs, with */ + OP_TYPEPLUS, /* the minimizing one second. These codes must */ + OP_TYPEMINPLUS, /* be in exactly the same order as those above. */ + OP_TYPEQUERY, /* This set applies to character types such as \d */ + OP_TYPEMINQUERY, + OP_TYPEUPTO, /* From 0 to n matches */ + OP_TYPEMINUPTO, + OP_TYPEEXACT, /* Exactly n matches */ + + OP_CRSTAR, /* The maximizing and minimizing versions of */ + OP_CRMINSTAR, /* all these opcodes must come in pairs, with */ + OP_CRPLUS, /* the minimizing one second. These codes must */ + OP_CRMINPLUS, /* be in exactly the same order as those above. */ + OP_CRQUERY, /* These are for character classes and back refs */ + OP_CRMINQUERY, + OP_CRRANGE, /* These are different to the three seta above. */ + OP_CRMINRANGE, + + OP_CLASS, /* Match a character class */ + OP_CLASS_L, /* Match a character class */ + OP_REF, /* Match a back reference */ + + OP_ALT, /* Start of alternation */ + OP_KET, /* End of group that doesn't have an unbounded repeat */ + OP_KETRMAX, /* These two must remain together and in this */ + OP_KETRMIN, /* order. They are for groups the repeat for ever. */ + + OP_ASSERT, + OP_ASSERT_NOT, + OP_ONCE, /* Once matched, don't back up into the subpattern */ + + OP_BRAZERO, /* These two must remain together and in this */ + OP_BRAMINZERO, /* order. */ + + OP_BRA /* This and greater values are used for brackets that + extract substrings. */ +}; + +/* The highest extraction number. This is limited by the number of opcodes +left after OP_BRA, i.e. 255 - OP_BRA. We actually set it somewhat lower. */ + +#define EXTRACT_MAX 99 + +/* The texts of compile-time error messages are defined as macros here so that +they can be accessed by the POSIX wrapper and converted into error codes. Yes, +I could have used error codes in the first place, but didn't feel like changing +just to accommodate the POSIX wrapper. */ + +#define ERR1 "\\ at end of pattern" +#define ERR2 "\\c at end of pattern" +#define ERR3 "unrecognized character follows \\" +#define ERR4 "numbers out of order in {} quantifier" +#define ERR5 "number too big in {} quantifier" +#define ERR6 "missing terminating ] for character class" +#define ERR7 "invalid escape sequence in character class" +#define ERR8 "range out of order in character class" +#define ERR9 "nothing to repeat" +#define ERR10 "operand of unlimited repeat could match the empty string" +#define ERR11 "internal error: unexpected repeat" +#define ERR12 "unrecognized character after (?" +#define ERR13 "too many capturing parenthesized sub-patterns" +#define ERR14 "missing )" +#define ERR15 "back reference to non-existent subpattern" +#define ERR16 "erroffset passed as NULL" +#define ERR17 "unknown option bit(s) set" +#define ERR18 "missing ) after comment" +#define ERR19 "too many sets of parentheses" +#define ERR20 "regular expression too large" +#define ERR21 "failed to get memory" +#define ERR22 "unmatched brackets" +#define ERR23 "internal error: code overflow" + +/* All character handling must be done as unsigned characters. Otherwise there +are problems with top-bit-set characters and functions such as isspace(). +However, we leave the interface to the outside world as char *, because that +should make things easier for callers. We define a short type for unsigned char +to save lots of typing. I tried "uchar", but it causes problems on Digital +Unix, where it is defined in sys/types, so use "uschar" instead. */ + +typedef unsigned char uschar; + +/* The real format of the start of the pcre block; the actual code vector +runs on as long as necessary after the end. */ + +typedef struct real_pcre { + unsigned int magic_number; + unsigned short int options; + unsigned char top_bracket; + unsigned char top_backref; + unsigned char first_char; + unsigned char code[1]; +} real_pcre; + +/* The real format of the extra block returned by pcre_study(). */ + +typedef struct real_pcre_extra { + unsigned char options; + unsigned char start_bits[32]; +} real_pcre_extra; + +/* Global tables from chartables.c */ + +extern uschar pcre_lcc[]; +extern uschar pcre_fcc[]; +extern uschar pcre_cbits[]; +extern uschar pcre_ctypes[]; + +/* Bit definitions for entries in pcre_ctypes[]. */ + +#define ctype_space 0x01 +#define ctype_letter 0x02 +#define ctype_digit 0x04 +#define ctype_xdigit 0x08 +#define ctype_word 0x10 /* alphameric or '_' */ +#define ctype_odigit 0x20 /* octal digit */ +#define ctype_meta 0x80 /* regexp meta char or zero (end pattern) */ + +/* Offsets for the bitmap tables */ + +#define cbit_digit 0 +#define cbit_letter 32 +#define cbit_word 64 +#define cbit_space 96 +#define cbit_length 128 /* Length of the cbits table */ + +/* End of internal.h */ diff --git a/Modules/pcre-internal.h b/Modules/pcre-internal.h deleted file mode 100644 index 735c02d..0000000 --- a/Modules/pcre-internal.h +++ /dev/null @@ -1,294 +0,0 @@ -/************************************************* -* Perl-Compatible Regular Expressions * -*************************************************/ - - -#define PCRE_VERSION "1.01 19-Nov-1997" - - -/* This is a library of functions to support regular expressions whose syntax -and semantics are as close as possible to those of the Perl 5 language. See -the file Tech.Notes for some information on the internals. - -Written by: Philip Hazel - - Copyright (c) 1997 University of Cambridge - ------------------------------------------------------------------------------ -Permission is granted to anyone to use this software for any purpose on any -computer system, and to redistribute it freely, subject to the following -restrictions: - -1. This software is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - -2. The origin of this software must not be misrepresented, either by - explicit claim or by omission. - -3. Altered versions must be plainly marked as such, and must not be - misrepresented as being the original software. ------------------------------------------------------------------------------ -*/ - -/* This header contains definitions that are shared between the different -modules, but which are not relevant to the outside. */ - - -/* To cope with SunOS4 and other systems that lack memmove() but have bcopy(), -define a macro for memmove() if USE_BCOPY is defined. */ - -#ifdef USE_BCOPY -#define memmove(a, b, c) bcopy(b, a, c) -#endif - -/* Standard C headers plus the external interface definition */ - -#include -#include -#include -#include -#include -#include -#include -#include "pcre.h" - -/* Private options flags start at the most significant end of the two bytes. -The public options defined in pcre.h start at the least significant end. Make -sure they don't overlap! */ - -#define PCRE_FIRSTSET 0x8000 /* first_char is set */ -#define PCRE_STARTLINE 0x4000 /* start after \n for multiline */ -#define PCRE_COMPILED_CASELESS 0x2000 /* like it says */ - -/* Options for the "extra" block produced by pcre_study(). */ - -#define PCRE_STUDY_CASELESS 0x01 /* study was caseless */ -#define PCRE_STUDY_MAPPED 0x02 /* a map of starting chars exists */ - -/* Masks for identifying the public options: all permitted at compile time, -only some permitted at run or study time. */ - -#ifdef FOR_PYTHON -#define PUBLIC_OPTIONS \ - (PCRE_CASELESS|PCRE_EXTENDED|PCRE_ANCHORED|PCRE_MULTILINE| \ - PCRE_DOTALL|PCRE_DOLLAR_ENDONLY|PCRE_EXTRA|PCRE_LOCALE) -#else -#define PUBLIC_OPTIONS \ - (PCRE_CASELESS|PCRE_EXTENDED|PCRE_ANCHORED|PCRE_MULTILINE| \ - PCRE_DOTALL|PCRE_DOLLAR_ENDONLY|PCRE_EXTRA) -#endif -#define PUBLIC_EXEC_OPTIONS \ - (PCRE_CASELESS|PCRE_ANCHORED|PCRE_MULTILINE|PCRE_NOTBOL|PCRE_NOTEOL| \ - PCRE_DOTALL|PCRE_DOLLAR_ENDONLY) - -#define PUBLIC_STUDY_OPTIONS (PCRE_CASELESS) - -/* Magic number to provide a small check against being handed junk. */ - -#define MAGIC_NUMBER 0x50435245 /* 'PCRE' */ - -/* Miscellaneous definitions */ - -typedef int BOOL; - -#define FALSE 0 -#define TRUE 1 - -/* These are escaped items that aren't just an encoding of a particular data -value such as \n. They must have non-zero values, as check_escape() returns -their negation. Also, they must appear in the same order as in the opcode -definitions below, up to ESC_Z. The final one must be ESC_REF as subsequent -values are used for \1, \2, \3, etc. There is a test in the code for an escape -greater than ESC_b and less than ESC_X to detect the types that may be -repeated. If any new escapes are put in-between that don't consume a character, -that code will have to change. */ - -enum { ESC_A = 1, ESC_B, ESC_b, ESC_D, ESC_d, ESC_S, ESC_s, ESC_W, ESC_w, - - /* These are not Perl escapes, so can't appear in the */ - ESC_X, /* simple table-lookup because they must be conditional */ - /* on PCRE_EXTRA. */ - ESC_Z, - ESC_REF }; - -/* Opcode table: OP_BRA must be last, as all values >= it are used for brackets -that extract substrings. Starting from 1 (i.e. after OP_END), the values up to -OP_EOL must correspond in order to the list of escapes immediately above. */ - -enum { - OP_END, /* End of pattern */ - - /* Values corresponding to backslashed metacharacters */ - - OP_SOD, /* Start of data: \A */ - OP_NOT_WORD_BOUNDARY, /* \B */ - OP_WORD_BOUNDARY, /* \b */ - OP_NOT_DIGIT, /* \D */ - OP_DIGIT, /* \d */ - OP_NOT_WHITESPACE, /* \S */ - OP_WHITESPACE, /* \s */ - OP_NOT_WORDCHAR, /* \W */ - OP_WORDCHAR, /* \w */ - OP_CUT, /* The analogue of Prolog's "cut" operation (extension) */ - OP_EOD, /* End of data: or \Z. This must always be the last - of the backslashed meta values. */ - - OP_NOT_WORD_BOUNDARY_L, /* localized \B */ - OP_WORD_BOUNDARY_L, /* localized \b */ - OP_NOT_WORDCHAR_L, /* localized \W */ - OP_WORDCHAR_L, /* localized \w */ - - OP_CIRC, /* Start of line - varies with multiline switch */ - OP_DOLL, /* End of line - varies with multiline switch */ - OP_ANY, /* Match any character */ - OP_CHARS, /* Match string of characters */ - OP_NOT, /* Match anything but the following char */ - - OP_STAR, /* The maximizing and minimizing versions of */ - OP_MINSTAR, /* all these opcodes must come in pairs, with */ - OP_PLUS, /* the minimizing one second. */ - OP_MINPLUS, /* This first set applies to single characters */ - OP_QUERY, - OP_MINQUERY, - OP_UPTO, /* From 0 to n matches */ - OP_MINUPTO, - OP_EXACT, /* Exactly n matches */ - - OP_NOTSTAR, /* The maximizing and minimizing versions of */ - OP_NOTMINSTAR, /* all these opcodes must come in pairs, with */ - OP_NOTPLUS, /* the minimizing one second. */ - OP_NOTMINPLUS, /* This first set applies to "not" single characters */ - OP_NOTQUERY, - OP_NOTMINQUERY, - OP_NOTUPTO, /* From 0 to n matches */ - OP_NOTMINUPTO, - OP_NOTEXACT, /* Exactly n matches */ - - OP_TYPESTAR, /* The maximizing and minimizing versions of */ - OP_TYPEMINSTAR, /* all these opcodes must come in pairs, with */ - OP_TYPEPLUS, /* the minimizing one second. These codes must */ - OP_TYPEMINPLUS, /* be in exactly the same order as those above. */ - OP_TYPEQUERY, /* This set applies to character types such as \d */ - OP_TYPEMINQUERY, - OP_TYPEUPTO, /* From 0 to n matches */ - OP_TYPEMINUPTO, - OP_TYPEEXACT, /* Exactly n matches */ - - OP_CRSTAR, /* The maximizing and minimizing versions of */ - OP_CRMINSTAR, /* all these opcodes must come in pairs, with */ - OP_CRPLUS, /* the minimizing one second. These codes must */ - OP_CRMINPLUS, /* be in exactly the same order as those above. */ - OP_CRQUERY, /* These are for character classes and back refs */ - OP_CRMINQUERY, - OP_CRRANGE, /* These are different to the three seta above. */ - OP_CRMINRANGE, - - OP_CLASS, /* Match a character class */ - OP_CLASS_L, /* Match a character class */ - OP_REF, /* Match a back reference */ - - OP_ALT, /* Start of alternation */ - OP_KET, /* End of group that doesn't have an unbounded repeat */ - OP_KETRMAX, /* These two must remain together and in this */ - OP_KETRMIN, /* order. They are for groups the repeat for ever. */ - - OP_ASSERT, - OP_ASSERT_NOT, - OP_ONCE, /* Once matched, don't back up into the subpattern */ - - OP_BRAZERO, /* These two must remain together and in this */ - OP_BRAMINZERO, /* order. */ - - OP_BRA /* This and greater values are used for brackets that - extract substrings. */ -}; - -/* The highest extraction number. This is limited by the number of opcodes -left after OP_BRA, i.e. 255 - OP_BRA. We actually set it somewhat lower. */ - -#define EXTRACT_MAX 99 - -/* The texts of compile-time error messages are defined as macros here so that -they can be accessed by the POSIX wrapper and converted into error codes. Yes, -I could have used error codes in the first place, but didn't feel like changing -just to accommodate the POSIX wrapper. */ - -#define ERR1 "\\ at end of pattern" -#define ERR2 "\\c at end of pattern" -#define ERR3 "unrecognized character follows \\" -#define ERR4 "numbers out of order in {} quantifier" -#define ERR5 "number too big in {} quantifier" -#define ERR6 "missing terminating ] for character class" -#define ERR7 "invalid escape sequence in character class" -#define ERR8 "range out of order in character class" -#define ERR9 "nothing to repeat" -#define ERR10 "operand of unlimited repeat could match the empty string" -#define ERR11 "internal error: unexpected repeat" -#define ERR12 "unrecognized character after (?" -#define ERR13 "too many capturing parenthesized sub-patterns" -#define ERR14 "missing )" -#define ERR15 "back reference to non-existent subpattern" -#define ERR16 "erroffset passed as NULL" -#define ERR17 "unknown option bit(s) set" -#define ERR18 "missing ) after comment" -#define ERR19 "too many sets of parentheses" -#define ERR20 "regular expression too large" -#define ERR21 "failed to get memory" -#define ERR22 "unmatched brackets" -#define ERR23 "internal error: code overflow" - -/* All character handling must be done as unsigned characters. Otherwise there -are problems with top-bit-set characters and functions such as isspace(). -However, we leave the interface to the outside world as char *, because that -should make things easier for callers. We define a short type for unsigned char -to save lots of typing. I tried "uchar", but it causes problems on Digital -Unix, where it is defined in sys/types, so use "uschar" instead. */ - -typedef unsigned char uschar; - -/* The real format of the start of the pcre block; the actual code vector -runs on as long as necessary after the end. */ - -typedef struct real_pcre { - unsigned int magic_number; - unsigned short int options; - unsigned char top_bracket; - unsigned char top_backref; - unsigned char first_char; - unsigned char code[1]; -} real_pcre; - -/* The real format of the extra block returned by pcre_study(). */ - -typedef struct real_pcre_extra { - unsigned char options; - unsigned char start_bits[32]; -} real_pcre_extra; - -/* Global tables from chartables.c */ - -extern uschar pcre_lcc[]; -extern uschar pcre_fcc[]; -extern uschar pcre_cbits[]; -extern uschar pcre_ctypes[]; - -/* Bit definitions for entries in pcre_ctypes[]. */ - -#define ctype_space 0x01 -#define ctype_letter 0x02 -#define ctype_digit 0x04 -#define ctype_xdigit 0x08 -#define ctype_word 0x10 /* alphameric or '_' */ -#define ctype_odigit 0x20 /* octal digit */ -#define ctype_meta 0x80 /* regexp meta char or zero (end pattern) */ - -/* Offsets for the bitmap tables */ - -#define cbit_digit 0 -#define cbit_letter 32 -#define cbit_word 64 -#define cbit_space 96 -#define cbit_length 128 /* Length of the cbits table */ - -/* End of internal.h */ diff --git a/Modules/pcre.h b/Modules/pcre.h index 3a215d5..4ba9d9c 100644 --- a/Modules/pcre.h +++ b/Modules/pcre.h @@ -55,14 +55,14 @@ extern void (*pcre_free)(void *); /* Functions */ #ifdef FOR_PYTHON -extern pcre *pcre_compile(const char *, int, char **, int *, PyObject *); +extern pcre *pcre_compile(const char *, int, const char **, int *, PyObject *); #else -extern pcre *pcre_compile(const char *, int, char **, int *); +extern pcre *pcre_compile(const char *, int, const char **, int *); #endif extern int pcre_exec(const pcre *, const pcre_extra *, const char *, int, int, int *, int); extern int pcre_info(const pcre *, int *, int *); -extern pcre_extra *pcre_study(const pcre *, int, char **); -extern char *pcre_version(void); +extern pcre_extra *pcre_study(const pcre *, int, const char **); +extern const char *pcre_version(void); #endif /* End of pcre.h */ diff --git a/Modules/pcremodule.c b/Modules/pcremodule.c index 8701ba5..2587fa0 100644 --- a/Modules/pcremodule.c +++ b/Modules/pcremodule.c @@ -1,5 +1,5 @@ /*********************************************************** -Copyright 1991-1995 by Stichting Mathematisch Centrum, Amsterdam, +Copyright 1997 by Stichting Mathematisch Centrum, Amsterdam, The Netherlands. All Rights Reserved @@ -33,6 +33,7 @@ PERFORMANCE OF THIS SOFTWARE. #include "Python.h" +#include #ifndef Py_eval_input /* For Python 1.4, graminit.h has to be explicitly included */ #include "graminit.h" @@ -44,7 +45,7 @@ PERFORMANCE OF THIS SOFTWARE. #endif #include "pcre.h" -#include "pcre-internal.h" +#include "pcre-int.h" static PyObject *ErrorObject; @@ -127,7 +128,9 @@ PyPcre_exec(self, args) if (count==PCRE_ERROR_NOMATCH) {Py_INCREF(Py_None); return Py_None;} if (count<0) { - PyErr_SetObject(ErrorObject, Py_BuildValue("si", "Regex error", count)); + PyObject *errval = Py_BuildValue("si", "Regex execution error", count); + PyErr_SetObject(ErrorObject, errval); + Py_XDECREF(errval); return NULL; } @@ -191,7 +194,7 @@ PyPcre_compile(self, args) PcreObject *rv; PyObject *dictionary; char *pattern, *newpattern; - char *error; + const char *error; int num_zeros, i, j; int patternlen, options, erroroffset; @@ -203,12 +206,13 @@ PyPcre_compile(self, args) return NULL; /* PCRE doesn't like having null bytes in its pattern, so we have to replace - any zeros in the string with the characters '\0'. */ - num_zeros=1; + any zeros in the string with the characters '\000'. This increases the size + of the string by 3*num_zeros, plus 1 byte for the terminating \0. */ + num_zeros=1; /* Start at 1; this will give 3 extra bytes of leeway */ for(i=0; iregex = pcre_compile((char*)newpattern, options, @@ -231,21 +241,27 @@ PyPcre_compile(self, args) PyMem_DEL(rv); if (!PyErr_Occurred()) { - PyErr_SetObject(ErrorObject, Py_BuildValue("si", error, erroroffset)); + PyObject *errval = Py_BuildValue("si", error, erroroffset); + PyErr_SetObject(ErrorObject, errval); + Py_XDECREF(errval); } return NULL; } rv->regex_extra=pcre_study(rv->regex, 0, &error); if (rv->regex_extra==NULL && error!=NULL) { + PyObject *errval = Py_BuildValue("si", error, 0); PyMem_DEL(rv); - PyErr_SetObject(ErrorObject, Py_BuildValue("si", error, 0)); + PyErr_SetObject(ErrorObject, errval); + Py_XDECREF(errval); return NULL; } rv->num_groups = pcre_info(rv->regex, NULL, NULL); if (rv->num_groups<0) { - PyErr_SetObject(ErrorObject, Py_BuildValue("si", "Regex error", rv->num_groups)); + PyObject *errval = Py_BuildValue("si", error, rv->num_groups); + PyErr_SetObject(ErrorObject, errval); + Py_XDECREF(errval); PyMem_DEL(rv); return NULL; } @@ -526,7 +542,7 @@ PyPcre_expand(self, args) Py_DECREF(r); Py_DECREF(tuple); if (result==NULL) { - /* The group() method trigged an exception of some sort */ + /* The group() method triggered an exception of some sort */ Py_DECREF(results); Py_DECREF(value); return NULL; diff --git a/Modules/pypcre.c b/Modules/pypcre.c index 699932f..55908e7 100644 --- a/Modules/pypcre.c +++ b/Modules/pypcre.c @@ -15,10 +15,9 @@ file by hand, or submit patches to it. The Python-specific PCRE distribution can be retrieved from http://starship.skyport.net/crew/amk/regex/ -The unmodified original PCRE distribution doesn't have a fixed URL -yet; write Philip Hazel for the latest version. - -Written by: Philip Hazel +The unmodified original PCRE distribution is available at +ftp://ftp.cus.cam.ac.uk/pub/software/programs/pcre/, and is originally +written by: Philip Hazel Extensively modified by the Python String-SIG: Send bug reports to: @@ -46,7 +45,7 @@ restrictions: #define FOR_PYTHON -#include "pcre-internal.h" +#include "pcre-int.h" #include "Python.h" #include "mymalloc.h" #include @@ -254,13 +253,13 @@ Returns: TRUE if table built, FALSE otherwise */ static BOOL -set_start_bits(uschar *code, uschar *start_bits) +set_start_bits(const uschar *code, uschar *start_bits) { register int c; do { - uschar *tcode = code + 3; + const uschar *tcode = code + 3; BOOL try_next = TRUE; while (try_next) @@ -466,12 +465,12 @@ Returns: pointer to a pcre_extra block, */ pcre_extra * -pcre_study(const pcre *external_re, int options, char **errorptr) +pcre_study(const pcre *external_re, int options, const char **errorptr) { BOOL caseless; uschar start_bits[32]; real_pcre_extra *extra; -real_pcre *re = (real_pcre *)external_re; +const real_pcre *re = (const real_pcre *)external_re; *errorptr = NULL; @@ -592,7 +591,8 @@ static char rep_max[] = { 0, 0, 0, 0, 1, 1 }; /* Text forms of OP_ values and things, for debugging */ #ifdef DEBUG -static char *OP_names[] = { "End", "\\A", "\\B", "\\b", "\\D", "\\d", +static const char *OP_names[] = { + "End", "\\A", "\\B", "\\b", "\\D", "\\d", "\\S", "\\s", "\\W", "\\w", "Cut", "\\Z", "localized \\B", "localized \\b", "localized \\W", "localized \\w", "^", "$", "Any", "chars", @@ -627,8 +627,8 @@ static short int escapes[] = { /* Definition to allow mutual recursion */ -static BOOL compile_regex(int, int *, uschar **, uschar **, - char **, PyObject *); +static BOOL compile_regex(int, int *, uschar **, const uschar **, + const char **, PyObject *); /* Structure for passing "static" information around between the functions doing the matching, so that they are thread-safe. */ @@ -645,10 +645,10 @@ typedef struct match_data { BOOL noteol; /* NOTEOL flag */ BOOL dotall; /* Dot matches any char */ BOOL endonly; /* Dollar not before final \n */ - uschar *start_subject; /* Start of the subject string */ - uschar *end_subject; /* End of the subject string */ + const uschar *start_subject; /* Start of the subject string */ + const uschar *end_subject; /* End of the subject string */ jmp_buf fail_env; /* Environment for longjump() break out */ - uschar *end_match_ptr; /* Subject position at end match */ + const uschar *end_match_ptr; /* Subject position at end match */ int end_offset_top; /* Highwater mark at end of match */ jmp_buf error_env; /* For longjmp() if an error occurs deep inside a matching operation */ @@ -656,7 +656,7 @@ typedef struct match_data { int point; /* Point to add next item pushed onto stacks */ /* Pointers to the 6 stacks */ int *off_num, *offset_top, *r1, *r2; - uschar **eptr, **ecode; + const uschar **eptr, **ecode; } match_data; @@ -680,7 +680,7 @@ void (*pcre_free)(void *) = free; * Return version string * *************************************************/ -char * +const char * pcre_version(void) { return PCRE_VERSION; @@ -710,7 +710,7 @@ Returns: number of identifying extraction brackets int pcre_info(const pcre *external_re, int *optptr, int *first_char) { -real_pcre *re = (real_pcre *)external_re; +const real_pcre *re = (real_pcre *)external_re; if (re == NULL) return PCRE_ERROR_NULL; if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC; if (optptr != NULL) *optptr = (re->options & PUBLIC_OPTIONS); @@ -906,9 +906,9 @@ Arguments: */ static int -get_group_id(uschar *ptr, char finalchar, char **errorptr) +get_group_id(const uschar *ptr, char finalchar, const char **errorptr) { - uschar *start = ptr; + const uschar *start = ptr; /* If the first character is not in \w, or is in \w but is a digit, report an error */ @@ -960,10 +960,10 @@ Returns: zero or positive => a data character */ static int -check_escape(uschar **ptrptr, char **errorptr, int bracount, int options, - BOOL isclass) +check_escape(const uschar **ptrptr, const char **errorptr, int bracount, + int options, BOOL isclass) { -uschar *ptr = *ptrptr; +const uschar *ptr = *ptrptr; int c = *(++ptr) & 255; /* Ensure > 0 on signed-char systems */ int i; @@ -1092,7 +1092,7 @@ Returns: TRUE or FALSE */ static BOOL -is_counted_repeat(uschar *p) +is_counted_repeat(const uschar *p) { if ((pcre_ctypes[*p++] & ctype_digit) == 0) return FALSE; while ((pcre_ctypes[*p] & ctype_digit) != 0) p++; @@ -1127,8 +1127,8 @@ Returns: pointer to '}' on success; current ptr on error, with errorptr set */ -static uschar * -read_repeat_counts(uschar *p, int *minp, int *maxp, char **errorptr) +static const uschar * +read_repeat_counts(const uschar *p, int *minp, int *maxp, const char **errorptr) { int min = 0; int max = -1; @@ -1183,16 +1183,16 @@ Returns: TRUE on success static BOOL compile_branch(int options, int *brackets, uschar **codeptr, - uschar **ptrptr, char **errorptr, PyObject *dictionary) + const uschar **ptrptr, const char **errorptr, PyObject *dictionary) { int repeat_type, op_type; int repeat_min, repeat_max; int bravalue, length; register int c; register uschar *code = *codeptr; -uschar *ptr = *ptrptr; +const uschar *ptr = *ptrptr; +const uschar *oldptr; uschar *previous = NULL; -uschar *oldptr; uschar class[32]; uschar *class_flag; /* Pointer to the single-byte flag for OP_CLASS_L */ @@ -1299,7 +1299,7 @@ for (;; ptr++) /* Backslash may introduce a single character, or it may introduce one of the specials, which just set a flag. Escaped items are checked for validity in the pre-compiling pass. The sequence \b is a special case. - Inside a class (and only there) it is treated as backslash. Elsewhere + Inside a class (and only there) it is treated as backspace. Elsewhere it marks a word boundary. Other escapes have preset maps ready to or into the one we are building. We assume they have more than one character in them, so set class_count bigger than one. */ @@ -1314,22 +1314,12 @@ for (;; ptr++) switch (-c) { case ESC_d: - if (options & PCRE_LOCALE) - { - *class_flag |= 4; - } - else { for (c = 0; c < 32; c++) class[c] |= pcre_cbits[c+cbit_digit]; } continue; case ESC_D: - if (options & PCRE_LOCALE) - { - *class_flag |= 8; - } - else { for (c = 0; c < 32; c++) class[c] |= ~pcre_cbits[c+cbit_digit]; } @@ -1360,22 +1350,12 @@ for (;; ptr++) continue; case ESC_s: - if (options & PCRE_LOCALE) - { - *class_flag |= 32; - } - else { for (c = 0; c < 32; c++) class[c] |= pcre_cbits[c+cbit_space]; } continue; case ESC_S: - if (options & PCRE_LOCALE) - { - *class_flag |= 32; - } - else { for (c = 0; c < 32; c++) class[c] |= ~pcre_cbits[c+cbit_space]; } @@ -1795,7 +1775,7 @@ for (;; ptr++) } string = PyString_FromStringAndSize((char*)ptr, idlen); intobj = PyInt_FromLong( brackets[0] + 1 ); - if (intobj == NULL || string==NULL) + if (intobj == NULL || string == NULL) { Py_XDECREF(string); Py_XDECREF(intobj); @@ -1803,7 +1783,7 @@ for (;; ptr++) goto FAILED; } PyDict_SetItem(dictionary, string, intobj); - Py_DECREF(string); Py_DECREF(intobj); + Py_DECREF(string); Py_DECREF(intobj); /* XXX DECREF commented out! */ ptr += idlen+1; /* Point to rest of expression */ goto do_grouping_bracket; } @@ -1820,7 +1800,6 @@ for (;; ptr++) } string = PyString_FromStringAndSize((char *)ptr, idlen); if (string==NULL) { - Py_XDECREF(string); *errorptr = "exception raised"; goto FAILED; } @@ -1833,6 +1812,10 @@ for (;; ptr++) refnum = PyInt_AsLong(intobj); Py_DECREF(string); + /* The caller doesn't own the reference to the value + returned from PyDict_GetItem, so intobj is not + DECREF'ed. */ + *code++ = OP_REF; *code++ = refnum; /* The continue will cause the top-level for() loop to @@ -1943,7 +1926,7 @@ for (;; ptr++) continue; } - /* Reset and fall through */ + /* Data character: Reset and fall through */ ptr = oldptr; c = '\\'; @@ -2035,9 +2018,9 @@ Returns: TRUE on success static BOOL compile_regex(int options, int *brackets, uschar **codeptr, - uschar **ptrptr, char **errorptr, PyObject *dictionary) + const uschar **ptrptr, const char **errorptr, PyObject *dictionary) { -uschar *ptr = *ptrptr; +const uschar *ptr = *ptrptr; uschar *code = *codeptr; uschar *start_bracket = code; @@ -2103,7 +2086,7 @@ Returns: TRUE or FALSE */ static BOOL -is_anchored(register uschar *code, BOOL multiline) +is_anchored(register const uschar *code, BOOL multiline) { do { int op = (int)code[3]; @@ -2132,7 +2115,7 @@ Returns: TRUE or FALSE */ static BOOL -is_startline(uschar *code) +is_startline(const uschar *code) { do { if ((int)code[3] >= OP_BRA || code[3] == OP_ASSERT) @@ -2217,7 +2200,7 @@ Returns: pointer to compiled data block, or NULL on error, */ pcre * -pcre_compile(const char *pattern, int options, char **errorptr, +pcre_compile(const char *pattern, int options, const char **errorptr, int *erroroffset, PyObject *dictionary) { real_pcre *re; @@ -2227,9 +2210,10 @@ int runlength; int c, size; int bracount = 0; int brastack[200]; -int brastackptr = 0; int top_backref = 0; -uschar *code, *ptr; +unsigned int brastackptr = 0; +uschar *code; +const uschar *ptr; #ifdef DEBUG uschar *code_base, *code_end; @@ -2268,7 +2252,7 @@ internal flag settings. Make an attempt to correct for any counted white space if an "extended" flag setting appears late in the pattern. We can't be so clever for #-comments. */ -ptr = (uschar *)(pattern - 1); +ptr = (const uschar *)(pattern - 1); while ((c = *(++ptr)) != 0) { int min, max; @@ -2295,7 +2279,7 @@ while ((c = *(++ptr)) != 0) case '\\': { - uschar *save_ptr = ptr; + const uschar *save_ptr = ptr; c = check_escape(&ptr, errorptr, bracount, options, FALSE); if (*errorptr != NULL) goto PCRE_ERROR_RETURN; if (c >= 0) @@ -2585,7 +2569,7 @@ while ((c = *(++ptr)) != 0) if (c == '\\') { - uschar *saveptr = ptr; + const uschar *saveptr = ptr; c = check_escape(&ptr, errorptr, bracount, options, FALSE); if (*errorptr != NULL) goto PCRE_ERROR_RETURN; if (c < 0) { ptr = saveptr; break; } @@ -2633,7 +2617,7 @@ re->options = options; error, *errorptr will be set non-NULL, so we don't need to look at the result of the function here. */ -ptr = (uschar *)pattern; +ptr = (const uschar *)pattern; code = re->code; *code = OP_BRA; bracount = 0; @@ -2661,7 +2645,7 @@ if (*errorptr != NULL) { (pcre_free)(re); PCRE_ERROR_RETURN: - *erroroffset = ptr - (uschar *)pattern; + *erroroffset = ptr - (const uschar *)pattern; return NULL; } @@ -2947,8 +2931,8 @@ switch(type) case OP_WHITESPACE: return (pcre_ctypes[c] & ctype_space) != 0; case OP_NOT_WORDCHAR: return (pcre_ctypes[c] & ctype_word) == 0; case OP_WORDCHAR: return (pcre_ctypes[c] & ctype_word) != 0; - case OP_NOT_WORDCHAR_L: return (c!='_' && !isalpha(c)); - case OP_WORDCHAR_L: return (c=='_' || isalpha(c)); + case OP_NOT_WORDCHAR_L: return (c!='_' && !isalnum(c)); + case OP_WORDCHAR_L: return (c=='_' || isalnum(c)); } return FALSE; } @@ -2971,9 +2955,9 @@ Returns: TRUE if matched */ static BOOL -match_ref(int number, register uschar *eptr, int length, match_data *md) +match_ref(int number, register const uschar *eptr, int length, match_data *md) { -uschar *p = md->start_subject + md->offset_vector[number]; +const uschar *p = md->start_subject + md->offset_vector[number]; #ifdef DEBUG if (eptr >= md->end_subject) @@ -2992,7 +2976,7 @@ printf("\n"); if (length > md->end_subject - p) return FALSE; -/* Separate the caselesss case for speed */ +/* Separate the caseless case for speed */ if (md->caseless) { while (length-- > 0) if (pcre_lcc[*p++] != pcre_lcc[*eptr++]) return FALSE; } @@ -3027,8 +3011,8 @@ static int grow_stack(match_data *md) else {md->length = 80;} } PyMem_RESIZE(md->offset_top, int, md->length); - PyMem_RESIZE(md->eptr, uschar *, md->length); - PyMem_RESIZE(md->ecode, uschar *, md->length); + PyMem_RESIZE(md->eptr, const uschar *, md->length); + PyMem_RESIZE(md->ecode, const uschar *, md->length); PyMem_RESIZE(md->off_num, int, md->length); PyMem_RESIZE(md->r1, int, md->length); PyMem_RESIZE(md->r2, int, md->length); @@ -3058,7 +3042,7 @@ Returns: TRUE if matched */ static BOOL -match(register uschar *eptr, register uschar *ecode, int offset_top, +match(register const uschar *eptr, register const uschar *ecode, int offset_top, match_data *md) { int save_stack_position = md->point; @@ -3072,7 +3056,7 @@ for (;;) int min, max, ctype; register int i; register int c; - BOOL minimize; + BOOL minimize = FALSE; /* Opening bracket. Check the alternative branches in turn, failing if none match. We have to set the start offset if required and there is space @@ -3085,7 +3069,7 @@ for (;;) if ((int)*ecode >= OP_BRA) { int number = (*ecode - OP_BRA) << 1; - int save_offset1, save_offset2; + int save_offset1 = 0, save_offset2 = 0; #ifdef DEBUG printf("start bracket %d\n", number/2); @@ -3212,7 +3196,7 @@ for (;;) case OP_BRAZERO: { - uschar *next = ecode+1; + const uschar *next = ecode+1; if (match(eptr, next, offset_top, md)) SUCCEED; do next += (next[1] << 8) + next[2]; while (*next == OP_ALT); ecode = next + 3; @@ -3221,7 +3205,7 @@ for (;;) case OP_BRAMINZERO: { - uschar *next = ecode+1; + const uschar *next = ecode+1; do next += (next[1] << 8) + next[2]; while (*next == OP_ALT); if (match(eptr, next+3, offset_top, md)) SUCCEED; ecode++; @@ -3237,7 +3221,7 @@ for (;;) case OP_KETRMAX: { int number; - uschar *prev = ecode - (ecode[1] << 8) - ecode[2]; + const uschar *prev = ecode - (ecode[1] << 8) - ecode[2]; if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT || *prev == OP_ONCE) { @@ -3279,11 +3263,10 @@ for (;;) if (*ecode == OP_KETRMIN) { - uschar *ptr; + const uschar *ptr; if (match(eptr, ecode+3, offset_top, md)) goto succeed; /* Handle alternation inside the BRA...KET; push the additional - alternatives onto the stack - XXX this tries the alternatives backwards! */ + alternatives onto the stack */ ptr=prev; do { ptr += (ptr[1]<<8)+ ptr[2]; @@ -3306,8 +3289,8 @@ for (;;) } else /* OP_KETRMAX */ { - uschar *ptr; - int points_pushed=0; + const uschar *ptr; + /*int points_pushed=0;*/ /* Push one failure point, that will resume matching at the code after the KETRMAX opcode. */ @@ -3325,8 +3308,7 @@ for (;;) md->offset_vector[number] = eptr - md->start_subject; /* Handle alternation inside the BRA...KET; push each of the - additional alternatives onto the stack - XXX this tries the alternatives backwards! */ + additional alternatives onto the stack */ ptr=prev; do { ptr += (ptr[1]<<8)+ ptr[2]; @@ -3344,15 +3326,15 @@ for (;;) md->r2[md->point] = 0; md->off_num[md->point] = 0; md->point++; - points_pushed++; + /*points_pushed++;*/ } } while (*ptr==OP_ALT); /* Jump to the first (or only) alternative and resume trying to match */ ecode=prev+3; goto match_loop; } } - FAIL; - + break; + /* Start of subject unless notbol, or after internal newline if multiline */ case OP_CIRC: @@ -3419,9 +3401,9 @@ for (;;) case OP_WORD_BOUNDARY_L: { BOOL prev_is_word = (eptr != md->start_subject) && - (isalpha(eptr[-1]) || eptr[-1]=='_'); + (isalnum(eptr[-1]) || eptr[-1]=='_'); BOOL cur_is_word = (eptr < md->end_subject) && - (isalpha(eptr[-1]) || eptr[-1]=='_'); + (isalnum(*eptr) || *eptr=='_'); if ((*ecode++ == OP_WORD_BOUNDARY_L)? cur_is_word == prev_is_word : cur_is_word != prev_is_word) FAIL; @@ -3474,14 +3456,14 @@ for (;;) break; case OP_NOT_WORDCHAR_L: - if (eptr >= md->end_subject || (*eptr=='_' || isalpha(*eptr) )) + if (eptr >= md->end_subject || (*eptr=='_' || isalnum(*eptr) )) return FALSE; eptr++; ecode++; break; case OP_WORDCHAR_L: - if (eptr >= md->end_subject || (*eptr!='_' && !isalpha(*eptr) )) + if (eptr >= md->end_subject || (*eptr!='_' && !isalnum(*eptr) )) return FALSE; eptr++; ecode++; @@ -3577,7 +3559,7 @@ for (;;) else { - uschar *pp = eptr; + const uschar *pp = eptr; for (i = min; i < max; i++) { if (!match_ref(number, eptr, length, md)) break; @@ -3601,8 +3583,8 @@ for (;;) case OP_CLASS: { - uschar *data = ecode + 1; /* Save for matching */ - ecode += 33; /* Advance past the item */ + const uschar *data = ecode + 1; /* Save for matching */ + ecode += 33; /* Advance past the item */ switch (*ecode) { @@ -3685,7 +3667,7 @@ for (;;) else { - uschar *pp = eptr; + const uschar *pp = eptr; for (i = min; i < max; eptr++, i++) { if (eptr >= md->end_subject) break; @@ -3710,8 +3692,8 @@ for (;;) case OP_CLASS_L: { - uschar *data = ecode + 1; /* Save for matching */ - uschar locale_flag = *data; + const uschar *data = ecode + 1; /* Save for matching */ + const uschar locale_flag = *data; ecode++; data++; /* The localization support adds an extra byte */ ecode += 33; /* Advance past the item */ @@ -3744,8 +3726,8 @@ for (;;) if (eptr >= md->end_subject) FAIL; c = *eptr++; if ((data[c/8] & (1 << (c&7))) != 0) continue; /* With main loop */ - if ( (locale_flag & 1) && (isalpha(c) || c=='_') ) continue; /* Locale \w */ - if ( (locale_flag & 2) && (!isalpha(c) && c!='_') ) continue; /* Locale \W */ + if ( (locale_flag & 1) && (isalnum(c) || c=='_') ) continue; /* Locale \w */ + if ( (locale_flag & 2) && (!isalnum(c) && c!='_') ) continue; /* Locale \W */ #if 0 if ( (locale_flag & 4) && isdigit(c) ) continue; /* Locale \d */ if ( (locale_flag & 8) && !isdigit(c) ) continue; /* Locale \D */ @@ -3758,8 +3740,8 @@ for (;;) c = pcre_fcc[c]; if ((data[c/8] & (1 << (c&7))) != 0) continue; /* With main loop */ - if ( (locale_flag & 1) && (isalpha(c) || c=='_') ) continue; /* Locale \w */ - if ( (locale_flag & 2) && (!isalpha(c) && c!='_') ) continue; /* Locale \W */ + if ( (locale_flag & 1) && (isalnum(c) || c=='_') ) continue; /* Locale \w */ + if ( (locale_flag & 2) && (!isalnum(c) && c!='_') ) continue; /* Locale \W */ } FAIL; } @@ -3771,15 +3753,15 @@ for (;;) if (eptr >= md->end_subject) FAIL; c = *eptr++; if ((data[c/8] & (1 << (c&7))) != 0) continue; - if ( (locale_flag & 1) && (isalpha(c) || c=='_') ) continue; /* Locale \w */ - if ( (locale_flag & 2) && (!isalpha(c) && c!='_') ) continue; /* Locale \W */ + if ( (locale_flag & 1) && (isalnum(c) || c=='_') ) continue; /* Locale \w */ + if ( (locale_flag & 2) && (!isalnum(c) && c!='_') ) continue; /* Locale \W */ if (md->runtime_caseless) { c = pcre_fcc[c]; if ((data[c/8] & (1 << (c&7))) != 0) continue; - if ( (locale_flag & 1) && (isalpha(c) || c=='_') ) continue; /* Locale \w */ - if ( (locale_flag & 2) && (!isalpha(c) && c!='_') ) continue; /* Locale \W */ + if ( (locale_flag & 1) && (isalnum(c) || c=='_') ) continue; /* Locale \w */ + if ( (locale_flag & 2) && (!isalnum(c) && c!='_') ) continue; /* Locale \W */ } FAIL; } @@ -3800,15 +3782,15 @@ for (;;) if (i >= max || eptr >= md->end_subject) FAIL; c = *eptr++; if ((data[c/8] & (1 << (c&7))) != 0) continue; - if ( (locale_flag & 1) && (isalpha(c) || c=='_') ) continue; /* Locale \w */ - if ( (locale_flag & 2) && (!isalpha(c) && c!='_') ) continue; /* Locale \W */ + if ( (locale_flag & 1) && (isalnum(c) || c=='_') ) continue; /* Locale \w */ + if ( (locale_flag & 2) && (!isalnum(c) && c!='_') ) continue; /* Locale \W */ if (md->runtime_caseless) { c = pcre_fcc[c]; if ((data[c/8] & (1 << (c&7))) != 0) continue; - if ( (locale_flag & 1) && (isalpha(c) || c=='_') ) continue; /* Locale \w */ - if ( (locale_flag & 2) && (!isalpha(c) && c!='_') ) continue; /* Locale \W */ + if ( (locale_flag & 1) && (isalnum(c) || c=='_') ) continue; /* Locale \w */ + if ( (locale_flag & 2) && (!isalnum(c) && c!='_') ) continue; /* Locale \W */ } FAIL; } @@ -3819,20 +3801,20 @@ for (;;) else { - uschar *pp = eptr; + const uschar *pp = eptr; for (i = min; i < max; eptr++, i++) { if (eptr >= md->end_subject) break; c = *eptr; if ((data[c/8] & (1 << (c&7))) != 0) continue; - if ( (locale_flag & 1) && (isalpha(c) || c=='_') ) continue; /* Locale \w */ - if ( (locale_flag & 2) && (!isalpha(c) && c!='_') ) continue; /* Locale \W */ + if ( (locale_flag & 1) && (isalnum(c) || c=='_') ) continue; /* Locale \w */ + if ( (locale_flag & 2) && (!isalnum(c) && c!='_') ) continue; /* Locale \W */ if (md->runtime_caseless) { c = pcre_fcc[c]; if ((data[c/8] & (1 << (c&7))) != 0) continue; - if ( (locale_flag & 1) && (isalpha(c) || c=='_') ) continue; /* Locale \w */ - if ( (locale_flag & 2) && (!isalpha(c) && c!='_') ) continue; /* Locale \W */ + if ( (locale_flag & 1) && (isalnum(c) || c=='_') ) continue; /* Locale \w */ + if ( (locale_flag & 2) && (!isalnum(c) && c!='_') ) continue; /* Locale \W */ } break; } @@ -3941,7 +3923,7 @@ for (;;) } else { - uschar *pp = eptr; + const uschar *pp = eptr; for (i = min; i < max; i++) { if (eptr >= md->end_subject || c != pcre_lcc[*eptr]) break; @@ -3971,7 +3953,7 @@ for (;;) } else { - uschar *pp = eptr; + const uschar *pp = eptr; for (i = min; i < max; i++) { if (eptr >= md->end_subject || c != *eptr) break; @@ -4068,7 +4050,7 @@ for (;;) } else { - uschar *pp = eptr; + const uschar *pp = eptr; for (i = min; i < max; i++) { if (eptr >= md->end_subject || c == pcre_lcc[*eptr]) break; @@ -4098,7 +4080,7 @@ for (;;) } else { - uschar *pp = eptr; + const uschar *pp = eptr; for (i = min; i < max; i++) { if (eptr >= md->end_subject || c == *eptr) break; @@ -4191,12 +4173,12 @@ for (;;) break; case OP_NOT_WORDCHAR_L: - for (i = 1; i <= min; i++, eptr++) if (*eptr=='_' || isalpha(*eptr)) + for (i = 1; i <= min; i++, eptr++) if (*eptr=='_' || isalnum(*eptr)) return FALSE; break; case OP_WORDCHAR_L: - for (i = 1; i <= min; i++, eptr++) if (*eptr!='_' && !isalpha(*eptr)) + for (i = 1; i <= min; i++, eptr++) if (*eptr!='_' && !isalnum(*eptr)) return FALSE; break; } @@ -4225,7 +4207,7 @@ for (;;) else { - uschar *pp = eptr; + const uschar *pp = eptr; switch(ctype) { case OP_ANY: @@ -4301,7 +4283,7 @@ for (;;) case OP_NOT_WORDCHAR_L: for (i = min; i < max; i++) { - if (eptr >= md->end_subject || (*eptr=='_' || isalpha(*eptr) ) ) + if (eptr >= md->end_subject || (*eptr=='_' || isalnum(*eptr) ) ) break; eptr++; } @@ -4310,7 +4292,7 @@ for (;;) case OP_WORDCHAR_L: for (i = min; i < max; i++) { - if (eptr >= md->end_subject || (*eptr!='_' && !isalpha(*eptr) ) ) + if (eptr >= md->end_subject || (*eptr!='_' && !isalnum(*eptr) ) ) break; eptr++; } @@ -4399,17 +4381,20 @@ int pcre_exec(const pcre *external_re, const pcre_extra *external_extra, const char *subject, int length, int options, int *offsets, int offsetcount) { -int resetcount; -int ocount = offsetcount; -int first_char = -1; + /* The "volatile" directives are to make gcc -Wall stop complaining + that these variables can be clobbered by the longjmp. Hopefully + they won't cost too much performance. */ +volatile int resetcount; +volatile int ocount = offsetcount; +volatile int first_char = -1; match_data match_block; -uschar *start_bits = NULL; -uschar *start_match = (uschar *)subject; -uschar *end_subject; -real_pcre *re = (real_pcre *)external_re; -real_pcre_extra *extra = (real_pcre_extra *)external_extra; -BOOL anchored = ((re->options | options) & PCRE_ANCHORED) != 0; -BOOL startline = (re->options & PCRE_STARTLINE) != 0; +volatile const uschar *start_bits = NULL; +const uschar *start_match = (uschar *)subject; +const uschar *end_subject; +const real_pcre *re = (const real_pcre *)external_re; +const real_pcre_extra *extra = (const real_pcre_extra *)external_extra; +volatile BOOL anchored = ((re->options | options) & PCRE_ANCHORED) != 0; +volatile BOOL startline = (re->options & PCRE_STARTLINE) != 0; if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION; @@ -4417,7 +4402,7 @@ if (re == NULL || subject == NULL || (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL; if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC; -match_block.start_subject = (uschar *)subject; +match_block.start_subject = (const uschar *)subject; match_block.end_subject = match_block.start_subject + length; end_subject = match_block.end_subject; @@ -4626,13 +4611,3 @@ return match_block.errorcode; } /* End of pcre.c */ - - - - - - - - - - -- cgit v0.12