diff options
Diffstat (limited to 'generic/tclRegexp.c')
-rw-r--r-- | generic/tclRegexp.c | 193 |
1 files changed, 120 insertions, 73 deletions
diff --git a/generic/tclRegexp.c b/generic/tclRegexp.c index b5e3bec..6348e4a 100644 --- a/generic/tclRegexp.c +++ b/generic/tclRegexp.c @@ -9,8 +9,6 @@ * * See the file "license.terms" for information on usage and redistribution of * this file, and for a DISCLAIMER OF ALL WARRANTIES. - * - * RCS: @(#) $Id: tclRegexp.c,v 1.21 2005/11/02 00:55:06 dkf Exp $ */ #include "tclInt.h" @@ -55,8 +53,8 @@ * * *** NOTE: this code has been altered slightly for use in Tcl: *** * *** 1. Names have been changed, e.g. from re_comp to *** - * *** TclRegComp, to avoid clashes with other *** - * *** regexp implementations used by applications. *** + * *** TclRegComp, to avoid clashes with other *** + * *** regexp implementations used by applications. *** */ /* @@ -85,7 +83,7 @@ static Tcl_ThreadDataKey dataKey; * Declarations for functions used only in this file. */ -static TclRegexp * CompileRegexp(Tcl_Interp *interp, CONST char *pattern, +static TclRegexp * CompileRegexp(Tcl_Interp *interp, const char *pattern, int length, int flags); static void DupRegexpInternalRep(Tcl_Obj *srcPtr, Tcl_Obj *copyPtr); @@ -93,7 +91,7 @@ static void FinalizeRegexp(ClientData clientData); static void FreeRegexp(TclRegexp *regexpPtr); static void FreeRegexpInternalRep(Tcl_Obj *objPtr); static int RegExpExecUniChar(Tcl_Interp *interp, Tcl_RegExp re, - CONST Tcl_UniChar *uniString, int numChars, + const Tcl_UniChar *uniString, int numChars, int nmatches, int flags); static int SetRegexpFromAny(Tcl_Interp *interp, Tcl_Obj *objPtr); @@ -102,7 +100,7 @@ static int SetRegexpFromAny(Tcl_Interp *interp, Tcl_Obj *objPtr); * compiled form of the regular expression. */ -Tcl_ObjType tclRegexpType = { +const Tcl_ObjType tclRegexpType = { "regexp", /* name */ FreeRegexpInternalRep, /* freeIntRepProc */ DupRegexpInternalRep, /* dupIntRepProc */ @@ -136,7 +134,7 @@ Tcl_RegExp Tcl_RegExpCompile( Tcl_Interp *interp, /* For use in error reporting and to access * the interp regexp cache. */ - CONST char *pattern) /* String for which to produce compiled + const char *pattern) /* String for which to produce compiled * regular expression. */ { return (Tcl_RegExp) CompileRegexp(interp, pattern, (int) strlen(pattern), @@ -169,15 +167,15 @@ Tcl_RegExpExec( Tcl_RegExp re, /* Compiled regular expression; must have been * returned by previous call to * Tcl_GetRegExpFromObj. */ - CONST char *text, /* Text against which to match re. */ - CONST char *start) /* If text is part of a larger string, this + const char *text, /* Text against which to match re. */ + const char *start) /* If text is part of a larger string, this * identifies beginning of larger string, so * that "^" won't match. */ { int flags, result, numChars; - TclRegexp *regexp = (TclRegexp *)re; + TclRegexp *regexp = (TclRegexp *) re; Tcl_DString ds; - CONST Tcl_UniChar *ustr; + const Tcl_UniChar *ustr; /* * If the starting point is offset from the beginning of the buffer, then @@ -237,13 +235,13 @@ Tcl_RegExpRange( int index, /* 0 means give the range of the entire match, * > 0 means give the range of a matching * subrange. */ - CONST char **startPtr, /* Store address of first character in + const char **startPtr, /* Store address of first character in * (sub-)range here. */ - CONST char **endPtr) /* Store address of character just after last + const char **endPtr) /* Store address of character just after last * in (sub-)range here. */ { TclRegexp *regexpPtr = (TclRegexp *) re; - CONST char *string; + const char *string; if ((size_t) index > regexpPtr->re.re_nsub) { *startPtr = *endPtr = NULL; @@ -251,7 +249,7 @@ Tcl_RegExpRange( *startPtr = *endPtr = NULL; } else { if (regexpPtr->objPtr) { - string = Tcl_GetString(regexpPtr->objPtr); + string = TclGetString(regexpPtr->objPtr); } else { string = regexpPtr->string; } @@ -285,7 +283,7 @@ RegExpExecUniChar( Tcl_Interp *interp, /* Interpreter to use for error reporting. */ Tcl_RegExp re, /* Compiled regular expression; returned by a * previous call to Tcl_GetRegExpFromObj */ - CONST Tcl_UniChar *wString, /* String against which to match re. */ + const Tcl_UniChar *wString, /* String against which to match re. */ int numChars, /* Length of Tcl_UniChar string (must be * >=0). */ int nmatches, /* How many subexpression matches (counting @@ -390,12 +388,11 @@ TclRegExpRangeUniChar( int Tcl_RegExpMatch( Tcl_Interp *interp, /* Used for error reporting. May be NULL. */ - CONST char *text, /* Text to search for pattern matches. */ - CONST char *pattern) /* Regular expression to match against text. */ + const char *text, /* Text to search for pattern matches. */ + const char *pattern) /* Regular expression to match against text. */ { - Tcl_RegExp re; + Tcl_RegExp re = Tcl_RegExpCompile(interp, pattern); - re = Tcl_RegExpCompile(interp, pattern); if (re == NULL) { return -1; } @@ -437,6 +434,28 @@ Tcl_RegExpExecObj( TclRegexp *regexpPtr = (TclRegexp *) re; Tcl_UniChar *udata; int length; + int reflags = regexpPtr->flags; +#define TCL_REG_GLOBOK_FLAGS \ + (TCL_REG_ADVANCED | TCL_REG_NOSUB | TCL_REG_NOCASE) + + /* + * Take advantage of the equivalent glob pattern, if one exists. + * This is possible based only on the right mix of incoming flags (0) + * and regexp compile flags. + */ + if ((offset == 0) && (nmatches == 0) && (flags == 0) + && !(reflags & ~TCL_REG_GLOBOK_FLAGS) + && (regexpPtr->globObjPtr != NULL)) { + int nocase = (reflags & TCL_REG_NOCASE) ? TCL_MATCH_NOCASE : 0; + + /* + * Pass to TclStringMatchObj for obj-specific handling. + * XXX: Currently doesn't take advantage of exact-ness that + * XXX: TclReToGlob tells us about + */ + + return TclStringMatchObj(textObj, regexpPtr->globObjPtr, nocase); + } /* * Save the target object so we can extract strings from it later. @@ -552,17 +571,17 @@ Tcl_GetRegExpFromObj( { int length; TclRegexp *regexpPtr; - char *pattern; + const char *pattern; /* * This is OK because we only actually interpret this value properly as a * TclRegexp* when the type is tclRegexpType. */ - regexpPtr = (TclRegexp *) objPtr->internalRep.otherValuePtr; + regexpPtr = objPtr->internalRep.twoPtrValue.ptr1; if ((objPtr->typePtr != &tclRegexpType) || (regexpPtr->flags != flags)) { - pattern = Tcl_GetStringFromObj(objPtr, &length); + pattern = TclGetStringFromObj(objPtr, &length); regexpPtr = CompileRegexp(interp, pattern, length, flags); if (regexpPtr == NULL) { @@ -582,7 +601,7 @@ Tcl_GetRegExpFromObj( */ TclFreeIntRep(objPtr); - objPtr->internalRep.otherValuePtr = (void *) regexpPtr; + objPtr->internalRep.twoPtrValue.ptr1 = regexpPtr; objPtr->typePtr = &tclRegexpType; } return (Tcl_RegExp) regexpPtr; @@ -612,12 +631,12 @@ TclRegAbout( Tcl_Interp *interp, /* For use in variable assignment. */ Tcl_RegExp re) /* The compiled regular expression. */ { - TclRegexp *regexpPtr = (TclRegexp *)re; - char buf[TCL_INTEGER_SPACE]; - static struct infoname { + TclRegexp *regexpPtr = (TclRegexp *) re; + struct infoname { int bit; - char *text; - } infonames[] = { + const char *text; + }; + static const struct infoname infonames[] = { {REG_UBACKREF, "REG_UBACKREF"}, {REG_ULOOKAHEAD, "REG_ULOOKAHEAD"}, {REG_UBOUNDS, "REG_UBOUNDS"}, @@ -632,38 +651,42 @@ TclRegAbout( {REG_UEMPTYMATCH, "REG_UEMPTYMATCH"}, {REG_UIMPOSSIBLE, "REG_UIMPOSSIBLE"}, {REG_USHORTEST, "REG_USHORTEST"}, - {0, ""} + {0, NULL} }; - struct infoname *inf; - int n; + const struct infoname *inf; + Tcl_Obj *infoObj, *resultObj; + + /* + * The reset here guarantees that the interpreter result is empty and + * unshared. This means that we can use Tcl_ListObjAppendElement on the + * result object quite safely. + */ Tcl_ResetResult(interp); - sprintf(buf, "%u", (unsigned)(regexpPtr->re.re_nsub)); - Tcl_AppendElement(interp, buf); + /* + * Assume that there will never be more than INT_MAX subexpressions. This + * is a pretty reasonable assumption; the RE engine doesn't scale _that_ + * well and Tcl has other limits that constrain things as well... + */ + + resultObj = Tcl_NewObj(); + Tcl_ListObjAppendElement(NULL, resultObj, + Tcl_NewIntObj((int) regexpPtr->re.re_nsub)); /* - * Must count bits before generating list, because we must know whether {} - * are needed before we start appending names. + * Now append a list of all the bit-flags set for the RE. */ - n = 0; - for (inf = infonames; inf->bit != 0; inf++) { - if (regexpPtr->re.re_info&inf->bit) { - n++; + TclNewObj(infoObj); + for (inf=infonames ; inf->bit != 0 ; inf++) { + if (regexpPtr->re.re_info & inf->bit) { + Tcl_ListObjAppendElement(NULL, infoObj, + Tcl_NewStringObj(inf->text, -1)); } } - if (n != 1) { - Tcl_AppendResult(interp, " {", NULL); - } - for (inf = infonames; inf->bit != 0; inf++) { - if (regexpPtr->re.re_info&inf->bit) { - Tcl_AppendElement(interp, inf->text); - } - } - if (n != 1) { - Tcl_AppendResult(interp, "}", NULL); - } + Tcl_ListObjAppendElement(NULL, resultObj, infoObj); + Tcl_SetObjResult(interp, resultObj); return 0; } @@ -687,18 +710,18 @@ TclRegAbout( void TclRegError( Tcl_Interp *interp, /* Interpreter for error reporting. */ - CONST char *msg, /* Message to prepend to error. */ + const char *msg, /* Message to prepend to error. */ int status) /* Status code to report. */ { char buf[100]; /* ample in practice */ - char cbuf[100]; /* lots in practice */ + char cbuf[TCL_INTEGER_SPACE]; size_t n; - char *p; + const char *p; Tcl_ResetResult(interp); n = TclReError(status, NULL, buf, sizeof(buf)); p = (n > sizeof(buf)) ? "..." : ""; - Tcl_AppendResult(interp, msg, buf, p, NULL); + Tcl_SetObjResult(interp, Tcl_ObjPrintf("%s%s%s", msg, buf, p)); sprintf(cbuf, "%d", status); (void) TclReError(REG_ITOA, NULL, cbuf, sizeof(cbuf)); @@ -726,7 +749,7 @@ static void FreeRegexpInternalRep( Tcl_Obj *objPtr) /* Regexp object with internal rep to free. */ { - TclRegexp *regexpRepPtr = (TclRegexp *) objPtr->internalRep.otherValuePtr; + TclRegexp *regexpRepPtr = objPtr->internalRep.twoPtrValue.ptr1; /* * If this is the last reference to the regexp, free it. @@ -735,6 +758,7 @@ FreeRegexpInternalRep( if (--(regexpRepPtr->refCount) <= 0) { FreeRegexp(regexpRepPtr); } + objPtr->typePtr = NULL; } /* @@ -759,10 +783,10 @@ DupRegexpInternalRep( Tcl_Obj *srcPtr, /* Object with internal rep to copy. */ Tcl_Obj *copyPtr) /* Object with internal rep to set. */ { - TclRegexp *regexpPtr = (TclRegexp *) srcPtr->internalRep.otherValuePtr; + TclRegexp *regexpPtr = srcPtr->internalRep.twoPtrValue.ptr1; regexpPtr->refCount++; - copyPtr->internalRep.otherValuePtr = srcPtr->internalRep.otherValuePtr; + copyPtr->internalRep.twoPtrValue.ptr1 = srcPtr->internalRep.twoPtrValue.ptr1; copyPtr->typePtr = &tclRegexpType; } @@ -822,15 +846,14 @@ SetRegexpFromAny( static TclRegexp * CompileRegexp( Tcl_Interp *interp, /* Used for error reporting if not NULL. */ - CONST char *string, /* The regexp to compile (UTF-8). */ + const char *string, /* The regexp to compile (UTF-8). */ int length, /* The length of the string in bytes. */ int flags) /* Compilation flags. */ { TclRegexp *regexpPtr; - CONST Tcl_UniChar *uniString; - int numChars; + const Tcl_UniChar *uniString; + int numChars, status, i, exact; Tcl_DString stringBuf; - int status, i; ThreadSpecificData *tsdPtr = TCL_TSD_INIT(&dataKey); if (!tsdPtr->initialized) { @@ -882,7 +905,7 @@ CompileRegexp( * This is a new expression, so compile it and add it to the cache. */ - regexpPtr = (TclRegexp *) ckalloc(sizeof(TclRegexp)); + regexpPtr = ckalloc(sizeof(TclRegexp)); regexpPtr->objPtr = NULL; regexpPtr->string = NULL; regexpPtr->details.rm_extend.rm_so = -1; @@ -909,22 +932,34 @@ CompileRegexp( * Clean up and report errors in the interpreter, if possible. */ - ckfree((char *)regexpPtr); + ckfree(regexpPtr); if (interp) { TclRegError(interp, - "couldn't compile regular expression pattern: ", - status); + "couldn't compile regular expression pattern: ", status); } return NULL; } /* + * Convert RE to a glob pattern equivalent, if any, and cache it. If this + * is not possible, then globObjPtr will be NULL. This is used by + * Tcl_RegExpExecObj to optionally do a fast match (avoids RE engine). + */ + + if (TclReToGlob(NULL, string, length, &stringBuf, &exact) == TCL_OK) { + regexpPtr->globObjPtr = TclDStringToObj(&stringBuf); + Tcl_IncrRefCount(regexpPtr->globObjPtr); + } else { + regexpPtr->globObjPtr = NULL; + } + + /* * Allocate enough space for all of the subexpressions, plus one extra for * the entire pattern. */ - regexpPtr->matches = (regmatch_t *) ckalloc( - sizeof(regmatch_t) * (regexpPtr->re.re_nsub + 1)); + regexpPtr->matches = + ckalloc(sizeof(regmatch_t) * (regexpPtr->re.re_nsub + 1)); /* * Initialize the refcount to one initially, since it is in the cache. @@ -939,6 +974,7 @@ CompileRegexp( if (tsdPtr->patterns[NUM_REGEXPS-1] != NULL) { TclRegexp *oldRegexpPtr = tsdPtr->regexps[NUM_REGEXPS-1]; + if (--(oldRegexpPtr->refCount) <= 0) { FreeRegexp(oldRegexpPtr); } @@ -949,8 +985,8 @@ CompileRegexp( tsdPtr->patLengths[i+1] = tsdPtr->patLengths[i]; tsdPtr->regexps[i+1] = tsdPtr->regexps[i]; } - tsdPtr->patterns[0] = (char *) ckalloc((unsigned) (length+1)); - strcpy(tsdPtr->patterns[0], string); + tsdPtr->patterns[0] = ckalloc(length + 1); + memcpy(tsdPtr->patterns[0], string, (unsigned) length + 1); tsdPtr->patLengths[0] = length; tsdPtr->regexps[0] = regexpPtr; @@ -978,10 +1014,13 @@ FreeRegexp( TclRegexp *regexpPtr) /* Compiled regular expression to free. */ { TclReFree(®expPtr->re); + if (regexpPtr->globObjPtr) { + TclDecrRefCount(regexpPtr->globObjPtr); + } if (regexpPtr->matches) { - ckfree((char *) regexpPtr->matches); + ckfree(regexpPtr->matches); } - ckfree((char *) regexpPtr); + ckfree(regexpPtr); } /* @@ -1014,7 +1053,15 @@ FinalizeRegexp( FreeRegexp(regexpPtr); } ckfree(tsdPtr->patterns[i]); + tsdPtr->patterns[i] = NULL; } + + /* + * We may find ourselves reinitialized if another finalization routine + * invokes regexps. + */ + + tsdPtr->initialized = 0; } /* |