summaryrefslogtreecommitdiffstats
path: root/generic/tclRegexp.c
diff options
context:
space:
mode:
Diffstat (limited to 'generic/tclRegexp.c')
-rw-r--r--generic/tclRegexp.c713
1 files changed, 432 insertions, 281 deletions
diff --git a/generic/tclRegexp.c b/generic/tclRegexp.c
index 2318780..6348e4a 100644
--- a/generic/tclRegexp.c
+++ b/generic/tclRegexp.c
@@ -1,26 +1,23 @@
-/*
+/*
* tclRegexp.c --
*
- * This file contains the public interfaces to the Tcl regular
- * expression mechanism.
+ * This file contains the public interfaces to the Tcl regular expression
+ * mechanism.
*
* Copyright (c) 1998 by Sun Microsystems, Inc.
* Copyright (c) 1998-1999 by Scriptics Corporation.
*
- * See the file "license.terms" for information on usage and redistribution
- * of this file, and for a DISCLAIMER OF ALL WARRANTIES.
- *
- * RCS: @(#) $Id: tclRegexp.c,v 1.6 1999/06/02 01:53:31 stanton Exp $
+ * See the file "license.terms" for information on usage and redistribution of
+ * this file, and for a DISCLAIMER OF ALL WARRANTIES.
*/
#include "tclInt.h"
-#include "tclPort.h"
#include "tclRegexp.h"
/*
*----------------------------------------------------------------------
- * The routines in this file use Henry Spencer's regular expression
- * package contained in the following additional source files:
+ * The routines in this file use Henry Spencer's regular expression package
+ * contained in the following additional source files:
*
* regc_color.c regc_cvec.c regc_lex.c
* regc_nfa.c regcomp.c regcustom.h
@@ -29,23 +26,23 @@
* regfronts.c regguts.h
*
* Copyright (c) 1998 Henry Spencer. All rights reserved.
- *
+ *
* Development of this software was funded, in part, by Cray Research Inc.,
* UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
- * Corporation, none of whom are responsible for the results. The author
- * thanks all of them.
- *
+ * Corporation, none of whom are responsible for the results. The author
+ * thanks all of them.
+ *
* Redistribution and use in source and binary forms -- with or without
* modification -- are permitted for any purpose, provided that
* redistributions in source form retain this entire copyright notice and
* indicate the origin and nature of any modifications.
- *
- * I'd appreciate being given credit for this package in the documentation
- * of software which uses it, but that is not a requirement.
- *
+ *
+ * I'd appreciate being given credit for this package in the documentation of
+ * software which uses it, but that is not a requirement.
+ *
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
- * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
+ * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
* HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
@@ -56,8 +53,8 @@
*
* *** NOTE: this code has been altered slightly for use in Tcl: ***
* *** 1. Names have been changed, e.g. from re_comp to ***
- * *** TclRegComp, to avoid clashes with other ***
- * *** regexp implementations used by applications. ***
+ * *** TclRegComp, to avoid clashes with other ***
+ * *** regexp implementations used by applications. ***
*/
/*
@@ -69,15 +66,14 @@
typedef struct ThreadSpecificData {
int initialized; /* Set to 1 when the module is initialized. */
- char *patterns[NUM_REGEXPS];/* Strings corresponding to compiled
- * regular expression patterns. NULL
- * means that this slot isn't used.
- * Malloc-ed. */
+ char *patterns[NUM_REGEXPS];/* Strings corresponding to compiled regular
+ * expression patterns. NULL means that this
+ * slot isn't used. Malloc-ed. */
int patLengths[NUM_REGEXPS];/* Number of non-null characters in
- * corresponding entry in patterns.
- * -1 means entry isn't used. */
+ * corresponding entry in patterns. -1 means
+ * entry isn't used. */
struct TclRegexp *regexps[NUM_REGEXPS];
- /* Compiled forms of above strings. Also
+ /* Compiled forms of above strings. Also
* malloc-ed, or NULL if not in use yet. */
} ThreadSpecificData;
@@ -87,46 +83,46 @@ static Tcl_ThreadDataKey dataKey;
* Declarations for functions used only in this file.
*/
-static TclRegexp * CompileRegexp _ANSI_ARGS_((Tcl_Interp *interp,
- char *pattern, int length, int flags));
-static void DupRegexpInternalRep _ANSI_ARGS_((Tcl_Obj *srcPtr,
- Tcl_Obj *copyPtr));
-static void FinalizeRegexp _ANSI_ARGS_((ClientData clientData));
-static void FreeRegexp _ANSI_ARGS_((TclRegexp *regexpPtr));
-static void FreeRegexpInternalRep _ANSI_ARGS_((Tcl_Obj *objPtr));
-static int SetRegexpFromAny _ANSI_ARGS_((Tcl_Interp *interp,
- Tcl_Obj *objPtr));
+static TclRegexp * CompileRegexp(Tcl_Interp *interp, const char *pattern,
+ int length, int flags);
+static void DupRegexpInternalRep(Tcl_Obj *srcPtr,
+ Tcl_Obj *copyPtr);
+static void FinalizeRegexp(ClientData clientData);
+static void FreeRegexp(TclRegexp *regexpPtr);
+static void FreeRegexpInternalRep(Tcl_Obj *objPtr);
+static int RegExpExecUniChar(Tcl_Interp *interp, Tcl_RegExp re,
+ const Tcl_UniChar *uniString, int numChars,
+ int nmatches, int flags);
+static int SetRegexpFromAny(Tcl_Interp *interp, Tcl_Obj *objPtr);
/*
- * The regular expression Tcl object type. This serves as a cache
- * of the compiled form of the regular expression.
+ * The regular expression Tcl object type. This serves as a cache of the
+ * compiled form of the regular expression.
*/
-Tcl_ObjType tclRegexpType = {
+const Tcl_ObjType tclRegexpType = {
"regexp", /* name */
FreeRegexpInternalRep, /* freeIntRepProc */
DupRegexpInternalRep, /* dupIntRepProc */
NULL, /* updateStringProc */
SetRegexpFromAny /* setFromAnyProc */
};
-
/*
*----------------------------------------------------------------------
*
* Tcl_RegExpCompile --
*
- * Compile a regular expression into a form suitable for fast
- * matching. This procedure is DEPRECATED in favor of the
- * object version of the command.
+ * Compile a regular expression into a form suitable for fast matching.
+ * This function is DEPRECATED in favor of the object version of the
+ * command.
*
* Results:
- * The return value is a pointer to the compiled form of string,
- * suitable for passing to Tcl_RegExpExec. This compiled form
- * is only valid up until the next call to this procedure, so
- * don't keep these around for a long time! If an error occurred
- * while compiling the pattern, then NULL is returned and an error
- * message is left in the interp's result.
+ * The return value is a pointer to the compiled form of string, suitable
+ * for passing to Tcl_RegExpExec. This compiled form is only valid up
+ * until the next call to this function, so don't keep these around for a
+ * long time! If an error occurred while compiling the pattern, then NULL
+ * is returned and an error message is left in the interp's result.
*
* Side effects:
* Updates the cache of compiled regexps.
@@ -135,13 +131,13 @@ Tcl_ObjType tclRegexpType = {
*/
Tcl_RegExp
-Tcl_RegExpCompile(interp, string)
- Tcl_Interp *interp; /* For use in error reporting and
- * to access the interp regexp cache. */
- char *string; /* String for which to produce
- * compiled regular expression. */
+Tcl_RegExpCompile(
+ Tcl_Interp *interp, /* For use in error reporting and to access
+ * the interp regexp cache. */
+ const char *pattern) /* String for which to produce compiled
+ * regular expression. */
{
- return (Tcl_RegExp) CompileRegexp(interp, string, (int) strlen(string),
+ return (Tcl_RegExp) CompileRegexp(interp, pattern, (int) strlen(pattern),
REG_ADVANCED);
}
@@ -150,15 +146,14 @@ Tcl_RegExpCompile(interp, string)
*
* Tcl_RegExpExec --
*
- * Execute the regular expression matcher using a compiled form
- * of a regular expression and save information about any match
- * that is found.
+ * Execute the regular expression matcher using a compiled form of a
+ * regular expression and save information about any match that is found.
*
* Results:
- * If an error occurs during the matching operation then -1
- * is returned and the interp's result contains an error message.
- * Otherwise the return value is 1 if a matching range is
- * found and 0 if there is no matching range.
+ * If an error occurs during the matching operation then -1 is returned
+ * and the interp's result contains an error message. Otherwise the
+ * return value is 1 if a matching range is found and 0 if there is no
+ * matching range.
*
* Side effects:
* None.
@@ -167,41 +162,49 @@ Tcl_RegExpCompile(interp, string)
*/
int
-Tcl_RegExpExec(interp, re, string, start)
- Tcl_Interp *interp; /* Interpreter to use for error reporting. */
- Tcl_RegExp re; /* Compiled regular expression; must have
- * been returned by previous call to
+Tcl_RegExpExec(
+ Tcl_Interp *interp, /* Interpreter to use for error reporting. */
+ Tcl_RegExp re, /* Compiled regular expression; must have been
+ * returned by previous call to
* Tcl_GetRegExpFromObj. */
- CONST char *string; /* String against which to match re. */
- CONST char *start; /* If string is part of a larger string,
- * this identifies beginning of larger
- * string, so that "^" won't match. */
+ const char *text, /* Text against which to match re. */
+ const char *start) /* If text is part of a larger string, this
+ * identifies beginning of larger string, so
+ * that "^" won't match. */
{
- int result, numChars;
- Tcl_DString stringBuffer;
- Tcl_UniChar *uniString;
-
- TclRegexp *regexpPtr = (TclRegexp *) re;
+ int flags, result, numChars;
+ TclRegexp *regexp = (TclRegexp *) re;
+ Tcl_DString ds;
+ const Tcl_UniChar *ustr;
/*
- * Remember the UTF-8 string so Tcl_RegExpRange() can convert the
- * matches from character to byte offsets.
+ * If the starting point is offset from the beginning of the buffer, then
+ * we need to tell the regexp engine not to match "^".
*/
- regexpPtr->string = string;
-
- Tcl_DStringInit(&stringBuffer);
- uniString = Tcl_UtfToUniCharDString(string, -1, &stringBuffer);
- numChars = Tcl_DStringLength(&stringBuffer) / sizeof(Tcl_UniChar);
+ if (text > start) {
+ flags = REG_NOTBOL;
+ } else {
+ flags = 0;
+ }
/*
- * Perform the regexp match.
+ * Remember the string for use by Tcl_RegExpRange().
*/
- result = TclRegExpExecUniChar(interp, re, uniString, numChars, -1,
- ((string > start) ? REG_NOTBOL : 0));
+ regexp->string = text;
+ regexp->objPtr = NULL;
- Tcl_DStringFree(&stringBuffer);
+ /*
+ * Convert the string to Unicode and perform the match.
+ */
+
+ Tcl_DStringInit(&ds);
+ ustr = Tcl_UtfToUniCharDString(text, -1, &ds);
+ numChars = Tcl_DStringLength(&ds) / sizeof(Tcl_UniChar);
+ result = RegExpExecUniChar(interp, re, ustr, numChars, -1 /* nmatches */,
+ flags);
+ Tcl_DStringFree(&ds);
return result;
}
@@ -216,7 +219,7 @@ Tcl_RegExpExec(interp, re, string, start)
*
* Results:
* The variables at *startPtr and *endPtr are modified to hold the
- * addresses of the endpoints of the range given by index. If the
+ * addresses of the endpoints of the range given by index. If the
* specified range doesn't exist then NULLs are returned.
*
* Side effects:
@@ -226,45 +229,48 @@ Tcl_RegExpExec(interp, re, string, start)
*/
void
-Tcl_RegExpRange(re, index, startPtr, endPtr)
- Tcl_RegExp re; /* Compiled regular expression that has
- * been passed to Tcl_RegExpExec. */
- int index; /* 0 means give the range of the entire
- * match, > 0 means give the range of
- * a matching subrange. */
- char **startPtr; /* Store address of first character in
- * (sub-) range here. */
- char **endPtr; /* Store address of character just after last
- * in (sub-) range here. */
+Tcl_RegExpRange(
+ Tcl_RegExp re, /* Compiled regular expression that has been
+ * passed to Tcl_RegExpExec. */
+ int index, /* 0 means give the range of the entire match,
+ * > 0 means give the range of a matching
+ * subrange. */
+ const char **startPtr, /* Store address of first character in
+ * (sub-)range here. */
+ const char **endPtr) /* Store address of character just after last
+ * in (sub-)range here. */
{
TclRegexp *regexpPtr = (TclRegexp *) re;
+ const char *string;
if ((size_t) index > regexpPtr->re.re_nsub) {
*startPtr = *endPtr = NULL;
} else if (regexpPtr->matches[index].rm_so < 0) {
*startPtr = *endPtr = NULL;
} else {
- *startPtr = Tcl_UtfAtIndex(regexpPtr->string,
- regexpPtr->matches[index].rm_so);
- *endPtr = Tcl_UtfAtIndex(regexpPtr->string,
- regexpPtr->matches[index].rm_eo);
+ if (regexpPtr->objPtr) {
+ string = TclGetString(regexpPtr->objPtr);
+ } else {
+ string = regexpPtr->string;
+ }
+ *startPtr = Tcl_UtfAtIndex(string, regexpPtr->matches[index].rm_so);
+ *endPtr = Tcl_UtfAtIndex(string, regexpPtr->matches[index].rm_eo);
}
}
/*
*---------------------------------------------------------------------------
*
- * TclRegExpExecUniChar --
+ * RegExpExecUniChar --
*
* Execute the regular expression matcher using a compiled form of a
- * regular expression and save information about any match that is
- * found.
+ * regular expression and save information about any match that is found.
*
* Results:
- * If an error occurs during the matching operation then -1 is
- * returned and an error message is left in interp's result.
- * Otherwise the return value is 1 if a matching range was found or
- * 0 if there was no matching range.
+ * If an error occurs during the matching operation then -1 is returned
+ * and an error message is left in interp's result. Otherwise the return
+ * value is 1 if a matching range was found or 0 if there was no matching
+ * range.
*
* Side effects:
* None.
@@ -272,26 +278,27 @@ Tcl_RegExpRange(re, index, startPtr, endPtr)
*----------------------------------------------------------------------
*/
-int
-TclRegExpExecUniChar(interp, re, wString, numChars, nmatches, flags)
- Tcl_Interp *interp; /* Interpreter to use for error reporting. */
- Tcl_RegExp re; /* Compiled regular expression; returned by
- * a previous call to Tcl_GetRegExpFromObj */
- CONST Tcl_UniChar *wString; /* String against which to match re. */
- int numChars; /* Length of Tcl_UniChar string (must
- * be >= 0). */
- int nmatches; /* How many subexpression matches (counting
- * the whole match as subexpression 0) are
- * of interest. -1 means "don't know". */
- int flags; /* Regular expression flags. */
+static int
+RegExpExecUniChar(
+ Tcl_Interp *interp, /* Interpreter to use for error reporting. */
+ Tcl_RegExp re, /* Compiled regular expression; returned by a
+ * previous call to Tcl_GetRegExpFromObj */
+ const Tcl_UniChar *wString, /* String against which to match re. */
+ int numChars, /* Length of Tcl_UniChar string (must be
+ * >=0). */
+ int nmatches, /* How many subexpression matches (counting
+ * the whole match as subexpression 0) are of
+ * interest. -1 means "don't know". */
+ int flags) /* Regular expression flags. */
{
int status;
TclRegexp *regexpPtr = (TclRegexp *) re;
size_t last = regexpPtr->re.re_nsub + 1;
size_t nm = last;
- if (nmatches >= 0 && (size_t) nmatches < nm)
+ if (nmatches >= 0 && (size_t) nmatches < nm) {
nm = (size_t) nmatches;
+ }
status = TclReExec(&regexpPtr->re, wString, (size_t) numChars,
&regexpPtr->details, nm, regexpPtr->matches, flags);
@@ -324,8 +331,8 @@ TclRegExpExecUniChar(interp, re, wString, numChars, nmatches, flags)
*
* Results:
* The variables at *startPtr and *endPtr are modified to hold the
- * offsets of the endpoints of the range given by index. If the
- * specified range doesn't exist then -1s are supplied.
+ * offsets of the endpoints of the range given by index. If the specified
+ * range doesn't exist then -1s are supplied.
*
* Side effects:
* None.
@@ -334,17 +341,17 @@ TclRegExpExecUniChar(interp, re, wString, numChars, nmatches, flags)
*/
void
-TclRegExpRangeUniChar(re, index, startPtr, endPtr)
- Tcl_RegExp re; /* Compiled regular expression that has
- * been passed to Tcl_RegExpExec. */
- int index; /* 0 means give the range of the entire
- * match, > 0 means give the range of
- * a matching subrange, -1 means the
- * range of the rm_extend field. */
- int *startPtr; /* Store address of first character in
- * (sub-) range here. */
- int *endPtr; /* Store address of character just after last
- * in (sub-) range here. */
+TclRegExpRangeUniChar(
+ Tcl_RegExp re, /* Compiled regular expression that has been
+ * passed to Tcl_RegExpExec. */
+ int index, /* 0 means give the range of the entire match,
+ * > 0 means give the range of a matching
+ * subrange, -1 means the range of the
+ * rm_extend field. */
+ int *startPtr, /* Store address of first character in
+ * (sub-)range here. */
+ int *endPtr) /* Store address of character just after last
+ * in (sub-)range here. */
{
TclRegexp *regexpPtr = (TclRegexp *) re;
@@ -368,10 +375,9 @@ TclRegExpRangeUniChar(re, index, startPtr, endPtr)
* See if a string matches a regular expression.
*
* Results:
- * If an error occurs during the matching operation then -1
- * is returned and the interp's result contains an error message.
- * Otherwise the return value is 1 if "string" matches "pattern"
- * and 0 otherwise.
+ * If an error occurs during the matching operation then -1 is returned
+ * and the interp's result contains an error message. Otherwise the
+ * return value is 1 if "text" matches "pattern" and 0 otherwise.
*
* Side effects:
* None.
@@ -380,54 +386,157 @@ TclRegExpRangeUniChar(re, index, startPtr, endPtr)
*/
int
-Tcl_RegExpMatch(interp, string, pattern)
- Tcl_Interp *interp; /* Used for error reporting. */
- char *string; /* String. */
- char *pattern; /* Regular expression to match against
- * string. */
+Tcl_RegExpMatch(
+ Tcl_Interp *interp, /* Used for error reporting. May be NULL. */
+ const char *text, /* Text to search for pattern matches. */
+ const char *pattern) /* Regular expression to match against text. */
{
- Tcl_RegExp re;
+ Tcl_RegExp re = Tcl_RegExpCompile(interp, pattern);
- re = Tcl_RegExpCompile(interp, pattern);
if (re == NULL) {
return -1;
}
- return Tcl_RegExpExec(interp, re, string, string);
+ return Tcl_RegExpExec(interp, re, text, text);
}
/*
*----------------------------------------------------------------------
*
- * TclRegExpMatchObj --
+ * Tcl_RegExpExecObj --
*
- * See if a string matches a regular expression pattern object.
+ * Execute a precompiled regexp against the given object.
*
* Results:
- * If an error occurs during the matching operation then -1
- * is returned and the interp's result contains an error message.
- * Otherwise the return value is 1 if "string" matches "pattern"
- * and 0 otherwise.
+ * If an error occurs during the matching operation then -1 is returned
+ * and the interp's result contains an error message. Otherwise the
+ * return value is 1 if "string" matches "pattern" and 0 otherwise.
*
* Side effects:
- * None.
+ * Converts the object to a Unicode object.
*
*----------------------------------------------------------------------
*/
int
-TclRegExpMatchObj(interp, string, patObj)
- Tcl_Interp *interp; /* Used for error reporting. */
- char *string; /* String. */
- Tcl_Obj *patObj; /* Regular expression to match against
+Tcl_RegExpExecObj(
+ Tcl_Interp *interp, /* Interpreter to use for error reporting. */
+ Tcl_RegExp re, /* Compiled regular expression; must have been
+ * returned by previous call to
+ * Tcl_GetRegExpFromObj. */
+ Tcl_Obj *textObj, /* Text against which to match re. */
+ int offset, /* Character index that marks where matching
+ * should begin. */
+ int nmatches, /* How many subexpression matches (counting
+ * the whole match as subexpression 0) are of
+ * interest. -1 means all of them. */
+ int flags) /* Regular expression execution flags. */
+{
+ TclRegexp *regexpPtr = (TclRegexp *) re;
+ Tcl_UniChar *udata;
+ int length;
+ int reflags = regexpPtr->flags;
+#define TCL_REG_GLOBOK_FLAGS \
+ (TCL_REG_ADVANCED | TCL_REG_NOSUB | TCL_REG_NOCASE)
+
+ /*
+ * Take advantage of the equivalent glob pattern, if one exists.
+ * This is possible based only on the right mix of incoming flags (0)
+ * and regexp compile flags.
+ */
+ if ((offset == 0) && (nmatches == 0) && (flags == 0)
+ && !(reflags & ~TCL_REG_GLOBOK_FLAGS)
+ && (regexpPtr->globObjPtr != NULL)) {
+ int nocase = (reflags & TCL_REG_NOCASE) ? TCL_MATCH_NOCASE : 0;
+
+ /*
+ * Pass to TclStringMatchObj for obj-specific handling.
+ * XXX: Currently doesn't take advantage of exact-ness that
+ * XXX: TclReToGlob tells us about
+ */
+
+ return TclStringMatchObj(textObj, regexpPtr->globObjPtr, nocase);
+ }
+
+ /*
+ * Save the target object so we can extract strings from it later.
+ */
+
+ regexpPtr->string = NULL;
+ regexpPtr->objPtr = textObj;
+
+ udata = Tcl_GetUnicodeFromObj(textObj, &length);
+
+ if (offset > length) {
+ offset = length;
+ }
+ udata += offset;
+ length -= offset;
+
+ return RegExpExecUniChar(interp, re, udata, length, nmatches, flags);
+}
+
+/*
+ *----------------------------------------------------------------------
+ *
+ * Tcl_RegExpMatchObj --
+ *
+ * See if an object matches a regular expression.
+ *
+ * Results:
+ * If an error occurs during the matching operation then -1 is returned
+ * and the interp's result contains an error message. Otherwise the
+ * return value is 1 if "text" matches "pattern" and 0 otherwise.
+ *
+ * Side effects:
+ * Changes the internal rep of the pattern and string objects.
+ *
+ *----------------------------------------------------------------------
+ */
+
+int
+Tcl_RegExpMatchObj(
+ Tcl_Interp *interp, /* Used for error reporting. May be NULL. */
+ Tcl_Obj *textObj, /* Object containing the String to search. */
+ Tcl_Obj *patternObj) /* Regular expression to match against
* string. */
{
Tcl_RegExp re;
- re = Tcl_GetRegExpFromObj(interp, patObj, REG_ADVANCED);
+ re = Tcl_GetRegExpFromObj(interp, patternObj,
+ TCL_REG_ADVANCED | TCL_REG_NOSUB);
if (re == NULL) {
return -1;
}
- return Tcl_RegExpExec(interp, re, string, string);
+ return Tcl_RegExpExecObj(interp, re, textObj, 0 /* offset */,
+ 0 /* nmatches */, 0 /* flags */);
+}
+
+/*
+ *----------------------------------------------------------------------
+ *
+ * Tcl_RegExpGetInfo --
+ *
+ * Retrieve information about the current match.
+ *
+ * Results:
+ * None.
+ *
+ * Side effects:
+ * None.
+ *
+ *----------------------------------------------------------------------
+ */
+
+void
+Tcl_RegExpGetInfo(
+ Tcl_RegExp regexp, /* Pattern from which to get subexpressions. */
+ Tcl_RegExpInfo *infoPtr) /* Match information is stored here. */
+{
+ TclRegexp *regexpPtr = (TclRegexp *) regexp;
+
+ infoPtr->nsubs = regexpPtr->re.re_nsub;
+ infoPtr->matches = (Tcl_RegExpIndices *) regexpPtr->matches;
+ infoPtr->extendStart = regexpPtr->details.rm_extend.rm_so;
}
/*
@@ -435,14 +544,14 @@ TclRegExpMatchObj(interp, string, patObj)
*
* Tcl_GetRegExpFromObj --
*
- * Compile a regular expression into a form suitable for fast
- * matching. This procedure caches the result in a Tcl_Obj.
+ * Compile a regular expression into a form suitable for fast matching.
+ * This function caches the result in a Tcl_Obj.
*
* Results:
- * The return value is a pointer to the compiled form of string,
- * suitable for passing to Tcl_RegExpExec. If an error occurred
- * while compiling the pattern, then NULL is returned and an error
- * message is left in the interp's result.
+ * The return value is a pointer to the compiled form of string, suitable
+ * for passing to Tcl_RegExpExec. If an error occurred while compiling
+ * the pattern, then NULL is returned and an error message is left in the
+ * interp's result.
*
* Side effects:
* Updates the native rep of the Tcl_Obj.
@@ -451,25 +560,28 @@ TclRegExpMatchObj(interp, string, patObj)
*/
Tcl_RegExp
-Tcl_GetRegExpFromObj(interp, objPtr, flags)
- Tcl_Interp *interp; /* For use in error reporting, and to access
+Tcl_GetRegExpFromObj(
+ Tcl_Interp *interp, /* For use in error reporting, and to access
* the interp regexp cache. */
- Tcl_Obj *objPtr; /* Object whose string rep contains regular
- * expression pattern. Internal rep will be
+ Tcl_Obj *objPtr, /* Object whose string rep contains regular
+ * expression pattern. Internal rep will be
* changed to compiled form of this regular
* expression. */
- int flags; /* Regular expression compilation flags. */
+ int flags) /* Regular expression compilation flags. */
{
int length;
- Tcl_ObjType *typePtr;
TclRegexp *regexpPtr;
- char *pattern;
+ const char *pattern;
+
+ /*
+ * This is OK because we only actually interpret this value properly as a
+ * TclRegexp* when the type is tclRegexpType.
+ */
- typePtr = objPtr->typePtr;
- regexpPtr = (TclRegexp *) objPtr->internalRep.otherValuePtr;
+ regexpPtr = objPtr->internalRep.twoPtrValue.ptr1;
- if ((typePtr != &tclRegexpType) || (regexpPtr->flags != flags)) {
- pattern = Tcl_GetStringFromObj(objPtr, &length);
+ if ((objPtr->typePtr != &tclRegexpType) || (regexpPtr->flags != flags)) {
+ pattern = TclGetStringFromObj(objPtr, &length);
regexpPtr = CompileRegexp(interp, pattern, length, flags);
if (regexpPtr == NULL) {
@@ -478,7 +590,7 @@ Tcl_GetRegExpFromObj(interp, objPtr, flags)
/*
* Add a reference to the regexp so it will persist even if it is
- * pushed out of the current thread's regexp cache. This reference
+ * pushed out of the current thread's regexp cache. This reference
* will be removed when the object's internal rep is freed.
*/
@@ -488,10 +600,8 @@ Tcl_GetRegExpFromObj(interp, objPtr, flags)
* Free the old representation and set our type.
*/
- if ((typePtr != NULL) && (typePtr->freeIntRepProc != NULL)) {
- (*typePtr->freeIntRepProc)(objPtr);
- }
- objPtr->internalRep.otherValuePtr = (VOID *) regexpPtr;
+ TclFreeIntRep(objPtr);
+ objPtr->internalRep.twoPtrValue.ptr1 = regexpPtr;
objPtr->typePtr = &tclRegexpType;
}
return (Tcl_RegExp) regexpPtr;
@@ -505,10 +615,10 @@ Tcl_GetRegExpFromObj(interp, objPtr, flags)
* Return information about a compiled regular expression.
*
* Results:
- * The return value is -1 for failure, 0 for success, although at
- * the moment there's nothing that could fail. On success, a list
- * is left in the interp's result: first element is the subexpression
- * count, second is a list of re_info bit names.
+ * The return value is -1 for failure, 0 for success, although at the
+ * moment there's nothing that could fail. On success, a list is left in
+ * the interp's result: first element is the subexpression count, second
+ * is a list of re_info bit names.
*
* Side effects:
* None.
@@ -517,16 +627,16 @@ Tcl_GetRegExpFromObj(interp, objPtr, flags)
*/
int
-TclRegAbout(interp, re)
- Tcl_Interp *interp; /* For use in variable assignment. */
- Tcl_RegExp re; /* The compiled regular expression. */
+TclRegAbout(
+ Tcl_Interp *interp, /* For use in variable assignment. */
+ Tcl_RegExp re) /* The compiled regular expression. */
{
- TclRegexp *regexpPtr = (TclRegexp *)re;
- char buf[TCL_INTEGER_SPACE];
- static struct infoname {
+ TclRegexp *regexpPtr = (TclRegexp *) re;
+ struct infoname {
int bit;
- char *text;
- } infonames[] = {
+ const char *text;
+ };
+ static const struct infoname infonames[] = {
{REG_UBACKREF, "REG_UBACKREF"},
{REG_ULOOKAHEAD, "REG_ULOOKAHEAD"},
{REG_UBOUNDS, "REG_UBOUNDS"},
@@ -540,37 +650,43 @@ TclRegAbout(interp, re)
{REG_ULOCALE, "REG_ULOCALE"},
{REG_UEMPTYMATCH, "REG_UEMPTYMATCH"},
{REG_UIMPOSSIBLE, "REG_UIMPOSSIBLE"},
- {0, ""}
+ {REG_USHORTEST, "REG_USHORTEST"},
+ {0, NULL}
};
- struct infoname *inf;
- int n;
+ const struct infoname *inf;
+ Tcl_Obj *infoObj, *resultObj;
+
+ /*
+ * The reset here guarantees that the interpreter result is empty and
+ * unshared. This means that we can use Tcl_ListObjAppendElement on the
+ * result object quite safely.
+ */
Tcl_ResetResult(interp);
- sprintf(buf, "%u", (unsigned)(regexpPtr->re.re_nsub));
- Tcl_AppendElement(interp, buf);
+ /*
+ * Assume that there will never be more than INT_MAX subexpressions. This
+ * is a pretty reasonable assumption; the RE engine doesn't scale _that_
+ * well and Tcl has other limits that constrain things as well...
+ */
+
+ resultObj = Tcl_NewObj();
+ Tcl_ListObjAppendElement(NULL, resultObj,
+ Tcl_NewIntObj((int) regexpPtr->re.re_nsub));
/*
- * Must count bits before generating list, because we must know
- * whether {} are needed before we start appending names.
+ * Now append a list of all the bit-flags set for the RE.
*/
- n = 0;
- for (inf = infonames; inf->bit != 0; inf++) {
- if (regexpPtr->re.re_info&inf->bit) {
- n++;
- }
- }
- if (n != 1) {
- Tcl_AppendResult(interp, " {", NULL);
- }
- for (inf = infonames; inf->bit != 0; inf++) {
- if (regexpPtr->re.re_info&inf->bit) {
- Tcl_AppendElement(interp, inf->text);
+
+ TclNewObj(infoObj);
+ for (inf=infonames ; inf->bit != 0 ; inf++) {
+ if (regexpPtr->re.re_info & inf->bit) {
+ Tcl_ListObjAppendElement(NULL, infoObj,
+ Tcl_NewStringObj(inf->text, -1));
}
}
- if (n != 1) {
- Tcl_AppendResult(interp, "}", NULL);
- }
+ Tcl_ListObjAppendElement(NULL, resultObj, infoObj);
+ Tcl_SetObjResult(interp, resultObj);
return 0;
}
@@ -592,26 +708,25 @@ TclRegAbout(interp, re)
*/
void
-TclRegError(interp, msg, status)
- Tcl_Interp *interp; /* Interpreter for error reporting. */
- char *msg; /* Message to prepend to error. */
- int status; /* Status code to report. */
+TclRegError(
+ Tcl_Interp *interp, /* Interpreter for error reporting. */
+ const char *msg, /* Message to prepend to error. */
+ int status) /* Status code to report. */
{
char buf[100]; /* ample in practice */
- char cbuf[100]; /* lots in practice */
+ char cbuf[TCL_INTEGER_SPACE];
size_t n;
- char *p;
+ const char *p;
Tcl_ResetResult(interp);
- n = TclReError(status, (regex_t *)NULL, buf, sizeof(buf));
+ n = TclReError(status, NULL, buf, sizeof(buf));
p = (n > sizeof(buf)) ? "..." : "";
- Tcl_AppendResult(interp, msg, buf, p, NULL);
+ Tcl_SetObjResult(interp, Tcl_ObjPrintf("%s%s%s", msg, buf, p));
sprintf(cbuf, "%d", status);
- (VOID) TclReError(REG_ITOA, (regex_t *)NULL, cbuf, sizeof(cbuf));
+ (void) TclReError(REG_ITOA, NULL, cbuf, sizeof(cbuf));
Tcl_SetErrorCode(interp, "REGEXP", cbuf, buf, NULL);
}
-
/*
*----------------------------------------------------------------------
@@ -631,10 +746,10 @@ TclRegError(interp, msg, status)
*/
static void
-FreeRegexpInternalRep(objPtr)
- Tcl_Obj *objPtr; /* Regexp object with internal rep to free. */
+FreeRegexpInternalRep(
+ Tcl_Obj *objPtr) /* Regexp object with internal rep to free. */
{
- TclRegexp *regexpRepPtr = (TclRegexp *) objPtr->internalRep.otherValuePtr;
+ TclRegexp *regexpRepPtr = objPtr->internalRep.twoPtrValue.ptr1;
/*
* If this is the last reference to the regexp, free it.
@@ -643,6 +758,7 @@ FreeRegexpInternalRep(objPtr)
if (--(regexpRepPtr->refCount) <= 0) {
FreeRegexp(regexpRepPtr);
}
+ objPtr->typePtr = NULL;
}
/*
@@ -650,8 +766,8 @@ FreeRegexpInternalRep(objPtr)
*
* DupRegexpInternalRep --
*
- * We copy the reference to the compiled regexp and bump its
- * reference count.
+ * We copy the reference to the compiled regexp and bump its reference
+ * count.
*
* Results:
* None.
@@ -663,13 +779,14 @@ FreeRegexpInternalRep(objPtr)
*/
static void
-DupRegexpInternalRep(srcPtr, copyPtr)
- Tcl_Obj *srcPtr; /* Object with internal rep to copy. */
- Tcl_Obj *copyPtr; /* Object with internal rep to set. */
+DupRegexpInternalRep(
+ Tcl_Obj *srcPtr, /* Object with internal rep to copy. */
+ Tcl_Obj *copyPtr) /* Object with internal rep to set. */
{
- TclRegexp *regexpPtr = (TclRegexp *) srcPtr->internalRep.otherValuePtr;
+ TclRegexp *regexpPtr = srcPtr->internalRep.twoPtrValue.ptr1;
+
regexpPtr->refCount++;
- copyPtr->internalRep.otherValuePtr = srcPtr->internalRep.otherValuePtr;
+ copyPtr->internalRep.twoPtrValue.ptr1 = srcPtr->internalRep.twoPtrValue.ptr1;
copyPtr->typePtr = &tclRegexpType;
}
@@ -694,9 +811,9 @@ DupRegexpInternalRep(srcPtr, copyPtr)
*/
static int
-SetRegexpFromAny(interp, objPtr)
- Tcl_Interp *interp; /* Used for error reporting if not NULL. */
- Tcl_Obj *objPtr; /* The object to convert. */
+SetRegexpFromAny(
+ Tcl_Interp *interp, /* Used for error reporting if not NULL. */
+ Tcl_Obj *objPtr) /* The object to convert. */
{
if (Tcl_GetRegExpFromObj(interp, objPtr, REG_ADVANCED) == NULL) {
return TCL_ERROR;
@@ -709,37 +826,36 @@ SetRegexpFromAny(interp, objPtr)
*
* CompileRegexp --
*
- * Attempt to compile the given regexp pattern. If the compiled
- * regular expression can be found in the per-thread cache, it
- * will be used instead of compiling a new copy.
+ * Attempt to compile the given regexp pattern. If the compiled regular
+ * expression can be found in the per-thread cache, it will be used
+ * instead of compiling a new copy.
*
* Results:
- * The return value is a pointer to a newly allocated TclRegexp
- * that represents the compiled pattern, or NULL if the pattern
- * could not be compiled. If NULL is returned, an error message is
- * left in the interp's result.
+ * The return value is a pointer to a newly allocated TclRegexp that
+ * represents the compiled pattern, or NULL if the pattern could not be
+ * compiled. If NULL is returned, an error message is left in the
+ * interp's result.
*
* Side effects:
- * The thread-local regexp cache is updated and a new TclRegexp may
- * be allocated.
+ * The thread-local regexp cache is updated and a new TclRegexp may be
+ * allocated.
*
*----------------------------------------------------------------------
*/
static TclRegexp *
-CompileRegexp(interp, string, length, flags)
- Tcl_Interp *interp; /* Used for error reporting if not NULL. */
- char *string; /* The regexp to compile (UTF-8). */
- int length; /* The length of the string in bytes. */
- int flags; /* Compilation flags. */
+CompileRegexp(
+ Tcl_Interp *interp, /* Used for error reporting if not NULL. */
+ const char *string, /* The regexp to compile (UTF-8). */
+ int length, /* The length of the string in bytes. */
+ int flags) /* Compilation flags. */
{
TclRegexp *regexpPtr;
- Tcl_UniChar *uniString;
- int numChars;
+ const Tcl_UniChar *uniString;
+ int numChars, status, i, exact;
Tcl_DString stringBuf;
- int status, i;
ThreadSpecificData *tsdPtr = TCL_TSD_INIT(&dataKey);
-
+
if (!tsdPtr->initialized) {
tsdPtr->initialized = 1;
Tcl_CreateThreadExitHandler(FinalizeRegexp, NULL);
@@ -747,14 +863,14 @@ CompileRegexp(interp, string, length, flags)
/*
* This routine maintains a second-level regular expression cache in
- * addition to the per-object regexp cache. The per-thread cache is needed
+ * addition to the per-object regexp cache. The per-thread cache is needed
* to handle the case where for various reasons the object is lost between
* invocations of the regexp command, but the literal pattern is the same.
*/
/*
- * Check the per-thread compiled regexp cache. We can only reuse
- * a regexp if it has the same pattern and the same flags.
+ * Check the per-thread compiled regexp cache. We can only reuse a regexp
+ * if it has the same pattern and the same flags.
*/
for (i = 0; (i < NUM_REGEXPS) && (tsdPtr->patterns[i] != NULL); i++) {
@@ -762,8 +878,8 @@ CompileRegexp(interp, string, length, flags)
&& (tsdPtr->regexps[i]->flags == flags)
&& (strcmp(string, tsdPtr->patterns[i]) == 0)) {
/*
- * Move the matched pattern to the first slot in the
- * cache and shift the other patterns down one position.
+ * Move the matched pattern to the first slot in the cache and
+ * shift the other patterns down one position.
*/
if (i != 0) {
@@ -788,8 +904,12 @@ CompileRegexp(interp, string, length, flags)
/*
* This is a new expression, so compile it and add it to the cache.
*/
-
- regexpPtr = (TclRegexp *) ckalloc(sizeof(TclRegexp));
+
+ regexpPtr = ckalloc(sizeof(TclRegexp));
+ regexpPtr->objPtr = NULL;
+ regexpPtr->string = NULL;
+ regexpPtr->details.rm_extend.rm_so = -1;
+ regexpPtr->details.rm_extend.rm_eo = -1;
/*
* Get the up-to-date string representation and map to unicode.
@@ -812,22 +932,34 @@ CompileRegexp(interp, string, length, flags)
* Clean up and report errors in the interpreter, if possible.
*/
- ckfree((char *)regexpPtr);
+ ckfree(regexpPtr);
if (interp) {
TclRegError(interp,
- "couldn't compile regular expression pattern: ",
- status);
+ "couldn't compile regular expression pattern: ", status);
}
return NULL;
}
/*
- * Allocate enough space for all of the subexpressions, plus one
- * extra for the entire pattern.
+ * Convert RE to a glob pattern equivalent, if any, and cache it. If this
+ * is not possible, then globObjPtr will be NULL. This is used by
+ * Tcl_RegExpExecObj to optionally do a fast match (avoids RE engine).
+ */
+
+ if (TclReToGlob(NULL, string, length, &stringBuf, &exact) == TCL_OK) {
+ regexpPtr->globObjPtr = TclDStringToObj(&stringBuf);
+ Tcl_IncrRefCount(regexpPtr->globObjPtr);
+ } else {
+ regexpPtr->globObjPtr = NULL;
+ }
+
+ /*
+ * Allocate enough space for all of the subexpressions, plus one extra for
+ * the entire pattern.
*/
- regexpPtr->matches = (regmatch_t *) ckalloc(
- sizeof(regmatch_t) * (regexpPtr->re.re_nsub + 1));
+ regexpPtr->matches =
+ ckalloc(sizeof(regmatch_t) * (regexpPtr->re.re_nsub + 1));
/*
* Initialize the refcount to one initially, since it is in the cache.
@@ -842,6 +974,7 @@ CompileRegexp(interp, string, length, flags)
if (tsdPtr->patterns[NUM_REGEXPS-1] != NULL) {
TclRegexp *oldRegexpPtr = tsdPtr->regexps[NUM_REGEXPS-1];
+
if (--(oldRegexpPtr->refCount) <= 0) {
FreeRegexp(oldRegexpPtr);
}
@@ -852,8 +985,8 @@ CompileRegexp(interp, string, length, flags)
tsdPtr->patLengths[i+1] = tsdPtr->patLengths[i];
tsdPtr->regexps[i+1] = tsdPtr->regexps[i];
}
- tsdPtr->patterns[0] = (char *) ckalloc((unsigned) (length+1));
- strcpy(tsdPtr->patterns[0], string);
+ tsdPtr->patterns[0] = ckalloc(length + 1);
+ memcpy(tsdPtr->patterns[0], string, (unsigned) length + 1);
tsdPtr->patLengths[0] = length;
tsdPtr->regexps[0] = regexpPtr;
@@ -877,14 +1010,17 @@ CompileRegexp(interp, string, length, flags)
*/
static void
-FreeRegexp(regexpPtr)
- TclRegexp *regexpPtr; /* Compiled regular expression to free. */
+FreeRegexp(
+ TclRegexp *regexpPtr) /* Compiled regular expression to free. */
{
TclReFree(&regexpPtr->re);
+ if (regexpPtr->globObjPtr) {
+ TclDecrRefCount(regexpPtr->globObjPtr);
+ }
if (regexpPtr->matches) {
- ckfree((char *) regexpPtr->matches);
+ ckfree(regexpPtr->matches);
}
- ckfree((char *) regexpPtr);
+ ckfree(regexpPtr);
}
/*
@@ -892,8 +1028,7 @@ FreeRegexp(regexpPtr)
*
* FinalizeRegexp --
*
- * Release the storage associated with the per-thread regexp
- * cache.
+ * Release the storage associated with the per-thread regexp cache.
*
* Results:
* None.
@@ -905,8 +1040,8 @@ FreeRegexp(regexpPtr)
*/
static void
-FinalizeRegexp(clientData)
- ClientData clientData; /* Not used. */
+FinalizeRegexp(
+ ClientData clientData) /* Not used. */
{
int i;
TclRegexp *regexpPtr;
@@ -918,5 +1053,21 @@ FinalizeRegexp(clientData)
FreeRegexp(regexpPtr);
}
ckfree(tsdPtr->patterns[i]);
+ tsdPtr->patterns[i] = NULL;
}
+
+ /*
+ * We may find ourselves reinitialized if another finalization routine
+ * invokes regexps.
+ */
+
+ tsdPtr->initialized = 0;
}
+
+/*
+ * Local Variables:
+ * mode: c
+ * c-basic-offset: 4
+ * fill-column: 78
+ * End:
+ */