diff options
Diffstat (limited to 'generic/tclRegexp.c')
| -rw-r--r-- | generic/tclRegexp.c | 621 | 
1 files changed, 333 insertions, 288 deletions
| diff --git a/generic/tclRegexp.c b/generic/tclRegexp.c index 2a32675..6348e4a 100644 --- a/generic/tclRegexp.c +++ b/generic/tclRegexp.c @@ -1,26 +1,23 @@ -/*  +/*   * tclRegexp.c --   * - *	This file contains the public interfaces to the Tcl regular - *	expression mechanism. + *	This file contains the public interfaces to the Tcl regular expression + *	mechanism.   *   * Copyright (c) 1998 by Sun Microsystems, Inc.   * Copyright (c) 1998-1999 by Scriptics Corporation.   * - * See the file "license.terms" for information on usage and redistribution - * of this file, and for a DISCLAIMER OF ALL WARRANTIES. - * - * RCS: @(#) $Id: tclRegexp.c,v 1.11 2001/05/21 22:32:01 hobbs Exp $ + * See the file "license.terms" for information on usage and redistribution of + * this file, and for a DISCLAIMER OF ALL WARRANTIES.   */  #include "tclInt.h" -#include "tclPort.h"  #include "tclRegexp.h"  /*   *---------------------------------------------------------------------- - * The routines in this file use Henry Spencer's regular expression - * package contained in the following additional source files: + * The routines in this file use Henry Spencer's regular expression package + * contained in the following additional source files:   *   *	regc_color.c	regc_cvec.c	regc_lex.c   *	regc_nfa.c	regcomp.c	regcustom.h @@ -29,23 +26,23 @@   *	regfronts.c	regguts.h   *   * Copyright (c) 1998 Henry Spencer.  All rights reserved. - *  + *   * Development of this software was funded, in part, by Cray Research Inc.,   * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics - * Corporation, none of whom are responsible for the results.  The author - * thanks all of them.  - *  + * Corporation, none of whom are responsible for the results. The author + * thanks all of them. + *   * Redistribution and use in source and binary forms -- with or without   * modification -- are permitted for any purpose, provided that   * redistributions in source form retain this entire copyright notice and   * indicate the origin and nature of any modifications. - *  - * I'd appreciate being given credit for this package in the documentation - * of software which uses it, but that is not a requirement. - *  + * + * I'd appreciate being given credit for this package in the documentation of + * software which uses it, but that is not a requirement. + *   * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,   * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY - * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL + * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL   * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,   * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,   * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; @@ -56,8 +53,8 @@   *   * *** NOTE: this code has been altered slightly for use in Tcl: ***   * *** 1. Names have been changed, e.g. from re_comp to		 *** - * ***    TclRegComp, to avoid clashes with other 		 *** - * ***    regexp implementations used by applications. 		 *** + * ***    TclRegComp, to avoid clashes with other		 *** + * ***    regexp implementations used by applications.		 ***   */  /* @@ -69,15 +66,14 @@  typedef struct ThreadSpecificData {      int initialized;		/* Set to 1 when the module is initialized. */ -    char *patterns[NUM_REGEXPS];/* Strings corresponding to compiled -				 * regular expression patterns.	 NULL -				 * means that this slot isn't used. -				 * Malloc-ed. */ +    char *patterns[NUM_REGEXPS];/* Strings corresponding to compiled regular +				 * expression patterns. NULL means that this +				 * slot isn't used. Malloc-ed. */      int patLengths[NUM_REGEXPS];/* Number of non-null characters in -				 * corresponding entry in patterns. -				 * -1 means entry isn't used. */ +				 * corresponding entry in patterns. -1 means +				 * entry isn't used. */      struct TclRegexp *regexps[NUM_REGEXPS]; -				/* Compiled forms of above strings.  Also +				/* Compiled forms of above strings. Also  				 * malloc-ed, or NULL if not in use yet. */  } ThreadSpecificData; @@ -87,49 +83,46 @@ static Tcl_ThreadDataKey dataKey;   * Declarations for functions used only in this file.   */ -static TclRegexp *	CompileRegexp _ANSI_ARGS_((Tcl_Interp *interp, -			    char *pattern, int length, int flags)); -static void		DupRegexpInternalRep _ANSI_ARGS_((Tcl_Obj *srcPtr, -			    Tcl_Obj *copyPtr)); -static void		FinalizeRegexp _ANSI_ARGS_((ClientData clientData)); -static void		FreeRegexp _ANSI_ARGS_((TclRegexp *regexpPtr)); -static void		FreeRegexpInternalRep _ANSI_ARGS_((Tcl_Obj *objPtr)); -static int		RegExpExecUniChar _ANSI_ARGS_((Tcl_Interp *interp, -			    Tcl_RegExp re, CONST Tcl_UniChar *uniString, -			    int numChars, int nmatches, int flags)); -static int		SetRegexpFromAny _ANSI_ARGS_((Tcl_Interp *interp, -			    Tcl_Obj *objPtr)); +static TclRegexp *	CompileRegexp(Tcl_Interp *interp, const char *pattern, +			    int length, int flags); +static void		DupRegexpInternalRep(Tcl_Obj *srcPtr, +			    Tcl_Obj *copyPtr); +static void		FinalizeRegexp(ClientData clientData); +static void		FreeRegexp(TclRegexp *regexpPtr); +static void		FreeRegexpInternalRep(Tcl_Obj *objPtr); +static int		RegExpExecUniChar(Tcl_Interp *interp, Tcl_RegExp re, +			    const Tcl_UniChar *uniString, int numChars, +			    int nmatches, int flags); +static int		SetRegexpFromAny(Tcl_Interp *interp, Tcl_Obj *objPtr);  /* - * The regular expression Tcl object type.  This serves as a cache - * of the compiled form of the regular expression. + * The regular expression Tcl object type. This serves as a cache of the + * compiled form of the regular expression.   */ -Tcl_ObjType tclRegexpType = { +const Tcl_ObjType tclRegexpType = {      "regexp",				/* name */      FreeRegexpInternalRep,		/* freeIntRepProc */      DupRegexpInternalRep,		/* dupIntRepProc */      NULL,				/* updateStringProc */      SetRegexpFromAny			/* setFromAnyProc */  }; -  /*   *----------------------------------------------------------------------   *   * Tcl_RegExpCompile --   * - *	Compile a regular expression into a form suitable for fast - *	matching.  This procedure is DEPRECATED in favor of the - *	object version of the command. + *	Compile a regular expression into a form suitable for fast matching. + *	This function is DEPRECATED in favor of the object version of the + *	command.   *   * Results: - *	The return value is a pointer to the compiled form of string, - *	suitable for passing to Tcl_RegExpExec.  This compiled form - *	is only valid up until the next call to this procedure, so - *	don't keep these around for a long time!  If an error occurred - *	while compiling the pattern, then NULL is returned and an error - *	message is left in the interp's result. + *	The return value is a pointer to the compiled form of string, suitable + *	for passing to Tcl_RegExpExec. This compiled form is only valid up + *	until the next call to this function, so don't keep these around for a + *	long time! If an error occurred while compiling the pattern, then NULL + *	is returned and an error message is left in the interp's result.   *   * Side effects:   *	Updates the cache of compiled regexps. @@ -138,13 +131,13 @@ Tcl_ObjType tclRegexpType = {   */  Tcl_RegExp -Tcl_RegExpCompile(interp, string) -    Tcl_Interp *interp;		/* For use in error reporting and -				 * to access the interp regexp cache. */ -    char *string;		/* String for which to produce -				 * compiled regular expression. */ +Tcl_RegExpCompile( +    Tcl_Interp *interp,		/* For use in error reporting and to access +				 * the interp regexp cache. */ +    const char *pattern)	/* String for which to produce compiled +				 * regular expression. */  { -    return (Tcl_RegExp) CompileRegexp(interp, string, (int) strlen(string), +    return (Tcl_RegExp) CompileRegexp(interp, pattern, (int) strlen(pattern),  	    REG_ADVANCED);  } @@ -153,15 +146,14 @@ Tcl_RegExpCompile(interp, string)   *   * Tcl_RegExpExec --   * - *	Execute the regular expression matcher using a compiled form - *	of a regular expression and save information about any match - *	that is found. + *	Execute the regular expression matcher using a compiled form of a + *	regular expression and save information about any match that is found.   *   * Results: - *	If an error occurs during the matching operation then -1 - *	is returned and the interp's result contains an error message. - *	Otherwise the return value is 1 if a matching range is - *	found and 0 if there is no matching range. + *	If an error occurs during the matching operation then -1 is returned + *	and the interp's result contains an error message. Otherwise the + *	return value is 1 if a matching range is found and 0 if there is no + *	matching range.   *   * Side effects:   *	None. @@ -170,27 +162,27 @@ Tcl_RegExpCompile(interp, string)   */  int -Tcl_RegExpExec(interp, re, string, start) -    Tcl_Interp *interp;		/* Interpreter to use for error reporting. */ -    Tcl_RegExp re;		/* Compiled regular expression;  must have -				 * been returned by previous call to +Tcl_RegExpExec( +    Tcl_Interp *interp,		/* Interpreter to use for error reporting. */ +    Tcl_RegExp re,		/* Compiled regular expression; must have been +				 * returned by previous call to  				 * Tcl_GetRegExpFromObj. */ -    CONST char *string;		/* String against which to match re. */ -    CONST char *start;		/* If string is part of a larger string, -				 * this identifies beginning of larger -				 * string, so that "^" won't match. */ +    const char *text,		/* Text against which to match re. */ +    const char *start)		/* If text is part of a larger string, this +				 * identifies beginning of larger string, so +				 * that "^" won't match. */  {      int flags, result, numChars; -    TclRegexp *regexp = (TclRegexp *)re; +    TclRegexp *regexp = (TclRegexp *) re;      Tcl_DString ds; -    Tcl_UniChar *ustr; +    const Tcl_UniChar *ustr;      /* -     * If the starting point is offset from the beginning of the buffer, -     * then we need to tell the regexp engine not to match "^". +     * If the starting point is offset from the beginning of the buffer, then +     * we need to tell the regexp engine not to match "^".       */ -    if (string > start) { +    if (text > start) {  	flags = REG_NOTBOL;      } else {  	flags = 0; @@ -200,7 +192,7 @@ Tcl_RegExpExec(interp, re, string, start)       * Remember the string for use by Tcl_RegExpRange().       */ -    regexp->string = string; +    regexp->string = text;      regexp->objPtr = NULL;      /* @@ -208,10 +200,10 @@ Tcl_RegExpExec(interp, re, string, start)       */      Tcl_DStringInit(&ds); -    ustr = Tcl_UtfToUniCharDString(string, -1, &ds); +    ustr = Tcl_UtfToUniCharDString(text, -1, &ds);      numChars = Tcl_DStringLength(&ds) / sizeof(Tcl_UniChar); -    result = RegExpExecUniChar(interp, re, ustr, numChars, -	    -1 /* nmatches */, flags); +    result = RegExpExecUniChar(interp, re, ustr, numChars, -1 /* nmatches */, +	    flags);      Tcl_DStringFree(&ds);      return result; @@ -227,7 +219,7 @@ Tcl_RegExpExec(interp, re, string, start)   *   * Results:   *	The variables at *startPtr and *endPtr are modified to hold the - *	addresses of the endpoints of the range given by index.  If the + *	addresses of the endpoints of the range given by index. If the   *	specified range doesn't exist then NULLs are returned.   *   * Side effects: @@ -237,19 +229,19 @@ Tcl_RegExpExec(interp, re, string, start)   */  void -Tcl_RegExpRange(re, index, startPtr, endPtr) -    Tcl_RegExp re;		/* Compiled regular expression that has -				 * been passed to Tcl_RegExpExec. */ -    int index;			/* 0 means give the range of the entire -				 * match, > 0 means give the range of -				 * a matching subrange. */ -    char **startPtr;		/* Store address of first character in -				 * (sub-) range here. */ -    char **endPtr;		/* Store address of character just after last -				 * in (sub-) range here. */ +Tcl_RegExpRange( +    Tcl_RegExp re,		/* Compiled regular expression that has been +				 * passed to Tcl_RegExpExec. */ +    int index,			/* 0 means give the range of the entire match, +				 * > 0 means give the range of a matching +				 * subrange. */ +    const char **startPtr,	/* Store address of first character in +				 * (sub-)range here. */ +    const char **endPtr)	/* Store address of character just after last +				 * in (sub-)range here. */  {      TclRegexp *regexpPtr = (TclRegexp *) re; -    CONST char *string; +    const char *string;      if ((size_t) index > regexpPtr->re.re_nsub) {  	*startPtr = *endPtr = NULL; @@ -257,7 +249,7 @@ Tcl_RegExpRange(re, index, startPtr, endPtr)  	*startPtr = *endPtr = NULL;      } else {  	if (regexpPtr->objPtr) { -	    string = Tcl_GetString(regexpPtr->objPtr); +	    string = TclGetString(regexpPtr->objPtr);  	} else {  	    string = regexpPtr->string;  	} @@ -272,14 +264,13 @@ Tcl_RegExpRange(re, index, startPtr, endPtr)   * RegExpExecUniChar --   *   *	Execute the regular expression matcher using a compiled form of a - *	regular expression and save information about any match that is - *	found. + *	regular expression and save information about any match that is found.   *   * Results: - *	If an error occurs during the matching operation then -1 is - *	returned and an error message is left in interp's result. - *	Otherwise the return value is 1 if a matching range was found or - *	0 if there was no matching range. + *	If an error occurs during the matching operation then -1 is returned + *	and an error message is left in interp's result. Otherwise the return + *	value is 1 if a matching range was found or 0 if there was no matching + *	range.   *   * Side effects:   *	None. @@ -288,17 +279,17 @@ Tcl_RegExpRange(re, index, startPtr, endPtr)   */  static int -RegExpExecUniChar(interp, re, wString, numChars, nmatches, flags) -    Tcl_Interp *interp;		/* Interpreter to use for error reporting. */ -    Tcl_RegExp re;		/* Compiled regular expression; returned by -				 * a previous call to Tcl_GetRegExpFromObj */ -    CONST Tcl_UniChar *wString;	/* String against which to match re. */ -    int numChars;		/* Length of Tcl_UniChar string (must -				 * be >= 0). */ -    int nmatches;		/* How many subexpression matches (counting -				 * the whole match as subexpression 0) are -				 * of interest.  -1 means "don't know". */ -    int flags;			/* Regular expression flags. */ +RegExpExecUniChar( +    Tcl_Interp *interp,		/* Interpreter to use for error reporting. */ +    Tcl_RegExp re,		/* Compiled regular expression; returned by a +				 * previous call to Tcl_GetRegExpFromObj */ +    const Tcl_UniChar *wString,	/* String against which to match re. */ +    int numChars,		/* Length of Tcl_UniChar string (must be +				 * >=0). */ +    int nmatches,		/* How many subexpression matches (counting +				 * the whole match as subexpression 0) are of +				 * interest. -1 means "don't know". */ +    int flags)			/* Regular expression flags. */  {      int status;      TclRegexp *regexpPtr = (TclRegexp *) re; @@ -340,8 +331,8 @@ RegExpExecUniChar(interp, re, wString, numChars, nmatches, flags)   *   * Results:   *	The variables at *startPtr and *endPtr are modified to hold the - *	offsets of the endpoints of the range given by index.  If the - *	specified range doesn't exist then -1s are supplied. + *	offsets of the endpoints of the range given by index. If the specified + *	range doesn't exist then -1s are supplied.   *   * Side effects:   *	None. @@ -350,17 +341,17 @@ RegExpExecUniChar(interp, re, wString, numChars, nmatches, flags)   */  void -TclRegExpRangeUniChar(re, index, startPtr, endPtr) -    Tcl_RegExp re;		/* Compiled regular expression that has -				 * been passed to Tcl_RegExpExec. */ -    int index;			/* 0 means give the range of the entire -				 * match, > 0 means give the range of -				 * a matching subrange, -1 means the -				 * range of the rm_extend field. */ -    int *startPtr;		/* Store address of first character in -				 * (sub-) range here. */ -    int *endPtr;		/* Store address of character just after last -				 * in (sub-) range here. */ +TclRegExpRangeUniChar( +    Tcl_RegExp re,		/* Compiled regular expression that has been +				 * passed to Tcl_RegExpExec. */ +    int index,			/* 0 means give the range of the entire match, +				 * > 0 means give the range of a matching +				 * subrange, -1 means the range of the +				 * rm_extend field. */ +    int *startPtr,		/* Store address of first character in +				 * (sub-)range here. */ +    int *endPtr)		/* Store address of character just after last +				 * in (sub-)range here. */  {      TclRegexp *regexpPtr = (TclRegexp *) re; @@ -384,10 +375,9 @@ TclRegExpRangeUniChar(re, index, startPtr, endPtr)   *	See if a string matches a regular expression.   *   * Results: - *	If an error occurs during the matching operation then -1 - *	is returned and the interp's result contains an error message. - *	Otherwise the return value is 1 if "string" matches "pattern" - *	and 0 otherwise. + *	If an error occurs during the matching operation then -1 is returned + *	and the interp's result contains an error message. Otherwise the + *	return value is 1 if "text" matches "pattern" and 0 otherwise.   *   * Side effects:   *	None. @@ -396,19 +386,17 @@ TclRegExpRangeUniChar(re, index, startPtr, endPtr)   */  int -Tcl_RegExpMatch(interp, string, pattern) -    Tcl_Interp *interp;		/* Used for error reporting. May be NULL. */ -    char *string;		/* String. */ -    char *pattern;		/* Regular expression to match against -				 * string. */ +Tcl_RegExpMatch( +    Tcl_Interp *interp,		/* Used for error reporting. May be NULL. */ +    const char *text,		/* Text to search for pattern matches. */ +    const char *pattern)	/* Regular expression to match against text. */  { -    Tcl_RegExp re; +    Tcl_RegExp re = Tcl_RegExpCompile(interp, pattern); -    re = Tcl_RegExpCompile(interp, pattern);      if (re == NULL) {  	return -1;      } -    return Tcl_RegExpExec(interp, re, string, string); +    return Tcl_RegExpExec(interp, re, text, text);  }  /* @@ -419,10 +407,9 @@ Tcl_RegExpMatch(interp, string, pattern)   *	Execute a precompiled regexp against the given object.   *   * Results: - *	If an error occurs during the matching operation then -1 - *	is returned and the interp's result contains an error message. - *	Otherwise the return value is 1 if "string" matches "pattern" - *	and 0 otherwise. + *	If an error occurs during the matching operation then -1 is returned + *	and the interp's result contains an error message. Otherwise the + *	return value is 1 if "string" matches "pattern" and 0 otherwise.   *   * Side effects:   *	Converts the object to a Unicode object. @@ -431,38 +418,60 @@ Tcl_RegExpMatch(interp, string, pattern)   */  int -Tcl_RegExpExecObj(interp, re, objPtr, offset, nmatches, flags) -    Tcl_Interp *interp;		/* Interpreter to use for error reporting. */ -    Tcl_RegExp re;		/* Compiled regular expression;  must have -				 * been returned by previous call to +Tcl_RegExpExecObj( +    Tcl_Interp *interp,		/* Interpreter to use for error reporting. */ +    Tcl_RegExp re,		/* Compiled regular expression; must have been +				 * returned by previous call to  				 * Tcl_GetRegExpFromObj. */ -    Tcl_Obj *objPtr;		/* String against which to match re. */ -    int offset;			/* Character index that marks where matching +    Tcl_Obj *textObj,		/* Text against which to match re. */ +    int offset,			/* Character index that marks where matching  				 * should begin. */ -    int nmatches;		/* How many subexpression matches (counting -				 * the whole match as subexpression 0) are -				 * of interest.  -1 means all of them. */ -    int flags;			/* Regular expression execution flags. */ +    int nmatches,		/* How many subexpression matches (counting +				 * the whole match as subexpression 0) are of +				 * interest. -1 means all of them. */ +    int flags)			/* Regular expression execution flags. */  {      TclRegexp *regexpPtr = (TclRegexp *) re;      Tcl_UniChar *udata;      int length; +    int reflags = regexpPtr->flags; +#define TCL_REG_GLOBOK_FLAGS \ +	(TCL_REG_ADVANCED | TCL_REG_NOSUB | TCL_REG_NOCASE) + +    /* +     * Take advantage of the equivalent glob pattern, if one exists. +     * This is possible based only on the right mix of incoming flags (0) +     * and regexp compile flags. +     */ +    if ((offset == 0) && (nmatches == 0) && (flags == 0) +	    && !(reflags & ~TCL_REG_GLOBOK_FLAGS) +	    && (regexpPtr->globObjPtr != NULL)) { +	int nocase = (reflags & TCL_REG_NOCASE) ? TCL_MATCH_NOCASE : 0; + +	/* +	 * Pass to TclStringMatchObj for obj-specific handling. +	 * XXX: Currently doesn't take advantage of exact-ness that +	 * XXX: TclReToGlob tells us about +	 */ + +	return TclStringMatchObj(textObj, regexpPtr->globObjPtr, nocase); +    }      /*       * Save the target object so we can extract strings from it later.       */      regexpPtr->string = NULL; -    regexpPtr->objPtr = objPtr; +    regexpPtr->objPtr = textObj; -    udata = Tcl_GetUnicodeFromObj(objPtr, &length); +    udata = Tcl_GetUnicodeFromObj(textObj, &length);      if (offset > length) {  	offset = length;      }      udata += offset;      length -= offset; -     +      return RegExpExecUniChar(interp, re, udata, length, nmatches, flags);  } @@ -474,10 +483,9 @@ Tcl_RegExpExecObj(interp, re, objPtr, offset, nmatches, flags)   *	See if an object matches a regular expression.   *   * Results: - *	If an error occurs during the matching operation then -1 - *	is returned and the interp's result contains an error message. - *	Otherwise the return value is 1 if "string" matches "pattern" - *	and 0 otherwise. + *	If an error occurs during the matching operation then -1 is returned + *	and the interp's result contains an error message. Otherwise the + *	return value is 1 if "text" matches "pattern" and 0 otherwise.   *   * Side effects:   *	Changes the internal rep of the pattern and string objects. @@ -486,10 +494,10 @@ Tcl_RegExpExecObj(interp, re, objPtr, offset, nmatches, flags)   */  int -Tcl_RegExpMatchObj(interp, stringObj, patternObj) -    Tcl_Interp *interp;		/* Used for error reporting. May be NULL. */ -    Tcl_Obj *stringObj;		/* Object containing the String to search. */ -    Tcl_Obj *patternObj;	/* Regular expression to match against +Tcl_RegExpMatchObj( +    Tcl_Interp *interp,		/* Used for error reporting. May be NULL. */ +    Tcl_Obj *textObj,		/* Object containing the String to search. */ +    Tcl_Obj *patternObj)	/* Regular expression to match against  				 * string. */  {      Tcl_RegExp re; @@ -499,7 +507,7 @@ Tcl_RegExpMatchObj(interp, stringObj, patternObj)      if (re == NULL) {  	return -1;      } -    return Tcl_RegExpExecObj(interp, re, stringObj, 0 /* offset */, +    return Tcl_RegExpExecObj(interp, re, textObj, 0 /* offset */,  	    0 /* nmatches */, 0 /* flags */);  } @@ -520,9 +528,9 @@ Tcl_RegExpMatchObj(interp, stringObj, patternObj)   */  void -Tcl_RegExpGetInfo(regexp, infoPtr) -    Tcl_RegExp regexp;		/* Pattern from which to get subexpressions. */ -    Tcl_RegExpInfo *infoPtr;	/* Match information is stored here.  */ +Tcl_RegExpGetInfo( +    Tcl_RegExp regexp,		/* Pattern from which to get subexpressions. */ +    Tcl_RegExpInfo *infoPtr)	/* Match information is stored here. */  {      TclRegexp *regexpPtr = (TclRegexp *) regexp; @@ -536,14 +544,14 @@ Tcl_RegExpGetInfo(regexp, infoPtr)   *   * Tcl_GetRegExpFromObj --   * - *	Compile a regular expression into a form suitable for fast - *	matching.  This procedure caches the result in a Tcl_Obj. + *	Compile a regular expression into a form suitable for fast matching. + *	This function caches the result in a Tcl_Obj.   *   * Results: - *	The return value is a pointer to the compiled form of string, - *	suitable for passing to Tcl_RegExpExec.  If an error occurred - *	while compiling the pattern, then NULL is returned and an error - *	message is left in the interp's result. + *	The return value is a pointer to the compiled form of string, suitable + *	for passing to Tcl_RegExpExec. If an error occurred while compiling + *	the pattern, then NULL is returned and an error message is left in the + *	interp's result.   *   * Side effects:   *	Updates the native rep of the Tcl_Obj. @@ -552,25 +560,28 @@ Tcl_RegExpGetInfo(regexp, infoPtr)   */  Tcl_RegExp -Tcl_GetRegExpFromObj(interp, objPtr, flags) -    Tcl_Interp *interp;		/* For use in error reporting, and to access +Tcl_GetRegExpFromObj( +    Tcl_Interp *interp,		/* For use in error reporting, and to access  				 * the interp regexp cache. */ -    Tcl_Obj *objPtr;		/* Object whose string rep contains regular -				 * expression pattern.  Internal rep will be +    Tcl_Obj *objPtr,		/* Object whose string rep contains regular +				 * expression pattern. Internal rep will be  				 * changed to compiled form of this regular  				 * expression. */ -    int flags;			/* Regular expression compilation flags. */ +    int flags)			/* Regular expression compilation flags. */  {      int length; -    Tcl_ObjType *typePtr;      TclRegexp *regexpPtr; -    char *pattern; +    const char *pattern; + +    /* +     * This is OK because we only actually interpret this value properly as a +     * TclRegexp* when the type is tclRegexpType. +     */ -    typePtr = objPtr->typePtr; -    regexpPtr = (TclRegexp *) objPtr->internalRep.otherValuePtr; +    regexpPtr = objPtr->internalRep.twoPtrValue.ptr1; -    if ((typePtr != &tclRegexpType) || (regexpPtr->flags != flags)) { -	pattern = Tcl_GetStringFromObj(objPtr, &length); +    if ((objPtr->typePtr != &tclRegexpType) || (regexpPtr->flags != flags)) { +	pattern = TclGetStringFromObj(objPtr, &length);  	regexpPtr = CompileRegexp(interp, pattern, length, flags);  	if (regexpPtr == NULL) { @@ -579,7 +590,7 @@ Tcl_GetRegExpFromObj(interp, objPtr, flags)  	/*  	 * Add a reference to the regexp so it will persist even if it is -	 * pushed out of the current thread's regexp cache.  This reference +	 * pushed out of the current thread's regexp cache. This reference  	 * will be removed when the object's internal rep is freed.  	 */ @@ -589,10 +600,8 @@ Tcl_GetRegExpFromObj(interp, objPtr, flags)  	 * Free the old representation and set our type.  	 */ -	if ((typePtr != NULL) && (typePtr->freeIntRepProc != NULL)) { -	    (*typePtr->freeIntRepProc)(objPtr); -	} -	objPtr->internalRep.otherValuePtr = (VOID *) regexpPtr; +	TclFreeIntRep(objPtr); +	objPtr->internalRep.twoPtrValue.ptr1 = regexpPtr;  	objPtr->typePtr = &tclRegexpType;      }      return (Tcl_RegExp) regexpPtr; @@ -606,10 +615,10 @@ Tcl_GetRegExpFromObj(interp, objPtr, flags)   *	Return information about a compiled regular expression.   *   * Results: - *	The return value is -1 for failure, 0 for success, although at - *	the moment there's nothing that could fail.  On success, a list - *	is left in the interp's result:  first element is the subexpression - *	count, second is a list of re_info bit names. + *	The return value is -1 for failure, 0 for success, although at the + *	moment there's nothing that could fail. On success, a list is left in + *	the interp's result: first element is the subexpression count, second + *	is a list of re_info bit names.   *   * Side effects:   *	None. @@ -618,16 +627,16 @@ Tcl_GetRegExpFromObj(interp, objPtr, flags)   */  int -TclRegAbout(interp, re) -    Tcl_Interp *interp;		/* For use in variable assignment. */ -    Tcl_RegExp re;		/* The compiled regular expression. */ +TclRegAbout( +    Tcl_Interp *interp,		/* For use in variable assignment. */ +    Tcl_RegExp re)		/* The compiled regular expression. */  { -    TclRegexp *regexpPtr = (TclRegexp *)re; -    char buf[TCL_INTEGER_SPACE]; -    static struct infoname { +    TclRegexp *regexpPtr = (TclRegexp *) re; +    struct infoname {  	int bit; -	char *text; -    } infonames[] = { +	const char *text; +    }; +    static const struct infoname infonames[] = {  	{REG_UBACKREF,		"REG_UBACKREF"},  	{REG_ULOOKAHEAD,	"REG_ULOOKAHEAD"},  	{REG_UBOUNDS,		"REG_UBOUNDS"}, @@ -642,37 +651,42 @@ TclRegAbout(interp, re)  	{REG_UEMPTYMATCH,	"REG_UEMPTYMATCH"},  	{REG_UIMPOSSIBLE,	"REG_UIMPOSSIBLE"},  	{REG_USHORTEST,		"REG_USHORTEST"}, -	{0,			""} +	{0,			NULL}      }; -    struct infoname *inf; -    int n; +    const struct infoname *inf; +    Tcl_Obj *infoObj, *resultObj; + +    /* +     * The reset here guarantees that the interpreter result is empty and +     * unshared. This means that we can use Tcl_ListObjAppendElement on the +     * result object quite safely. +     */      Tcl_ResetResult(interp); -    sprintf(buf, "%u", (unsigned)(regexpPtr->re.re_nsub)); -    Tcl_AppendElement(interp, buf); +    /* +     * Assume that there will never be more than INT_MAX subexpressions. This +     * is a pretty reasonable assumption; the RE engine doesn't scale _that_ +     * well and Tcl has other limits that constrain things as well... +     */ + +    resultObj = Tcl_NewObj(); +    Tcl_ListObjAppendElement(NULL, resultObj, +	    Tcl_NewIntObj((int) regexpPtr->re.re_nsub));      /* -     * Must count bits before generating list, because we must know -     * whether {} are needed before we start appending names. +     * Now append a list of all the bit-flags set for the RE.       */ -    n = 0; -    for (inf = infonames; inf->bit != 0; inf++) { -	if (regexpPtr->re.re_info&inf->bit) { -	    n++; -	} -    } -    if (n != 1) { -	Tcl_AppendResult(interp, " {", NULL); -    } -    for (inf = infonames; inf->bit != 0; inf++) { -	if (regexpPtr->re.re_info&inf->bit) { -	    Tcl_AppendElement(interp, inf->text); + +    TclNewObj(infoObj); +    for (inf=infonames ; inf->bit != 0 ; inf++) { +	if (regexpPtr->re.re_info & inf->bit) { +	    Tcl_ListObjAppendElement(NULL, infoObj, +		    Tcl_NewStringObj(inf->text, -1));  	}      } -    if (n != 1) { -	Tcl_AppendResult(interp, "}", NULL); -    } +    Tcl_ListObjAppendElement(NULL, resultObj, infoObj); +    Tcl_SetObjResult(interp, resultObj);      return 0;  } @@ -694,26 +708,25 @@ TclRegAbout(interp, re)   */  void -TclRegError(interp, msg, status) -    Tcl_Interp *interp;		/* Interpreter for error reporting. */ -    char *msg;			/* Message to prepend to error. */ -    int status;			/* Status code to report. */ +TclRegError( +    Tcl_Interp *interp,		/* Interpreter for error reporting. */ +    const char *msg,		/* Message to prepend to error. */ +    int status)			/* Status code to report. */  {      char buf[100];		/* ample in practice */ -    char cbuf[100];		/* lots in practice */ +    char cbuf[TCL_INTEGER_SPACE];      size_t n; -    char *p; +    const char *p;      Tcl_ResetResult(interp); -    n = TclReError(status, (regex_t *)NULL, buf, sizeof(buf)); +    n = TclReError(status, NULL, buf, sizeof(buf));      p = (n > sizeof(buf)) ? "..." : ""; -    Tcl_AppendResult(interp, msg, buf, p, NULL); +    Tcl_SetObjResult(interp, Tcl_ObjPrintf("%s%s%s", msg, buf, p));      sprintf(cbuf, "%d", status); -    (VOID) TclReError(REG_ITOA, (regex_t *)NULL, cbuf, sizeof(cbuf)); +    (void) TclReError(REG_ITOA, NULL, cbuf, sizeof(cbuf));      Tcl_SetErrorCode(interp, "REGEXP", cbuf, buf, NULL);  } -  /*   *---------------------------------------------------------------------- @@ -733,10 +746,10 @@ TclRegError(interp, msg, status)   */  static void -FreeRegexpInternalRep(objPtr) -    Tcl_Obj *objPtr;		/* Regexp object with internal rep to free. */ +FreeRegexpInternalRep( +    Tcl_Obj *objPtr)		/* Regexp object with internal rep to free. */  { -    TclRegexp *regexpRepPtr = (TclRegexp *) objPtr->internalRep.otherValuePtr; +    TclRegexp *regexpRepPtr = objPtr->internalRep.twoPtrValue.ptr1;      /*       * If this is the last reference to the regexp, free it. @@ -745,6 +758,7 @@ FreeRegexpInternalRep(objPtr)      if (--(regexpRepPtr->refCount) <= 0) {  	FreeRegexp(regexpRepPtr);      } +    objPtr->typePtr = NULL;  }  /* @@ -752,8 +766,8 @@ FreeRegexpInternalRep(objPtr)   *   * DupRegexpInternalRep --   * - *	We copy the reference to the compiled regexp and bump its - *	reference count. + *	We copy the reference to the compiled regexp and bump its reference + *	count.   *   * Results:   *	None. @@ -765,13 +779,14 @@ FreeRegexpInternalRep(objPtr)   */  static void -DupRegexpInternalRep(srcPtr, copyPtr) -    Tcl_Obj *srcPtr;		/* Object with internal rep to copy. */ -    Tcl_Obj *copyPtr;		/* Object with internal rep to set. */ +DupRegexpInternalRep( +    Tcl_Obj *srcPtr,		/* Object with internal rep to copy. */ +    Tcl_Obj *copyPtr)		/* Object with internal rep to set. */  { -    TclRegexp *regexpPtr = (TclRegexp *) srcPtr->internalRep.otherValuePtr; +    TclRegexp *regexpPtr = srcPtr->internalRep.twoPtrValue.ptr1; +      regexpPtr->refCount++; -    copyPtr->internalRep.otherValuePtr = srcPtr->internalRep.otherValuePtr; +    copyPtr->internalRep.twoPtrValue.ptr1 = srcPtr->internalRep.twoPtrValue.ptr1;      copyPtr->typePtr = &tclRegexpType;  } @@ -796,9 +811,9 @@ DupRegexpInternalRep(srcPtr, copyPtr)   */  static int -SetRegexpFromAny(interp, objPtr) -    Tcl_Interp *interp;		/* Used for error reporting if not NULL. */ -    Tcl_Obj *objPtr;		/* The object to convert. */ +SetRegexpFromAny( +    Tcl_Interp *interp,		/* Used for error reporting if not NULL. */ +    Tcl_Obj *objPtr)		/* The object to convert. */  {      if (Tcl_GetRegExpFromObj(interp, objPtr, REG_ADVANCED) == NULL) {  	return TCL_ERROR; @@ -811,37 +826,36 @@ SetRegexpFromAny(interp, objPtr)   *   * CompileRegexp --   * - *	Attempt to compile the given regexp pattern.  If the compiled - *	regular expression can be found in the per-thread cache, it - *	will be used instead of compiling a new copy. + *	Attempt to compile the given regexp pattern. If the compiled regular + *	expression can be found in the per-thread cache, it will be used + *	instead of compiling a new copy.   *   * Results: - *	The return value is a pointer to a newly allocated TclRegexp - *	that represents the compiled pattern, or NULL if the pattern - *	could not be compiled.  If NULL is returned, an error message is - *	left in the interp's result. + *	The return value is a pointer to a newly allocated TclRegexp that + *	represents the compiled pattern, or NULL if the pattern could not be + *	compiled. If NULL is returned, an error message is left in the + *	interp's result.   *   * Side effects: - *	The thread-local regexp cache is updated and a new TclRegexp may - *	be allocated. + *	The thread-local regexp cache is updated and a new TclRegexp may be + *	allocated.   *   *----------------------------------------------------------------------   */  static TclRegexp * -CompileRegexp(interp, string, length, flags) -    Tcl_Interp *interp;		/* Used for error reporting if not NULL. */ -    char *string;		/* The regexp to compile (UTF-8). */ -    int length;			/* The length of the string in bytes. */ -    int flags;			/* Compilation flags. */ +CompileRegexp( +    Tcl_Interp *interp,		/* Used for error reporting if not NULL. */ +    const char *string,		/* The regexp to compile (UTF-8). */ +    int length,			/* The length of the string in bytes. */ +    int flags)			/* Compilation flags. */  {      TclRegexp *regexpPtr; -    Tcl_UniChar *uniString; -    int numChars; +    const Tcl_UniChar *uniString; +    int numChars, status, i, exact;      Tcl_DString stringBuf; -    int status, i;      ThreadSpecificData *tsdPtr = TCL_TSD_INIT(&dataKey); -  +      if (!tsdPtr->initialized) {  	tsdPtr->initialized = 1;  	Tcl_CreateThreadExitHandler(FinalizeRegexp, NULL); @@ -849,14 +863,14 @@ CompileRegexp(interp, string, length, flags)      /*       * This routine maintains a second-level regular expression cache in -     * addition to the per-object regexp cache.  The per-thread cache is needed +     * addition to the per-object regexp cache. The per-thread cache is needed       * to handle the case where for various reasons the object is lost between       * invocations of the regexp command, but the literal pattern is the same.       */      /* -     * Check the per-thread compiled regexp cache.  We can only reuse -     * a regexp if it has the same pattern and the same flags. +     * Check the per-thread compiled regexp cache. We can only reuse a regexp +     * if it has the same pattern and the same flags.       */      for (i = 0; (i < NUM_REGEXPS) && (tsdPtr->patterns[i] != NULL); i++) { @@ -864,8 +878,8 @@ CompileRegexp(interp, string, length, flags)  		&& (tsdPtr->regexps[i]->flags == flags)  		&& (strcmp(string, tsdPtr->patterns[i]) == 0)) {  	    /* -	     * Move the matched pattern to the first slot in the -	     * cache and shift the other patterns down one position. +	     * Move the matched pattern to the first slot in the cache and +	     * shift the other patterns down one position.  	     */  	    if (i != 0) { @@ -890,8 +904,8 @@ CompileRegexp(interp, string, length, flags)      /*       * This is a new expression, so compile it and add it to the cache.       */ -     -    regexpPtr = (TclRegexp *) ckalloc(sizeof(TclRegexp)); + +    regexpPtr = ckalloc(sizeof(TclRegexp));      regexpPtr->objPtr = NULL;      regexpPtr->string = NULL;      regexpPtr->details.rm_extend.rm_so = -1; @@ -918,22 +932,34 @@ CompileRegexp(interp, string, length, flags)  	 * Clean up and report errors in the interpreter, if possible.  	 */ -	ckfree((char *)regexpPtr); +	ckfree(regexpPtr);  	if (interp) {  	    TclRegError(interp, -		    "couldn't compile regular expression pattern: ", -		    status); +		    "couldn't compile regular expression pattern: ", status);  	}  	return NULL;      }      /* -     * Allocate enough space for all of the subexpressions, plus one -     * extra for the entire pattern. +     * Convert RE to a glob pattern equivalent, if any, and cache it.  If this +     * is not possible, then globObjPtr will be NULL.  This is used by +     * Tcl_RegExpExecObj to optionally do a fast match (avoids RE engine). +     */ + +    if (TclReToGlob(NULL, string, length, &stringBuf, &exact) == TCL_OK) { +	regexpPtr->globObjPtr = TclDStringToObj(&stringBuf); +	Tcl_IncrRefCount(regexpPtr->globObjPtr); +    } else { +	regexpPtr->globObjPtr = NULL; +    } + +    /* +     * Allocate enough space for all of the subexpressions, plus one extra for +     * the entire pattern.       */ -    regexpPtr->matches = (regmatch_t *) ckalloc( -	    sizeof(regmatch_t) * (regexpPtr->re.re_nsub + 1)); +    regexpPtr->matches = +	    ckalloc(sizeof(regmatch_t) * (regexpPtr->re.re_nsub + 1));      /*       * Initialize the refcount to one initially, since it is in the cache. @@ -948,6 +974,7 @@ CompileRegexp(interp, string, length, flags)      if (tsdPtr->patterns[NUM_REGEXPS-1] != NULL) {  	TclRegexp *oldRegexpPtr = tsdPtr->regexps[NUM_REGEXPS-1]; +  	if (--(oldRegexpPtr->refCount) <= 0) {  	    FreeRegexp(oldRegexpPtr);  	} @@ -958,8 +985,8 @@ CompileRegexp(interp, string, length, flags)  	tsdPtr->patLengths[i+1] = tsdPtr->patLengths[i];  	tsdPtr->regexps[i+1] = tsdPtr->regexps[i];      } -    tsdPtr->patterns[0] = (char *) ckalloc((unsigned) (length+1)); -    strcpy(tsdPtr->patterns[0], string); +    tsdPtr->patterns[0] = ckalloc(length + 1); +    memcpy(tsdPtr->patterns[0], string, (unsigned) length + 1);      tsdPtr->patLengths[0] = length;      tsdPtr->regexps[0] = regexpPtr; @@ -983,14 +1010,17 @@ CompileRegexp(interp, string, length, flags)   */  static void -FreeRegexp(regexpPtr) -    TclRegexp *regexpPtr;	/* Compiled regular expression to free. */ +FreeRegexp( +    TclRegexp *regexpPtr)	/* Compiled regular expression to free. */  {      TclReFree(®expPtr->re); +    if (regexpPtr->globObjPtr) { +	TclDecrRefCount(regexpPtr->globObjPtr); +    }      if (regexpPtr->matches) { -	ckfree((char *) regexpPtr->matches); +	ckfree(regexpPtr->matches);      } -    ckfree((char *) regexpPtr); +    ckfree(regexpPtr);  }  /* @@ -998,8 +1028,7 @@ FreeRegexp(regexpPtr)   *   * FinalizeRegexp --   * - *	Release the storage associated with the per-thread regexp - *	cache. + *	Release the storage associated with the per-thread regexp cache.   *   * Results:   *	None. @@ -1011,8 +1040,8 @@ FreeRegexp(regexpPtr)   */  static void -FinalizeRegexp(clientData) -    ClientData clientData;	/* Not used. */ +FinalizeRegexp( +    ClientData clientData)	/* Not used. */  {      int i;      TclRegexp *regexpPtr; @@ -1024,5 +1053,21 @@ FinalizeRegexp(clientData)  	    FreeRegexp(regexpPtr);  	}  	ckfree(tsdPtr->patterns[i]); +	tsdPtr->patterns[i] = NULL;      } + +    /* +     * We may find ourselves reinitialized if another finalization routine +     * invokes regexps. +     */ + +    tsdPtr->initialized = 0;  } + +/* + * Local Variables: + * mode: c + * c-basic-offset: 4 + * fill-column: 78 + * End: + */ | 
