From 81e8aa10e3e8c90c32e241356ce9bc68b5e4b3dc Mon Sep 17 00:00:00 2001 From: stanton Date: Thu, 10 Jun 1999 04:28:49 +0000 Subject: * generic/tclUnicodeObj.c: Lots of cleanup and simplification. Fixed several memory bugs. Added TclAppendUnicodeToObj. * generic/tclInt.h: Added declarations for various Unicode string functions. * generic/tclRegexp.c: * generic/tclCmdMZ.c: Changed to use new Unicode string interfaces for better performance. * generic/tclRegexp.h: * generic/tclRegexp.c: * generic/tcl.h: * generic/tcl.decls: Added Tcl_RegExpMatchObj and Tcl_RegExpGetInfo calls to access lower level regexp API. These features are needed by Expect. This is a preliminary implementation pending final review and cleanup. * generic/tclCmdMZ.c: * tests/string.test: Fixed bug where string map failed on null strings. --- generic/tcl.decls | 10 ++- generic/tcl.h | 42 +++++++++- generic/tclCmdMZ.c | 178 +++++++++++++++++++++------------------- generic/tclDecls.h | 19 ++++- generic/tclInt.h | 7 +- generic/tclRegexp.c | 142 +++++++++++++++++++++++++------- generic/tclRegexp.h | 12 ++- generic/tclStubInit.c | 4 +- generic/tclUnicodeObj.c | 214 +++++++++++++++++++++++++++++++++--------------- tests/string.test | 5 +- 10 files changed, 439 insertions(+), 194 deletions(-) diff --git a/generic/tcl.decls b/generic/tcl.decls index 68d50db..8447520 100644 --- a/generic/tcl.decls +++ b/generic/tcl.decls @@ -10,7 +10,7 @@ # See the file "license.terms" for information on usage and redistribution # of this file, and for a DISCLAIMER OF ALL WARRANTIES. # -# RCS: @(#) $Id: tcl.decls,v 1.14 1999/05/25 01:00:24 stanton Exp $ +# RCS: @(#) $Id: tcl.decls,v 1.15 1999/06/10 04:28:49 stanton Exp $ library tcl @@ -1277,6 +1277,14 @@ declare 374 generic { declare 375 generic { int Tcl_UniCharIsPunct(int ch) } +declare 376 generic { + int Tcl_RegExpMatchObj(Tcl_Interp *interp, Tcl_RegExp regexp, \ + Tcl_Obj *objPtr, int offset, int nmatches, int flags) +} +declare 377 generic { + void Tcl_RegExpGetInfo(Tcl_RegExp regexp, Tcl_RegExpInfo *infoPtr) +} + ############################################################################## diff --git a/generic/tcl.h b/generic/tcl.h index 829f0b7..2c7ea05 100644 --- a/generic/tcl.h +++ b/generic/tcl.h @@ -12,7 +12,7 @@ * See the file "license.terms" for information on usage and redistribution * of this file, and for a DISCLAIMER OF ALL WARRANTIES. * - * RCS: @(#) $Id: tcl.h,v 1.44 1999/05/22 01:20:11 stanton Exp $ + * RCS: @(#) $Id: tcl.h,v 1.45 1999/06/10 04:28:49 stanton Exp $ */ #ifndef _TCL @@ -373,6 +373,46 @@ typedef struct Tcl_Trace_ *Tcl_Trace; typedef struct Tcl_Var_ *Tcl_Var; /* + * Flag values passed to Tcl_GetRegExpFromObj. + */ + +#define TCL_REG_BASIC 000000 /* BREs (convenience) */ +#define TCL_REG_EXTENDED 000001 /* EREs */ +#define TCL_REG_ADVF 000002 /* advanced features in EREs */ +#define TCL_REG_ADVANCED 000003 /* AREs (which are also EREs) */ +#define TCL_REG_QUOTE 000004 /* no special characters, none */ +#define TCL_REG_NOCASE 000010 /* ignore case */ +#define TCL_REG_NOSUB 000020 /* don't care about subexpressions */ +#define TCL_REG_EXPANDED 000040 /* expanded format, white space & + * comments */ +#define TCL_REG_NLSTOP 000100 /* \n doesn't match . or [^ ] */ +#define TCL_REG_NLANCH 000200 /* ^ matches after \n, $ before */ +#define TCL_REG_NEWLINE 000300 /* newlines are line terminators */ +#define TCL_REG_CANMATCH 001000 /* report details on partial/limited + * matches */ + +/* + * Structures filled in by Tcl_RegExpInfo. Note that all offset values are + * relative to the start of the match string, not the beginning of the + * entire string. + */ + +typedef struct Tcl_RegExpIndices { + long start; /* character offset of first character in match */ + long end; /* character offset of first character after the + * match. */ +} Tcl_RegExpIndices; + +typedef struct Tcl_RegExpInfo { + int nsubs; /* number of subexpressions in the + * compiled expression*/ + Tcl_RegExpIndices *matches; /* array of nsubs match offset + * pairs */ + long extendStart; /* The offset at which a subsequent + * match might begin. */ +} Tcl_RegExpInfo; + +/* * Picky compilers complain if this typdef doesn't appear before the * struct's reference in tclDecls.h. */ diff --git a/generic/tclCmdMZ.c b/generic/tclCmdMZ.c index ebea22b..4f20815 100644 --- a/generic/tclCmdMZ.c +++ b/generic/tclCmdMZ.c @@ -13,7 +13,7 @@ * See the file "license.terms" for information on usage and redistribution * of this file, and for a DISCLAIMER OF ALL WARRANTIES. * - * RCS: @(#) $Id: tclCmdMZ.c,v 1.13 1999/06/08 02:59:23 hershey Exp $ + * RCS: @(#) $Id: tclCmdMZ.c,v 1.14 1999/06/10 04:28:50 stanton Exp $ */ #include "tclInt.h" @@ -126,12 +126,11 @@ Tcl_RegexpObjCmd(dummy, interp, objc, objv) int objc; /* Number of arguments. */ Tcl_Obj *CONST objv[]; /* Argument objects. */ { - int i, result, indices, stringLength, wLen, match, about; + int i, indices, match, about; int cflags, eflags; Tcl_RegExp regExpr; - char *string; - Tcl_DString stringBuffer, valueBuffer; - Tcl_UniChar *wStart; + Tcl_Obj *objPtr; + Tcl_RegExpInfo info; static char *options[] = { "-indices", "-nocase", "-about", "-expanded", "-line", "-linestop", "-lineanchor", @@ -209,6 +208,7 @@ Tcl_RegexpObjCmd(dummy, interp, objc, objv) if (regExpr == NULL) { return TCL_ERROR; } + objPtr = objv[1]; if (about) { if (TclRegAbout(interp, regExpr) < 0) { @@ -217,27 +217,21 @@ Tcl_RegexpObjCmd(dummy, interp, objc, objv) return TCL_OK; } - result = TCL_OK; - string = Tcl_GetStringFromObj(objv[1], &stringLength); + match = Tcl_RegExpMatchObj(interp, regExpr, objPtr, 0 /* offset */, + objc-2 /* nmatches */, eflags); - Tcl_DStringInit(&valueBuffer); - - Tcl_DStringInit(&stringBuffer); - wStart = Tcl_UtfToUniCharDString(string, stringLength, &stringBuffer); - wLen = Tcl_DStringLength(&stringBuffer) / sizeof(Tcl_UniChar); - - match = TclRegExpExecUniChar(interp, regExpr, wStart, wLen, objc-2, eflags); if (match < 0) { - result = TCL_ERROR; - goto done; + return TCL_ERROR; } + if (match == 0) { /* - * Set the interpreter's object result to an integer object w/ value 0. + * Set the interpreter's object result to an integer object w/ + * value 0. */ Tcl_SetIntObj(Tcl_GetObjResult(interp), 0); - goto done; + return TCL_OK; } /* @@ -248,37 +242,51 @@ Tcl_RegexpObjCmd(dummy, interp, objc, objv) objc -= 2; objv += 2; + Tcl_RegExpGetInfo(regExpr, &info); for (i = 0; i < objc; i++) { - char *varName, *value; - int start, end; + Tcl_Obj *varPtr, *valuePtr, *newPtr; - varName = Tcl_GetString(objv[i]); + varPtr = objv[i]; + if (indices) { + int start, end; + Tcl_Obj *objs[2]; + + if (i <= info.nsubs) { + start = info.matches[i].start; + end = info.matches[i].end; - TclRegExpRangeUniChar(regExpr, i, &start, &end); - if (start < 0) { - if (indices) { - value = Tcl_SetVar(interp, varName, "-1 -1", 0); + /* + * Adjust index so it refers to the last character in the + * match instead of the first character after the match. + */ + + if (end >= 0) { + end--; + } } else { - value = Tcl_SetVar(interp, varName, "", 0); + start = -1; + end = -1; } + + objs[0] = Tcl_NewLongObj(start); + objs[1] = Tcl_NewLongObj(end); + + newPtr = Tcl_NewListObj(2, objs); } else { - if (indices) { - char info[TCL_INTEGER_SPACE * 2]; - - sprintf(info, "%d %d", start, end - 1); - value = Tcl_SetVar(interp, varName, info, 0); + if (i <= info.nsubs) { + newPtr = TclGetRangeFromObj(objPtr, info.matches[i].start, + info.matches[i].end - 1); } else { - value = Tcl_UniCharToUtfDString(wStart + start, end - start, - &valueBuffer); - value = Tcl_SetVar(interp, varName, value, 0); - Tcl_DStringSetLength(&valueBuffer, 0); + newPtr = Tcl_NewObj(); + } } - if (value == NULL) { + valuePtr = Tcl_ObjSetVar2(interp, varPtr, NULL, newPtr, 0); + if (valuePtr == NULL) { + Tcl_DecrRefCount(newPtr); Tcl_AppendResult(interp, "couldn't set variable \"", - varName, "\"", (char *) NULL); - result = TCL_ERROR; - goto done; + Tcl_GetString(varPtr), "\"", (char *) NULL); + return TCL_ERROR; } } @@ -287,11 +295,7 @@ Tcl_RegexpObjCmd(dummy, interp, objc, objv) */ Tcl_SetIntObj(Tcl_GetObjResult(interp), 1); - - done: - Tcl_DStringFree(&stringBuffer); - Tcl_DStringFree(&valueBuffer); - return result; + return TCL_OK; } /* @@ -319,11 +323,12 @@ Tcl_RegsubObjCmd(dummy, interp, objc, objv) int objc; /* Number of arguments. */ Tcl_Obj *CONST objv[]; /* Argument objects. */ { - int i, result, flags, all, stringLength, numMatches; + int i, result, cflags, all, wlen, numMatches, offset; Tcl_RegExp regExpr; - Tcl_DString resultBuffer, stringBuffer; - CONST Tcl_UniChar *w, *wStart, *wEnd; - char *string, *subspec, *varname; + Tcl_Obj *resultPtr, *varPtr, *objPtr; + Tcl_UniChar *wstring; + char *subspec; + static char *options[] = { "-all", "-nocase", "--", NULL }; @@ -331,7 +336,7 @@ Tcl_RegsubObjCmd(dummy, interp, objc, objv) REGSUB_ALL, REGSUB_NOCASE, REGSUB_LAST }; - flags = 0; + cflags = REG_ADVANCED; all = 0; for (i = 1; i < objc; i++) { @@ -352,7 +357,7 @@ Tcl_RegsubObjCmd(dummy, interp, objc, objv) break; } case REGSUB_NOCASE: { - flags |= REG_ICASE; + cflags |= REG_ICASE; break; } case REGSUB_LAST: { @@ -369,17 +374,21 @@ Tcl_RegsubObjCmd(dummy, interp, objc, objv) } objv += i; - regExpr = Tcl_GetRegExpFromObj(interp, objv[0], flags | REG_ADVANCED); + + regExpr = Tcl_GetRegExpFromObj(interp, objv[0], cflags); if (regExpr == NULL) { return TCL_ERROR; } result = TCL_OK; - string = Tcl_GetStringFromObj(objv[1], &stringLength); - subspec = Tcl_GetString(objv[2]); - varname = Tcl_GetString(objv[3]); + resultPtr = Tcl_NewObj(); + Tcl_IncrRefCount(resultPtr); - Tcl_DStringInit(&resultBuffer); + objPtr = objv[1]; + wlen = TclGetUnicodeLengthFromObj(objPtr); + wstring = TclGetUnicodeFromObj(objPtr); + subspec = Tcl_GetString(objv[2]); + varPtr = objv[3]; /* * The following loop is to handle multiple matches within the @@ -388,23 +397,22 @@ Tcl_RegsubObjCmd(dummy, interp, objc, objv) * then the loop body only gets executed once. */ - Tcl_DStringInit(&stringBuffer); - wStart = Tcl_UtfToUniCharDString(string, stringLength, &stringBuffer); - wEnd = wStart + Tcl_DStringLength(&stringBuffer) / sizeof(Tcl_UniChar); - numMatches = 0; - for (w = wStart; w < wEnd; ) { + offset = 0; + for (offset = 0; offset < wlen; ) { int start, end, subStart, subEnd, match; char *src, *firstChar; char c; + Tcl_RegExpInfo info; /* * The flags argument is set if string is part of a larger string, * so that "^" won't match. */ - match = TclRegExpExecUniChar(interp, regExpr, w, wEnd - w, 10, - ((w > wStart) ? REG_NOTBOL : 0)); + match = Tcl_RegExpMatchObj(interp, regExpr, objPtr, offset, + 10 /* matches */, ((offset > 0) ? REG_NOTBOL : 0)); + if (match < 0) { result = TCL_ERROR; goto done; @@ -419,9 +427,11 @@ Tcl_RegsubObjCmd(dummy, interp, objc, objv) * result variable. */ - TclRegExpRangeUniChar(regExpr, 0, &start, &end); - Tcl_UniCharToUtfDString(w, start, &resultBuffer); - + Tcl_RegExpGetInfo(regExpr, &info); + start = info.matches[0].start; + end = info.matches[0].end; + TclAppendUnicodeToObj(resultPtr, wstring + offset, start); + /* * Append the subSpec argument to the variable, making appropriate * substitutions. This code is a bit hairy because of the backslash @@ -441,9 +451,8 @@ Tcl_RegsubObjCmd(dummy, interp, objc, objv) if ((c >= '0') && (c <= '9')) { index = c - '0'; } else if ((c == '\\') || (c == '&')) { - Tcl_DStringAppend(&resultBuffer, firstChar, - src - firstChar); - Tcl_DStringAppend(&resultBuffer, &c, 1); + Tcl_AppendToObj(resultPtr, firstChar, src - firstChar); + Tcl_AppendToObj(resultPtr, &c, 1); firstChar = src + 2; src++; continue; @@ -454,12 +463,13 @@ Tcl_RegsubObjCmd(dummy, interp, objc, objv) continue; } if (firstChar != src) { - Tcl_DStringAppend(&resultBuffer, firstChar, src - firstChar); + Tcl_AppendToObj(resultPtr, firstChar, src - firstChar); } - TclRegExpRangeUniChar(regExpr, index, &subStart, &subEnd); + subStart = info.matches[index].start; + subEnd = info.matches[index].end; if ((subStart >= 0) && (subEnd >= 0)) { - Tcl_UniCharToUtfDString(w + subStart, subEnd - subStart, - &resultBuffer); + TclAppendUnicodeToObj(resultPtr, wstring + offset + subStart, + subEnd - subStart); } if (*src == '\\') { src++; @@ -467,7 +477,7 @@ Tcl_RegsubObjCmd(dummy, interp, objc, objv) firstChar = src + 1; } if (firstChar != src) { - Tcl_DStringAppend(&resultBuffer, firstChar, src - firstChar); + Tcl_AppendToObj(resultPtr, firstChar, src - firstChar); } if (end == 0) { /* @@ -475,10 +485,10 @@ Tcl_RegsubObjCmd(dummy, interp, objc, objv) * in order to prevent infinite loops. */ - Tcl_UniCharToUtfDString(w, 1, &resultBuffer); - w++; + TclAppendUnicodeToObj(resultPtr, wstring + offset, 1); + offset++; } - w += end; + offset += end; if (!all) { break; } @@ -489,13 +499,12 @@ Tcl_RegsubObjCmd(dummy, interp, objc, objv) * result variable. */ - if ((w < wEnd) || (numMatches == 0)) { - Tcl_UniCharToUtfDString(w, wEnd - w, &resultBuffer); + if ((offset < wlen) || (numMatches == 0)) { + TclAppendUnicodeToObj(resultPtr, wstring + offset, wlen - offset); } - if (Tcl_SetVar(interp, varname, Tcl_DStringValue(&resultBuffer), - 0) == NULL) { - Tcl_AppendResult(interp, "couldn't set variable \"", varname, "\"", - (char *) NULL); + if (Tcl_ObjSetVar2(interp, varPtr, NULL, resultPtr, 0) == NULL) { + Tcl_AppendResult(interp, "couldn't set variable \"", + Tcl_GetString(varPtr), "\"", (char *) NULL); result = TCL_ERROR; } else { /* @@ -507,8 +516,7 @@ Tcl_RegsubObjCmd(dummy, interp, objc, objv) } done: - Tcl_DStringFree(&stringBuffer); - Tcl_DStringFree(&resultBuffer); + Tcl_DecrRefCount(resultPtr); return result; } @@ -1496,7 +1504,7 @@ Tcl_StringObjCmd(dummy, interp, objc, objv) } else { uselen = length2; } - if ((uselen <= length1) && + if ((uselen > 0) && (uselen <= length1) && (str_comp_fn(string2, string1, uselen) == 0)) { /* * Adjust len to be full length of matched string diff --git a/generic/tclDecls.h b/generic/tclDecls.h index 9eaf421..2a7ac93 100644 --- a/generic/tclDecls.h +++ b/generic/tclDecls.h @@ -8,7 +8,7 @@ * See the file "license.terms" for information on usage and redistribution * of this file, and for a DISCLAIMER OF ALL WARRANTIES. * - * RCS: @(#) $Id: tclDecls.h,v 1.14 1999/05/25 01:00:25 stanton Exp $ + * RCS: @(#) $Id: tclDecls.h,v 1.15 1999/06/10 04:28:50 stanton Exp $ */ #ifndef _TCLDECLS @@ -1168,6 +1168,13 @@ EXTERN int Tcl_UniCharIsGraph _ANSI_ARGS_((int ch)); EXTERN int Tcl_UniCharIsPrint _ANSI_ARGS_((int ch)); /* 375 */ EXTERN int Tcl_UniCharIsPunct _ANSI_ARGS_((int ch)); +/* 376 */ +EXTERN int Tcl_RegExpMatchObj _ANSI_ARGS_((Tcl_Interp * interp, + Tcl_RegExp regexp, Tcl_Obj * objPtr, + int offset, int nmatches, int flags)); +/* 377 */ +EXTERN void Tcl_RegExpGetInfo _ANSI_ARGS_((Tcl_RegExp regexp, + Tcl_RegExpInfo * infoPtr)); typedef struct TclStubHooks { struct TclPlatStubs *tclPlatStubs; @@ -1611,6 +1618,8 @@ typedef struct TclStubs { int (*tcl_UniCharIsGraph) _ANSI_ARGS_((int ch)); /* 373 */ int (*tcl_UniCharIsPrint) _ANSI_ARGS_((int ch)); /* 374 */ int (*tcl_UniCharIsPunct) _ANSI_ARGS_((int ch)); /* 375 */ + int (*tcl_RegExpMatchObj) _ANSI_ARGS_((Tcl_Interp * interp, Tcl_RegExp regexp, Tcl_Obj * objPtr, int offset, int nmatches, int flags)); /* 376 */ + void (*tcl_RegExpGetInfo) _ANSI_ARGS_((Tcl_RegExp regexp, Tcl_RegExpInfo * infoPtr)); /* 377 */ } TclStubs; #ifdef __cplusplus @@ -3151,6 +3160,14 @@ extern TclStubs *tclStubsPtr; #define Tcl_UniCharIsPunct \ (tclStubsPtr->tcl_UniCharIsPunct) /* 375 */ #endif +#ifndef Tcl_RegExpMatchObj +#define Tcl_RegExpMatchObj \ + (tclStubsPtr->tcl_RegExpMatchObj) /* 376 */ +#endif +#ifndef Tcl_RegExpGetInfo +#define Tcl_RegExpGetInfo \ + (tclStubsPtr->tcl_RegExpGetInfo) /* 377 */ +#endif #endif /* defined(USE_TCL_STUBS) && !defined(USE_TCL_STUB_PROCS) */ diff --git a/generic/tclInt.h b/generic/tclInt.h index 506c953..d30d439 100644 --- a/generic/tclInt.h +++ b/generic/tclInt.h @@ -11,7 +11,7 @@ * See the file "license.terms" for information on usage and redistribution * of this file, and for a DISCLAIMER OF ALL WARRANTIES. * - * RCS: @(#) $Id: tclInt.h,v 1.31 1999/06/08 23:30:24 hershey Exp $ + * RCS: @(#) $Id: tclInt.h,v 1.32 1999/06/10 04:28:51 stanton Exp $ */ #ifndef _TCLINT @@ -1546,6 +1546,9 @@ EXTERN void TclAllocateFreeObjects _ANSI_ARGS_((void)); EXTERN Tcl_Obj * TclAppendObjToUnicodeObj _ANSI_ARGS_(( register Tcl_Obj *targetObjPtr, register Tcl_Obj *srcObjPtr)); +EXTERN void TclAppendUnicodeToObj _ANSI_ARGS_(( + register Tcl_Obj *objPtr, Tcl_UniChar *unichars, + int length)); EXTERN int TclArraySet _ANSI_ARGS_((Tcl_Interp *interp, Tcl_Obj *arrayNameObj, Tcl_Obj *arrayElemObj)); EXTERN int TclCleanupChildren _ANSI_ARGS_((Tcl_Interp *interp, @@ -1692,6 +1695,8 @@ EXTERN Var * TclLookupVar _ANSI_ARGS_((Tcl_Interp *interp, EXTERN int TclMathInProgress _ANSI_ARGS_((void)); EXTERN int TclNeedSpace _ANSI_ARGS_((char *start, char *end)); EXTERN Tcl_Obj * TclNewProcBodyObj _ANSI_ARGS_((Proc *procPtr)); +EXTERN Tcl_Obj * TclNewUnicodeObj _ANSI_ARGS_((Tcl_UniChar *unichars, + int numChars)); EXTERN int TclObjCommandComplete _ANSI_ARGS_((Tcl_Obj *cmdPtr)); EXTERN int TclObjInterpProc _ANSI_ARGS_((ClientData clientData, Tcl_Interp *interp, int objc, diff --git a/generic/tclRegexp.c b/generic/tclRegexp.c index 2318780..7590c8d 100644 --- a/generic/tclRegexp.c +++ b/generic/tclRegexp.c @@ -10,7 +10,7 @@ * See the file "license.terms" for information on usage and redistribution * of this file, and for a DISCLAIMER OF ALL WARRANTIES. * - * RCS: @(#) $Id: tclRegexp.c,v 1.6 1999/06/02 01:53:31 stanton Exp $ + * RCS: @(#) $Id: tclRegexp.c,v 1.7 1999/06/10 04:28:51 stanton Exp $ */ #include "tclInt.h" @@ -177,33 +177,21 @@ Tcl_RegExpExec(interp, re, string, start) * this identifies beginning of larger * string, so that "^" won't match. */ { - int result, numChars; - Tcl_DString stringBuffer; - Tcl_UniChar *uniString; - - TclRegexp *regexpPtr = (TclRegexp *) re; + int flags; /* - * Remember the UTF-8 string so Tcl_RegExpRange() can convert the - * matches from character to byte offsets. + * If the starting point is offset from the beginning of the buffer, + * then we need to tell the regexp engine not to match "^". */ - regexpPtr->string = string; - - Tcl_DStringInit(&stringBuffer); - uniString = Tcl_UtfToUniCharDString(string, -1, &stringBuffer); - numChars = Tcl_DStringLength(&stringBuffer) / sizeof(Tcl_UniChar); - - /* - * Perform the regexp match. - */ - - result = TclRegExpExecUniChar(interp, re, uniString, numChars, -1, - ((string > start) ? REG_NOTBOL : 0)); - - Tcl_DStringFree(&stringBuffer); + if (string > start) { + flags = REG_NOTBOL; + } else { + flags = 0; + } - return result; + return Tcl_RegExpMatchObj(interp, re, Tcl_NewStringObj(string, -1), + 0 /* offset */, -1 /* nmatches */, flags); } /* @@ -238,16 +226,16 @@ Tcl_RegExpRange(re, index, startPtr, endPtr) * in (sub-) range here. */ { TclRegexp *regexpPtr = (TclRegexp *) re; + char *string; if ((size_t) index > regexpPtr->re.re_nsub) { *startPtr = *endPtr = NULL; } else if (regexpPtr->matches[index].rm_so < 0) { *startPtr = *endPtr = NULL; } else { - *startPtr = Tcl_UtfAtIndex(regexpPtr->string, - regexpPtr->matches[index].rm_so); - *endPtr = Tcl_UtfAtIndex(regexpPtr->string, - regexpPtr->matches[index].rm_eo); + string = Tcl_GetString(regexpPtr->objPtr); + *startPtr = Tcl_UtfAtIndex(string, regexpPtr->matches[index].rm_so); + *endPtr = Tcl_UtfAtIndex(string, regexpPtr->matches[index].rm_eo); } } @@ -290,8 +278,9 @@ TclRegExpExecUniChar(interp, re, wString, numChars, nmatches, flags) size_t last = regexpPtr->re.re_nsub + 1; size_t nm = last; - if (nmatches >= 0 && (size_t) nmatches < nm) + if (nmatches >= 0 && (size_t) nmatches < nm) { nm = (size_t) nmatches; + } status = TclReExec(®expPtr->re, wString, (size_t) numChars, ®expPtr->details, nm, regexpPtr->matches, flags); @@ -398,6 +387,100 @@ Tcl_RegExpMatch(interp, string, pattern) /* *---------------------------------------------------------------------- * + * Tcl_RegExpMatchObj -- + * + * Match a precompiled regexp against the given object. + * + * Results: + * If an error occurs during the matching operation then -1 + * is returned and the interp's result contains an error message. + * Otherwise the return value is 1 if "string" matches "pattern" + * and 0 otherwise. + * + * Side effects: + * Converts the object to a Unicode object. + * + *---------------------------------------------------------------------- + */ + +int +Tcl_RegExpMatchObj(interp, re, objPtr, offset, nmatches, flags) + Tcl_Interp *interp; /* Interpreter to use for error reporting. */ + Tcl_RegExp re; /* Compiled regular expression; must have + * been returned by previous call to + * Tcl_GetRegExpFromObj. */ + Tcl_Obj *objPtr; /* String against which to match re. */ + int offset; /* Character index that marks where matching + * should begin. */ + int nmatches; /* How many subexpression matches (counting + * the whole match as subexpression 0) are + * of interest. -1 means all of them. */ + int flags; /* Regular expression execution flags. */ +{ + TclRegexp *regexpPtr = (TclRegexp *) re; + Tcl_Obj *oldPtr = regexpPtr->objPtr; + Tcl_UniChar *udata; + int length; + + /* + * Bump the refcount before we do anything in case the object + * was newly created. + */ + + Tcl_IncrRefCount(objPtr); + + udata = TclGetUnicodeFromObj(objPtr); + length = TclGetUnicodeLengthFromObj(objPtr); + + /* + * Save the target object so we can extract strings from it later. + */ + + regexpPtr->objPtr = objPtr; + if (oldPtr) { + Tcl_DecrRefCount(oldPtr); + } + + if (offset > length) { + offset = length; + } + udata += offset; + length -= offset; + + return TclRegExpExecUniChar(interp, re, udata, length, nmatches, flags); +} + +/* + *---------------------------------------------------------------------- + * + * Tcl_RegExpGetInfo -- + * + * Retrieve information about the current match. + * + * Results: + * None. + * + * Side effects: + * None. + * + *---------------------------------------------------------------------- + */ + +void +Tcl_RegExpGetInfo(regexp, infoPtr) + Tcl_RegExp regexp; /* Pattern from which to get subexpressions. */ + Tcl_RegExpInfo *infoPtr; /* Match information is stored here. */ +{ + TclRegexp *regexpPtr = (TclRegexp *) regexp; + + infoPtr->nsubs = regexpPtr->re.re_nsub; + infoPtr->matches = (Tcl_RegExpIndices *) regexpPtr->matches; + infoPtr->extendStart = regexpPtr->details.rm_extend.rm_so; +} + +/* + *---------------------------------------------------------------------- + * * TclRegExpMatchObj -- * * See if a string matches a regular expression pattern object. @@ -790,6 +873,9 @@ CompileRegexp(interp, string, length, flags) */ regexpPtr = (TclRegexp *) ckalloc(sizeof(TclRegexp)); + regexpPtr->objPtr = NULL; + regexpPtr->details.rm_extend.rm_so = -1; + regexpPtr->details.rm_extend.rm_eo = -1; /* * Get the up-to-date string representation and map to unicode. diff --git a/generic/tclRegexp.h b/generic/tclRegexp.h index 7a6fb2a..5cee78e 100644 --- a/generic/tclRegexp.h +++ b/generic/tclRegexp.h @@ -10,7 +10,7 @@ * See the file "license.terms" for information on usage and redistribution * of this file, and for a DISCLAIMER OF ALL WARRANTIES. * - * RCS: @(#) $Id: tclRegexp.h,v 1.8 1999/06/02 01:53:31 stanton Exp $ + * RCS: @(#) $Id: tclRegexp.h,v 1.9 1999/06/10 04:28:51 stanton Exp $ */ #ifndef _TCLREGEXP @@ -33,10 +33,10 @@ typedef struct TclRegexp { int flags; /* Regexp compile flags. */ regex_t re; /* Compiled re, includes number of * subexpressions. */ - CONST char *string; /* Last string matched with this regexp - * (UTF-8), so Tcl_RegExpRange() can convert - * the matches from character indices to UTF-8 - * byte offsets. */ + Tcl_Obj *objPtr; /* Last object match with this regexp, so + * Tcl_RegExpRange() can convert the matches + * from character indices to UTF-8 byte + * offsets. */ regmatch_t *matches; /* Array of indices into the Tcl_UniChar * representation of the last string matched * with this regexp to indicate the location @@ -53,8 +53,6 @@ typedef struct TclRegexp { EXTERN int TclRegAbout _ANSI_ARGS_((Tcl_Interp *interp, Tcl_RegExp re)); -EXTERN VOID TclRegXflags _ANSI_ARGS_((char *string, int length, - int *cflagsPtr, int *eflagsPtr)); EXTERN int TclRegExpExecUniChar _ANSI_ARGS_((Tcl_Interp *interp, Tcl_RegExp re, CONST Tcl_UniChar *uniString, int numChars, int nmatches, int flags)); diff --git a/generic/tclStubInit.c b/generic/tclStubInit.c index 1bcfb63..9a3b4f3 100644 --- a/generic/tclStubInit.c +++ b/generic/tclStubInit.c @@ -8,7 +8,7 @@ * See the file "license.terms" for information on usage and redistribution * of this file, and for a DISCLAIMER OF ALL WARRANTIES. * - * RCS: @(#) $Id: tclStubInit.c,v 1.16 1999/05/25 01:00:27 stanton Exp $ + * RCS: @(#) $Id: tclStubInit.c,v 1.17 1999/06/10 04:28:51 stanton Exp $ */ #include "tclInt.h" @@ -756,6 +756,8 @@ TclStubs tclStubs = { Tcl_UniCharIsGraph, /* 373 */ Tcl_UniCharIsPrint, /* 374 */ Tcl_UniCharIsPunct, /* 375 */ + Tcl_RegExpMatchObj, /* 376 */ + Tcl_RegExpGetInfo, /* 377 */ }; /* !END!: Do not edit above this line. */ diff --git a/generic/tclUnicodeObj.c b/generic/tclUnicodeObj.c index 3a4709b..315644d 100644 --- a/generic/tclUnicodeObj.c +++ b/generic/tclUnicodeObj.c @@ -9,7 +9,7 @@ * See the file "license.terms" for information on usage and redistribution * of this file, and for a DISCLAIMER OF ALL WARRANTIES. * - * RCS: @(#) $Id: tclUnicodeObj.c,v 1.4 1999/06/09 17:06:57 hershey Exp $ + * RCS: @(#) $Id: tclUnicodeObj.c,v 1.5 1999/06/10 04:28:51 stanton Exp $ */ #include @@ -20,23 +20,19 @@ * Prototypes for local procedures defined in this file: */ +static int AllSingleByteChars _ANSI_ARGS_((Tcl_Obj *objPtr)); +static void AppendUniCharStrToObj _ANSI_ARGS_((Tcl_Obj *objPtr, + Tcl_UniChar *unichars, int numNewChars)); static void DupUnicodeInternalRep _ANSI_ARGS_((Tcl_Obj *srcPtr, Tcl_Obj *copyPtr)); static void FreeUnicodeInternalRep _ANSI_ARGS_((Tcl_Obj *objPtr)); static void UpdateStringOfUnicode _ANSI_ARGS_((Tcl_Obj *objPtr)); -static int SetUnicodeFromAny _ANSI_ARGS_((Tcl_Interp *interp, - Tcl_Obj *objPtr)); - -static int AllSingleByteChars _ANSI_ARGS_((Tcl_Obj *objPtr)); -static void TclAppendUniCharStrToObj _ANSI_ARGS_(( - register Tcl_Obj *objPtr, Tcl_UniChar *unichars, - int numChars)); -static Tcl_Obj * TclNewUnicodeObj _ANSI_ARGS_((Tcl_UniChar *unichars, - int numChars)); static void SetOptUnicodeFromAny _ANSI_ARGS_((Tcl_Obj *objPtr, int numChars)); static void SetFullUnicodeFromAny _ANSI_ARGS_((Tcl_Obj *objPtr, char *src, int numBytes, int numChars)); +static int SetUnicodeFromAny _ANSI_ARGS_((Tcl_Interp *interp, + Tcl_Obj *objPtr)); /* * The following object type represents a Unicode string. A Unicode string @@ -68,13 +64,13 @@ typedef struct Unicode { int numChars; /* The number of chars in the unicode * string. */ size_t allocated; /* The amount of space actually allocated. */ - unsigned char chars[4]; /* The array of chars. The actual size of + Tcl_UniChar chars[2]; /* The array of chars. The actual size of * this field depends on the 'allocated' field * above. */ } Unicode; #define UNICODE_SIZE(len) \ - ((unsigned) (sizeof(Unicode) - 4 + (len))) + ((unsigned) (sizeof(Unicode) - (sizeof(Tcl_UniChar)*2) + (len))) #define GET_UNICODE(objPtr) \ ((Unicode *) (objPtr)->internalRep.otherValuePtr) #define SET_UNICODE(objPtr, unicodePtr) \ @@ -104,7 +100,6 @@ Tcl_UniChar * TclGetUnicodeFromObj(objPtr) Tcl_Obj *objPtr; /* The object to find the unicode string for. */ { - Tcl_UniChar *unicharPtr; Unicode *unicodePtr; int numBytes; char *src; @@ -124,9 +119,15 @@ TclGetUnicodeFromObj(objPtr) src = Tcl_GetStringFromObj(objPtr, &numBytes); SetFullUnicodeFromAny(objPtr, src, numBytes, unicodePtr->numChars); + + /* + * We need to fetch the pointer again because we have just + * reallocated the structure to make room for the Unicode data. + */ + + unicodePtr = GET_UNICODE(objPtr); } - unicharPtr = (Tcl_UniChar *)unicodePtr->chars; - return unicharPtr; + return unicodePtr->chars; } /* @@ -185,7 +186,7 @@ TclGetUniCharFromObj(objPtr, index) Tcl_Obj *objPtr; /* The Unicode object. */ int index; /* Get the index'th character. */ { - Tcl_UniChar *unicharPtr, unichar; + Tcl_UniChar unichar; Unicode *unicodePtr; int length; @@ -206,8 +207,7 @@ TclGetUniCharFromObj(objPtr, index) str = Tcl_GetStringFromObj(objPtr, &length); Tcl_UtfToUniChar(&str[index], &unichar); } else { - unicharPtr = (Tcl_UniChar *)unicodePtr->chars; - unichar = unicharPtr[index]; + unichar = unicodePtr->chars[index]; } return unichar; } @@ -217,11 +217,11 @@ TclGetUniCharFromObj(objPtr, index) * * TclGetRangeFromObj -- * - * Create a Tcl Object that contains the chars between first and - * last of the object indicated by "objPtr". If the object is not - * already a Unicode object, an attempt will be made to convert it - * to one. The first and last indices are assumed to be in the - * appropriate range. + * Create a Tcl Object that contains the chars between first and last + * of the object indicated by "objPtr". If the object is not already + * a Unicode object, an attempt will be made to convert it to one. + * The first and last indices are assumed to be in the appropriate + * range. * * Results: * Returns a new Tcl Object of either "string" or "unicode" type, @@ -241,7 +241,6 @@ TclGetRangeFromObj(objPtr, first, last) int last; /* Last index of the range. */ { Tcl_Obj *newObjPtr; /* The Tcl object to find the range of. */ - Tcl_UniChar *unicharPtr; Unicode *unicodePtr; int length; @@ -250,8 +249,7 @@ TclGetRangeFromObj(objPtr, first, last) length = objPtr->length; if (unicodePtr->numChars != length) { - unicharPtr = (Tcl_UniChar *)unicodePtr->chars; - newObjPtr = TclNewUnicodeObj(&unicharPtr[first], last-first+1); + newObjPtr = TclNewUnicodeObj(unicodePtr->chars + first, last-first+1); } else { int length; char *str; @@ -273,7 +271,7 @@ TclGetRangeFromObj(objPtr, first, last) * * TclAppendObjToUnicodeObj -- * - * This procedure appends the contest of "srcObjPtr" to the Unicode + * This procedure appends the contents of "srcObjPtr" to the Unicode * object "destPtr". * * Results: @@ -367,7 +365,7 @@ TclAppendObjToUnicodeObj(targetObjPtr, srcObjPtr) } else { unicodePtr = GET_UNICODE(srcObjPtr); numChars = unicodePtr->numChars; - unicharSrcStr = (Tcl_UniChar *)unicodePtr->chars; + unicharSrcStr = unicodePtr->chars; } } else { utfSrcStr = Tcl_GetStringFromObj(srcObjPtr, &numBytes); @@ -383,7 +381,7 @@ TclAppendObjToUnicodeObj(targetObjPtr, srcObjPtr) * Append the unichar src string to the result object. */ - TclAppendUniCharStrToObj(resultObjPtr, unicharSrcStr, numChars); + AppendUniCharStrToObj(resultObjPtr, unicharSrcStr, numChars); Tcl_DStringFree(&dsPtr); return resultObjPtr; } @@ -391,7 +389,7 @@ TclAppendObjToUnicodeObj(targetObjPtr, srcObjPtr) /* *---------------------------------------------------------------------- * - * TclAppendUniCharStrToObj -- + * AppendUniCharStrToObj -- * * This procedure appends the contents of "srcObjPtr" to the * Unicode object "objPtr". @@ -406,31 +404,25 @@ TclAppendObjToUnicodeObj(targetObjPtr, srcObjPtr) *---------------------------------------------------------------------- */ -void -TclAppendUniCharStrToObj(objPtr, unichars, numNewChars) +static void +AppendUniCharStrToObj(objPtr, unichars, numNewChars) register Tcl_Obj *objPtr; /* Points to the object to append to. */ Tcl_UniChar *unichars; /* The unicode string to append to the * object. */ int numNewChars; /* Number of chars in "unichars". */ { Unicode *unicodePtr; - int usedBytes, numNewBytes, totalNumBytes, totalNumChars; - - /* - * Invalidate the StringRep. - */ - - Tcl_InvalidateStringRep(objPtr); + int numChars; + size_t numBytes; + SetUnicodeFromAny(NULL, objPtr); unicodePtr = GET_UNICODE(objPtr); - usedBytes = unicodePtr->numChars * sizeof(Tcl_UniChar); - totalNumChars = numNewChars + unicodePtr->numChars; - totalNumBytes = totalNumChars * sizeof(Tcl_UniChar); - numNewBytes = numNewChars * sizeof(Tcl_UniChar); + numChars = numNewChars + unicodePtr->numChars; + numBytes = (numChars + 1) * sizeof(Tcl_UniChar); - if (unicodePtr->allocated <= totalNumBytes) { - int allocatedBytes = totalNumBytes * 2; + if (unicodePtr->allocated < numBytes) { + int allocatedBytes = numBytes * 2; /* * There isn't currently enough space in the Unicode @@ -439,15 +431,101 @@ TclAppendUniCharStrToObj(objPtr, unichars, numNewChars) * having to reallocate again. */ - unicodePtr = (Unicode *) ckrealloc(unicodePtr, + unicodePtr = (Unicode *) ckrealloc((char*) unicodePtr, UNICODE_SIZE(allocatedBytes)); unicodePtr->allocated = allocatedBytes; unicodePtr = SET_UNICODE(objPtr, unicodePtr); } - memcpy((VOID *) (unicodePtr->chars + usedBytes), - (VOID *) unichars, (size_t) numNewBytes); - *((Tcl_UniChar *)unicodePtr->chars + totalNumChars) = 0; - unicodePtr->numChars = totalNumChars; + memcpy((VOID *) (unicodePtr->chars + unicodePtr->numChars), + (VOID *) unichars, (size_t) numNewChars * sizeof(Tcl_UniChar)); + unicodePtr->chars[numChars] = 0; + unicodePtr->numChars = numChars; + + /* + * Invalidate the StringRep. + */ + + Tcl_InvalidateStringRep(objPtr); +} + +/* + *---------------------------------------------------------------------- + * + * TclAppendUnicodeToObj -- + * + * This procedure appends a Unicode string to an object in the + * most efficient manner possible. + * + * Results: + * None. + * + * Side effects: + * Invalidates the string rep and creates a new Unicode string. + * + *---------------------------------------------------------------------- + */ + +void +TclAppendUnicodeToObj(objPtr, unichars, length) + register Tcl_Obj *objPtr; /* Points to the object to append to. */ + Tcl_UniChar *unichars; /* The unicode string to append to the + * object. */ + int length; /* Number of chars in "unichars". */ +{ + Unicode *unicodePtr; + int numChars, i; + size_t newSize; + char *src; + Tcl_UniChar *dst; + + if (Tcl_IsShared(objPtr)) { + panic("TclAppendUnicodeToObj called with shared object"); + } + + SetUnicodeFromAny(NULL, objPtr); + unicodePtr = GET_UNICODE(objPtr); + + /* + * Make the buffer big enough for the result. + */ + + numChars = unicodePtr->numChars + length; + newSize = (numChars + 1) * sizeof(Tcl_UniChar); + + if (newSize > unicodePtr->allocated) { + int allocated = newSize * 2; + + unicodePtr = (Unicode *) ckrealloc((char*)unicodePtr, + UNICODE_SIZE(allocated)); + + if (unicodePtr->allocated == 0) { + /* + * If the original string was not in Unicode form, add it to the + * beginning of the buffer. + */ + + src = objPtr->bytes; + dst = unicodePtr->chars; + for (i = 0; i < unicodePtr->numChars; i++) { + src += Tcl_UtfToUniChar(src, dst++); + } + } + unicodePtr->allocated = allocated; + } + + /* + * Copy the new string onto the end of the old string, then add the + * trailing null. + */ + + memcpy((VOID*) (unicodePtr->chars + unicodePtr->numChars), unichars, + length * sizeof(Tcl_UniChar)); + unicodePtr->numChars = numChars; + unicodePtr->chars[numChars] = 0; + + SET_UNICODE(objPtr, unicodePtr); + + Tcl_InvalidateStringRep(objPtr); } /* @@ -497,7 +575,7 @@ TclNewUnicodeObj(unichars, numChars) unicodePtr->numChars = numChars; unicodePtr->allocated = allocated; memcpy((VOID *) unicodePtr->chars, (VOID *) unichars, (size_t) numBytes); - *((Tcl_UniChar *)unicodePtr->chars + numChars) = 0; + unicodePtr->chars[numChars] = 0; SET_UNICODE(objPtr, unicodePtr); return objPtr; } @@ -572,12 +650,10 @@ DupUnicodeInternalRep(srcPtr, copyPtr) */ if (AllSingleByteChars(srcPtr)) { - copyUnicodePtr = (Unicode *) ckalloc(UNICODE_SIZE(4)); + copyUnicodePtr = (Unicode *) ckalloc(sizeof(Unicode)); + copyUnicodePtr->allocated = 0; } else { int allocated = srcUnicodePtr->allocated; - Tcl_UniChar *unichars; - - unichars = (Tcl_UniChar *)srcUnicodePtr->chars; copyUnicodePtr = (Unicode *) ckalloc(UNICODE_SIZE(allocated)); @@ -624,7 +700,7 @@ UpdateStringOfUnicode(objPtr) Unicode *unicodePtr; unicodePtr = GET_UNICODE(objPtr); - src = (Tcl_UniChar *) unicodePtr->chars; + src = unicodePtr->chars; length = unicodePtr->numChars * sizeof(Tcl_UniChar); /* @@ -672,16 +748,20 @@ SetOptUnicodeFromAny(objPtr, numChars) { Tcl_ObjType *typePtr; Unicode *unicodePtr; - - unicodePtr = (Unicode *) ckalloc(UNICODE_SIZE(4)); - unicodePtr->numChars = numChars; - unicodePtr->allocated = 0; typePtr = objPtr->typePtr; if ((typePtr != NULL) && (typePtr->freeIntRepProc) != NULL) { (*typePtr->freeIntRepProc)(objPtr); } objPtr->typePtr = &tclUnicodeType; + + /* + * Allocate enough space for the basic Unicode structure. + */ + + unicodePtr = (Unicode *) ckalloc(sizeof(Unicode)); + unicodePtr->numChars = numChars; + unicodePtr->allocated = 0; SET_UNICODE(objPtr, unicodePtr); } @@ -719,7 +799,7 @@ SetFullUnicodeFromAny(objPtr, src, numBytes, numChars) unicodePtr = (Unicode *) ckalloc(UNICODE_SIZE(length)); srcEnd = src + numBytes; - for (dst = (Tcl_UniChar *) unicodePtr->chars; src < srcEnd; dst++) { + for (dst = unicodePtr->chars; src < srcEnd; dst++) { src += Tcl_UtfToUniChar(src, dst); } *dst = 0; @@ -747,10 +827,10 @@ SetFullUnicodeFromAny(objPtr, src, numBytes, numChars) * * Side effects: * A Unicode object is stored as the internal rep of objPtr. The Unicode - * ojbect is opitmized for the case where each UTF char in a string is only - * one byte. In this case, we store the value of numChars, but we don't copy - * the bytes to the unicodeObj->chars. Before accessing obj->chars, check if - * all chars are 1 byte long. + * object is opitmized for the case where each UTF char in a string is + * only one byte. In this case, we store the value of numChars, but we + * don't copy the bytes to the unicodeObj->chars. Before accessing + * obj->chars, check if all chars are 1 byte long. * *--------------------------------------------------------------------------- */ @@ -760,12 +840,10 @@ SetUnicodeFromAny(interp, objPtr) Tcl_Interp *interp; /* Not used. */ Tcl_Obj *objPtr; /* The object to convert to type Unicode. */ { - Tcl_ObjType *typePtr; int numBytes, numChars; char *src; - typePtr = objPtr->typePtr; - if (typePtr != &tclUnicodeType) { + if (objPtr->typePtr != &tclUnicodeType) { src = Tcl_GetStringFromObj(objPtr, &numBytes); numChars = Tcl_NumUtfChars(src, numBytes); diff --git a/tests/string.test b/tests/string.test index 235dba8..5200b48 100644 --- a/tests/string.test +++ b/tests/string.test @@ -11,7 +11,7 @@ # See the file "license.terms" for information on usage and redistribution # of this file, and for a DISCLAIMER OF ALL WARRANTIES. # -# RCS: @(#) $Id: string.test,v 1.12 1999/06/08 02:59:28 hershey Exp $ +# RCS: @(#) $Id: string.test,v 1.13 1999/06/10 04:28:52 stanton Exp $ if {[lsearch [namespace children] ::tcltest] == -1} { source [file join [pwd] [file dirname [info script]] defs.tcl] @@ -648,6 +648,9 @@ test string-10.12 {string map, unicode} { test string-10.13 {string map, -nocase unicode} { string map -nocase [list \374 ue UE \334] "a\374ueUE\000EU" } aue\334\334\0EU +test string-10.14 {string map, -nocase null arguments} { + string map -nocase {{} abc} foo +} foo test string-11.1 {string match, too few args} { list [catch {string match a} msg] $msg -- cgit v0.12