From f819f7990a8794c9429eb83b8f912950b85d0a91 Mon Sep 17 00:00:00 2001 From: dgp Date: Fri, 13 Feb 2009 03:22:51 +0000 Subject: * generic/tclStringObj.c: Rewrites of the routines Tcl_GetCharLength, Tcl_GetUniChar, Tcl_GetUnicodeFromObj, Tcl_GetRange, and TclStringObjReverse to use the new macro, and to more simply and clearly split the cases depending on whether a valid unicode rep is present or needs to be created. * generic/tclInt.h: New macro TclNumUtfChars meant to be a faster replacement for a full Tcl_NumUtfChars() call when the string has all single-byte characters. --- ChangeLog | 10 ++ generic/tclInt.h | 27 +++++- generic/tclStringObj.c | 250 +++++++++++++++++++------------------------------ 3 files changed, 134 insertions(+), 153 deletions(-) diff --git a/ChangeLog b/ChangeLog index 3bae1b1..803ee95 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,15 @@ 2009-02-12 Don Porter + * generic/tclStringObj.c: Rewrites of the routines + Tcl_GetCharLength, Tcl_GetUniChar, Tcl_GetUnicodeFromObj, + Tcl_GetRange, and TclStringObjReverse to use the new macro, and + to more simply and clearly split the cases depending on whether + a valid unicode rep is present or needs to be created. + + * generic/tclInt.h: New macro TclNumUtfChars meant to be a faster + replacement for a full Tcl_NumUtfChars() call when the string has all + single-byte characters. + * generic/tclStringObj.c: Simplified Tcl_GetCharLength by * generic/tclTestObj.c: removing code that did nothing. Added early returns from Tcl_*SetObjLength when the desired length diff --git a/generic/tclInt.h b/generic/tclInt.h index dba84fb..3de0ea2 100644 --- a/generic/tclInt.h +++ b/generic/tclInt.h @@ -15,7 +15,7 @@ * See the file "license.terms" for information on usage and redistribution of * this file, and for a DISCLAIMER OF ALL WARRANTIES. * - * RCS: @(#) $Id: tclInt.h,v 1.416 2009/02/03 18:48:25 dkf Exp $ + * RCS: @(#) $Id: tclInt.h,v 1.417 2009/02/13 03:22:52 dgp Exp $ */ #ifndef _TCLINT @@ -3805,6 +3805,31 @@ MODULE_SCOPE void TclDbInitNewObj(Tcl_Obj *objPtr); /* *---------------------------------------------------------------- + * Macro counterpart of the Tcl_NumUtfChars() function. To be used + * in speed-sensitive points where it pays to avoid a function call + * in the common case of counting along a string of all one-byte characters. + * The ANSI C "prototype" for this macro is: + * + * MODULE_SCOPE void TclNumUtfChars(int numChars, const char *bytes, + * int numBytes); + *---------------------------------------------------------------- + */ + +#define TclNumUtfChars(numChars, bytes, numBytes) \ + do { \ + int count, i = (numBytes); \ + unsigned char *str = (unsigned char *) (bytes); \ + while (i && (*str < 0xC0)) { i--; str++; } \ + count = (numBytes) - i; \ + if (i) { \ + count += Tcl_NumUtfChars((bytes) + count, i); \ + } \ + (numChars) = count; \ + } while (0); + + +/* + *---------------------------------------------------------------- * Macro used by the Tcl core to compare Unicode strings. On big-endian * systems we can use the more efficient memcmp, but this would not be * lexically correct on little-endian systems. The ANSI C "prototype" for diff --git a/generic/tclStringObj.c b/generic/tclStringObj.c index 1a8a395..f6c3bc8 100644 --- a/generic/tclStringObj.c +++ b/generic/tclStringObj.c @@ -33,7 +33,7 @@ * See the file "license.terms" for information on usage and redistribution of * this file, and for a DISCLAIMER OF ALL WARRANTIES. * - * RCS: @(#) $Id: tclStringObj.c,v 1.101 2009/02/12 17:08:45 dgp Exp $ */ + * RCS: @(#) $Id: tclStringObj.c,v 1.102 2009/02/13 03:22:52 dgp Exp $ */ #include "tclInt.h" #include "tommath.h" @@ -397,44 +397,28 @@ Tcl_GetCharLength( stringPtr = GET_STRING(objPtr); numChars = stringPtr->numChars; - /* - * If numChars is unknown, then calculate the number of characaters while - * populating the Unicode string. - */ - + /* If numChars is unknown, compute it. */ if (numChars == -1) { - register int i = objPtr->length; - register unsigned char *str = (unsigned char *) objPtr->bytes; + TclNumUtfChars(numChars, objPtr->bytes, objPtr->length); + stringPtr->numChars = numChars; /* - * This is a speed sensitive function, so run specially over the - * string to count continuous ascii characters before resorting to the - * Tcl_NumUtfChars call. This is a long form of: - stringPtr->numChars = Tcl_NumUtfChars(objPtr->bytes,objPtr->length); - * - * TODO: Consider macro-izing this. + * Disabled the auto-fill of the unicode rep when multi-byte + * characters have been detected, on the YAGNI principle. */ - - while (i && (*str < 0xC0)) { - i--; - str++; - } - numChars = objPtr->length - i; - if (i) { - numChars += Tcl_NumUtfChars(objPtr->bytes - + (objPtr->length - i), i); - } - - stringPtr->numChars = numChars; +#if 0 if (numChars < objPtr->length) { /* - * Since we've just calucalated the number of chars, and not all + * Since we've just computed the number of chars, and not all * UTF chars are 1-byte long, go ahead and populate the unicode * string. + * + * TODO: Examine does this really help? How? */ FillUnicodeRep(objPtr); } +#endif } return numChars; } @@ -462,7 +446,6 @@ Tcl_GetUniChar( * from. */ int index) /* Get the index'th Unicode character. */ { - Tcl_UniChar unichar; String *stringPtr; /* @@ -484,33 +467,18 @@ Tcl_GetUniChar( SetStringFromAny(NULL, objPtr); stringPtr = GET_STRING(objPtr); - if (stringPtr->numChars == -1) { - /* - * We haven't yet calculated the length, so we don't have the Unicode - * str. We need to know the number of chars before we can do indexing. - */ - - Tcl_GetCharLength(objPtr); - - /* - * We need to fetch the pointer again because we may have just - * reallocated the structure. - */ - - stringPtr = GET_STRING(objPtr); - } if (stringPtr->hasUnicode == 0) { - /* - * All of the characters in the Utf string are 1 byte chars, so we - * don't store the unicode char. We get the Utf string and convert the - * index'th byte to a Unicode character. - */ - - unichar = (Tcl_UniChar) objPtr->bytes[index]; - } else { - unichar = stringPtr->unicode[index]; + /* If numChars is unknown, compute it. */ + if (stringPtr->numChars == -1) { + TclNumUtfChars(stringPtr->numChars, objPtr->bytes, objPtr->length); + } + if (stringPtr->numChars == objPtr->length) { + return (Tcl_UniChar) objPtr->bytes[index]; + } + FillUnicodeRep(objPtr); + stringPtr = GET_STRING(objPtr); } - return unichar; + return stringPtr->unicode[index]; } /* @@ -572,22 +540,8 @@ Tcl_GetUnicodeFromObj( SetStringFromAny(NULL, objPtr); stringPtr = GET_STRING(objPtr); - if ((stringPtr->numChars == -1) || (stringPtr->hasUnicode == 0)) { - /* - * We haven't yet calculated the length, or all of the characters in - * the Utf string are 1 byte chars (so we didn't store the unicode - * str). Since this function must return a unicode string, and one has - * not yet been stored, force the Unicode to be calculated and stored - * now. - */ - + if (stringPtr->hasUnicode == 0) { FillUnicodeRep(objPtr); - - /* - * We need to fetch the pointer again because we have just reallocated - * the structure to make room for the Unicode data. - */ - stringPtr = GET_STRING(objPtr); } @@ -644,47 +598,25 @@ Tcl_GetRange( SetStringFromAny(NULL, objPtr); stringPtr = GET_STRING(objPtr); - if (stringPtr->numChars == -1) { - /* - * We haven't yet calculated the length, so we don't have the Unicode - * str. We need to know the number of chars before we can do indexing. - */ - - Tcl_GetCharLength(objPtr); - - /* - * We need to fetch the pointer again because we may have just - * reallocated the structure. - */ - + if (stringPtr->hasUnicode == 0) { + /* If numChars is unknown, compute it. */ + if (stringPtr->numChars == -1) { + TclNumUtfChars(stringPtr->numChars, objPtr->bytes, objPtr->length); + } + if (stringPtr->numChars == objPtr->length) { + newObjPtr = Tcl_NewStringObj(objPtr->bytes + first, last-first+1); + + /* Since we know the char length of the result, store it. */ + SetStringFromAny(NULL, newObjPtr); + stringPtr = GET_STRING(newObjPtr); + stringPtr->numChars = newObjPtr->length; + return newObjPtr; + } + FillUnicodeRep(objPtr); stringPtr = GET_STRING(objPtr); } - if (objPtr->bytes && (stringPtr->numChars == objPtr->length)) { - const char *str = TclGetString(objPtr); - - /* - * All of the characters in the Utf string are 1 byte chars, so we - * don't store the unicode char. Create a new string object containing - * the specified range of chars. - */ - - newObjPtr = Tcl_NewStringObj(str+first, last-first+1); - - /* - * Since we know the new string only has 1-byte chars, we can set it's - * numChars field. - */ - - SetStringFromAny(NULL, newObjPtr); - stringPtr = GET_STRING(newObjPtr); - /* TODO: validity check! */ - stringPtr->numChars = last-first+1; - } else { - newObjPtr = Tcl_NewUnicodeObj(stringPtr->unicode + first, - last-first+1); - } - return newObjPtr; + return Tcl_NewUnicodeObj(stringPtr->unicode + first, last-first+1); } /* @@ -2615,65 +2547,79 @@ TclStringObjReverse( Tcl_Obj *objPtr) { String *stringPtr; - int numChars = Tcl_GetCharLength(objPtr); - int i = 0, lastCharIdx = numChars - 1; - char *bytes; - - if (numChars <= 1) { - return objPtr; - } + char *src = NULL, *dest = NULL; + Tcl_UniChar *usrc = NULL, *udest = NULL; + Tcl_Obj *resultPtr = NULL; + SetStringFromAny(NULL, objPtr); stringPtr = GET_STRING(objPtr); - if (stringPtr->hasUnicode) { - Tcl_UniChar *source = stringPtr->unicode; - - if (Tcl_IsShared(objPtr)) { - Tcl_UniChar *dest, ch = 0; - - /* - * Create a non-empty, pure unicode value, so we can coax - * Tcl_SetObjLength into growing the unicode rep buffer. - */ - Tcl_Obj *resultPtr = Tcl_NewUnicodeObj(&ch, 1); - Tcl_SetObjLength(resultPtr, numChars); - dest = Tcl_GetUnicode(resultPtr); - - while (i < numChars) { - dest[i++] = source[lastCharIdx--]; - } - return resultPtr; + if (stringPtr->hasUnicode == 0) { + if (stringPtr->numChars == -1) { + TclNumUtfChars(stringPtr->numChars, objPtr->bytes, objPtr->length); } - - while (i < lastCharIdx) { - Tcl_UniChar tmp = source[lastCharIdx]; - source[lastCharIdx--] = source[i]; - source[i++] = tmp; + if (stringPtr->numChars <= 1) { + return objPtr; } - TclInvalidateStringRep(objPtr); - stringPtr->allocated = 0; + if (stringPtr->numChars == objPtr->length) { + /* All one-byte chars. Reverse in objPtr->bytes. */ + if (Tcl_IsShared(objPtr)) { + resultPtr = Tcl_NewObj(); + Tcl_SetObjLength(resultPtr, objPtr->length); + dest = TclGetString(resultPtr); + src = objPtr->bytes + objPtr->length - 1; + while (src >= objPtr->bytes) { + *dest++ = *src--; + } + return resultPtr; + } + /* Unshared. Reverse objPtr->bytes in place. */ + dest = objPtr->bytes; + src = dest + objPtr->length - 1; + while (dest < src) { + char tmp = *src; + *src-- = *dest; + *dest++ = tmp; + } + return objPtr; + } + FillUnicodeRep(objPtr); + stringPtr = GET_STRING(objPtr); + } + if (stringPtr->numChars <= 1) { return objPtr; } - /* TODO: Document the dangers here! */ - - bytes = TclGetString(objPtr); + /* Reverse the Unicode rep. */ if (Tcl_IsShared(objPtr)) { - char *dest; - Tcl_Obj *resultPtr = Tcl_NewObj(); - Tcl_SetObjLength(resultPtr, numChars); - dest = TclGetString(resultPtr); - while (i < numChars) { - dest[i++] = bytes[lastCharIdx--]; + Tcl_UniChar ch = 0; + + /* + * Create a non-empty, pure unicode value, so we can coax + * Tcl_SetObjLength into growing the unicode rep buffer. + */ + + resultPtr = Tcl_NewUnicodeObj(&ch, 1); + Tcl_SetObjLength(resultPtr, stringPtr->numChars); + udest = Tcl_GetUnicode(resultPtr); + usrc = stringPtr->unicode + stringPtr->numChars - 1; + while (usrc >= stringPtr->unicode) { + *udest++ = *usrc--; } return resultPtr; } - while (i < lastCharIdx) { - char tmp = bytes[lastCharIdx]; - bytes[lastCharIdx--] = bytes[i]; - bytes[i++] = tmp; + /* Unshared. Reverse objPtr->bytes in place. */ + udest = stringPtr->unicode; + usrc = udest + stringPtr->numChars - 1; + while (udest < usrc) { + Tcl_UniChar tmp = *usrc; + *usrc-- = *udest; + *udest++ = tmp; } + + TclInvalidateStringRep(objPtr); + stringPtr->allocated = 0; return objPtr; } @@ -2720,7 +2666,7 @@ ExtendUnicodeRepWithString( numOrigChars = stringPtr->numChars; } if (numAppendChars == -1) { - numAppendChars = Tcl_NumUtfChars(bytes, numBytes); + TclNumUtfChars(numAppendChars, bytes, numBytes); } needed = numOrigChars + numAppendChars; if (needed < 0) { -- cgit v0.12