diff options
Diffstat (limited to 'generic/tclUtf.c')
| -rw-r--r-- | generic/tclUtf.c | 189 |
1 files changed, 114 insertions, 75 deletions
diff --git a/generic/tclUtf.c b/generic/tclUtf.c index b9e1226..e5497a4 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -59,7 +59,7 @@ * UTF-8. */ -static const unsigned char totalBytes[256] = { +static CONST unsigned char totalBytes[256] = { 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, @@ -231,13 +231,13 @@ Tcl_UniCharToUtf( char * Tcl_UniCharToUtfDString( - const Tcl_UniChar *uniStr, /* Unicode string to convert to UTF-8. */ - size_t uniLength, /* Length of Unicode string in Tcl_UniChars + CONST Tcl_UniChar *uniStr, /* Unicode string to convert to UTF-8. */ + int uniLength, /* Length of Unicode string in Tcl_UniChars * (must be >= 0). */ Tcl_DString *dsPtr) /* UTF-8 representation of string is appended * to this previously initialized DString. */ { - const Tcl_UniChar *w, *wEnd; + CONST Tcl_UniChar *w, *wEnd; char *p, *string; int oldLength; @@ -289,7 +289,7 @@ Tcl_UniCharToUtfDString( int Tcl_UtfToUniChar( - register const char *src, /* The UTF-8 string. */ + register CONST char *src, /* The UTF-8 string. */ register Tcl_UniChar *chPtr)/* Filled with the Tcl_UniChar represented by * the UTF-8 string. */ { @@ -393,18 +393,18 @@ Tcl_UtfToUniChar( Tcl_UniChar * Tcl_UtfToUniCharDString( - const char *src, /* UTF-8 string to convert to Unicode. */ - size_t length, /* Length of UTF-8 string in bytes, or - * TCL_STRLEN for strlen(). */ + CONST char *src, /* UTF-8 string to convert to Unicode. */ + int length, /* Length of UTF-8 string in bytes, or -1 for + * strlen(). */ Tcl_DString *dsPtr) /* Unicode representation of string is * appended to this previously initialized * DString. */ { Tcl_UniChar *w, *wString; - const char *p, *end; - size_t oldLength; + CONST char *p, *end; + int oldLength; - if (length == TCL_STRLEN) { + if (length < 0) { length = strlen(src); } @@ -414,9 +414,8 @@ Tcl_UtfToUniCharDString( */ oldLength = Tcl_DStringLength(dsPtr); -/* TODO: fix overreach! */ Tcl_DStringSetLength(dsPtr, - (oldLength + length + 1) * sizeof(Tcl_UniChar)); + (int) ((oldLength + length + 1) * sizeof(Tcl_UniChar))); wString = (Tcl_UniChar *) (Tcl_DStringValue(dsPtr) + oldLength); w = wString; @@ -427,7 +426,7 @@ Tcl_UtfToUniCharDString( } *w = '\0'; Tcl_DStringSetLength(dsPtr, - oldLength + ((char *) w - (char *) wString)); + (oldLength + ((char *) w - (char *) wString))); return wString; } @@ -453,9 +452,9 @@ Tcl_UtfToUniCharDString( int Tcl_UtfCharComplete( - const char *src, /* String to check if first few bytes contain + CONST char *src, /* String to check if first few bytes contain * a complete UTF-8 character. */ - size_t length) /* Length of above string in bytes. */ + int length) /* Length of above string in bytes. */ { int ch; @@ -481,11 +480,11 @@ Tcl_UtfCharComplete( *--------------------------------------------------------------------------- */ -size_t +int Tcl_NumUtfChars( - register const char *src, /* The UTF-8 string to measure. */ - size_t length) /* The length of the string in bytes, or - * TCL_STRLEN for strlen(string). */ + register CONST char *src, /* The UTF-8 string to measure. */ + int length) /* The length of the string in bytes, or -1 + * for strlen(string). */ { Tcl_UniChar ch; register Tcl_UniChar *chPtr = &ch; @@ -499,7 +498,7 @@ Tcl_NumUtfChars( */ i = 0; - if (length == TCL_STRLEN) { + if (length < 0) { while (*src != '\0') { src += TclUtfToUniChar(src, chPtr); i++; @@ -541,9 +540,9 @@ Tcl_NumUtfChars( *--------------------------------------------------------------------------- */ -const char * +CONST char * Tcl_UtfFindFirst( - const char *src, /* The UTF-8 string to be searched. */ + CONST char *src, /* The UTF-8 string to be searched. */ int ch) /* The Tcl_UniChar to search for. */ { int len; @@ -580,14 +579,14 @@ Tcl_UtfFindFirst( *--------------------------------------------------------------------------- */ -const char * +CONST char * Tcl_UtfFindLast( - const char *src, /* The UTF-8 string to be searched. */ + CONST char *src, /* The UTF-8 string to be searched. */ int ch) /* The Tcl_UniChar to search for. */ { int len; Tcl_UniChar find; - const char *last; + CONST char *last; last = NULL; while (1) { @@ -622,9 +621,9 @@ Tcl_UtfFindLast( *--------------------------------------------------------------------------- */ -const char * +CONST char * Tcl_UtfNext( - const char *src) /* The current location in the string. */ + CONST char *src) /* The current location in the string. */ { Tcl_UniChar ch; @@ -652,13 +651,13 @@ Tcl_UtfNext( *--------------------------------------------------------------------------- */ -const char * +CONST char * Tcl_UtfPrev( - const char *src, /* The current location in the string. */ - const char *start) /* Pointer to the beginning of the string, to + CONST char *src, /* The current location in the string. */ + CONST char *start) /* Pointer to the beginning of the string, to * avoid going backwards too far. */ { - const char *look; + CONST char *look; int i, byte; src--; @@ -701,10 +700,10 @@ Tcl_UtfPrev( Tcl_UniChar Tcl_UniCharAtIndex( - register const char *src, /* The UTF-8 string to dereference. */ - register size_t index) /* The position of the desired character. */ + register CONST char *src, /* The UTF-8 string to dereference. */ + register int index) /* The position of the desired character. */ { - Tcl_UniChar ch = 0; + Tcl_UniChar ch; while (index >= 0) { index--; @@ -730,10 +729,10 @@ Tcl_UniCharAtIndex( *--------------------------------------------------------------------------- */ -const char * +CONST char * Tcl_UtfAtIndex( - register const char *src, /* The UTF-8 string. */ - register size_t index) /* The position of the desired character. */ + register CONST char *src, /* The UTF-8 string. */ + register int index) /* The position of the desired character. */ { Tcl_UniChar ch; @@ -770,17 +769,18 @@ Tcl_UtfAtIndex( *--------------------------------------------------------------------------- */ -size_t +int Tcl_UtfBackslash( - const char *src, /* Points to the backslash character of a + CONST char *src, /* Points to the backslash character of a * backslash sequence. */ - size_t *readPtr, /* Fill in with number of characters read from + int *readPtr, /* Fill in with number of characters read from * src, unless NULL. */ char *dst) /* Filled with the bytes represented by the * backslash sequence. */ { #define LINE_LENGTH 128 - size_t numRead, result; + int numRead; + int result; result = TclParseBackslash(src, LINE_LENGTH, &numRead, dst); if (numRead == LINE_LENGTH) { @@ -820,7 +820,7 @@ Tcl_UtfToUpper( { Tcl_UniChar ch, upChar; char *src, *dst; - size_t bytes; + int bytes; /* * Iterate over the string until we hit the terminating null. @@ -838,7 +838,7 @@ Tcl_UtfToUpper( */ if (bytes < UtfCount(upChar)) { - memcpy(dst, src, bytes); + memcpy(dst, src, (size_t) bytes); dst += bytes; } else { dst += Tcl_UniCharToUtf(upChar, dst); @@ -873,7 +873,7 @@ Tcl_UtfToLower( { Tcl_UniChar ch, lowChar; char *src, *dst; - size_t bytes; + int bytes; /* * Iterate over the string until we hit the terminating null. @@ -891,7 +891,7 @@ Tcl_UtfToLower( */ if (bytes < UtfCount(lowChar)) { - memcpy(dst, src, bytes); + memcpy(dst, src, (size_t) bytes); dst += bytes; } else { dst += Tcl_UniCharToUtf(lowChar, dst); @@ -927,7 +927,7 @@ Tcl_UtfToTitle( { Tcl_UniChar ch, titleChar, lowChar; char *src, *dst; - size_t bytes; + int bytes; /* * Capitalize the first character and then lowercase the rest of the @@ -941,7 +941,7 @@ Tcl_UtfToTitle( titleChar = Tcl_UniCharToTitle(ch); if (bytes < UtfCount(titleChar)) { - memcpy(dst, src, bytes); + memcpy(dst, src, (size_t) bytes); dst += bytes; } else { dst += Tcl_UniCharToUtf(titleChar, dst); @@ -953,7 +953,7 @@ Tcl_UtfToTitle( lowChar = Tcl_UniCharToLower(ch); if (bytes < UtfCount(lowChar)) { - memcpy(dst, src, bytes); + memcpy(dst, src, (size_t) bytes); dst += bytes; } else { dst += Tcl_UniCharToUtf(lowChar, dst); @@ -983,9 +983,9 @@ Tcl_UtfToTitle( int TclpUtfNcmp2( - const char *cs, /* UTF string to compare to ct. */ - const char *ct, /* UTF string cs is compared to. */ - size_t numBytes) /* Number of *bytes* to compare. */ + CONST char *cs, /* UTF string to compare to ct. */ + CONST char *ct, /* UTF string cs is compared to. */ + unsigned long numBytes) /* Number of *bytes* to compare. */ { /* * We can't simply call 'memcmp(cs, ct, numBytes);' because we need to @@ -1030,9 +1030,9 @@ TclpUtfNcmp2( int Tcl_UtfNcmp( - const char *cs, /* UTF string to compare to ct. */ - const char *ct, /* UTF string cs is compared to. */ - size_t numChars) /* Number of UTF chars to compare. */ + CONST char *cs, /* UTF string to compare to ct. */ + CONST char *ct, /* UTF string cs is compared to. */ + unsigned long numChars) /* Number of UTF chars to compare. */ { Tcl_UniChar ch1, ch2; @@ -1078,9 +1078,9 @@ Tcl_UtfNcmp( int Tcl_UtfNcasecmp( - const char *cs, /* UTF string to compare to ct. */ - const char *ct, /* UTF string cs is compared to. */ - size_t numChars) /* Number of UTF chars to compare. */ + CONST char *cs, /* UTF string to compare to ct. */ + CONST char *ct, /* UTF string cs is compared to. */ + unsigned long numChars) /* Number of UTF chars to compare. */ { Tcl_UniChar ch1, ch2; while (numChars-- > 0) { @@ -1105,6 +1105,46 @@ Tcl_UtfNcasecmp( /* *---------------------------------------------------------------------- * + * Tcl_UtfNcasecmp -- + * + * Compare UTF chars of string cs to string ct case insensitively. + * Replacement for strcasecmp in Tcl core, in places where UTF-8 should + * be handled. + * + * Results: + * Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct. + * + * Side effects: + * None. + * + *---------------------------------------------------------------------- + */ + +int +TclUtfCasecmp( + CONST char *cs, /* UTF string to compare to ct. */ + CONST char *ct) /* UTF string cs is compared to. */ +{ + while (*cs && *ct) { + Tcl_UniChar ch1, ch2; + + cs += TclUtfToUniChar(cs, &ch1); + ct += TclUtfToUniChar(ct, &ch2); + if (ch1 != ch2) { + ch1 = Tcl_UniCharToLower(ch1); + ch2 = Tcl_UniCharToLower(ch2); + if (ch1 != ch2) { + return ch1 - ch2; + } + } + } + return UCHAR(*cs) - UCHAR(*ct); +} + + +/* + *---------------------------------------------------------------------- + * * Tcl_UniCharToUpper -- * * Compute the uppercase equivalent of the given Unicode character. @@ -1212,7 +1252,7 @@ Tcl_UniCharToTitle( int Tcl_UniCharLen( - const Tcl_UniChar *uniStr) /* Unicode string to find length of. */ + CONST Tcl_UniChar *uniStr) /* Unicode string to find length of. */ { int len = 0; @@ -1242,9 +1282,9 @@ Tcl_UniCharLen( int Tcl_UniCharNcmp( - const Tcl_UniChar *ucs, /* Unicode string to compare to uct. */ - const Tcl_UniChar *uct, /* Unicode string ucs is compared to. */ - size_t numChars) /* Number of unichars to compare. */ + CONST Tcl_UniChar *ucs, /* Unicode string to compare to uct. */ + CONST Tcl_UniChar *uct, /* Unicode string ucs is compared to. */ + unsigned long numChars) /* Number of unichars to compare. */ { #ifdef WORDS_BIGENDIAN /* @@ -1287,9 +1327,9 @@ Tcl_UniCharNcmp( int Tcl_UniCharNcasecmp( - const Tcl_UniChar *ucs, /* Unicode string to compare to uct. */ - const Tcl_UniChar *uct, /* Unicode string ucs is compared to. */ - size_t numChars) /* Number of unichars to compare. */ + CONST Tcl_UniChar *ucs, /* Unicode string to compare to uct. */ + CONST Tcl_UniChar *uct, /* Unicode string ucs is compared to. */ + unsigned long numChars) /* Number of unichars to compare. */ { for ( ; numChars != 0; numChars--, ucs++, uct++) { if (*ucs != *uct) { @@ -1514,9 +1554,8 @@ Tcl_UniCharIsSpace( */ if (((Tcl_UniChar) ch) < ((Tcl_UniChar) 0x80)) { - return isspace(UCHAR(ch)); /* INTL: ISO space */ - } else if ((Tcl_UniChar) ch == 0x0085 || (Tcl_UniChar) ch == 0x200b - || (Tcl_UniChar) ch == 0x2060 || (Tcl_UniChar) ch == 0xfeff) { + return TclIsSpaceProc((char) ch); + } else if ((Tcl_UniChar) ch == 0x180e || (Tcl_UniChar) ch == 0x202f) { return 1; } else { return ((SPACE_BITS >> GetCategory(ch)) & 1); @@ -1594,8 +1633,8 @@ Tcl_UniCharIsWordChar( int Tcl_UniCharCaseMatch( - const Tcl_UniChar *uniStr, /* Unicode String. */ - const Tcl_UniChar *uniPattern, + CONST Tcl_UniChar *uniStr, /* Unicode String. */ + CONST Tcl_UniChar *uniPattern, /* Pattern, which may contain special * characters. */ int nocase) /* 0 for case sensitive, 1 for insensitive */ @@ -1782,14 +1821,14 @@ Tcl_UniCharCaseMatch( int TclUniCharMatch( - const Tcl_UniChar *string, /* Unicode String. */ - size_t strLen, /* Length of String */ - const Tcl_UniChar *pattern, /* Pattern, which may contain special + CONST Tcl_UniChar *string, /* Unicode String. */ + int strLen, /* Length of String */ + CONST Tcl_UniChar *pattern, /* Pattern, which may contain special * characters. */ - size_t ptnLen, /* Length of Pattern */ + int ptnLen, /* Length of Pattern */ int nocase) /* 0 for case sensitive, 1 for insensitive */ { - const Tcl_UniChar *stringEnd, *patternEnd; + CONST Tcl_UniChar *stringEnd, *patternEnd; Tcl_UniChar p; stringEnd = string + strLen; |
