diff options
Diffstat (limited to 'generic/tclUtf.c')
| -rw-r--r-- | generic/tclUtf.c | 202 | 
1 files changed, 115 insertions, 87 deletions
| diff --git a/generic/tclUtf.c b/generic/tclUtf.c index 3c86e1b..15529c7 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -7,8 +7,6 @@   *   * See the file "license.terms" for information on usage and redistribution of   * this file, and for a DISCLAIMER OF ALL WARRANTIES. - * - * RCS: @(#) $Id: tclUtf.c,v 1.37 2005/10/31 15:59:41 dkf Exp $   */  #include "tclInt.h" @@ -28,28 +26,27 @@  #define ALPHA_BITS ((1 << UPPERCASE_LETTER) | (1 << LOWERCASE_LETTER) \  	| (1 << TITLECASE_LETTER) | (1 << MODIFIER_LETTER) | (1<<OTHER_LETTER)) +#define CONTROL_BITS ((1 << CONTROL) | (1 << FORMAT) | (1 << PRIVATE_USE)) +  #define DIGIT_BITS (1 << DECIMAL_DIGIT_NUMBER)  #define SPACE_BITS ((1 << SPACE_SEPARATOR) | (1 << LINE_SEPARATOR) \  	| (1 << PARAGRAPH_SEPARATOR)) -#define CONNECTOR_BITS (1 << CONNECTOR_PUNCTUATION) - -#define PRINT_BITS (ALPHA_BITS | DIGIT_BITS | SPACE_BITS | \ -	(1 << NON_SPACING_MARK) | (1 << ENCLOSING_MARK) | \ -	(1 << COMBINING_SPACING_MARK) | (1 << LETTER_NUMBER) | \ -	(1 << OTHER_NUMBER) | (1 << CONNECTOR_PUNCTUATION) | \ -	(1 << DASH_PUNCTUATION) | (1 << OPEN_PUNCTUATION) | \ -	(1 << CLOSE_PUNCTUATION) | (1 << INITIAL_QUOTE_PUNCTUATION) | \ -	(1 << FINAL_QUOTE_PUNCTUATION) | (1 << OTHER_PUNCTUATION) | \ -	(1 << MATH_SYMBOL) | (1 << CURRENCY_SYMBOL) | \ -	(1 << MODIFIER_SYMBOL) | (1 << OTHER_SYMBOL)) +#define WORD_BITS (ALPHA_BITS | DIGIT_BITS | (1 << CONNECTOR_PUNCTUATION))  #define PUNCT_BITS ((1 << CONNECTOR_PUNCTUATION) | \  	(1 << DASH_PUNCTUATION) | (1 << OPEN_PUNCTUATION) | \  	(1 << CLOSE_PUNCTUATION) | (1 << INITIAL_QUOTE_PUNCTUATION) | \  	(1 << FINAL_QUOTE_PUNCTUATION) | (1 << OTHER_PUNCTUATION)) +#define GRAPH_BITS (WORD_BITS | PUNCT_BITS | \ +	(1 << NON_SPACING_MARK) | (1 << ENCLOSING_MARK) | \ +	(1 << COMBINING_SPACING_MARK) | (1 << LETTER_NUMBER) | \ +	(1 << OTHER_NUMBER) | \ +	(1 << MATH_SYMBOL) | (1 << CURRENCY_SYMBOL) | \ +	(1 << MODIFIER_SYMBOL) | (1 << OTHER_SYMBOL)) +  /*   * Unicode characters less than this value are represented by themselves in   * UTF-8 strings. @@ -62,7 +59,7 @@   * UTF-8.   */ -static CONST unsigned char totalBytes[256] = { +static const unsigned char totalBytes[256] = {      1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,      1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,      1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, @@ -234,13 +231,13 @@ Tcl_UniCharToUtf(  char *  Tcl_UniCharToUtfDString( -    CONST Tcl_UniChar *uniStr,	/* Unicode string to convert to UTF-8. */ +    const Tcl_UniChar *uniStr,	/* Unicode string to convert to UTF-8. */      int uniLength,		/* Length of Unicode string in Tcl_UniChars  				 * (must be >= 0). */      Tcl_DString *dsPtr)		/* UTF-8 representation of string is appended  				 * to this previously initialized DString. */  { -    CONST Tcl_UniChar *w, *wEnd; +    const Tcl_UniChar *w, *wEnd;      char *p, *string;      int oldLength; @@ -292,7 +289,7 @@ Tcl_UniCharToUtfDString(  int  Tcl_UtfToUniChar( -    register CONST char *src,	/* The UTF-8 string. */ +    register const char *src,	/* The UTF-8 string. */      register Tcl_UniChar *chPtr)/* Filled with the Tcl_UniChar represented by  				 * the UTF-8 string. */  { @@ -396,7 +393,7 @@ Tcl_UtfToUniChar(  Tcl_UniChar *  Tcl_UtfToUniCharDString( -    CONST char *src,		/* UTF-8 string to convert to Unicode. */ +    const char *src,		/* UTF-8 string to convert to Unicode. */      int length,			/* Length of UTF-8 string in bytes, or -1 for  				 * strlen(). */      Tcl_DString *dsPtr)		/* Unicode representation of string is @@ -404,7 +401,7 @@ Tcl_UtfToUniCharDString(  				 * DString. */  {      Tcl_UniChar *w, *wString; -    CONST char *p, *end; +    const char *p, *end;      int oldLength;      if (length < 0) { @@ -417,6 +414,7 @@ Tcl_UtfToUniCharDString(       */      oldLength = Tcl_DStringLength(dsPtr); +/* TODO: fix overreach! */      Tcl_DStringSetLength(dsPtr,  	    (int) ((oldLength + length + 1) * sizeof(Tcl_UniChar)));      wString = (Tcl_UniChar *) (Tcl_DStringValue(dsPtr) + oldLength); @@ -455,7 +453,7 @@ Tcl_UtfToUniCharDString(  int  Tcl_UtfCharComplete( -    CONST char *src,		/* String to check if first few bytes contain +    const char *src,		/* String to check if first few bytes contain  				 * a complete UTF-8 character. */      int length)			/* Length of above string in bytes. */  { @@ -485,7 +483,7 @@ Tcl_UtfCharComplete(  int  Tcl_NumUtfChars( -    register CONST char *src,	/* The UTF-8 string to measure. */ +    register const char *src,	/* The UTF-8 string to measure. */      int length)			/* The length of the string in bytes, or -1  				 * for strlen(string). */  { @@ -543,9 +541,9 @@ Tcl_NumUtfChars(   *---------------------------------------------------------------------------   */ -CONST char * +const char *  Tcl_UtfFindFirst( -    CONST char *src,		/* The UTF-8 string to be searched. */ +    const char *src,		/* The UTF-8 string to be searched. */      int ch)			/* The Tcl_UniChar to search for. */  {      int len; @@ -582,14 +580,14 @@ Tcl_UtfFindFirst(   *---------------------------------------------------------------------------   */ -CONST char * +const char *  Tcl_UtfFindLast( -    CONST char *src,		/* The UTF-8 string to be searched. */ +    const char *src,		/* The UTF-8 string to be searched. */      int ch)			/* The Tcl_UniChar to search for. */  {      int len;      Tcl_UniChar find; -    CONST char *last; +    const char *last;      last = NULL;      while (1) { @@ -624,9 +622,9 @@ Tcl_UtfFindLast(   *---------------------------------------------------------------------------   */ -CONST char * +const char *  Tcl_UtfNext( -    CONST char *src)		/* The current location in the string. */ +    const char *src)		/* The current location in the string. */  {      Tcl_UniChar ch; @@ -654,13 +652,13 @@ Tcl_UtfNext(   *---------------------------------------------------------------------------   */ -CONST char * +const char *  Tcl_UtfPrev( -    CONST char *src,		/* The current location in the string. */ -    CONST char *start)		/* Pointer to the beginning of the string, to +    const char *src,		/* The current location in the string. */ +    const char *start)		/* Pointer to the beginning of the string, to  				 * avoid going backwards too far. */  { -    CONST char *look; +    const char *look;      int i, byte;      src--; @@ -703,10 +701,10 @@ Tcl_UtfPrev(  Tcl_UniChar  Tcl_UniCharAtIndex( -    register CONST char *src,	/* The UTF-8 string to dereference. */ +    register const char *src,	/* The UTF-8 string to dereference. */      register int index)		/* The position of the desired character. */  { -    Tcl_UniChar ch; +    Tcl_UniChar ch = 0;      while (index >= 0) {  	index--; @@ -732,9 +730,9 @@ Tcl_UniCharAtIndex(   *---------------------------------------------------------------------------   */ -CONST char * +const char *  Tcl_UtfAtIndex( -    register CONST char *src,	/* The UTF-8 string. */ +    register const char *src,	/* The UTF-8 string. */      register int index)		/* The position of the desired character. */  {      Tcl_UniChar ch; @@ -774,7 +772,7 @@ Tcl_UtfAtIndex(  int  Tcl_UtfBackslash( -    CONST char *src,		/* Points to the backslash character of a +    const char *src,		/* Points to the backslash character of a  				 * backslash sequence. */      int *readPtr,		/* Fill in with number of characters read from  				 * src, unless NULL. */ @@ -986,8 +984,8 @@ Tcl_UtfToTitle(  int  TclpUtfNcmp2( -    CONST char *cs,		/* UTF string to compare to ct. */ -    CONST char *ct,		/* UTF string cs is compared to. */ +    const char *cs,		/* UTF string to compare to ct. */ +    const char *ct,		/* UTF string cs is compared to. */      unsigned long numBytes)	/* Number of *bytes* to compare. */  {      /* @@ -1033,8 +1031,8 @@ TclpUtfNcmp2(  int  Tcl_UtfNcmp( -    CONST char *cs,		/* UTF string to compare to ct. */ -    CONST char *ct,		/* UTF string cs is compared to. */ +    const char *cs,		/* UTF string to compare to ct. */ +    const char *ct,		/* UTF string cs is compared to. */      unsigned long numChars)	/* Number of UTF chars to compare. */  {      Tcl_UniChar ch1, ch2; @@ -1081,8 +1079,8 @@ Tcl_UtfNcmp(  int  Tcl_UtfNcasecmp( -    CONST char *cs,		/* UTF string to compare to ct. */ -    CONST char *ct,		/* UTF string cs is compared to. */ +    const char *cs,		/* UTF string to compare to ct. */ +    const char *ct,		/* UTF string cs is compared to. */      unsigned long numChars)	/* Number of UTF chars to compare. */  {      Tcl_UniChar ch1, ch2; @@ -1108,6 +1106,46 @@ Tcl_UtfNcasecmp(  /*   *----------------------------------------------------------------------   * + * Tcl_UtfNcasecmp -- + * + *	Compare UTF chars of string cs to string ct case insensitively. + *	Replacement for strcasecmp in Tcl core, in places where UTF-8 should + *	be handled. + * + * Results: + *	Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct. + * + * Side effects: + *	None. + * + *---------------------------------------------------------------------- + */ + +int +TclUtfCasecmp( +    const char *cs,		/* UTF string to compare to ct. */ +    const char *ct)		/* UTF string cs is compared to. */ +{ +    while (*cs && *ct) { +	Tcl_UniChar ch1, ch2; + +	cs += TclUtfToUniChar(cs, &ch1); +	ct += TclUtfToUniChar(ct, &ch2); +	if (ch1 != ch2) { +	    ch1 = Tcl_UniCharToLower(ch1); +	    ch2 = Tcl_UniCharToLower(ch2); +	    if (ch1 != ch2) { +		return ch1 - ch2; +	    } +	} +    } +    return UCHAR(*cs) - UCHAR(*ct); +} + + +/* + *---------------------------------------------------------------------- + *   * Tcl_UniCharToUpper --   *   *	Compute the uppercase equivalent of the given Unicode character. @@ -1128,10 +1166,9 @@ Tcl_UniCharToUpper(      int info = GetUniCharInfo(ch);      if (GetCaseType(info) & 0x04) { -	return (Tcl_UniChar) (ch - GetDelta(info)); -    } else { -	return ch; +	ch -= GetDelta(info);      } +    return (Tcl_UniChar) ch;  }  /* @@ -1157,10 +1194,9 @@ Tcl_UniCharToLower(      int info = GetUniCharInfo(ch);      if (GetCaseType(info) & 0x02) { -	return (Tcl_UniChar) (ch + GetDelta(info)); -    } else { -	return ch; +	ch += GetDelta(info);      } +    return (Tcl_UniChar) ch;  }  /* @@ -1191,12 +1227,11 @@ Tcl_UniCharToTitle(  	 * Subtract or add one depending on the original case.  	 */ -	return (Tcl_UniChar) (ch + ((mode & 0x4) ? -1 : 1)); +	ch += ((mode & 0x4) ? -1 : 1);      } else if (mode == 0x4) { -	return (Tcl_UniChar) (ch - GetDelta(info)); -    } else { -	return ch; +	ch -= GetDelta(info);      } +    return (Tcl_UniChar) ch;  }  /* @@ -1218,7 +1253,7 @@ Tcl_UniCharToTitle(  int  Tcl_UniCharLen( -    CONST Tcl_UniChar *uniStr)	/* Unicode string to find length of. */ +    const Tcl_UniChar *uniStr)	/* Unicode string to find length of. */  {      int len = 0; @@ -1248,8 +1283,8 @@ Tcl_UniCharLen(  int  Tcl_UniCharNcmp( -    CONST Tcl_UniChar *ucs,	/* Unicode string to compare to uct. */ -    CONST Tcl_UniChar *uct,	/* Unicode string ucs is compared to. */ +    const Tcl_UniChar *ucs,	/* Unicode string to compare to uct. */ +    const Tcl_UniChar *uct,	/* Unicode string ucs is compared to. */      unsigned long numChars)	/* Number of unichars to compare. */  {  #ifdef WORDS_BIGENDIAN @@ -1293,8 +1328,8 @@ Tcl_UniCharNcmp(  int  Tcl_UniCharNcasecmp( -    CONST Tcl_UniChar *ucs,	/* Unicode string to compare to uct. */ -    CONST Tcl_UniChar *uct,	/* Unicode string ucs is compared to. */ +    const Tcl_UniChar *ucs,	/* Unicode string to compare to uct. */ +    const Tcl_UniChar *uct,	/* Unicode string ucs is compared to. */      unsigned long numChars)	/* Number of unichars to compare. */  {      for ( ; numChars != 0; numChars--, ucs++, uct++) { @@ -1330,9 +1365,7 @@ int  Tcl_UniCharIsAlnum(      int ch)			/* Unicode character to test. */  { -    register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK); - -    return (((ALPHA_BITS | DIGIT_BITS) >> category) & 1); +    return (((ALPHA_BITS | DIGIT_BITS) >> GetCategory(ch)) & 1);  }  /* @@ -1355,8 +1388,7 @@ int  Tcl_UniCharIsAlpha(      int ch)			/* Unicode character to test. */  { -    register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK); -    return ((ALPHA_BITS >> category) & 1); +    return ((ALPHA_BITS >> GetCategory(ch)) & 1);  }  /* @@ -1379,7 +1411,7 @@ int  Tcl_UniCharIsControl(      int ch)			/* Unicode character to test. */  { -    return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == CONTROL); +    return ((CONTROL_BITS >> GetCategory(ch)) & 1);  }  /* @@ -1402,7 +1434,7 @@ int  Tcl_UniCharIsDigit(      int ch)			/* Unicode character to test. */  { -    return (GetUniCharInfo(ch)&UNICODE_CATEGORY_MASK) == DECIMAL_DIGIT_NUMBER; +    return (GetCategory(ch) == DECIMAL_DIGIT_NUMBER);  }  /* @@ -1425,8 +1457,7 @@ int  Tcl_UniCharIsGraph(      int ch)			/* Unicode character to test. */  { -    register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK); -    return (((PRINT_BITS >> category) & 1) && ((unsigned char) ch != ' ')); +    return ((GRAPH_BITS >> GetCategory(ch)) & 1);  }  /* @@ -1449,7 +1480,7 @@ int  Tcl_UniCharIsLower(      int ch)			/* Unicode character to test. */  { -    return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == LOWERCASE_LETTER); +    return (GetCategory(ch) == LOWERCASE_LETTER);  }  /* @@ -1472,8 +1503,7 @@ int  Tcl_UniCharIsPrint(      int ch)			/* Unicode character to test. */  { -    register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK); -    return ((PRINT_BITS >> category) & 1); +    return (((GRAPH_BITS|SPACE_BITS) >> GetCategory(ch)) & 1);  }  /* @@ -1496,8 +1526,7 @@ int  Tcl_UniCharIsPunct(      int ch)			/* Unicode character to test. */  { -    register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK); -    return ((PUNCT_BITS >> category) & 1); +    return ((PUNCT_BITS >> GetCategory(ch)) & 1);  }  /* @@ -1520,18 +1549,19 @@ int  Tcl_UniCharIsSpace(      int ch)			/* Unicode character to test. */  { -    register int category; -      /*       * If the character is within the first 127 characters, just use the       * standard C function, otherwise consult the Unicode table.       */ -    if (ch < 0x80) { -	return isspace(UCHAR(ch)); /* INTL: ISO space */ +    if (((Tcl_UniChar) ch) < ((Tcl_UniChar) 0x80)) { +	return TclIsSpaceProc((char) ch); +    } else if ((Tcl_UniChar) ch == 0x0085 || (Tcl_UniChar) ch == 0x180e +	    || (Tcl_UniChar) ch == 0x200b || (Tcl_UniChar) ch == 0x2060 +	    || (Tcl_UniChar) ch == 0x202f || (Tcl_UniChar) ch == 0xfeff) { +	return 1;      } else { -	category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK); -	return ((SPACE_BITS >> category) & 1); +	return ((SPACE_BITS >> GetCategory(ch)) & 1);      }  } @@ -1555,7 +1585,7 @@ int  Tcl_UniCharIsUpper(      int ch)			/* Unicode character to test. */  { -    return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == UPPERCASE_LETTER); +    return (GetCategory(ch) == UPPERCASE_LETTER);  }  /* @@ -1578,9 +1608,7 @@ int  Tcl_UniCharIsWordChar(      int ch)			/* Unicode character to test. */  { -    register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK); - -    return (((ALPHA_BITS | DIGIT_BITS | CONNECTOR_BITS) >> category) & 1); +    return ((WORD_BITS >> GetCategory(ch)) & 1);  }  /* @@ -1608,8 +1636,8 @@ Tcl_UniCharIsWordChar(  int  Tcl_UniCharCaseMatch( -    CONST Tcl_UniChar *uniStr,	/* Unicode String. */ -    CONST Tcl_UniChar *uniPattern, +    const Tcl_UniChar *uniStr,	/* Unicode String. */ +    const Tcl_UniChar *uniPattern,  				/* Pattern, which may contain special  				 * characters. */      int nocase)			/* 0 for case sensitive, 1 for insensitive */ @@ -1796,14 +1824,14 @@ Tcl_UniCharCaseMatch(  int  TclUniCharMatch( -    CONST Tcl_UniChar *string,	/* Unicode String. */ +    const Tcl_UniChar *string,	/* Unicode String. */      int strLen,			/* Length of String */ -    CONST Tcl_UniChar *pattern,	/* Pattern, which may contain special +    const Tcl_UniChar *pattern,	/* Pattern, which may contain special  				 * characters. */      int ptnLen,			/* Length of Pattern */      int nocase)			/* 0 for case sensitive, 1 for insensitive */  { -    CONST Tcl_UniChar *stringEnd, *patternEnd; +    const Tcl_UniChar *stringEnd, *patternEnd;      Tcl_UniChar p;      stringEnd = string + strLen; | 
